Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_invsqrt_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2013, 2014 Free Software Foundation, Inc.
4 * Copyright 2026 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
52
53#ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
54#define INCLUDED_volk_32f_invsqrt_32f_a_H
55
56#include <math.h>
57#include <volk/volk_common.h>
58
59#ifdef LV_HAVE_GENERIC
60
61static inline void volk_32f_invsqrt_32f_generic(float* cVector,
62 const float* aVector,
63 unsigned int num_points)
64{
65 for (unsigned int number = 0; number < num_points; number++) {
66 cVector[number] = sqrtf(1.0f / aVector[number]);
67 }
68}
69#endif /* LV_HAVE_GENERIC */
70
71#ifdef LV_HAVE_GENERIC
72
73static inline void volk_32f_invsqrt_32f_recip_sqrt(float* cVector,
74 const float* aVector,
75 unsigned int num_points)
76{
77 for (unsigned int number = 0; number < num_points; number++) {
78 cVector[number] = 1.0f / sqrtf(aVector[number]);
79 }
80}
81#endif /* LV_HAVE_GENERIC */
82
83#ifdef LV_HAVE_GENERIC
84
85static inline void volk_32f_invsqrt_32f_Q_rsqrt(float* cVector,
86 const float* aVector,
87 unsigned int num_points)
88{
89 // The famous fast inverse square root from Quake III Arena
90 union {
91 float f;
92 uint32_t u;
93 } conv, result;
94
95 for (unsigned int number = 0; number < num_points; number++) {
96 float a = aVector[number];
97 float xhalf = 0.5f * a;
98 conv.f = a;
99 uint32_t input_bits = conv.u; // Save original bits for edge case detection
100 conv.u = 0x5f3759df - (conv.u >> 1); // The magic (note: use unsigned for shift)
101 float x = conv.f;
102 x = x * (1.5f - xhalf * x * x); // Newton-Raphson iteration 1
103 x = x * (1.5f - xhalf * x * x);
104 x = x * (1.5f - xhalf * x * x);
105
106 // Branchless special case handling
107 result.f = x;
108 uint32_t is_positive = (uint32_t)(-(int32_t)(a > 0.0f));
109 uint32_t is_zero = (uint32_t)(-(int32_t)(input_bits == 0x00000000));
110 uint32_t is_inf = (uint32_t)(-(int32_t)(input_bits == 0x7F800000));
111 uint32_t is_normal_pos = is_positive & ~is_inf;
112 result.u = (result.u & is_normal_pos) | // Normal positive: keep result
113 (0x7F800000u & is_zero) | // +0 → +Inf
114 (0x7FC00000u & ~is_positive & ~is_zero); // Negative/NaN → NaN
115 // Note: +Inf → 0 is handled implicitly (all terms are 0 when is_inf)
116 cVector[number] = result.f;
117 }
118}
119#endif /* LV_HAVE_GENERIC */
120
121#ifdef LV_HAVE_SSE
123#include <xmmintrin.h>
124
125static inline void
126volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
127{
128 unsigned int number = 0;
129 const unsigned int quarterPoints = num_points / 4;
130
131 float* cPtr = cVector;
132 const float* aPtr = aVector;
133 for (; number < quarterPoints; number++) {
134 __m128 aVal = _mm_load_ps(aPtr);
135 __m128 cVal = _mm_rsqrt_nr_ps(aVal);
136 _mm_store_ps(cPtr, cVal);
137 aPtr += 4;
138 cPtr += 4;
139 }
140
141 number = quarterPoints * 4;
142 for (; number < num_points; number++) {
143 *cPtr++ = 1.0f / sqrtf(*aPtr++);
144 }
145}
146#endif /* LV_HAVE_SSE */
147
148#ifdef LV_HAVE_AVX
149#include <immintrin.h>
151
152static inline void
153volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
154{
155 unsigned int number = 0;
156 const unsigned int eighthPoints = num_points / 8;
157
158 float* cPtr = cVector;
159 const float* aPtr = aVector;
160 for (; number < eighthPoints; number++) {
161 __m256 aVal = _mm256_load_ps(aPtr);
162 __m256 cVal = _mm256_rsqrt_nr_ps(aVal);
163 _mm256_store_ps(cPtr, cVal);
164 aPtr += 8;
165 cPtr += 8;
166 }
167
168 number = eighthPoints * 8;
169 for (; number < num_points; number++) {
170 *cPtr++ = 1.0f / sqrtf(*aPtr++);
171 }
172}
173#endif /* LV_HAVE_AVX */
174
175
176#ifdef LV_HAVE_AVX2
177#include <immintrin.h>
179
180static inline void
181volk_32f_invsqrt_32f_a_avx2(float* cVector, const float* aVector, unsigned int num_points)
182{
183 unsigned int number = 0;
184 const unsigned int eighthPoints = num_points / 8;
185
186 float* cPtr = cVector;
187 const float* aPtr = aVector;
188 for (; number < eighthPoints; number++) {
189 __m256 aVal = _mm256_load_ps(aPtr);
190 __m256 cVal = _mm256_rsqrt_nr_avx2(aVal);
191 _mm256_store_ps(cPtr, cVal);
192 aPtr += 8;
193 cPtr += 8;
194 }
195
196 number = eighthPoints * 8;
197 for (; number < num_points; number++) {
198 *cPtr++ = 1.0f / sqrtf(*aPtr++);
199 }
200}
201#endif /* LV_HAVE_AVX2 */
202
203
204#ifdef LV_HAVE_AVX512F
205#include <immintrin.h>
207
208static inline void volk_32f_invsqrt_32f_a_avx512f(float* cVector,
209 const float* aVector,
210 unsigned int num_points)
211{
212 unsigned int number = 0;
213 const unsigned int sixteenthPoints = num_points / 16;
214
215 float* cPtr = cVector;
216 const float* aPtr = aVector;
217 for (; number < sixteenthPoints; number++) {
218 __m512 aVal = _mm512_load_ps(aPtr);
219 __m512 cVal = _mm512_rsqrt_nr_ps(aVal);
220 _mm512_store_ps(cPtr, cVal);
221 aPtr += 16;
222 cPtr += 16;
223 }
224
225 number = sixteenthPoints * 16;
226 for (; number < num_points; number++) {
227 *cPtr++ = 1.0f / sqrtf(*aPtr++);
228 }
229}
230#endif /* LV_HAVE_AVX512F */
231
232#ifdef LV_HAVE_SSE
234#include <xmmintrin.h>
235
236static inline void
237volk_32f_invsqrt_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
238{
239 unsigned int number = 0;
240 const unsigned int quarterPoints = num_points / 4;
241
242 float* cPtr = cVector;
243 const float* aPtr = aVector;
244 for (; number < quarterPoints; number++) {
245 __m128 aVal = _mm_loadu_ps(aPtr);
246 __m128 cVal = _mm_rsqrt_nr_ps(aVal);
247 _mm_storeu_ps(cPtr, cVal);
248 aPtr += 4;
249 cPtr += 4;
250 }
251
252 number = quarterPoints * 4;
253 for (; number < num_points; number++) {
254 *cPtr++ = 1.0f / sqrtf(*aPtr++);
255 }
256}
257#endif /* LV_HAVE_SSE */
258
259
260#ifdef LV_HAVE_AVX
261#include <immintrin.h>
263
264static inline void
265volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
266{
267 unsigned int number = 0;
268 const unsigned int eighthPoints = num_points / 8;
269
270 float* cPtr = cVector;
271 const float* aPtr = aVector;
272 for (; number < eighthPoints; number++) {
273 __m256 aVal = _mm256_loadu_ps(aPtr);
274 __m256 cVal = _mm256_rsqrt_nr_ps(aVal);
275 _mm256_storeu_ps(cPtr, cVal);
276 aPtr += 8;
277 cPtr += 8;
278 }
279
280 number = eighthPoints * 8;
281 for (; number < num_points; number++) {
282 *cPtr++ = 1.0f / sqrtf(*aPtr++);
283 }
284}
285#endif /* LV_HAVE_AVX */
286
287#ifdef LV_HAVE_AVX2
288#include <immintrin.h>
290
291static inline void
292volk_32f_invsqrt_32f_u_avx2(float* cVector, const float* aVector, unsigned int num_points)
293{
294 unsigned int number = 0;
295 const unsigned int eighthPoints = num_points / 8;
296
297 float* cPtr = cVector;
298 const float* aPtr = aVector;
299 for (; number < eighthPoints; number++) {
300 __m256 aVal = _mm256_loadu_ps(aPtr);
301 __m256 cVal = _mm256_rsqrt_nr_avx2(aVal);
302 _mm256_storeu_ps(cPtr, cVal);
303 aPtr += 8;
304 cPtr += 8;
305 }
306
307 number = eighthPoints * 8;
308 for (; number < num_points; number++) {
309 *cPtr++ = 1.0f / sqrtf(*aPtr++);
310 }
311}
312#endif /* LV_HAVE_AVX2 */
313
314#ifdef LV_HAVE_AVX512F
315#include <immintrin.h>
317
318static inline void volk_32f_invsqrt_32f_u_avx512f(float* cVector,
319 const float* aVector,
320 unsigned int num_points)
321{
322 unsigned int number = 0;
323 const unsigned int sixteenthPoints = num_points / 16;
324
325 float* cPtr = cVector;
326 const float* aPtr = aVector;
327 for (; number < sixteenthPoints; number++) {
328 __m512 aVal = _mm512_loadu_ps(aPtr);
329 __m512 cVal = _mm512_rsqrt_nr_ps(aVal);
330 _mm512_storeu_ps(cPtr, cVal);
331 aPtr += 16;
332 cPtr += 16;
333 }
334
335 number = sixteenthPoints * 16;
336 for (; number < num_points; number++) {
337 *cPtr++ = 1.0f / sqrtf(*aPtr++);
338 }
339}
340#endif /* LV_HAVE_AVX512F */
341
342#ifdef LV_HAVE_NEON
343#include <arm_neon.h>
345
346static inline void
347volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
348{
349 unsigned int number;
350 const unsigned int quarter_points = num_points / 4;
351
352 float* cPtr = cVector;
353 const float* aPtr = aVector;
354 for (number = 0; number < quarter_points; ++number) {
355 float32x4_t a_val = vld1q_f32(aPtr);
356 float32x4_t c_val = _vinvsqrtq_f32(a_val);
357 vst1q_f32(cPtr, c_val);
358 aPtr += 4;
359 cPtr += 4;
360 }
361
362 for (number = quarter_points * 4; number < num_points; number++) {
363 *cPtr++ = 1.0f / sqrtf(*aPtr++);
364 }
365}
366#endif /* LV_HAVE_NEON */
367
368#ifdef LV_HAVE_NEONV8
369#include <arm_neon.h>
370
371static inline void
372volk_32f_invsqrt_32f_neonv8(float* cVector, const float* aVector, unsigned int num_points)
373{
374 unsigned int number;
375 const unsigned int quarter_points = num_points / 4;
376
377 float* cPtr = cVector;
378 const float* aPtr = aVector;
379
380 // Process 4 elements at a time (1 vector)
381 for (number = 0; number < quarter_points; ++number) {
382 float32x4_t a = vld1q_f32(aPtr);
383 float32x4_t x0 = vrsqrteq_f32(a); // +Inf for +0, 0 for +Inf
384
385 // Two Newton-Raphson iterations for float32 accuracy
386 float32x4_t x1 = vmulq_f32(x0, vrsqrtsq_f32(vmulq_f32(a, x0), x0));
387 x1 = vmulq_f32(x1, vrsqrtsq_f32(vmulq_f32(a, x1), x1));
388
389 // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
390 // Blend: use x0 where a == +0 or a == +Inf, else use x1
391 uint32x4_t a_bits = vreinterpretq_u32_f32(a);
392 uint32x4_t zero_mask = vceqq_u32(a_bits, vdupq_n_u32(0x00000000));
393 uint32x4_t inf_mask = vceqq_u32(a_bits, vdupq_n_u32(0x7F800000));
394 uint32x4_t special_mask = vorrq_u32(zero_mask, inf_mask);
395
396 vst1q_f32(cPtr, vbslq_f32(special_mask, x0, x1));
397 aPtr += 4;
398 cPtr += 4;
399 }
400
401 for (number = quarter_points * 4; number < num_points; number++) {
402 *cPtr++ = 1.0f / sqrtf(*aPtr++);
403 }
404}
405#endif /* LV_HAVE_NEONV8 */
406
407#ifdef LV_HAVE_RVV
408#include <riscv_vector.h>
409
410static inline void
411volk_32f_invsqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_points)
412{
413 size_t n = num_points;
414 for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
415 vl = __riscv_vsetvl_e32m8(n);
416 vfloat32m8_t a = __riscv_vle32_v_f32m8(aVector, vl);
417 vfloat32m8_t half = __riscv_vfmv_v_f_f32m8(0.5f, vl);
418 vfloat32m8_t three_halfs = __riscv_vfmv_v_f_f32m8(1.5f, vl);
419
420 // Initial estimate (~7-bit precision): +Inf for +0, 0 for +Inf
421 vfloat32m8_t x0 = __riscv_vfrsqrt7(a, vl);
422
423 // Two Newton-Raphson iterations: x = x * (1.5 - 0.5 * a * x * x)
424 vfloat32m8_t half_a = __riscv_vfmul(half, a, vl);
425 vfloat32m8_t x1 = __riscv_vfmul(
426 x0, __riscv_vfnmsac(three_halfs, half_a, __riscv_vfmul(x0, x0, vl), vl), vl);
427 x1 = __riscv_vfmul(
428 x1, __riscv_vfnmsac(three_halfs, half_a, __riscv_vfmul(x1, x1, vl), vl), vl);
429
430 // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
431 // Blend: use x0 where a == +0 or a == +Inf, else use x1
432 vuint32m8_t a_bits = __riscv_vreinterpret_v_f32m8_u32m8(a);
433 vbool4_t zero_mask = __riscv_vmseq_vx_u32m8_b4(a_bits, 0x00000000, vl);
434 vbool4_t inf_mask = __riscv_vmseq_vx_u32m8_b4(a_bits, 0x7F800000, vl);
435 vbool4_t special_mask = __riscv_vmor_mm_b4(zero_mask, inf_mask, vl);
436 vfloat32m8_t result = __riscv_vmerge_vvm_f32m8(x1, x0, special_mask, vl);
437
438 __riscv_vse32(cVector, result, vl);
439 }
440}
441#endif /*LV_HAVE_RVV*/
442
443#endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */