53#ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
54#define INCLUDED_volk_32f_invsqrt_32f_a_H
63 unsigned int num_points)
65 for (
unsigned int number = 0; number < num_points; number++) {
66 cVector[number] = sqrtf(1.0f / aVector[number]);
75 unsigned int num_points)
77 for (
unsigned int number = 0; number < num_points; number++) {
78 cVector[number] = 1.0f / sqrtf(aVector[number]);
87 unsigned int num_points)
95 for (
unsigned int number = 0; number < num_points; number++) {
96 float a = aVector[number];
97 float xhalf = 0.5f * a;
99 uint32_t input_bits = conv.u;
100 conv.u = 0x5f3759df - (conv.u >> 1);
102 x = x * (1.5f - xhalf * x * x);
103 x = x * (1.5f - xhalf * x * x);
104 x = x * (1.5f - xhalf * x * x);
108 uint32_t is_positive = (uint32_t)(-(int32_t)(a > 0.0f));
109 uint32_t is_zero = (uint32_t)(-(int32_t)(input_bits == 0x00000000));
110 uint32_t is_inf = (uint32_t)(-(int32_t)(input_bits == 0x7F800000));
111 uint32_t is_normal_pos = is_positive & ~is_inf;
112 result.u = (result.u & is_normal_pos) |
113 (0x7F800000u & is_zero) |
114 (0x7FC00000u & ~is_positive & ~is_zero);
116 cVector[number] = result.f;
123#include <xmmintrin.h>
128 unsigned int number = 0;
129 const unsigned int quarterPoints = num_points / 4;
131 float* cPtr = cVector;
132 const float* aPtr = aVector;
133 for (; number < quarterPoints; number++) {
134 __m128 aVal = _mm_load_ps(aPtr);
136 _mm_store_ps(cPtr, cVal);
141 number = quarterPoints * 4;
142 for (; number < num_points; number++) {
143 *cPtr++ = 1.0f / sqrtf(*aPtr++);
149#include <immintrin.h>
155 unsigned int number = 0;
156 const unsigned int eighthPoints = num_points / 8;
158 float* cPtr = cVector;
159 const float* aPtr = aVector;
160 for (; number < eighthPoints; number++) {
161 __m256 aVal = _mm256_load_ps(aPtr);
163 _mm256_store_ps(cPtr, cVal);
168 number = eighthPoints * 8;
169 for (; number < num_points; number++) {
170 *cPtr++ = 1.0f / sqrtf(*aPtr++);
177#include <immintrin.h>
181volk_32f_invsqrt_32f_a_avx2(
float* cVector,
const float* aVector,
unsigned int num_points)
183 unsigned int number = 0;
184 const unsigned int eighthPoints = num_points / 8;
186 float* cPtr = cVector;
187 const float* aPtr = aVector;
188 for (; number < eighthPoints; number++) {
189 __m256 aVal = _mm256_load_ps(aPtr);
191 _mm256_store_ps(cPtr, cVal);
196 number = eighthPoints * 8;
197 for (; number < num_points; number++) {
198 *cPtr++ = 1.0f / sqrtf(*aPtr++);
204#ifdef LV_HAVE_AVX512F
205#include <immintrin.h>
208static inline void volk_32f_invsqrt_32f_a_avx512f(
float* cVector,
209 const float* aVector,
210 unsigned int num_points)
212 unsigned int number = 0;
213 const unsigned int sixteenthPoints = num_points / 16;
215 float* cPtr = cVector;
216 const float* aPtr = aVector;
217 for (; number < sixteenthPoints; number++) {
218 __m512 aVal = _mm512_load_ps(aPtr);
220 _mm512_store_ps(cPtr, cVal);
225 number = sixteenthPoints * 16;
226 for (; number < num_points; number++) {
227 *cPtr++ = 1.0f / sqrtf(*aPtr++);
234#include <xmmintrin.h>
239 unsigned int number = 0;
240 const unsigned int quarterPoints = num_points / 4;
242 float* cPtr = cVector;
243 const float* aPtr = aVector;
244 for (; number < quarterPoints; number++) {
245 __m128 aVal = _mm_loadu_ps(aPtr);
247 _mm_storeu_ps(cPtr, cVal);
252 number = quarterPoints * 4;
253 for (; number < num_points; number++) {
254 *cPtr++ = 1.0f / sqrtf(*aPtr++);
261#include <immintrin.h>
267 unsigned int number = 0;
268 const unsigned int eighthPoints = num_points / 8;
270 float* cPtr = cVector;
271 const float* aPtr = aVector;
272 for (; number < eighthPoints; number++) {
273 __m256 aVal = _mm256_loadu_ps(aPtr);
275 _mm256_storeu_ps(cPtr, cVal);
280 number = eighthPoints * 8;
281 for (; number < num_points; number++) {
282 *cPtr++ = 1.0f / sqrtf(*aPtr++);
288#include <immintrin.h>
292volk_32f_invsqrt_32f_u_avx2(
float* cVector,
const float* aVector,
unsigned int num_points)
294 unsigned int number = 0;
295 const unsigned int eighthPoints = num_points / 8;
297 float* cPtr = cVector;
298 const float* aPtr = aVector;
299 for (; number < eighthPoints; number++) {
300 __m256 aVal = _mm256_loadu_ps(aPtr);
302 _mm256_storeu_ps(cPtr, cVal);
307 number = eighthPoints * 8;
308 for (; number < num_points; number++) {
309 *cPtr++ = 1.0f / sqrtf(*aPtr++);
314#ifdef LV_HAVE_AVX512F
315#include <immintrin.h>
318static inline void volk_32f_invsqrt_32f_u_avx512f(
float* cVector,
319 const float* aVector,
320 unsigned int num_points)
322 unsigned int number = 0;
323 const unsigned int sixteenthPoints = num_points / 16;
325 float* cPtr = cVector;
326 const float* aPtr = aVector;
327 for (; number < sixteenthPoints; number++) {
328 __m512 aVal = _mm512_loadu_ps(aPtr);
330 _mm512_storeu_ps(cPtr, cVal);
335 number = sixteenthPoints * 16;
336 for (; number < num_points; number++) {
337 *cPtr++ = 1.0f / sqrtf(*aPtr++);
350 const unsigned int quarter_points = num_points / 4;
352 float* cPtr = cVector;
353 const float* aPtr = aVector;
354 for (number = 0; number < quarter_points; ++number) {
355 float32x4_t a_val = vld1q_f32(aPtr);
357 vst1q_f32(cPtr, c_val);
362 for (number = quarter_points * 4; number < num_points; number++) {
363 *cPtr++ = 1.0f / sqrtf(*aPtr++);
372volk_32f_invsqrt_32f_neonv8(
float* cVector,
const float* aVector,
unsigned int num_points)
375 const unsigned int quarter_points = num_points / 4;
377 float* cPtr = cVector;
378 const float* aPtr = aVector;
381 for (number = 0; number < quarter_points; ++number) {
382 float32x4_t a = vld1q_f32(aPtr);
383 float32x4_t x0 = vrsqrteq_f32(a);
386 float32x4_t x1 = vmulq_f32(x0, vrsqrtsq_f32(vmulq_f32(a, x0), x0));
387 x1 = vmulq_f32(x1, vrsqrtsq_f32(vmulq_f32(a, x1), x1));
391 uint32x4_t a_bits = vreinterpretq_u32_f32(a);
392 uint32x4_t zero_mask = vceqq_u32(a_bits, vdupq_n_u32(0x00000000));
393 uint32x4_t inf_mask = vceqq_u32(a_bits, vdupq_n_u32(0x7F800000));
394 uint32x4_t special_mask = vorrq_u32(zero_mask, inf_mask);
396 vst1q_f32(cPtr, vbslq_f32(special_mask, x0, x1));
401 for (number = quarter_points * 4; number < num_points; number++) {
402 *cPtr++ = 1.0f / sqrtf(*aPtr++);
408#include <riscv_vector.h>
411volk_32f_invsqrt_32f_rvv(
float* cVector,
const float* aVector,
unsigned int num_points)
413 size_t n = num_points;
414 for (
size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
415 vl = __riscv_vsetvl_e32m8(n);
416 vfloat32m8_t a = __riscv_vle32_v_f32m8(aVector, vl);
417 vfloat32m8_t half = __riscv_vfmv_v_f_f32m8(0.5f, vl);
418 vfloat32m8_t three_halfs = __riscv_vfmv_v_f_f32m8(1.5f, vl);
421 vfloat32m8_t x0 = __riscv_vfrsqrt7(a, vl);
424 vfloat32m8_t half_a = __riscv_vfmul(half, a, vl);
425 vfloat32m8_t x1 = __riscv_vfmul(
426 x0, __riscv_vfnmsac(three_halfs, half_a, __riscv_vfmul(x0, x0, vl), vl), vl);
428 x1, __riscv_vfnmsac(three_halfs, half_a, __riscv_vfmul(x1, x1, vl), vl), vl);
432 vuint32m8_t a_bits = __riscv_vreinterpret_v_f32m8_u32m8(a);
433 vbool4_t zero_mask = __riscv_vmseq_vx_u32m8_b4(a_bits, 0x00000000, vl);
434 vbool4_t inf_mask = __riscv_vmseq_vx_u32m8_b4(a_bits, 0x7F800000, vl);
435 vbool4_t special_mask = __riscv_vmor_mm_b4(zero_mask, inf_mask, vl);
436 vfloat32m8_t result = __riscv_vmerge_vvm_f32m8(x1, x0, special_mask, vl);
438 __riscv_vse32(cVector, result, vl);