42#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
43#define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
53static inline void volk_16ic_s32f_magnitude_32f_a_avx2(
float* magnitudeVector,
56 unsigned int num_points)
58 unsigned int number = 0;
59 const unsigned int eighthPoints = num_points / 8;
61 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
62 float* magnitudeVectorPtr = magnitudeVector;
64 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
66 __m256 cplxValue1, cplxValue2, result;
68 __m128i short1, short2;
69 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
71 for (; number < eighthPoints; number++) {
73 int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
74 complexVectorPtr += 16;
75 short1 = _mm256_extracti128_si256(int1, 0);
76 short2 = _mm256_extracti128_si256(int1, 1);
78 int1 = _mm256_cvtepi16_epi32(short1);
79 int2 = _mm256_cvtepi16_epi32(short2);
80 cplxValue1 = _mm256_cvtepi32_ps(int1);
81 cplxValue2 = _mm256_cvtepi32_ps(int2);
83 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
84 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
86 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
87 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
89 result = _mm256_hadd_ps(cplxValue1, cplxValue2);
90 result = _mm256_permutevar8x32_ps(result, idx);
92 result = _mm256_sqrt_ps(result);
94 _mm256_store_ps(magnitudeVectorPtr, result);
96 magnitudeVectorPtr += 8;
99 number = eighthPoints * 8;
100 magnitudeVectorPtr = &magnitudeVector[number];
101 complexVectorPtr = (
const int16_t*)&complexVector[number];
102 for (; number < num_points; number++) {
103 float val1Real = (float)(*complexVectorPtr++) / scalar;
104 float val1Imag = (float)(*complexVectorPtr++) / scalar;
105 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
112#include <pmmintrin.h>
117 unsigned int num_points)
119 unsigned int number = 0;
120 const unsigned int quarterPoints = num_points / 4;
122 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
123 float* magnitudeVectorPtr = magnitudeVector;
125 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
127 __m128 cplxValue1, cplxValue2, result;
131 for (; number < quarterPoints; number++) {
133 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
134 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
135 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
136 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
138 inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
139 inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
140 inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
141 inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
143 cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
144 cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
146 complexVectorPtr += 8;
148 cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
149 cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
151 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1);
152 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2);
154 result = _mm_hadd_ps(cplxValue1, cplxValue2);
156 result = _mm_sqrt_ps(result);
158 _mm_store_ps(magnitudeVectorPtr, result);
160 magnitudeVectorPtr += 4;
163 number = quarterPoints * 4;
164 magnitudeVectorPtr = &magnitudeVector[number];
165 complexVectorPtr = (
const int16_t*)&complexVector[number];
166 for (; number < num_points; number++) {
167 float val1Real = (float)(*complexVectorPtr++) / scalar;
168 float val1Imag = (float)(*complexVectorPtr++) / scalar;
169 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
175#include <xmmintrin.h>
180 unsigned int num_points)
182 unsigned int number = 0;
183 const unsigned int quarterPoints = num_points / 4;
185 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
186 float* magnitudeVectorPtr = magnitudeVector;
188 const float iScalar = 1.0 / scalar;
189 __m128 invScalar = _mm_set_ps1(iScalar);
191 __m128 cplxValue1, cplxValue2, result, re, im;
195 for (; number < quarterPoints; number++) {
196 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
197 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
198 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
199 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
201 inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
202 inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
203 inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
204 inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
206 cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
207 cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
209 re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
210 im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
212 complexVectorPtr += 8;
214 cplxValue1 = _mm_mul_ps(re, invScalar);
215 cplxValue2 = _mm_mul_ps(im, invScalar);
217 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1);
218 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2);
220 result = _mm_add_ps(cplxValue1, cplxValue2);
222 result = _mm_sqrt_ps(result);
224 _mm_store_ps(magnitudeVectorPtr, result);
226 magnitudeVectorPtr += 4;
229 number = quarterPoints * 4;
230 magnitudeVectorPtr = &magnitudeVector[number];
231 complexVectorPtr = (
const int16_t*)&complexVector[number];
232 for (; number < num_points; number++) {
233 float val1Real = (float)(*complexVectorPtr++) * iScalar;
234 float val1Imag = (float)(*complexVectorPtr++) * iScalar;
235 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
242#ifdef LV_HAVE_GENERIC
247 unsigned int num_points)
249 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
250 float* magnitudeVectorPtr = magnitudeVector;
251 unsigned int number = 0;
252 const float invScalar = 1.0 / scalar;
253 for (number = 0; number < num_points; number++) {
254 float real = ((float)(*complexVectorPtr++)) * invScalar;
255 float imag = ((float)(*complexVectorPtr++)) * invScalar;
256 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
264#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
265#define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
273#include <immintrin.h>
275static inline void volk_16ic_s32f_magnitude_32f_u_avx2(
float* magnitudeVector,
278 unsigned int num_points)
280 unsigned int number = 0;
281 const unsigned int eighthPoints = num_points / 8;
283 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
284 float* magnitudeVectorPtr = magnitudeVector;
286 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
288 __m256 cplxValue1, cplxValue2, result;
290 __m128i short1, short2;
291 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
293 for (; number < eighthPoints; number++) {
295 int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
296 complexVectorPtr += 16;
297 short1 = _mm256_extracti128_si256(int1, 0);
298 short2 = _mm256_extracti128_si256(int1, 1);
300 int1 = _mm256_cvtepi16_epi32(short1);
301 int2 = _mm256_cvtepi16_epi32(short2);
302 cplxValue1 = _mm256_cvtepi32_ps(int1);
303 cplxValue2 = _mm256_cvtepi32_ps(int2);
305 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
306 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
308 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
309 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
311 result = _mm256_hadd_ps(cplxValue1, cplxValue2);
312 result = _mm256_permutevar8x32_ps(result, idx);
314 result = _mm256_sqrt_ps(result);
316 _mm256_storeu_ps(magnitudeVectorPtr, result);
318 magnitudeVectorPtr += 8;
321 number = eighthPoints * 8;
322 magnitudeVectorPtr = &magnitudeVector[number];
323 complexVectorPtr = (
const int16_t*)&complexVector[number];
324 for (; number < num_points; number++) {
325 float val1Real = (float)(*complexVectorPtr++) / scalar;
326 float val1Imag = (float)(*complexVectorPtr++) / scalar;
327 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
338 unsigned int num_points)
340 unsigned int number = 0;
341 const unsigned int quarter_points = num_points / 4;
343 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
344 float* magnitudeVectorPtr = magnitudeVector;
345 const float invScalar = 1.0f / scalar;
346 float32x4_t vInvScalar = vdupq_n_f32(invScalar);
348 for (; number < quarter_points; number++) {
349 int16x4x2_t input = vld2_s16(complexVectorPtr);
350 complexVectorPtr += 8;
352 int32x4_t realInt = vmovl_s16(input.val[0]);
353 int32x4_t imagInt = vmovl_s16(input.val[1]);
355 float32x4_t realFloat = vcvtq_f32_s32(realInt);
356 float32x4_t imagFloat = vcvtq_f32_s32(imagInt);
358 realFloat = vmulq_f32(realFloat, vInvScalar);
359 imagFloat = vmulq_f32(imagFloat, vInvScalar);
361 float32x4_t realSquared = vmulq_f32(realFloat, realFloat);
362 float32x4_t imagSquared = vmulq_f32(imagFloat, imagFloat);
363 float32x4_t sumSquared = vaddq_f32(realSquared, imagSquared);
366 float32x4_t rsqrt = vrsqrteq_f32(sumSquared);
367 rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(sumSquared, rsqrt), rsqrt));
368 rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(sumSquared, rsqrt), rsqrt));
369 float32x4_t result = vmulq_f32(sumSquared, rsqrt);
372 uint32x4_t zero_mask = vceqq_f32(sumSquared, vdupq_n_f32(0.0f));
373 result = vbslq_f32(zero_mask, sumSquared, result);
375 vst1q_f32(magnitudeVectorPtr, result);
376 magnitudeVectorPtr += 4;
379 number = quarter_points * 4;
380 complexVectorPtr = (
const int16_t*)&complexVector[number];
381 for (; number < num_points; number++) {
382 float real = ((float)(*complexVectorPtr++)) * invScalar;
383 float imag = ((float)(*complexVectorPtr++)) * invScalar;
384 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
392static inline void volk_16ic_s32f_magnitude_32f_neonv8(
float* magnitudeVector,
395 unsigned int num_points)
397 unsigned int number = 0;
398 const unsigned int eighth_points = num_points / 8;
400 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
401 float* magnitudeVectorPtr = magnitudeVector;
402 const float invScalar = 1.0f / scalar;
403 float32x4_t vInvScalar = vdupq_n_f32(invScalar);
405 for (; number < eighth_points; number++) {
406 int16x8x2_t input = vld2q_s16(complexVectorPtr);
407 complexVectorPtr += 16;
411 int32x4_t realInt0 = vmovl_s16(vget_low_s16(input.val[0]));
412 int32x4_t imagInt0 = vmovl_s16(vget_low_s16(input.val[1]));
414 float32x4_t realFloat0 = vcvtq_f32_s32(realInt0);
415 float32x4_t imagFloat0 = vcvtq_f32_s32(imagInt0);
417 realFloat0 = vmulq_f32(realFloat0, vInvScalar);
418 imagFloat0 = vmulq_f32(imagFloat0, vInvScalar);
420 float32x4_t sumSquared0 =
421 vfmaq_f32(vmulq_f32(imagFloat0, imagFloat0), realFloat0, realFloat0);
422 float32x4_t result0 = vsqrtq_f32(sumSquared0);
425 int32x4_t realInt1 = vmovl_s16(vget_high_s16(input.val[0]));
426 int32x4_t imagInt1 = vmovl_s16(vget_high_s16(input.val[1]));
428 float32x4_t realFloat1 = vcvtq_f32_s32(realInt1);
429 float32x4_t imagFloat1 = vcvtq_f32_s32(imagInt1);
431 realFloat1 = vmulq_f32(realFloat1, vInvScalar);
432 imagFloat1 = vmulq_f32(imagFloat1, vInvScalar);
434 float32x4_t sumSquared1 =
435 vfmaq_f32(vmulq_f32(imagFloat1, imagFloat1), realFloat1, realFloat1);
436 float32x4_t result1 = vsqrtq_f32(sumSquared1);
438 vst1q_f32(magnitudeVectorPtr, result0);
439 vst1q_f32(magnitudeVectorPtr + 4, result1);
440 magnitudeVectorPtr += 8;
443 number = eighth_points * 8;
444 complexVectorPtr = (
const int16_t*)&complexVector[number];
445 for (; number < num_points; number++) {
446 float real = ((float)(*complexVectorPtr++)) * invScalar;
447 float imag = ((float)(*complexVectorPtr++)) * invScalar;
448 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
454#include <riscv_vector.h>
456static inline void volk_16ic_s32f_magnitude_32f_rvv(
float* magnitudeVector,
459 unsigned int num_points)
461 size_t n = num_points;
462 for (
size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
463 vl = __riscv_vsetvl_e16m4(n);
464 vint32m8_t vc = __riscv_vle32_v_i32m8((
const int32_t*)complexVector, vl);
465 vint16m4_t vr = __riscv_vnsra(vc, 0, vl);
466 vint16m4_t vi = __riscv_vnsra(vc, 16, vl);
467 vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl);
468 vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl);
469 vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
470 __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl);
476#include <riscv_vector.h>
478static inline void volk_16ic_s32f_magnitude_32f_rvvseg(
float* magnitudeVector,
481 unsigned int num_points)
483 size_t n = num_points;
484 for (
size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
485 vl = __riscv_vsetvl_e16m4(n);
486 vint16m4x2_t vc = __riscv_vlseg2e16_v_i16m4x2((
const int16_t*)complexVector, vl);
487 vint16m4_t vr = __riscv_vget_i16m4(vc, 0);
488 vint16m4_t vi = __riscv_vget_i16m4(vc, 1);
489 vfloat32m8_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0f / scalar, vl);
490 vfloat32m8_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0f / scalar, vl);
491 vfloat32m8_t vf = __riscv_vfmacc(__riscv_vfmul(vif, vif, vl), vrf, vrf, vl);
492 __riscv_vse32(magnitudeVector, __riscv_vfsqrt(vf, vl), vl);