58#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
59#define INCLUDED_volk_32fc_magnitude_32f_u_H
69 unsigned int num_points)
71 const float* complexVectorPtr = (
float*)complexVector;
72 float* magnitudeVectorPtr = magnitudeVector;
73 unsigned int number = 0;
74 for (number = 0; number < num_points; number++) {
75 const float real = *complexVectorPtr++;
76 const float imag = *complexVectorPtr++;
77 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
85static inline void volk_32fc_magnitude_32f_u_avx512f(
float* magnitudeVector,
87 unsigned int num_points)
89 unsigned int number = 0;
90 const unsigned int eighthPoints = num_points / 8;
92 const float* complexVectorPtr = (
float*)complexVector;
93 float* magnitudeVectorPtr = magnitudeVector;
96 __m512 iValues, qValues;
99 for (; number < eighthPoints; number++) {
101 cplxValue = _mm512_loadu_ps(complexVectorPtr);
105 iValues = _mm512_permutexvar_ps(
106 _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0),
109 qValues = _mm512_permutexvar_ps(
110 _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0),
114 result = _mm512_fmadd_ps(iValues, iValues, _mm512_mul_ps(qValues, qValues));
117 result = _mm512_sqrt_ps(result);
120 _mm256_storeu_ps(magnitudeVectorPtr, _mm512_castps512_ps256(result));
122 complexVectorPtr += 16;
123 magnitudeVectorPtr += 8;
126 number = eighthPoints * 8;
128 magnitudeVectorPtr, (
const lv_32fc_t*)complexVectorPtr, num_points - number);
133#include <immintrin.h>
138 unsigned int num_points)
140 unsigned int number = 0;
141 const unsigned int eighthPoints = num_points / 8;
143 const float* complexVectorPtr = (
float*)complexVector;
144 float* magnitudeVectorPtr = magnitudeVector;
146 __m256 cplxValue1, cplxValue2, result;
148 for (; number < eighthPoints; number++) {
149 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
150 cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
152 _mm256_storeu_ps(magnitudeVectorPtr, result);
154 complexVectorPtr += 16;
155 magnitudeVectorPtr += 8;
158 number = eighthPoints * 8;
159 for (; number < num_points; number++) {
160 float val1Real = *complexVectorPtr++;
161 float val1Imag = *complexVectorPtr++;
162 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
168#include <pmmintrin.h>
173 unsigned int num_points)
175 unsigned int number = 0;
176 const unsigned int quarterPoints = num_points / 4;
178 const float* complexVectorPtr = (
float*)complexVector;
179 float* magnitudeVectorPtr = magnitudeVector;
181 __m128 cplxValue1, cplxValue2, result;
182 for (; number < quarterPoints; number++) {
183 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
184 complexVectorPtr += 4;
186 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
187 complexVectorPtr += 4;
191 _mm_storeu_ps(magnitudeVectorPtr, result);
192 magnitudeVectorPtr += 4;
195 number = quarterPoints * 4;
196 for (; number < num_points; number++) {
197 float val1Real = *complexVectorPtr++;
198 float val1Imag = *complexVectorPtr++;
199 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
207#include <xmmintrin.h>
211 unsigned int num_points)
213 unsigned int number = 0;
214 const unsigned int quarterPoints = num_points / 4;
216 const float* complexVectorPtr = (
float*)complexVector;
217 float* magnitudeVectorPtr = magnitudeVector;
219 __m128 cplxValue1, cplxValue2, result;
221 for (; number < quarterPoints; number++) {
222 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
223 complexVectorPtr += 4;
225 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
226 complexVectorPtr += 4;
229 _mm_storeu_ps(magnitudeVectorPtr, result);
230 magnitudeVectorPtr += 4;
233 number = quarterPoints * 4;
234 for (; number < num_points; number++) {
235 float val1Real = *complexVectorPtr++;
236 float val1Imag = *complexVectorPtr++;
237 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
243#ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
244#define INCLUDED_volk_32fc_magnitude_32f_a_H
250#ifdef LV_HAVE_AVX512F
251#include <immintrin.h>
253static inline void volk_32fc_magnitude_32f_a_avx512f(
float* magnitudeVector,
255 unsigned int num_points)
257 unsigned int number = 0;
258 const unsigned int eighthPoints = num_points / 8;
260 const float* complexVectorPtr = (
float*)complexVector;
261 float* magnitudeVectorPtr = magnitudeVector;
264 __m512 iValues, qValues;
267 for (; number < eighthPoints; number++) {
269 cplxValue = _mm512_load_ps(complexVectorPtr);
272 iValues = _mm512_permutexvar_ps(
273 _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0),
275 qValues = _mm512_permutexvar_ps(
276 _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0),
280 result = _mm512_fmadd_ps(iValues, iValues, _mm512_mul_ps(qValues, qValues));
283 result = _mm512_sqrt_ps(result);
286 _mm256_store_ps(magnitudeVectorPtr, _mm512_castps512_ps256(result));
288 complexVectorPtr += 16;
289 magnitudeVectorPtr += 8;
292 number = eighthPoints * 8;
294 magnitudeVectorPtr, (
const lv_32fc_t*)complexVectorPtr, num_points - number);
299#include <immintrin.h>
304 unsigned int num_points)
306 unsigned int number = 0;
307 const unsigned int eighthPoints = num_points / 8;
309 const float* complexVectorPtr = (
float*)complexVector;
310 float* magnitudeVectorPtr = magnitudeVector;
312 __m256 cplxValue1, cplxValue2, result;
313 for (; number < eighthPoints; number++) {
314 cplxValue1 = _mm256_load_ps(complexVectorPtr);
315 complexVectorPtr += 8;
317 cplxValue2 = _mm256_load_ps(complexVectorPtr);
318 complexVectorPtr += 8;
321 _mm256_store_ps(magnitudeVectorPtr, result);
322 magnitudeVectorPtr += 8;
325 number = eighthPoints * 8;
326 for (; number < num_points; number++) {
327 float val1Real = *complexVectorPtr++;
328 float val1Imag = *complexVectorPtr++;
329 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
335#include <pmmintrin.h>
340 unsigned int num_points)
342 unsigned int number = 0;
343 const unsigned int quarterPoints = num_points / 4;
345 const float* complexVectorPtr = (
float*)complexVector;
346 float* magnitudeVectorPtr = magnitudeVector;
348 __m128 cplxValue1, cplxValue2, result;
349 for (; number < quarterPoints; number++) {
350 cplxValue1 = _mm_load_ps(complexVectorPtr);
351 complexVectorPtr += 4;
353 cplxValue2 = _mm_load_ps(complexVectorPtr);
354 complexVectorPtr += 4;
357 _mm_store_ps(magnitudeVectorPtr, result);
358 magnitudeVectorPtr += 4;
361 number = quarterPoints * 4;
362 for (; number < num_points; number++) {
363 float val1Real = *complexVectorPtr++;
364 float val1Imag = *complexVectorPtr++;
365 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
372#include <xmmintrin.h>
376 unsigned int num_points)
378 unsigned int number = 0;
379 const unsigned int quarterPoints = num_points / 4;
381 const float* complexVectorPtr = (
float*)complexVector;
382 float* magnitudeVectorPtr = magnitudeVector;
384 __m128 cplxValue1, cplxValue2, result;
385 for (; number < quarterPoints; number++) {
386 cplxValue1 = _mm_load_ps(complexVectorPtr);
387 complexVectorPtr += 4;
389 cplxValue2 = _mm_load_ps(complexVectorPtr);
390 complexVectorPtr += 4;
393 _mm_store_ps(magnitudeVectorPtr, result);
394 magnitudeVectorPtr += 4;
397 number = quarterPoints * 4;
398 for (; number < num_points; number++) {
399 float val1Real = *complexVectorPtr++;
400 float val1Imag = *complexVectorPtr++;
401 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
412 unsigned int num_points)
415 unsigned int quarter_points = num_points / 4;
416 const float* complexVectorPtr = (
float*)complexVector;
417 float* magnitudeVectorPtr = magnitudeVector;
419 float32x4x2_t complex_vec;
420 float32x4_t magnitude_vec;
421 for (number = 0; number < quarter_points; number++) {
422 complex_vec = vld2q_f32(complexVectorPtr);
423 complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
425 vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
426 magnitude_vec = vrsqrteq_f32(magnitude_vec);
427 magnitude_vec = vrecpeq_f32(magnitude_vec);
428 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
430 complexVectorPtr += 8;
431 magnitudeVectorPtr += 4;
434 for (number = quarter_points * 4; number < num_points; number++) {
435 const float real = *complexVectorPtr++;
436 const float imag = *complexVectorPtr++;
437 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
446static inline void volk_32fc_magnitude_32f_neonv8(
float* magnitudeVector,
448 unsigned int num_points)
451 unsigned int quarter_points = num_points / 4;
452 const float* complexVectorPtr = (
float*)complexVector;
453 float* magnitudeVectorPtr = magnitudeVector;
455 float32x4x2_t complex_vec;
456 float32x4_t magnitude_sq, magnitude_vec;
457 for (number = 0; number < quarter_points; number++) {
458 complex_vec = vld2q_f32(complexVectorPtr);
461 magnitude_sq = vfmaq_f32(vmulq_f32(complex_vec.val[0], complex_vec.val[0]),
465 magnitude_vec = vsqrtq_f32(magnitude_sq);
466 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
468 complexVectorPtr += 8;
469 magnitudeVectorPtr += 4;
472 for (number = quarter_points * 4; number < num_points; number++) {
473 const float real = *complexVectorPtr++;
474 const float imag = *complexVectorPtr++;
475 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
499 float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points)
502 unsigned int quarter_points = num_points / 4;
503 const float* complexVectorPtr = (
float*)complexVector;
504 float* magnitudeVectorPtr = magnitudeVector;
506 const float threshold = 0.4142135;
508 float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
509 a_high = vdupq_n_f32(0.84);
510 b_high = vdupq_n_f32(0.561);
511 a_low = vdupq_n_f32(0.99);
512 b_low = vdupq_n_f32(0.197);
514 uint32x4_t comp0, comp1;
516 float32x4x2_t complex_vec;
517 float32x4_t min_vec, max_vec, magnitude_vec;
518 float32x4_t real_abs, imag_abs;
519 for (number = 0; number < quarter_points; number++) {
520 complex_vec = vld2q_f32(complexVectorPtr);
522 real_abs = vabsq_f32(complex_vec.val[0]);
523 imag_abs = vabsq_f32(complex_vec.val[1]);
525 min_vec = vminq_f32(real_abs, imag_abs);
526 max_vec = vmaxq_f32(real_abs, imag_abs);
529 comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
530 comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
533 a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
534 vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
535 b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
536 vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
539 min_vec = vmulq_f32(min_vec, b_vec);
540 max_vec = vmulq_f32(max_vec, a_vec);
542 magnitude_vec = vaddq_f32(min_vec, max_vec);
543 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
545 complexVectorPtr += 8;
546 magnitudeVectorPtr += 4;
549 for (number = quarter_points * 4; number < num_points; number++) {
550 const float real = *complexVectorPtr++;
551 const float imag = *complexVectorPtr++;
552 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
558#include <riscv_vector.h>
560static inline void volk_32fc_magnitude_32f_rvv(
float* magnitudeVector,
562 unsigned int num_points)
564 size_t n = num_points;
565 for (
size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
566 vl = __riscv_vsetvl_e32m4(n);
567 vuint64m8_t vc = __riscv_vle64_v_u64m8((
const uint64_t*)complexVector, vl);
568 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
569 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
570 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
571 __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl);
577#include <riscv_vector.h>
579static inline void volk_32fc_magnitude_32f_rvvseg(
float* magnitudeVector,
581 unsigned int num_points)
583 size_t n = num_points;
584 for (
size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
585 vl = __riscv_vsetvl_e32m4(n);
586 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((
const float*)complexVector, vl);
587 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
588 vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
589 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
590 __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl);