Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_magnitude_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
57
58#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
59#define INCLUDED_volk_32fc_magnitude_32f_u_H
60
61#include <inttypes.h>
62#include <math.h>
63#include <stdio.h>
64
65#ifdef LV_HAVE_GENERIC
66
67static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector,
68 const lv_32fc_t* complexVector,
69 unsigned int num_points)
70{
71 const float* complexVectorPtr = (float*)complexVector;
72 float* magnitudeVectorPtr = magnitudeVector;
73 unsigned int number = 0;
74 for (number = 0; number < num_points; number++) {
75 const float real = *complexVectorPtr++;
76 const float imag = *complexVectorPtr++;
77 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
78 }
79}
80#endif /* LV_HAVE_GENERIC */
81
82#ifdef LV_HAVE_AVX512F
83#include <immintrin.h>
84
85static inline void volk_32fc_magnitude_32f_u_avx512f(float* magnitudeVector,
86 const lv_32fc_t* complexVector,
87 unsigned int num_points)
88{
89 unsigned int number = 0;
90 const unsigned int eighthPoints = num_points / 8;
91
92 const float* complexVectorPtr = (float*)complexVector;
93 float* magnitudeVectorPtr = magnitudeVector;
94
95 __m512 cplxValue;
96 __m512 iValues, qValues;
97 __m512 result;
98
99 for (; number < eighthPoints; number++) {
100 // Load 8 complex numbers (16 floats): I0,Q0,I1,Q1,...,I7,Q7
101 cplxValue = _mm512_loadu_ps(complexVectorPtr);
102
103 // Separate I and Q values using permute
104 // Extract I values (even indices: 0,2,4,6,8,10,12,14)
105 iValues = _mm512_permutexvar_ps(
106 _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0),
107 cplxValue);
108 // Extract Q values (odd indices: 1,3,5,7,9,11,13,15)
109 qValues = _mm512_permutexvar_ps(
110 _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0),
111 cplxValue);
112
113 // Square and add: I^2 + Q^2
114 result = _mm512_fmadd_ps(iValues, iValues, _mm512_mul_ps(qValues, qValues));
115
116 // Square root
117 result = _mm512_sqrt_ps(result);
118
119 // Store only first 8 results
120 _mm256_storeu_ps(magnitudeVectorPtr, _mm512_castps512_ps256(result));
121
122 complexVectorPtr += 16;
123 magnitudeVectorPtr += 8;
124 }
125
126 number = eighthPoints * 8;
128 magnitudeVectorPtr, (const lv_32fc_t*)complexVectorPtr, num_points - number);
129}
130#endif /* LV_HAVE_AVX512F */
131
132#ifdef LV_HAVE_AVX
133#include <immintrin.h>
135
136static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector,
137 const lv_32fc_t* complexVector,
138 unsigned int num_points)
139{
140 unsigned int number = 0;
141 const unsigned int eighthPoints = num_points / 8;
142
143 const float* complexVectorPtr = (float*)complexVector;
144 float* magnitudeVectorPtr = magnitudeVector;
145
146 __m256 cplxValue1, cplxValue2, result;
147
148 for (; number < eighthPoints; number++) {
149 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
150 cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
151 result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
152 _mm256_storeu_ps(magnitudeVectorPtr, result);
153
154 complexVectorPtr += 16;
155 magnitudeVectorPtr += 8;
156 }
157
158 number = eighthPoints * 8;
159 for (; number < num_points; number++) {
160 float val1Real = *complexVectorPtr++;
161 float val1Imag = *complexVectorPtr++;
162 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
163 }
164}
165#endif /* LV_HAVE_AVX */
166
167#ifdef LV_HAVE_SSE3
168#include <pmmintrin.h>
170
171static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector,
172 const lv_32fc_t* complexVector,
173 unsigned int num_points)
174{
175 unsigned int number = 0;
176 const unsigned int quarterPoints = num_points / 4;
177
178 const float* complexVectorPtr = (float*)complexVector;
179 float* magnitudeVectorPtr = magnitudeVector;
180
181 __m128 cplxValue1, cplxValue2, result;
182 for (; number < quarterPoints; number++) {
183 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
184 complexVectorPtr += 4;
185
186 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
187 complexVectorPtr += 4;
188
189 result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
190
191 _mm_storeu_ps(magnitudeVectorPtr, result);
192 magnitudeVectorPtr += 4;
193 }
194
195 number = quarterPoints * 4;
196 for (; number < num_points; number++) {
197 float val1Real = *complexVectorPtr++;
198 float val1Imag = *complexVectorPtr++;
199 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
200 }
201}
202#endif /* LV_HAVE_SSE3 */
203
204
205#ifdef LV_HAVE_SSE
207#include <xmmintrin.h>
208
209static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector,
210 const lv_32fc_t* complexVector,
211 unsigned int num_points)
212{
213 unsigned int number = 0;
214 const unsigned int quarterPoints = num_points / 4;
215
216 const float* complexVectorPtr = (float*)complexVector;
217 float* magnitudeVectorPtr = magnitudeVector;
218
219 __m128 cplxValue1, cplxValue2, result;
220
221 for (; number < quarterPoints; number++) {
222 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
223 complexVectorPtr += 4;
224
225 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
226 complexVectorPtr += 4;
227
228 result = _mm_magnitude_ps(cplxValue1, cplxValue2);
229 _mm_storeu_ps(magnitudeVectorPtr, result);
230 magnitudeVectorPtr += 4;
231 }
232
233 number = quarterPoints * 4;
234 for (; number < num_points; number++) {
235 float val1Real = *complexVectorPtr++;
236 float val1Imag = *complexVectorPtr++;
237 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
238 }
239}
240#endif /* LV_HAVE_SSE */
241
242#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
243#ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
244#define INCLUDED_volk_32fc_magnitude_32f_a_H
245
246#include <inttypes.h>
247#include <math.h>
248#include <stdio.h>
249
250#ifdef LV_HAVE_AVX512F
251#include <immintrin.h>
252
253static inline void volk_32fc_magnitude_32f_a_avx512f(float* magnitudeVector,
254 const lv_32fc_t* complexVector,
255 unsigned int num_points)
256{
257 unsigned int number = 0;
258 const unsigned int eighthPoints = num_points / 8;
259
260 const float* complexVectorPtr = (float*)complexVector;
261 float* magnitudeVectorPtr = magnitudeVector;
262
263 __m512 cplxValue;
264 __m512 iValues, qValues;
265 __m512 result;
266
267 for (; number < eighthPoints; number++) {
268 // Load 8 complex numbers (16 floats): I0,Q0,I1,Q1,...,I7,Q7
269 cplxValue = _mm512_load_ps(complexVectorPtr);
270
271 // Separate I and Q values using permute
272 iValues = _mm512_permutexvar_ps(
273 _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0),
274 cplxValue);
275 qValues = _mm512_permutexvar_ps(
276 _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0),
277 cplxValue);
278
279 // Square and add: I^2 + Q^2
280 result = _mm512_fmadd_ps(iValues, iValues, _mm512_mul_ps(qValues, qValues));
281
282 // Square root
283 result = _mm512_sqrt_ps(result);
284
285 // Store only first 8 results
286 _mm256_store_ps(magnitudeVectorPtr, _mm512_castps512_ps256(result));
287
288 complexVectorPtr += 16;
289 magnitudeVectorPtr += 8;
290 }
291
292 number = eighthPoints * 8;
294 magnitudeVectorPtr, (const lv_32fc_t*)complexVectorPtr, num_points - number);
295}
296#endif /* LV_HAVE_AVX512F */
297
298#ifdef LV_HAVE_AVX
299#include <immintrin.h>
301
302static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector,
303 const lv_32fc_t* complexVector,
304 unsigned int num_points)
305{
306 unsigned int number = 0;
307 const unsigned int eighthPoints = num_points / 8;
308
309 const float* complexVectorPtr = (float*)complexVector;
310 float* magnitudeVectorPtr = magnitudeVector;
311
312 __m256 cplxValue1, cplxValue2, result;
313 for (; number < eighthPoints; number++) {
314 cplxValue1 = _mm256_load_ps(complexVectorPtr);
315 complexVectorPtr += 8;
316
317 cplxValue2 = _mm256_load_ps(complexVectorPtr);
318 complexVectorPtr += 8;
319
320 result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
321 _mm256_store_ps(magnitudeVectorPtr, result);
322 magnitudeVectorPtr += 8;
323 }
324
325 number = eighthPoints * 8;
326 for (; number < num_points; number++) {
327 float val1Real = *complexVectorPtr++;
328 float val1Imag = *complexVectorPtr++;
329 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
330 }
331}
332#endif /* LV_HAVE_AVX */
333
334#ifdef LV_HAVE_SSE3
335#include <pmmintrin.h>
337
338static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector,
339 const lv_32fc_t* complexVector,
340 unsigned int num_points)
341{
342 unsigned int number = 0;
343 const unsigned int quarterPoints = num_points / 4;
344
345 const float* complexVectorPtr = (float*)complexVector;
346 float* magnitudeVectorPtr = magnitudeVector;
347
348 __m128 cplxValue1, cplxValue2, result;
349 for (; number < quarterPoints; number++) {
350 cplxValue1 = _mm_load_ps(complexVectorPtr);
351 complexVectorPtr += 4;
352
353 cplxValue2 = _mm_load_ps(complexVectorPtr);
354 complexVectorPtr += 4;
355
356 result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
357 _mm_store_ps(magnitudeVectorPtr, result);
358 magnitudeVectorPtr += 4;
359 }
360
361 number = quarterPoints * 4;
362 for (; number < num_points; number++) {
363 float val1Real = *complexVectorPtr++;
364 float val1Imag = *complexVectorPtr++;
365 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
366 }
367}
368#endif /* LV_HAVE_SSE3 */
369
370#ifdef LV_HAVE_SSE
372#include <xmmintrin.h>
373
374static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector,
375 const lv_32fc_t* complexVector,
376 unsigned int num_points)
377{
378 unsigned int number = 0;
379 const unsigned int quarterPoints = num_points / 4;
380
381 const float* complexVectorPtr = (float*)complexVector;
382 float* magnitudeVectorPtr = magnitudeVector;
383
384 __m128 cplxValue1, cplxValue2, result;
385 for (; number < quarterPoints; number++) {
386 cplxValue1 = _mm_load_ps(complexVectorPtr);
387 complexVectorPtr += 4;
388
389 cplxValue2 = _mm_load_ps(complexVectorPtr);
390 complexVectorPtr += 4;
391
392 result = _mm_magnitude_ps(cplxValue1, cplxValue2);
393 _mm_store_ps(magnitudeVectorPtr, result);
394 magnitudeVectorPtr += 4;
395 }
396
397 number = quarterPoints * 4;
398 for (; number < num_points; number++) {
399 float val1Real = *complexVectorPtr++;
400 float val1Imag = *complexVectorPtr++;
401 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
402 }
403}
404#endif /* LV_HAVE_SSE */
405
406
407#ifdef LV_HAVE_NEON
408#include <arm_neon.h>
409
410static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector,
411 const lv_32fc_t* complexVector,
412 unsigned int num_points)
413{
414 unsigned int number;
415 unsigned int quarter_points = num_points / 4;
416 const float* complexVectorPtr = (float*)complexVector;
417 float* magnitudeVectorPtr = magnitudeVector;
418
419 float32x4x2_t complex_vec;
420 float32x4_t magnitude_vec;
421 for (number = 0; number < quarter_points; number++) {
422 complex_vec = vld2q_f32(complexVectorPtr);
423 complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
424 magnitude_vec =
425 vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
426 magnitude_vec = vrsqrteq_f32(magnitude_vec);
427 magnitude_vec = vrecpeq_f32(magnitude_vec); // no plain ol' sqrt
428 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
429
430 complexVectorPtr += 8;
431 magnitudeVectorPtr += 4;
432 }
433
434 for (number = quarter_points * 4; number < num_points; number++) {
435 const float real = *complexVectorPtr++;
436 const float imag = *complexVectorPtr++;
437 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
438 }
439}
440#endif /* LV_HAVE_NEON */
441
442
443#ifdef LV_HAVE_NEONV8
444#include <arm_neon.h>
445
446static inline void volk_32fc_magnitude_32f_neonv8(float* magnitudeVector,
447 const lv_32fc_t* complexVector,
448 unsigned int num_points)
449{
450 unsigned int number;
451 unsigned int quarter_points = num_points / 4;
452 const float* complexVectorPtr = (float*)complexVector;
453 float* magnitudeVectorPtr = magnitudeVector;
454
455 float32x4x2_t complex_vec;
456 float32x4_t magnitude_sq, magnitude_vec;
457 for (number = 0; number < quarter_points; number++) {
458 complex_vec = vld2q_f32(complexVectorPtr);
459 __VOLK_PREFETCH(complexVectorPtr + 8);
460 // Use FMA for magnitude squared: real^2 + imag^2
461 magnitude_sq = vfmaq_f32(vmulq_f32(complex_vec.val[0], complex_vec.val[0]),
462 complex_vec.val[1],
463 complex_vec.val[1]);
464 // ARMv8 native sqrt
465 magnitude_vec = vsqrtq_f32(magnitude_sq);
466 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
467
468 complexVectorPtr += 8;
469 magnitudeVectorPtr += 4;
470 }
471
472 for (number = quarter_points * 4; number < num_points; number++) {
473 const float real = *complexVectorPtr++;
474 const float imag = *complexVectorPtr++;
475 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
476 }
477}
478#endif /* LV_HAVE_NEONV8 */
479
480
481#ifdef LV_HAVE_NEON
499 float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
500{
501 unsigned int number;
502 unsigned int quarter_points = num_points / 4;
503 const float* complexVectorPtr = (float*)complexVector;
504 float* magnitudeVectorPtr = magnitudeVector;
505
506 const float threshold = 0.4142135;
507
508 float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
509 a_high = vdupq_n_f32(0.84);
510 b_high = vdupq_n_f32(0.561);
511 a_low = vdupq_n_f32(0.99);
512 b_low = vdupq_n_f32(0.197);
513
514 uint32x4_t comp0, comp1;
515
516 float32x4x2_t complex_vec;
517 float32x4_t min_vec, max_vec, magnitude_vec;
518 float32x4_t real_abs, imag_abs;
519 for (number = 0; number < quarter_points; number++) {
520 complex_vec = vld2q_f32(complexVectorPtr);
521
522 real_abs = vabsq_f32(complex_vec.val[0]);
523 imag_abs = vabsq_f32(complex_vec.val[1]);
524
525 min_vec = vminq_f32(real_abs, imag_abs);
526 max_vec = vmaxq_f32(real_abs, imag_abs);
527
528 // effective branch to choose coefficient pair.
529 comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
530 comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
531
532 // and 0s or 1s with coefficients from previous effective branch
533 a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
534 vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
535 b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
536 vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
537
538 // coefficients chosen, do the weighted sum
539 min_vec = vmulq_f32(min_vec, b_vec);
540 max_vec = vmulq_f32(max_vec, a_vec);
541
542 magnitude_vec = vaddq_f32(min_vec, max_vec);
543 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
544
545 complexVectorPtr += 8;
546 magnitudeVectorPtr += 4;
547 }
548
549 for (number = quarter_points * 4; number < num_points; number++) {
550 const float real = *complexVectorPtr++;
551 const float imag = *complexVectorPtr++;
552 *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
553 }
554}
555#endif /* LV_HAVE_NEON */
556
557#ifdef LV_HAVE_RVV
558#include <riscv_vector.h>
559
560static inline void volk_32fc_magnitude_32f_rvv(float* magnitudeVector,
561 const lv_32fc_t* complexVector,
562 unsigned int num_points)
563{
564 size_t n = num_points;
565 for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
566 vl = __riscv_vsetvl_e32m4(n);
567 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
568 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
569 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
570 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
571 __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl);
572 }
573}
574#endif /*LV_HAVE_RVV*/
575
576#ifdef LV_HAVE_RVVSEG
577#include <riscv_vector.h>
578
579static inline void volk_32fc_magnitude_32f_rvvseg(float* magnitudeVector,
580 const lv_32fc_t* complexVector,
581 unsigned int num_points)
582{
583 size_t n = num_points;
584 for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
585 vl = __riscv_vsetvl_e32m4(n);
586 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl);
587 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
588 vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
589 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
590 __riscv_vse32(magnitudeVector, __riscv_vfsqrt(v, vl), vl);
591 }
592}
593#endif /*LV_HAVE_RVVSEG*/
594
595#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */