Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_index_max_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016, 2018-2020 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
56
57#ifndef INCLUDED_volk_32fc_index_max_32u_a_H
58#define INCLUDED_volk_32fc_index_max_32u_a_H
59
60#include <inttypes.h>
61#include <volk/volk_common.h>
62#include <volk/volk_complex.h>
63
64#ifdef LV_HAVE_AVX2
65#include <immintrin.h>
67
68static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
69 const lv_32fc_t* src0,
70 uint32_t num_points)
71{
72 const __m256i indices_increment = _mm256_set1_epi32(8);
73 /*
74 * At the start of each loop iteration current_indices holds the indices of
75 * the complex numbers loaded from memory. Explanation for odd order is given
76 * in implementation of vector_32fc_index_max_variant0().
77 */
78 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
79
80 __m256 max_values = _mm256_setzero_ps();
81 __m256i max_indices = _mm256_setzero_si256();
82
83 for (unsigned i = 0; i < num_points / 8u; ++i) {
84 __m256 in0 = _mm256_load_ps((float*)src0);
85 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
87 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
88 src0 += 8;
89 }
90
91 // determine maximum value and index in the result of the vectorized loop
92 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
93 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
94 _mm256_store_ps(max_values_buffer, max_values);
95 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
96
97 float max = 0.f;
98 uint32_t index = 0;
99 for (unsigned i = 0; i < 8; i++) {
100 if (max_values_buffer[i] > max) {
101 max = max_values_buffer[i];
102 index = max_indices_buffer[i];
103 }
104 }
105
106 // handle tail not processed by the vectorized loop
107 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
108 const float abs_squared =
109 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
110 if (abs_squared > max) {
111 max = abs_squared;
112 index = i;
113 }
114 ++src0;
115 }
116
117 *target = index;
118}
119
120#endif /*LV_HAVE_AVX2*/
121
122#ifdef LV_HAVE_AVX2
123#include <immintrin.h>
125
126static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
127 const lv_32fc_t* src0,
128 uint32_t num_points)
129{
130 const __m256i indices_increment = _mm256_set1_epi32(8);
131 /*
132 * At the start of each loop iteration current_indices holds the indices of
133 * the complex numbers loaded from memory. Explanation for odd order is given
134 * in implementation of vector_32fc_index_max_variant0().
135 */
136 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
137
138 __m256 max_values = _mm256_setzero_ps();
139 __m256i max_indices = _mm256_setzero_si256();
140
141 for (unsigned i = 0; i < num_points / 8u; ++i) {
142 __m256 in0 = _mm256_load_ps((float*)src0);
143 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
145 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
146 src0 += 8;
147 }
148
149 // determine maximum value and index in the result of the vectorized loop
150 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
151 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
152 _mm256_store_ps(max_values_buffer, max_values);
153 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
154
155 float max = 0.f;
156 uint32_t index = 0;
157 for (unsigned i = 0; i < 8; i++) {
158 if (max_values_buffer[i] > max) {
159 max = max_values_buffer[i];
160 index = max_indices_buffer[i];
161 }
162 }
163
164 // handle tail not processed by the vectorized loop
165 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
166 const float abs_squared =
167 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
168 if (abs_squared > max) {
169 max = abs_squared;
170 index = i;
171 }
172 ++src0;
173 }
174
175 *target = index;
176}
177
178#endif /*LV_HAVE_AVX2*/
179
180#ifdef LV_HAVE_SSE3
181#include <pmmintrin.h>
182#include <xmmintrin.h>
183
184static inline void volk_32fc_index_max_32u_a_sse3(uint32_t* target,
185 const lv_32fc_t* src0,
186 uint32_t num_points)
187{
188 const uint32_t num_bytes = num_points * 8;
189
190 union bit128 holderf;
191 union bit128 holderi;
192 float sq_dist = 0.0;
193
194 union bit128 xmm5, xmm4;
195 __m128 xmm1, xmm2, xmm3;
196 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
197
198 xmm5.int_vec = _mm_setzero_si128();
199 xmm4.int_vec = _mm_setzero_si128();
200 holderf.int_vec = _mm_setzero_si128();
201 holderi.int_vec = _mm_setzero_si128();
202
203 int bound = num_bytes >> 5;
204 int i = 0;
205
206 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
207 xmm9 = _mm_setzero_si128();
208 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
209 xmm3 = _mm_setzero_ps();
210
211 for (; i < bound; ++i) {
212 xmm1 = _mm_load_ps((float*)src0);
213 xmm2 = _mm_load_ps((float*)&src0[2]);
214
215 src0 += 4;
216
217 xmm1 = _mm_mul_ps(xmm1, xmm1);
218 xmm2 = _mm_mul_ps(xmm2, xmm2);
219
220 xmm1 = _mm_hadd_ps(xmm1, xmm2);
221
222 xmm3 = _mm_max_ps(xmm1, xmm3);
223
224 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
225 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
226
227 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
228 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
229
230 xmm9 = _mm_add_epi32(xmm11, xmm12);
231
232 xmm8 = _mm_add_epi32(xmm8, xmm10);
233 }
234
235 if (num_bytes >> 4 & 1) {
236 xmm2 = _mm_load_ps((float*)src0);
237
238 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
239 xmm8 = bit128_p(&xmm1)->int_vec;
240
241 xmm2 = _mm_mul_ps(xmm2, xmm2);
242
243 src0 += 2;
244
245 xmm1 = _mm_hadd_ps(xmm2, xmm2);
246
247 xmm3 = _mm_max_ps(xmm1, xmm3);
248
249 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
250
251 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
252 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
253
254 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
255 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
256
257 xmm9 = _mm_add_epi32(xmm11, xmm12);
258
259 xmm8 = _mm_add_epi32(xmm8, xmm10);
260 }
261
262 if (num_bytes >> 3 & 1) {
263 sq_dist =
264 lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
265
266 xmm2 = _mm_load1_ps(&sq_dist);
267
268 xmm1 = xmm3;
269
270 xmm3 = _mm_max_ss(xmm3, xmm2);
271
272 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
273 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
274
275 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
276
277 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
278 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
279
280 xmm9 = _mm_add_epi32(xmm11, xmm12);
281 }
282
283 _mm_store_ps((float*)&(holderf.f), xmm3);
284 _mm_store_si128(&(holderi.int_vec), xmm9);
285
286 target[0] = holderi.i[0];
287 sq_dist = holderf.f[0];
288 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
289 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
290 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
291 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
292 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
293 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
294}
295
296#endif /*LV_HAVE_SSE3*/
297
298#ifdef LV_HAVE_GENERIC
299static inline void volk_32fc_index_max_32u_generic(uint32_t* target,
300 const lv_32fc_t* src0,
301 uint32_t num_points)
302{
303 const uint32_t num_bytes = num_points * 8;
304
305 float sq_dist = 0.0;
306 float max = 0.0;
307 uint32_t index = 0;
308
309 uint32_t i = 0;
310
311 for (; i < (num_bytes >> 3); ++i) {
312 sq_dist =
313 lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
314
315 if (sq_dist > max) {
316 index = i;
317 max = sq_dist;
318 }
319 }
320 target[0] = index;
321}
322
323#endif /*LV_HAVE_GENERIC*/
324
325#ifdef LV_HAVE_AVX512F
326#include <immintrin.h>
327
328static inline void volk_32fc_index_max_32u_a_avx512f(uint32_t* target,
329 const lv_32fc_t* src0,
330 uint32_t num_points)
331{
332 const lv_32fc_t* src0Ptr = src0;
333 const uint32_t sixteenthPoints = num_points / 16;
334
335 // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
336 __m512 currentIndexes =
337 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
338 const __m512 indexIncrement = _mm512_set1_ps(16);
339
340 __m512 maxValues = _mm512_setzero_ps();
341 __m512 maxIndices = _mm512_setzero_ps();
342
343 for (uint32_t number = 0; number < sixteenthPoints; number++) {
344 // Load 16 complex values (32 floats)
345 __m512 in0 = _mm512_load_ps((const float*)src0Ptr);
346 __m512 in1 = _mm512_load_ps((const float*)(src0Ptr + 8));
347 src0Ptr += 16;
348
349 // Square all values
350 in0 = _mm512_mul_ps(in0, in0);
351 in1 = _mm512_mul_ps(in1, in1);
352
353 // Add adjacent pairs (re² + im²) using within-lane shuffle
354 // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
355 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
356 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
357 __m512 sum0 = _mm512_add_ps(in0, sw0);
358 __m512 sum1 = _mm512_add_ps(in1, sw1);
359
360 // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
361 // 0x88 = _MM_SHUFFLE(2,0,2,0)
362 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
363
364 // Compare and update maximums
365 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, maxValues, _CMP_GT_OS);
366 maxIndices = _mm512_mask_blend_ps(cmpMask, maxIndices, currentIndexes);
367 maxValues = _mm512_max_ps(mag_sq, maxValues);
368
369 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
370 }
371
372 // Reduce 16 values to find maximum
373 __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
374 __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
375 _mm512_store_ps(maxValuesBuffer, maxValues);
376 _mm512_store_ps(maxIndexesBuffer, maxIndices);
377
378 float max = 0.0f;
379 uint32_t index = 0;
380 for (uint32_t i = 0; i < 16; i++) {
381 if (maxValuesBuffer[i] > max) {
382 max = maxValuesBuffer[i];
383 index = (uint32_t)maxIndexesBuffer[i];
384 } else if (maxValuesBuffer[i] == max) {
385 if ((uint32_t)maxIndexesBuffer[i] < index)
386 index = (uint32_t)maxIndexesBuffer[i];
387 }
388 }
389
390 // Handle tail
391 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
392 const float re = lv_creal(*src0Ptr);
393 const float im = lv_cimag(*src0Ptr);
394 const float sq_dist = re * re + im * im;
395 if (sq_dist > max) {
396 max = sq_dist;
397 index = number;
398 }
399 src0Ptr++;
400 }
401 *target = index;
402}
403
404#endif /*LV_HAVE_AVX512F*/
405
406#endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
407
408#ifndef INCLUDED_volk_32fc_index_max_32u_u_H
409#define INCLUDED_volk_32fc_index_max_32u_u_H
410
411#include <inttypes.h>
412#include <volk/volk_common.h>
413#include <volk/volk_complex.h>
414
415#ifdef LV_HAVE_AVX2
416#include <immintrin.h>
418
419static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
420 const lv_32fc_t* src0,
421 uint32_t num_points)
422{
423 const __m256i indices_increment = _mm256_set1_epi32(8);
424 /*
425 * At the start of each loop iteration current_indices holds the indices of
426 * the complex numbers loaded from memory. Explanation for odd order is given
427 * in implementation of vector_32fc_index_max_variant0().
428 */
429 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
430
431 __m256 max_values = _mm256_setzero_ps();
432 __m256i max_indices = _mm256_setzero_si256();
433
434 for (unsigned i = 0; i < num_points / 8u; ++i) {
435 __m256 in0 = _mm256_loadu_ps((float*)src0);
436 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
438 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
439 src0 += 8;
440 }
441
442 // determine maximum value and index in the result of the vectorized loop
443 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
444 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
445 _mm256_store_ps(max_values_buffer, max_values);
446 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
447
448 float max = 0.f;
449 uint32_t index = 0;
450 for (unsigned i = 0; i < 8; i++) {
451 if (max_values_buffer[i] > max) {
452 max = max_values_buffer[i];
453 index = max_indices_buffer[i];
454 }
455 }
456
457 // handle tail not processed by the vectorized loop
458 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
459 const float abs_squared =
460 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
461 if (abs_squared > max) {
462 max = abs_squared;
463 index = i;
464 }
465 ++src0;
466 }
467
468 *target = index;
469}
470
471#endif /*LV_HAVE_AVX2*/
472
473#ifdef LV_HAVE_AVX2
474#include <immintrin.h>
476
477static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
478 const lv_32fc_t* src0,
479 uint32_t num_points)
480{
481 const __m256i indices_increment = _mm256_set1_epi32(8);
482 /*
483 * At the start of each loop iteration current_indices holds the indices of
484 * the complex numbers loaded from memory. Explanation for odd order is given
485 * in implementation of vector_32fc_index_max_variant0().
486 */
487 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
488
489 __m256 max_values = _mm256_setzero_ps();
490 __m256i max_indices = _mm256_setzero_si256();
491
492 for (unsigned i = 0; i < num_points / 8u; ++i) {
493 __m256 in0 = _mm256_loadu_ps((float*)src0);
494 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
496 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
497 src0 += 8;
498 }
499
500 // determine maximum value and index in the result of the vectorized loop
501 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
502 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
503 _mm256_store_ps(max_values_buffer, max_values);
504 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
505
506 float max = 0.f;
507 uint32_t index = 0;
508 for (unsigned i = 0; i < 8; i++) {
509 if (max_values_buffer[i] > max) {
510 max = max_values_buffer[i];
511 index = max_indices_buffer[i];
512 }
513 }
514
515 // handle tail not processed by the vectorized loop
516 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
517 const float abs_squared =
518 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
519 if (abs_squared > max) {
520 max = abs_squared;
521 index = i;
522 }
523 ++src0;
524 }
525
526 *target = index;
527}
528
529#endif /*LV_HAVE_AVX2*/
530
531#ifdef LV_HAVE_NEON
532#include <arm_neon.h>
534
535static inline void
536volk_32fc_index_max_32u_neon(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points)
537{
538 unsigned int number = 0;
539 const uint32_t quarter_points = num_points / 4;
540 const lv_32fc_t* src0Ptr = src0;
541
542 uint32_t indices[4] = { 0, 1, 2, 3 };
543 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
544 uint32x4_t vec_indices = vld1q_u32(indices);
545 uint32x4_t vec_max_indices = vec_indices;
546
547 if (num_points) {
548 float max = FLT_MIN;
549 uint32_t index = 0;
550
551 float32x4_t vec_max = vdupq_n_f32(FLT_MIN);
552
553 for (; number < quarter_points; number++) {
554 // Load complex and compute magnitude squared
555 const float32x4_t vec_mag2 =
556 _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
557 __VOLK_PREFETCH(src0Ptr += 4);
558 // a > b?
559 const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
560 vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
561 vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
562 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
563 }
564 uint32_t tmp_max_indices[4];
565 float tmp_max[4];
566 vst1q_u32(tmp_max_indices, vec_max_indices);
567 vst1q_f32(tmp_max, vec_max);
568
569 for (int i = 0; i < 4; i++) {
570 if (tmp_max[i] > max) {
571 max = tmp_max[i];
572 index = tmp_max_indices[i];
573 } else if (tmp_max[i] == max) {
574 if (tmp_max_indices[i] < index)
575 index = tmp_max_indices[i];
576 }
577 }
578
579 // Deal with the rest
580 for (number = quarter_points * 4; number < num_points; number++) {
581 const float re = lv_creal(*src0Ptr);
582 const float im = lv_cimag(*src0Ptr);
583 const float sq_dist = re * re + im * im;
584 if (sq_dist > max) {
585 max = sq_dist;
586 index = number;
587 }
588 src0Ptr++;
589 }
590 *target = index;
591 }
592}
593
594#endif /*LV_HAVE_NEON*/
595
596
597#ifdef LV_HAVE_NEONV8
598#include <arm_neon.h>
599#include <float.h>
600
601static inline void volk_32fc_index_max_32u_neonv8(uint32_t* target,
602 const lv_32fc_t* src0,
603 uint32_t num_points)
604{
605 if (num_points == 0)
606 return;
607
608 const uint32_t quarter_points = num_points / 4;
609 const lv_32fc_t* inputPtr = src0;
610
611 // Use integer indices directly (no float conversion overhead)
612 uint32x4_t vec_indices = { 0, 1, 2, 3 };
613 const uint32x4_t vec_incr = vdupq_n_u32(4);
614
615 float32x4_t vec_max = vdupq_n_f32(0.0f);
616 uint32x4_t vec_max_idx = vdupq_n_u32(0);
617
618 for (uint32_t i = 0; i < quarter_points; i++) {
619 // Load and deinterleave complex values
620 float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
621 inputPtr += 4;
622
623 // Magnitude squared using FMA: re*re + im*im
624 float32x4_t mag2 =
625 vfmaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
626
627 // Compare BEFORE max update to know which lanes change
628 uint32x4_t gt_mask = vcgtq_f32(mag2, vec_max);
629 vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
630
631 // vmaxq_f32 is single-cycle, no dependency on comparison result
632 vec_max = vmaxq_f32(mag2, vec_max);
633
634 vec_indices = vaddq_u32(vec_indices, vec_incr);
635 }
636
637 // ARMv8 horizontal reduction - find max value across all lanes
638 float max_val = vmaxvq_f32(vec_max);
639
640 // Find which lane(s) have the max value, get minimum index among them
641 uint32x4_t max_mask = vceqq_f32(vec_max, vdupq_n_f32(max_val));
642 uint32x4_t idx_masked = vbslq_u32(max_mask, vec_max_idx, vdupq_n_u32(UINT32_MAX));
643 uint32_t result_idx = vminvq_u32(idx_masked);
644
645 // Handle tail elements
646 for (uint32_t i = quarter_points * 4; i < num_points; i++) {
647 float re = lv_creal(src0[i]);
648 float im = lv_cimag(src0[i]);
649 float mag2 = re * re + im * im;
650 if (mag2 > max_val) {
651 max_val = mag2;
652 result_idx = i;
653 }
654 }
655
656 *target = result_idx;
657}
658
659#endif /*LV_HAVE_NEONV8*/
660
661
662#ifdef LV_HAVE_AVX512F
663#include <immintrin.h>
664
665static inline void volk_32fc_index_max_32u_u_avx512f(uint32_t* target,
666 const lv_32fc_t* src0,
667 uint32_t num_points)
668{
669 const lv_32fc_t* src0Ptr = src0;
670 const uint32_t sixteenthPoints = num_points / 16;
671
672 // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
673 __m512 currentIndexes =
674 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
675 const __m512 indexIncrement = _mm512_set1_ps(16);
676
677 __m512 maxValues = _mm512_setzero_ps();
678 __m512 maxIndices = _mm512_setzero_ps();
679
680 for (uint32_t number = 0; number < sixteenthPoints; number++) {
681 // Load 16 complex values (32 floats)
682 __m512 in0 = _mm512_loadu_ps((const float*)src0Ptr);
683 __m512 in1 = _mm512_loadu_ps((const float*)(src0Ptr + 8));
684 src0Ptr += 16;
685
686 // Square all values
687 in0 = _mm512_mul_ps(in0, in0);
688 in1 = _mm512_mul_ps(in1, in1);
689
690 // Add adjacent pairs (re² + im²) using within-lane shuffle
691 // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
692 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
693 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
694 __m512 sum0 = _mm512_add_ps(in0, sw0);
695 __m512 sum1 = _mm512_add_ps(in1, sw1);
696
697 // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
698 // 0x88 = _MM_SHUFFLE(2,0,2,0)
699 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
700
701 // Compare and update maximums
702 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, maxValues, _CMP_GT_OS);
703 maxIndices = _mm512_mask_blend_ps(cmpMask, maxIndices, currentIndexes);
704 maxValues = _mm512_max_ps(mag_sq, maxValues);
705
706 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
707 }
708
709 // Reduce 16 values to find maximum
710 __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
711 __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
712 _mm512_store_ps(maxValuesBuffer, maxValues);
713 _mm512_store_ps(maxIndexesBuffer, maxIndices);
714
715 float max = 0.0f;
716 uint32_t index = 0;
717 for (uint32_t i = 0; i < 16; i++) {
718 if (maxValuesBuffer[i] > max) {
719 max = maxValuesBuffer[i];
720 index = (uint32_t)maxIndexesBuffer[i];
721 } else if (maxValuesBuffer[i] == max) {
722 if ((uint32_t)maxIndexesBuffer[i] < index)
723 index = (uint32_t)maxIndexesBuffer[i];
724 }
725 }
726
727 // Handle tail
728 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
729 const float re = lv_creal(*src0Ptr);
730 const float im = lv_cimag(*src0Ptr);
731 const float sq_dist = re * re + im * im;
732 if (sq_dist > max) {
733 max = sq_dist;
734 index = number;
735 }
736 src0Ptr++;
737 }
738 *target = index;
739}
740
741#endif /*LV_HAVE_AVX512F*/
742
743#ifdef LV_HAVE_RVV
744#include <float.h>
745#include <riscv_vector.h>
746
747static inline void
748volk_32fc_index_max_32u_rvv(uint32_t* target, const lv_32fc_t* src0, uint32_t num_points)
749{
750 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
751 vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
752 vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
753 size_t n = num_points;
754 for (size_t vl; n > 0; n -= vl, src0 += vl) {
755 vl = __riscv_vsetvl_e32m4(n);
756 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl);
757 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
758 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
759 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
760 vbool8_t m = __riscv_vmflt(vmax, v, vl);
761 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
762 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
763 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
764 }
765 size_t vl = __riscv_vsetvlmax_e32m4();
766 float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
767 __riscv_vfmv_v_f_f32m1(0, 1),
768 __riscv_vsetvlmax_e32m1()));
769 // Find lanes with max value, set others to UINT32_MAX
770 vbool8_t m = __riscv_vmfeq(vmax, max, vl);
771 vuint32m4_t idx_masked =
772 __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmaxi, m, vl);
773 // Find minimum index among lanes with max value
774 *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 32, idx_masked),
775 __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
776 __riscv_vsetvlmax_e32m1()));
777}
778#endif /*LV_HAVE_RVV*/
779
780#ifdef LV_HAVE_RVVSEG
781#include <float.h>
782#include <riscv_vector.h>
783
784static inline void volk_32fc_index_max_32u_rvvseg(uint32_t* target,
785 const lv_32fc_t* src0,
786 uint32_t num_points)
787{
788 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
789 vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
790 vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
791 size_t n = num_points;
792 for (size_t vl; n > 0; n -= vl, src0 += vl) {
793 vl = __riscv_vsetvl_e32m4(n);
794 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl);
795 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
796 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
797 vbool8_t m = __riscv_vmflt(vmax, v, vl);
798 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
799 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
800 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
801 }
802 size_t vl = __riscv_vsetvlmax_e32m4();
803 float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
804 __riscv_vfmv_v_f_f32m1(0, 1),
805 __riscv_vsetvlmax_e32m1()));
806 // Find lanes with max value, set others to UINT32_MAX
807 vbool8_t m = __riscv_vmfeq(vmax, max, vl);
808 vuint32m4_t idx_masked =
809 __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmaxi, m, vl);
810 // Find minimum index among lanes with max value
811 *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 32, idx_masked),
812 __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
813 __riscv_vsetvlmax_e32m1()));
814}
815#endif /*LV_HAVE_RVVSEG*/
816
817#endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/