Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
62
63#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
64#define INCLUDED_volk_32fc_index_max_16u_a_H
65
66#include <inttypes.h>
67#include <limits.h>
68#include <stdio.h>
69#include <volk/volk_common.h>
70#include <volk/volk_complex.h>
71
72#ifdef LV_HAVE_AVX2
73#include <immintrin.h>
75
76static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
77 const lv_32fc_t* src0,
78 uint32_t num_points)
79{
80 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81
82 const __m256i indices_increment = _mm256_set1_epi32(8);
83 /*
84 * At the start of each loop iteration current_indices holds the indices of
85 * the complex numbers loaded from memory. Explanation for odd order is given
86 * in implementation of vector_32fc_index_max_variant0().
87 */
88 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89
90 __m256 max_values = _mm256_setzero_ps();
91 __m256i max_indices = _mm256_setzero_si256();
92
93 for (unsigned i = 0; i < num_points / 8u; ++i) {
94 __m256 in0 = _mm256_load_ps((float*)src0);
95 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
97 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
98 src0 += 8;
99 }
100
101 // determine maximum value and index in the result of the vectorized loop
102 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
103 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
104 _mm256_store_ps(max_values_buffer, max_values);
105 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
106
107 float max = 0.f;
108 uint32_t index = 0;
109 for (unsigned i = 0; i < 8; i++) {
110 if (max_values_buffer[i] > max) {
111 max = max_values_buffer[i];
112 index = max_indices_buffer[i];
113 }
114 }
115
116 // handle tail not processed by the vectorized loop
117 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118 const float abs_squared =
119 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
120 if (abs_squared > max) {
121 max = abs_squared;
122 index = i;
123 }
124 ++src0;
125 }
126
127 *target = index;
128}
129
130#endif /*LV_HAVE_AVX2*/
131
132#ifdef LV_HAVE_AVX2
133#include <immintrin.h>
135
136static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
137 const lv_32fc_t* src0,
138 uint32_t num_points)
139{
140 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141
142 const __m256i indices_increment = _mm256_set1_epi32(8);
143 /*
144 * At the start of each loop iteration current_indices holds the indices of
145 * the complex numbers loaded from memory. Explanation for odd order is given
146 * in implementation of vector_32fc_index_max_variant0().
147 */
148 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149
150 __m256 max_values = _mm256_setzero_ps();
151 __m256i max_indices = _mm256_setzero_si256();
152
153 for (unsigned i = 0; i < num_points / 8u; ++i) {
154 __m256 in0 = _mm256_load_ps((float*)src0);
155 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
157 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
158 src0 += 8;
159 }
160
161 // determine maximum value and index in the result of the vectorized loop
162 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
163 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
164 _mm256_store_ps(max_values_buffer, max_values);
165 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
166
167 float max = 0.f;
168 uint32_t index = 0;
169 for (unsigned i = 0; i < 8; i++) {
170 if (max_values_buffer[i] > max) {
171 max = max_values_buffer[i];
172 index = max_indices_buffer[i];
173 }
174 }
175
176 // handle tail not processed by the vectorized loop
177 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178 const float abs_squared =
179 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
180 if (abs_squared > max) {
181 max = abs_squared;
182 index = i;
183 }
184 ++src0;
185 }
186
187 *target = index;
188}
189
190#endif /*LV_HAVE_AVX2*/
191
192#ifdef LV_HAVE_SSE3
193#include <pmmintrin.h>
194#include <xmmintrin.h>
195
196static inline void volk_32fc_index_max_16u_a_sse3(uint16_t* target,
197 const lv_32fc_t* src0,
198 uint32_t num_points)
199{
200 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
201 const uint32_t num_bytes = num_points * 8;
202
203 union bit128 holderf;
204 union bit128 holderi;
205 float sq_dist = 0.0;
206
207 union bit128 xmm5, xmm4;
208 __m128 xmm1, xmm2, xmm3;
209 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
210
211 xmm5.int_vec = _mm_setzero_si128();
212 xmm4.int_vec = _mm_setzero_si128();
213 holderf.int_vec = _mm_setzero_si128();
214 holderi.int_vec = _mm_setzero_si128();
215
216 int bound = num_bytes >> 5;
217 int i = 0;
218
219 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
220 xmm9 = _mm_setzero_si128();
221 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
222 xmm3 = _mm_setzero_ps();
223
224 for (; i < bound; ++i) {
225 xmm1 = _mm_load_ps((float*)src0);
226 xmm2 = _mm_load_ps((float*)&src0[2]);
227
228 src0 += 4;
229
230 xmm1 = _mm_mul_ps(xmm1, xmm1);
231 xmm2 = _mm_mul_ps(xmm2, xmm2);
232
233 xmm1 = _mm_hadd_ps(xmm1, xmm2);
234
235 xmm3 = _mm_max_ps(xmm1, xmm3);
236
237 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
238 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
239
240 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
241 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
242
243 xmm9 = _mm_add_epi32(xmm11, xmm12);
244
245 xmm8 = _mm_add_epi32(xmm8, xmm10);
246 }
247
248 if (num_bytes >> 4 & 1) {
249 xmm2 = _mm_load_ps((float*)src0);
250
251 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
252 xmm8 = bit128_p(&xmm1)->int_vec;
253
254 xmm2 = _mm_mul_ps(xmm2, xmm2);
255
256 src0 += 2;
257
258 xmm1 = _mm_hadd_ps(xmm2, xmm2);
259
260 xmm3 = _mm_max_ps(xmm1, xmm3);
261
262 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
263
264 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
265 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
266
267 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
268 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
269
270 xmm9 = _mm_add_epi32(xmm11, xmm12);
271
272 xmm8 = _mm_add_epi32(xmm8, xmm10);
273 }
274
275 if (num_bytes >> 3 & 1) {
276 sq_dist =
277 lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
278
279 xmm2 = _mm_load1_ps(&sq_dist);
280
281 xmm1 = xmm3;
282
283 xmm3 = _mm_max_ss(xmm3, xmm2);
284
285 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
286 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
287
288 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
289
290 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
291 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
292
293 xmm9 = _mm_add_epi32(xmm11, xmm12);
294 }
295
296 _mm_store_ps((float*)&(holderf.f), xmm3);
297 _mm_store_si128(&(holderi.int_vec), xmm9);
298
299 target[0] = holderi.i[0];
300 sq_dist = holderf.f[0];
301 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
302 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
303 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
304 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
305 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
306 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
307}
308
309#endif /*LV_HAVE_SSE3*/
310
311
312#ifdef LV_HAVE_NEON
313#include <arm_neon.h>
314#include <float.h>
315#include <limits.h>
317
318static inline void
319volk_32fc_index_max_16u_neon(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points)
320{
321 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
322
323 if (num_points == 0)
324 return;
325
326 const uint32_t quarter_points = num_points / 4;
327 const lv_32fc_t* inputPtr = src0;
328
329 // Use integer indices directly
330 uint32x4_t vec_indices = { 0, 1, 2, 3 };
331 const uint32x4_t vec_incr = vdupq_n_u32(4);
332
333 float32x4_t vec_max = vdupq_n_f32(0.0f);
334 uint32x4_t vec_max_idx = vdupq_n_u32(0);
335
336 for (uint32_t i = 0; i < quarter_points; i++) {
337 // Load and deinterleave complex values
338 float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
339 inputPtr += 4;
340
341 // Magnitude squared: re*re + im*im
342 float32x4_t mag2 =
343 vmlaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
344
345 // Compare BEFORE max update to know which lanes change
346 uint32x4_t gt_mask = vcgtq_f32(mag2, vec_max);
347 vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
348
349 // vmaxq_f32 is single-cycle, no dependency on comparison result
350 vec_max = vmaxq_f32(mag2, vec_max);
351
352 vec_indices = vaddq_u32(vec_indices, vec_incr);
353 }
354
355 // Scalar reduction
356 __VOLK_ATTR_ALIGNED(16) float max_buf[4];
357 __VOLK_ATTR_ALIGNED(16) uint32_t idx_buf[4];
358 vst1q_f32(max_buf, vec_max);
359 vst1q_u32(idx_buf, vec_max_idx);
360
361 float max_val = max_buf[0];
362 uint32_t result_idx = idx_buf[0];
363 for (int i = 1; i < 4; i++) {
364 if (max_buf[i] > max_val) {
365 max_val = max_buf[i];
366 result_idx = idx_buf[i];
367 } else if (max_buf[i] == max_val && idx_buf[i] < result_idx) {
368 result_idx = idx_buf[i];
369 }
370 }
371
372 // Handle tail
373 for (uint32_t i = quarter_points * 4; i < num_points; i++) {
374 float re = lv_creal(src0[i]);
375 float im = lv_cimag(src0[i]);
376 float mag2 = re * re + im * im;
377 if (mag2 > max_val) {
378 max_val = mag2;
379 result_idx = i;
380 }
381 }
382
383 *target = (uint16_t)result_idx;
384}
385
386#endif /*LV_HAVE_NEON*/
387
388
389#ifdef LV_HAVE_NEONV8
390#include <arm_neon.h>
391#include <float.h>
392#include <limits.h>
393
394static inline void volk_32fc_index_max_16u_neonv8(uint16_t* target,
395 const lv_32fc_t* src0,
396 uint32_t num_points)
397{
398 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
399
400 if (num_points == 0)
401 return;
402
403 const uint32_t quarter_points = num_points / 4;
404 const lv_32fc_t* inputPtr = src0;
405
406 // Use integer indices directly (no float conversion overhead)
407 uint32x4_t vec_indices = { 0, 1, 2, 3 };
408 const uint32x4_t vec_incr = vdupq_n_u32(4);
409
410 float32x4_t vec_max = vdupq_n_f32(0.0f);
411 uint32x4_t vec_max_idx = vdupq_n_u32(0);
412
413 for (uint32_t i = 0; i < quarter_points; i++) {
414 // Load and deinterleave complex values
415 float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
416 inputPtr += 4;
417
418 // Magnitude squared using FMA: re*re + im*im
419 float32x4_t mag2 =
420 vfmaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
421
422 // Compare BEFORE max update to know which lanes change
423 uint32x4_t gt_mask = vcgtq_f32(mag2, vec_max);
424 vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
425
426 // vmaxq_f32 is single-cycle, no dependency on comparison result
427 vec_max = vmaxq_f32(mag2, vec_max);
428
429 vec_indices = vaddq_u32(vec_indices, vec_incr);
430 }
431
432 // ARMv8 horizontal reduction
433 float max_val = vmaxvq_f32(vec_max);
434 uint32x4_t max_mask = vceqq_f32(vec_max, vdupq_n_f32(max_val));
435 uint32x4_t idx_masked = vbslq_u32(max_mask, vec_max_idx, vdupq_n_u32(UINT32_MAX));
436 uint32_t result_idx = vminvq_u32(idx_masked);
437
438 // Handle tail
439 for (uint32_t i = quarter_points * 4; i < num_points; i++) {
440 float re = lv_creal(src0[i]);
441 float im = lv_cimag(src0[i]);
442 float mag2 = re * re + im * im;
443 if (mag2 > max_val) {
444 max_val = mag2;
445 result_idx = i;
446 }
447 }
448
449 *target = (uint16_t)result_idx;
450}
451
452#endif /*LV_HAVE_NEONV8*/
453
454
455#ifdef LV_HAVE_GENERIC
456static inline void volk_32fc_index_max_16u_generic(uint16_t* target,
457 const lv_32fc_t* src0,
458 uint32_t num_points)
459{
460 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
461
462 const uint32_t num_bytes = num_points * 8;
463
464 float sq_dist = 0.0;
465 float max = 0.0;
466 uint16_t index = 0;
467
468 uint32_t i = 0;
469
470 for (; i < (num_bytes >> 3); ++i) {
471 sq_dist =
472 lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
473
474 if (sq_dist > max) {
475 index = i;
476 max = sq_dist;
477 }
478 }
479 target[0] = index;
480}
481
482#endif /*LV_HAVE_GENERIC*/
483
484#ifdef LV_HAVE_AVX512F
485#include <immintrin.h>
486#include <limits.h>
487
488static inline void volk_32fc_index_max_16u_a_avx512f(uint16_t* target,
489 const lv_32fc_t* src0,
490 uint32_t num_points)
491{
492 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
493
494 const lv_32fc_t* src0Ptr = src0;
495 const uint32_t sixteenthPoints = num_points / 16;
496
497 // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
498 __m512 currentIndexes =
499 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
500 const __m512 indexIncrement = _mm512_set1_ps(16);
501
502 __m512 maxValues = _mm512_setzero_ps();
503 __m512 maxIndices = _mm512_setzero_ps();
504
505 for (uint32_t number = 0; number < sixteenthPoints; number++) {
506 // Load 16 complex values (32 floats)
507 __m512 in0 = _mm512_load_ps((const float*)src0Ptr);
508 __m512 in1 = _mm512_load_ps((const float*)(src0Ptr + 8));
509 src0Ptr += 16;
510
511 // Square all values
512 in0 = _mm512_mul_ps(in0, in0);
513 in1 = _mm512_mul_ps(in1, in1);
514
515 // Add adjacent pairs (re² + im²) using within-lane shuffle
516 // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
517 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
518 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
519 __m512 sum0 = _mm512_add_ps(in0, sw0);
520 __m512 sum1 = _mm512_add_ps(in1, sw1);
521
522 // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
523 // 0x88 = _MM_SHUFFLE(2,0,2,0)
524 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
525
526 // Compare and update maximums
527 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, maxValues, _CMP_GT_OS);
528 maxIndices = _mm512_mask_blend_ps(cmpMask, maxIndices, currentIndexes);
529 maxValues = _mm512_max_ps(mag_sq, maxValues);
530
531 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
532 }
533
534 // Reduce 16 values to find maximum
535 __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
536 __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
537 _mm512_store_ps(maxValuesBuffer, maxValues);
538 _mm512_store_ps(maxIndexesBuffer, maxIndices);
539
540 float max = 0.0f;
541 uint32_t index = 0;
542 for (uint32_t i = 0; i < 16; i++) {
543 if (maxValuesBuffer[i] > max) {
544 max = maxValuesBuffer[i];
545 index = (uint32_t)maxIndexesBuffer[i];
546 } else if (maxValuesBuffer[i] == max) {
547 if ((uint32_t)maxIndexesBuffer[i] < index)
548 index = (uint32_t)maxIndexesBuffer[i];
549 }
550 }
551
552 // Handle tail
553 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
554 const float re = lv_creal(*src0Ptr);
555 const float im = lv_cimag(*src0Ptr);
556 const float sq_dist = re * re + im * im;
557 if (sq_dist > max) {
558 max = sq_dist;
559 index = number;
560 }
561 src0Ptr++;
562 }
563 *target = (uint16_t)index;
564}
565
566#endif /*LV_HAVE_AVX512F*/
567
568#endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
569
570#ifndef INCLUDED_volk_32fc_index_max_16u_u_H
571#define INCLUDED_volk_32fc_index_max_16u_u_H
572
573#include <inttypes.h>
574#include <limits.h>
575#include <stdio.h>
576#include <volk/volk_common.h>
577#include <volk/volk_complex.h>
578
579#ifdef LV_HAVE_AVX2
580#include <immintrin.h>
582
583static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
584 const lv_32fc_t* src0,
585 uint32_t num_points)
586{
587 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
588
589 const __m256i indices_increment = _mm256_set1_epi32(8);
590 /*
591 * At the start of each loop iteration current_indices holds the indices of
592 * the complex numbers loaded from memory. Explanation for odd order is given
593 * in implementation of vector_32fc_index_max_variant0().
594 */
595 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
596
597 __m256 max_values = _mm256_setzero_ps();
598 __m256i max_indices = _mm256_setzero_si256();
599
600 for (unsigned i = 0; i < num_points / 8u; ++i) {
601 __m256 in0 = _mm256_loadu_ps((float*)src0);
602 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
604 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
605 src0 += 8;
606 }
607
608 // determine maximum value and index in the result of the vectorized loop
609 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
610 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
611 _mm256_store_ps(max_values_buffer, max_values);
612 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
613
614 float max = 0.f;
615 uint32_t index = 0;
616 for (unsigned i = 0; i < 8; i++) {
617 if (max_values_buffer[i] > max) {
618 max = max_values_buffer[i];
619 index = max_indices_buffer[i];
620 }
621 }
622
623 // handle tail not processed by the vectorized loop
624 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
625 const float abs_squared =
626 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
627 if (abs_squared > max) {
628 max = abs_squared;
629 index = i;
630 }
631 ++src0;
632 }
633
634 *target = index;
635}
636
637#endif /*LV_HAVE_AVX2*/
638
639#ifdef LV_HAVE_AVX2
640#include <immintrin.h>
642
643static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
644 const lv_32fc_t* src0,
645 uint32_t num_points)
646{
647 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
648
649 const __m256i indices_increment = _mm256_set1_epi32(8);
650 /*
651 * At the start of each loop iteration current_indices holds the indices of
652 * the complex numbers loaded from memory. Explanation for odd order is given
653 * in implementation of vector_32fc_index_max_variant0().
654 */
655 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
656
657 __m256 max_values = _mm256_setzero_ps();
658 __m256i max_indices = _mm256_setzero_si256();
659
660 for (unsigned i = 0; i < num_points / 8u; ++i) {
661 __m256 in0 = _mm256_loadu_ps((float*)src0);
662 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
664 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
665 src0 += 8;
666 }
667
668 // determine maximum value and index in the result of the vectorized loop
669 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
670 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
671 _mm256_store_ps(max_values_buffer, max_values);
672 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
673
674 float max = 0.f;
675 uint32_t index = 0;
676 for (unsigned i = 0; i < 8; i++) {
677 if (max_values_buffer[i] > max) {
678 max = max_values_buffer[i];
679 index = max_indices_buffer[i];
680 }
681 }
682
683 // handle tail not processed by the vectorized loop
684 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
685 const float abs_squared =
686 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
687 if (abs_squared > max) {
688 max = abs_squared;
689 index = i;
690 }
691 ++src0;
692 }
693
694 *target = index;
695}
696
697#endif /*LV_HAVE_AVX2*/
698
699#ifdef LV_HAVE_AVX512F
700#include <immintrin.h>
701#include <limits.h>
702
703static inline void volk_32fc_index_max_16u_u_avx512f(uint16_t* target,
704 const lv_32fc_t* src0,
705 uint32_t num_points)
706{
707 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
708
709 const lv_32fc_t* src0Ptr = src0;
710 const uint32_t sixteenthPoints = num_points / 16;
711
712 // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
713 __m512 currentIndexes =
714 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
715 const __m512 indexIncrement = _mm512_set1_ps(16);
716
717 __m512 maxValues = _mm512_setzero_ps();
718 __m512 maxIndices = _mm512_setzero_ps();
719
720 for (uint32_t number = 0; number < sixteenthPoints; number++) {
721 // Load 16 complex values (32 floats)
722 __m512 in0 = _mm512_loadu_ps((const float*)src0Ptr);
723 __m512 in1 = _mm512_loadu_ps((const float*)(src0Ptr + 8));
724 src0Ptr += 16;
725
726 // Square all values
727 in0 = _mm512_mul_ps(in0, in0);
728 in1 = _mm512_mul_ps(in1, in1);
729
730 // Add adjacent pairs (re² + im²) using within-lane shuffle
731 // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
732 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
733 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
734 __m512 sum0 = _mm512_add_ps(in0, sw0);
735 __m512 sum1 = _mm512_add_ps(in1, sw1);
736
737 // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
738 // 0x88 = _MM_SHUFFLE(2,0,2,0)
739 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
740
741 // Compare and update maximums
742 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, maxValues, _CMP_GT_OS);
743 maxIndices = _mm512_mask_blend_ps(cmpMask, maxIndices, currentIndexes);
744 maxValues = _mm512_max_ps(mag_sq, maxValues);
745
746 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
747 }
748
749 // Reduce 16 values to find maximum
750 __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
751 __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
752 _mm512_store_ps(maxValuesBuffer, maxValues);
753 _mm512_store_ps(maxIndexesBuffer, maxIndices);
754
755 float max = 0.0f;
756 uint32_t index = 0;
757 for (uint32_t i = 0; i < 16; i++) {
758 if (maxValuesBuffer[i] > max) {
759 max = maxValuesBuffer[i];
760 index = (uint32_t)maxIndexesBuffer[i];
761 } else if (maxValuesBuffer[i] == max) {
762 if ((uint32_t)maxIndexesBuffer[i] < index)
763 index = (uint32_t)maxIndexesBuffer[i];
764 }
765 }
766
767 // Handle tail
768 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
769 const float re = lv_creal(*src0Ptr);
770 const float im = lv_cimag(*src0Ptr);
771 const float sq_dist = re * re + im * im;
772 if (sq_dist > max) {
773 max = sq_dist;
774 index = number;
775 }
776 src0Ptr++;
777 }
778 *target = (uint16_t)index;
779}
780
781#endif /*LV_HAVE_AVX512F*/
782
783#ifdef LV_HAVE_RVV
784#include <float.h>
785#include <riscv_vector.h>
786
787static inline void
788volk_32fc_index_max_16u_rvv(uint16_t* target, const lv_32fc_t* src0, uint32_t num_points)
789{
790 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
791 vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
792 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
793 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
794 for (size_t vl; n > 0; n -= vl, src0 += vl) {
795 vl = __riscv_vsetvl_e32m4(n);
796 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)src0, vl);
797 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
798 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
799 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
800 vbool8_t m = __riscv_vmflt(vmax, v, vl);
801 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
802 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
803 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
804 }
805 size_t vl = __riscv_vsetvlmax_e32m4();
806 float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
807 __riscv_vfmv_v_f_f32m1(0, 1),
808 __riscv_vsetvlmax_e32m1()));
809 // Find minimum index among lanes with max value
810 // Note: mask type mismatch (vbool8_t vs vbool4_t) prevents using vmerge,
811 // so we use scalar comparison
812 __attribute__((aligned(32))) float values[128];
813 __attribute__((aligned(32))) uint16_t indices[128];
814 __riscv_vse32(values, vmax, vl);
815 __riscv_vse16(indices, vmaxi, vl);
816 uint16_t min_idx = UINT16_MAX;
817 for (size_t i = 0; i < vl; i++) {
818 if (values[i] == max && indices[i] < min_idx) {
819 min_idx = indices[i];
820 }
821 }
822 *target = min_idx;
823}
824#endif /*LV_HAVE_RVV*/
825
826#ifdef LV_HAVE_RVVSEG
827#include <float.h>
828#include <riscv_vector.h>
829
830static inline void volk_32fc_index_max_16u_rvvseg(uint16_t* target,
831 const lv_32fc_t* src0,
832 uint32_t num_points)
833{
834 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
835 vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
836 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
837 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
838 for (size_t vl; n > 0; n -= vl, src0 += vl) {
839 vl = __riscv_vsetvl_e32m4(n);
840 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)src0, vl);
841 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
842 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
843 vbool8_t m = __riscv_vmflt(vmax, v, vl);
844 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
845 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
846 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
847 }
848 size_t vl = __riscv_vsetvlmax_e32m4();
849 float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
850 __riscv_vfmv_v_f_f32m1(0, 1),
851 __riscv_vsetvlmax_e32m1()));
852 // Find minimum index among lanes with max value
853 __attribute__((aligned(32))) float values[128];
854 __attribute__((aligned(32))) uint16_t indices[128];
855 __riscv_vse32(values, vmax, vl);
856 __riscv_vse16(indices, vmaxi, vl);
857 uint16_t min_idx = UINT16_MAX;
858 for (size_t i = 0; i < vl; i++) {
859 if (values[i] == max && indices[i] < min_idx) {
860 min_idx = indices[i];
861 }
862 }
863 *target = min_idx;
864}
865#endif /*LV_HAVE_RVVSEG*/
866
867#endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/