Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_index_min_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
56
57#ifndef INCLUDED_volk_32fc_index_min_32u_a_H
58#define INCLUDED_volk_32fc_index_min_32u_a_H
59
60#include <inttypes.h>
61#include <stdio.h>
62#include <volk/volk_common.h>
63#include <volk/volk_complex.h>
64
65#ifdef LV_HAVE_AVX2
66#include <immintrin.h>
68
69static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
70 const lv_32fc_t* source,
71 uint32_t num_points)
72{
73 const __m256i indices_increment = _mm256_set1_epi32(8);
74 /*
75 * At the start of each loop iteration current_indices holds the indices of
76 * the complex numbers loaded from memory. Explanation for odd order is given
77 * in implementation of vector_32fc_index_min_variant0().
78 */
79 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
80
81 __m256 min_values = _mm256_set1_ps(FLT_MAX);
82 __m256i min_indices = _mm256_setzero_si256();
83
84 for (unsigned i = 0; i < num_points / 8u; ++i) {
85 __m256 in0 = _mm256_load_ps((float*)source);
86 __m256 in1 = _mm256_load_ps((float*)(source + 4));
88 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
89 source += 8;
90 }
91
92 // determine minimum value and index in the result of the vectorized loop
93 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
94 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
95 _mm256_store_ps(min_values_buffer, min_values);
96 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
97
98 float min = FLT_MAX;
99 uint32_t index = 0;
100 for (unsigned i = 0; i < 8; i++) {
101 if (min_values_buffer[i] < min) {
102 min = min_values_buffer[i];
103 index = min_indices_buffer[i];
104 }
105 }
106
107 // handle tail not processed by the vectorized loop
108 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
109 const float abs_squared =
110 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
111 if (abs_squared < min) {
112 min = abs_squared;
113 index = i;
114 }
115 ++source;
116 }
117
118 *target = index;
119}
120
121#endif /*LV_HAVE_AVX2*/
122
123#ifdef LV_HAVE_AVX2
124#include <immintrin.h>
126
127static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
128 const lv_32fc_t* source,
129 uint32_t num_points)
130{
131 const __m256i indices_increment = _mm256_set1_epi32(8);
132 /*
133 * At the start of each loop iteration current_indices holds the indices of
134 * the complex numbers loaded from memory. Explanation for odd order is given
135 * in implementation of vector_32fc_index_min_variant0().
136 */
137 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
138
139 __m256 min_values = _mm256_set1_ps(FLT_MAX);
140 __m256i min_indices = _mm256_setzero_si256();
141
142 for (unsigned i = 0; i < num_points / 8u; ++i) {
143 __m256 in0 = _mm256_load_ps((float*)source);
144 __m256 in1 = _mm256_load_ps((float*)(source + 4));
146 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
147 source += 8;
148 }
149
150 // determine minimum value and index in the result of the vectorized loop
151 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
152 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
153 _mm256_store_ps(min_values_buffer, min_values);
154 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
155
156 float min = FLT_MAX;
157 uint32_t index = 0;
158 for (unsigned i = 0; i < 8; i++) {
159 if (min_values_buffer[i] < min) {
160 min = min_values_buffer[i];
161 index = min_indices_buffer[i];
162 }
163 }
164
165 // handle tail not processed by the vectorized loop
166 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
167 const float abs_squared =
168 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
169 if (abs_squared < min) {
170 min = abs_squared;
171 index = i;
172 }
173 ++source;
174 }
175
176 *target = index;
177}
178
179#endif /*LV_HAVE_AVX2*/
180
181#ifdef LV_HAVE_SSE3
182#include <pmmintrin.h>
183#include <xmmintrin.h>
184
185static inline void volk_32fc_index_min_32u_a_sse3(uint32_t* target,
186 const lv_32fc_t* source,
187 uint32_t num_points)
188{
189 union bit128 holderf;
190 union bit128 holderi;
191 float sq_dist = 0.0;
192
193 union bit128 xmm5, xmm4;
194 __m128 xmm1, xmm2, xmm3;
195 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
196
197 xmm5.int_vec = _mm_setzero_si128();
198 xmm4.int_vec = _mm_setzero_si128();
199 holderf.int_vec = _mm_setzero_si128();
200 holderi.int_vec = _mm_setzero_si128();
201
202 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
203 xmm9 = _mm_setzero_si128();
204 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
205 xmm3 = _mm_set_ps1(FLT_MAX);
206
207 int bound = num_points >> 2;
208
209 for (int i = 0; i < bound; ++i) {
210 xmm1 = _mm_load_ps((float*)source);
211 xmm2 = _mm_load_ps((float*)&source[2]);
212
213 source += 4;
214
215 xmm1 = _mm_mul_ps(xmm1, xmm1);
216 xmm2 = _mm_mul_ps(xmm2, xmm2);
217
218 xmm1 = _mm_hadd_ps(xmm1, xmm2);
219
220 xmm3 = _mm_min_ps(xmm1, xmm3);
221
222 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
223 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
224
225 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
226 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
227
228 xmm9 = _mm_add_epi32(xmm11, xmm12);
229
230 xmm8 = _mm_add_epi32(xmm8, xmm10);
231 }
232
233 if (num_points >> 1 & 1) {
234 xmm2 = _mm_load_ps((float*)source);
235
236 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
237 xmm8 = bit128_p(&xmm1)->int_vec;
238
239 xmm2 = _mm_mul_ps(xmm2, xmm2);
240
241 source += 2;
242
243 xmm1 = _mm_hadd_ps(xmm2, xmm2);
244
245 xmm3 = _mm_min_ps(xmm1, xmm3);
246
247 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
248
249 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
250 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
251
252 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
253 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
254
255 xmm9 = _mm_add_epi32(xmm11, xmm12);
256
257 xmm8 = _mm_add_epi32(xmm8, xmm10);
258 }
259
260 if (num_points & 1) {
261 sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
262 lv_cimag(source[0]) * lv_cimag(source[0]);
263
264 xmm2 = _mm_load1_ps(&sq_dist);
265
266 xmm1 = xmm3;
267
268 xmm3 = _mm_min_ss(xmm3, xmm2);
269
270 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
271 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
272
273 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
274
275 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
276 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
277
278 xmm9 = _mm_add_epi32(xmm11, xmm12);
279 }
280
281 _mm_store_ps((float*)&(holderf.f), xmm3);
282 _mm_store_si128(&(holderi.int_vec), xmm9);
283
284 target[0] = holderi.i[0];
285 sq_dist = holderf.f[0];
286 target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
287 sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
288 target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
289 sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
290 target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
291 sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
292}
293
294#endif /*LV_HAVE_SSE3*/
295
296#ifdef LV_HAVE_GENERIC
297static inline void volk_32fc_index_min_32u_generic(uint32_t* target,
298 const lv_32fc_t* source,
299 uint32_t num_points)
300{
301 float sq_dist = 0.0;
302 float min = FLT_MAX;
303 uint32_t index = 0;
304
305 for (uint32_t i = 0; i < num_points; ++i) {
306 sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
307 lv_cimag(source[i]) * lv_cimag(source[i]);
308
309 if (sq_dist < min) {
310 index = i;
311 min = sq_dist;
312 }
313 }
314 target[0] = index;
315}
316
317#endif /*LV_HAVE_GENERIC*/
318
319#ifdef LV_HAVE_AVX512F
320#include <float.h>
321#include <immintrin.h>
322
323static inline void volk_32fc_index_min_32u_a_avx512f(uint32_t* target,
324 const lv_32fc_t* src0,
325 uint32_t num_points)
326{
327 const lv_32fc_t* src0Ptr = src0;
328 const uint32_t sixteenthPoints = num_points / 16;
329
330 // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
331 __m512 currentIndexes =
332 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
333 const __m512 indexIncrement = _mm512_set1_ps(16);
334
335 __m512 minValues = _mm512_set1_ps(FLT_MAX);
336 __m512 minIndices = _mm512_setzero_ps();
337
338 for (uint32_t number = 0; number < sixteenthPoints; number++) {
339 // Load 16 complex values (32 floats)
340 __m512 in0 = _mm512_load_ps((const float*)src0Ptr);
341 __m512 in1 = _mm512_load_ps((const float*)(src0Ptr + 8));
342 src0Ptr += 16;
343
344 // Square all values
345 in0 = _mm512_mul_ps(in0, in0);
346 in1 = _mm512_mul_ps(in1, in1);
347
348 // Add adjacent pairs (re² + im²) using within-lane shuffle
349 // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
350 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
351 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
352 __m512 sum0 = _mm512_add_ps(in0, sw0);
353 __m512 sum1 = _mm512_add_ps(in1, sw1);
354
355 // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
356 // 0x88 = _MM_SHUFFLE(2,0,2,0)
357 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
358
359 // Compare and update minimums
360 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
361 minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
362 minValues = _mm512_min_ps(mag_sq, minValues);
363
364 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
365 }
366
367 // Reduce 16 values to find minimum
368 __VOLK_ATTR_ALIGNED(64) float minValuesBuffer[16];
369 __VOLK_ATTR_ALIGNED(64) float minIndexesBuffer[16];
370 _mm512_store_ps(minValuesBuffer, minValues);
371 _mm512_store_ps(minIndexesBuffer, minIndices);
372
373 float min = FLT_MAX;
374 uint32_t index = 0;
375 for (uint32_t i = 0; i < 16; i++) {
376 if (minValuesBuffer[i] < min) {
377 min = minValuesBuffer[i];
378 index = (uint32_t)minIndexesBuffer[i];
379 } else if (minValuesBuffer[i] == min) {
380 if ((uint32_t)minIndexesBuffer[i] < index)
381 index = (uint32_t)minIndexesBuffer[i];
382 }
383 }
384
385 // Handle tail
386 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
387 const float re = lv_creal(*src0Ptr);
388 const float im = lv_cimag(*src0Ptr);
389 const float sq_dist = re * re + im * im;
390 if (sq_dist < min) {
391 min = sq_dist;
392 index = number;
393 }
394 src0Ptr++;
395 }
396 *target = index;
397}
398
399#endif /*LV_HAVE_AVX512F*/
400
401#endif /*INCLUDED_volk_32fc_index_min_32u_a_H*/
402
403#ifndef INCLUDED_volk_32fc_index_min_32u_u_H
404#define INCLUDED_volk_32fc_index_min_32u_u_H
405
406#include <inttypes.h>
407#include <stdio.h>
408#include <volk/volk_common.h>
409#include <volk/volk_complex.h>
410
411#ifdef LV_HAVE_AVX2
412#include <immintrin.h>
414
415static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
416 const lv_32fc_t* source,
417 uint32_t num_points)
418{
419 const __m256i indices_increment = _mm256_set1_epi32(8);
420 /*
421 * At the start of each loop iteration current_indices holds the indices of
422 * the complex numbers loaded from memory. Explanation for odd order is given
423 * in implementation of vector_32fc_index_min_variant0().
424 */
425 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
426
427 __m256 min_values = _mm256_set1_ps(FLT_MAX);
428 __m256i min_indices = _mm256_setzero_si256();
429
430 for (unsigned i = 0; i < num_points / 8u; ++i) {
431 __m256 in0 = _mm256_loadu_ps((float*)source);
432 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
434 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
435 source += 8;
436 }
437
438 // determine minimum value and index in the result of the vectorized loop
439 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
440 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
441 _mm256_store_ps(min_values_buffer, min_values);
442 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
443
444 float min = FLT_MAX;
445 uint32_t index = 0;
446 for (unsigned i = 0; i < 8; i++) {
447 if (min_values_buffer[i] < min) {
448 min = min_values_buffer[i];
449 index = min_indices_buffer[i];
450 }
451 }
452
453 // handle tail not processed by the vectorized loop
454 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
455 const float abs_squared =
456 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
457 if (abs_squared < min) {
458 min = abs_squared;
459 index = i;
460 }
461 ++source;
462 }
463
464 *target = index;
465}
466
467#endif /*LV_HAVE_AVX2*/
468
469#ifdef LV_HAVE_AVX2
470#include <immintrin.h>
472
473static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
474 const lv_32fc_t* source,
475 uint32_t num_points)
476{
477 const __m256i indices_increment = _mm256_set1_epi32(8);
478 /*
479 * At the start of each loop iteration current_indices holds the indices of
480 * the complex numbers loaded from memory. Explanation for odd order is given
481 * in implementation of vector_32fc_index_min_variant0().
482 */
483 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
484
485 __m256 min_values = _mm256_set1_ps(FLT_MAX);
486 __m256i min_indices = _mm256_setzero_si256();
487
488 for (unsigned i = 0; i < num_points / 8u; ++i) {
489 __m256 in0 = _mm256_loadu_ps((float*)source);
490 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
492 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
493 source += 8;
494 }
495
496 // determine minimum value and index in the result of the vectorized loop
497 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
498 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
499 _mm256_store_ps(min_values_buffer, min_values);
500 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
501
502 float min = FLT_MAX;
503 uint32_t index = 0;
504 for (unsigned i = 0; i < 8; i++) {
505 if (min_values_buffer[i] < min) {
506 min = min_values_buffer[i];
507 index = min_indices_buffer[i];
508 }
509 }
510
511 // handle tail not processed by the vectorized loop
512 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
513 const float abs_squared =
514 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
515 if (abs_squared < min) {
516 min = abs_squared;
517 index = i;
518 }
519 ++source;
520 }
521
522 *target = index;
523}
524
525#endif /*LV_HAVE_AVX2*/
526
527#ifdef LV_HAVE_NEON
528#include <arm_neon.h>
530
531static inline void volk_32fc_index_min_32u_neon(uint32_t* target,
532 const lv_32fc_t* source,
533 uint32_t num_points)
534{
535 const uint32_t quarter_points = num_points / 4;
536 const lv_32fc_t* sourcePtr = source;
537
538 uint32_t indices[4] = { 0, 1, 2, 3 };
539 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
540 uint32x4_t vec_indices = vld1q_u32(indices);
541 uint32x4_t vec_min_indices = vec_indices;
542
543 if (num_points) {
544 float min = FLT_MAX;
545 uint32_t index = 0;
546
547 float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
548
549 for (uint32_t number = 0; number < quarter_points; number++) {
550 // Load complex and compute magnitude squared
551 const float32x4_t vec_mag2 =
552 _vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr));
553 __VOLK_PREFETCH(sourcePtr += 4);
554 // a < b?
555 const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
556 vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
557 vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
558 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
559 }
560 uint32_t tmp_min_indices[4];
561 float tmp_min[4];
562 vst1q_u32(tmp_min_indices, vec_min_indices);
563 vst1q_f32(tmp_min, vec_min);
564
565 for (int i = 0; i < 4; i++) {
566 if (tmp_min[i] < min) {
567 min = tmp_min[i];
568 index = tmp_min_indices[i];
569 } else if (tmp_min[i] == min) {
570 if (tmp_min_indices[i] < index)
571 index = tmp_min_indices[i];
572 }
573 }
574
575 // Deal with the rest
576 for (uint32_t number = quarter_points * 4; number < num_points; number++) {
577 const float re = lv_creal(*sourcePtr);
578 const float im = lv_cimag(*sourcePtr);
579 const float sq_dist = re * re + im * im;
580 if (sq_dist < min) {
581 min = sq_dist;
582 index = number;
583 }
584 sourcePtr++;
585 }
586 *target = index;
587 }
588}
589
590#endif /*LV_HAVE_NEON*/
591
592
593#ifdef LV_HAVE_NEONV8
594#include <arm_neon.h>
595#include <float.h>
596
597static inline void volk_32fc_index_min_32u_neonv8(uint32_t* target,
598 const lv_32fc_t* source,
599 uint32_t num_points)
600{
601 if (num_points == 0)
602 return;
603
604 const uint32_t quarter_points = num_points / 4;
605 const lv_32fc_t* inputPtr = source;
606
607 // Use integer indices directly (no float conversion overhead)
608 uint32x4_t vec_indices = { 0, 1, 2, 3 };
609 const uint32x4_t vec_incr = vdupq_n_u32(4);
610
611 float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
612 uint32x4_t vec_min_idx = vdupq_n_u32(0);
613
614 for (uint32_t i = 0; i < quarter_points; i++) {
615 // Load and deinterleave complex values
616 float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
617 inputPtr += 4;
618
619 // Magnitude squared using FMA: re*re + im*im
620 float32x4_t mag2 =
621 vfmaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
622
623 // Compare BEFORE min update to know which lanes change
624 uint32x4_t lt_mask = vcltq_f32(mag2, vec_min);
625 vec_min_idx = vbslq_u32(lt_mask, vec_indices, vec_min_idx);
626
627 // vminq_f32 is single-cycle, no dependency on comparison result
628 vec_min = vminq_f32(mag2, vec_min);
629
630 vec_indices = vaddq_u32(vec_indices, vec_incr);
631 }
632
633 // ARMv8 horizontal reduction - find min value across all lanes
634 float min_val = vminvq_f32(vec_min);
635
636 // Find which lane(s) have the min value, get minimum index among them
637 uint32x4_t min_mask = vceqq_f32(vec_min, vdupq_n_f32(min_val));
638 uint32x4_t idx_masked = vbslq_u32(min_mask, vec_min_idx, vdupq_n_u32(UINT32_MAX));
639 uint32_t result_idx = vminvq_u32(idx_masked);
640
641 // Handle tail elements
642 for (uint32_t i = quarter_points * 4; i < num_points; i++) {
643 float re = lv_creal(source[i]);
644 float im = lv_cimag(source[i]);
645 float mag2 = re * re + im * im;
646 if (mag2 < min_val) {
647 min_val = mag2;
648 result_idx = i;
649 }
650 }
651
652 *target = result_idx;
653}
654
655#endif /*LV_HAVE_NEONV8*/
656
657
658#ifdef LV_HAVE_AVX512F
659#include <float.h>
660#include <immintrin.h>
661
662static inline void volk_32fc_index_min_32u_u_avx512f(uint32_t* target,
663 const lv_32fc_t* src0,
664 uint32_t num_points)
665{
666 const lv_32fc_t* src0Ptr = src0;
667 const uint32_t sixteenthPoints = num_points / 16;
668
669 // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
670 __m512 currentIndexes =
671 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
672 const __m512 indexIncrement = _mm512_set1_ps(16);
673
674 __m512 minValues = _mm512_set1_ps(FLT_MAX);
675 __m512 minIndices = _mm512_setzero_ps();
676
677 for (uint32_t number = 0; number < sixteenthPoints; number++) {
678 // Load 16 complex values (32 floats)
679 __m512 in0 = _mm512_loadu_ps((const float*)src0Ptr);
680 __m512 in1 = _mm512_loadu_ps((const float*)(src0Ptr + 8));
681 src0Ptr += 16;
682
683 // Square all values
684 in0 = _mm512_mul_ps(in0, in0);
685 in1 = _mm512_mul_ps(in1, in1);
686
687 // Add adjacent pairs (re² + im²) using within-lane shuffle
688 // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
689 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
690 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
691 __m512 sum0 = _mm512_add_ps(in0, sw0);
692 __m512 sum1 = _mm512_add_ps(in1, sw1);
693
694 // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
695 // 0x88 = _MM_SHUFFLE(2,0,2,0)
696 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
697
698 // Compare and update minimums
699 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
700 minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
701 minValues = _mm512_min_ps(mag_sq, minValues);
702
703 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
704 }
705
706 // Reduce 16 values to find minimum
707 __VOLK_ATTR_ALIGNED(64) float minValuesBuffer[16];
708 __VOLK_ATTR_ALIGNED(64) float minIndexesBuffer[16];
709 _mm512_store_ps(minValuesBuffer, minValues);
710 _mm512_store_ps(minIndexesBuffer, minIndices);
711
712 float min = FLT_MAX;
713 uint32_t index = 0;
714 for (uint32_t i = 0; i < 16; i++) {
715 if (minValuesBuffer[i] < min) {
716 min = minValuesBuffer[i];
717 index = (uint32_t)minIndexesBuffer[i];
718 } else if (minValuesBuffer[i] == min) {
719 if ((uint32_t)minIndexesBuffer[i] < index)
720 index = (uint32_t)minIndexesBuffer[i];
721 }
722 }
723
724 // Handle tail
725 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
726 const float re = lv_creal(*src0Ptr);
727 const float im = lv_cimag(*src0Ptr);
728 const float sq_dist = re * re + im * im;
729 if (sq_dist < min) {
730 min = sq_dist;
731 index = number;
732 }
733 src0Ptr++;
734 }
735 *target = index;
736}
737
738#endif /*LV_HAVE_AVX512F*/
739
740#ifdef LV_HAVE_RVV
741#include <float.h>
742#include <riscv_vector.h>
743
744static inline void volk_32fc_index_min_32u_rvv(uint32_t* target,
745 const lv_32fc_t* source,
746 uint32_t num_points)
747{
748 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
749 vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
750 vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
751 size_t n = num_points;
752 for (size_t vl; n > 0; n -= vl, source += vl) {
753 vl = __riscv_vsetvl_e32m4(n);
754 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl);
755 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
756 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
757 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
758 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
759 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
760 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
761 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
762 }
763 size_t vl = __riscv_vsetvlmax_e32m4();
764 float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
765 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
766 __riscv_vsetvlmax_e32m1()));
767 // Find lanes with min value, set others to UINT32_MAX
768 vbool8_t m = __riscv_vmfeq(vmin, min, vl);
769 vuint32m4_t idx_masked =
770 __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmini, m, vl);
771 // Find minimum index among lanes with min value
772 *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 32, idx_masked),
773 __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
774 __riscv_vsetvlmax_e32m1()));
775}
776#endif /*LV_HAVE_RVV*/
777
778#ifdef LV_HAVE_RVVSEG
779#include <float.h>
780#include <riscv_vector.h>
781
782static inline void volk_32fc_index_min_32u_rvvseg(uint32_t* target,
783 const lv_32fc_t* source,
784 uint32_t num_points)
785{
786 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
787 vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
788 vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
789 size_t n = num_points;
790 for (size_t vl; n > 0; n -= vl, source += vl) {
791 vl = __riscv_vsetvl_e32m4(n);
792 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl);
793 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
794 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
795 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
796 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
797 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
798 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
799 }
800 size_t vl = __riscv_vsetvlmax_e32m4();
801 float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
802 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
803 __riscv_vsetvlmax_e32m1()));
804 // Find lanes with min value, set others to UINT32_MAX
805 vbool8_t m = __riscv_vmfeq(vmin, min, vl);
806 vuint32m4_t idx_masked =
807 __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmini, m, vl);
808 // Find minimum index among lanes with min value
809 *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 32, idx_masked),
810 __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
811 __riscv_vsetvlmax_e32m1()));
812}
813#endif /*LV_HAVE_RVVSEG*/
814
815#endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/