Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_index_min_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
62
63#ifndef INCLUDED_volk_32fc_index_min_16u_a_H
64#define INCLUDED_volk_32fc_index_min_16u_a_H
65
66#include <inttypes.h>
67#include <limits.h>
68#include <stdio.h>
69#include <volk/volk_common.h>
70#include <volk/volk_complex.h>
71
72#ifdef LV_HAVE_AVX2
73#include <immintrin.h>
75
76static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
77 const lv_32fc_t* source,
78 uint32_t num_points)
79{
80 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81
82 const __m256i indices_increment = _mm256_set1_epi32(8);
83 /*
84 * At the start of each loop iteration current_indices holds the indices of
85 * the complex numbers loaded from memory. Explanation for odd order is given
86 * in implementation of vector_32fc_index_min_variant0().
87 */
88 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89
90 __m256 min_values = _mm256_set1_ps(FLT_MAX);
91 __m256i min_indices = _mm256_setzero_si256();
92
93 for (unsigned i = 0; i < num_points / 8u; ++i) {
94 __m256 in0 = _mm256_load_ps((float*)source);
95 __m256 in1 = _mm256_load_ps((float*)(source + 4));
97 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
98 source += 8;
99 }
100
101 // determine minimum value and index in the result of the vectorized loop
102 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
103 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
104 _mm256_store_ps(min_values_buffer, min_values);
105 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
106
107 float min = FLT_MAX;
108 uint32_t index = 0;
109 for (unsigned i = 0; i < 8; i++) {
110 if (min_values_buffer[i] < min) {
111 min = min_values_buffer[i];
112 index = min_indices_buffer[i];
113 }
114 }
115
116 // handle tail not processed by the vectorized loop
117 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118 const float abs_squared =
119 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
120 if (abs_squared < min) {
121 min = abs_squared;
122 index = i;
123 }
124 ++source;
125 }
126
127 *target = index;
128}
129
130#endif /*LV_HAVE_AVX2*/
131
132#ifdef LV_HAVE_AVX2
133#include <immintrin.h>
135
136static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
137 const lv_32fc_t* source,
138 uint32_t num_points)
139{
140 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141
142 const __m256i indices_increment = _mm256_set1_epi32(8);
143 /*
144 * At the start of each loop iteration current_indices holds the indices of
145 * the complex numbers loaded from memory. Explanation for odd order is given
146 * in implementation of vector_32fc_index_min_variant0().
147 */
148 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149
150 __m256 min_values = _mm256_set1_ps(FLT_MAX);
151 __m256i min_indices = _mm256_setzero_si256();
152
153 for (unsigned i = 0; i < num_points / 8u; ++i) {
154 __m256 in0 = _mm256_load_ps((float*)source);
155 __m256 in1 = _mm256_load_ps((float*)(source + 4));
157 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
158 source += 8;
159 }
160
161 // determine minimum value and index in the result of the vectorized loop
162 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
163 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
164 _mm256_store_ps(min_values_buffer, min_values);
165 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
166
167 float min = FLT_MAX;
168 uint32_t index = 0;
169 for (unsigned i = 0; i < 8; i++) {
170 if (min_values_buffer[i] < min) {
171 min = min_values_buffer[i];
172 index = min_indices_buffer[i];
173 }
174 }
175
176 // handle tail not processed by the vectorized loop
177 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178 const float abs_squared =
179 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
180 if (abs_squared < min) {
181 min = abs_squared;
182 index = i;
183 }
184 ++source;
185 }
186
187 *target = index;
188}
189
190#endif /*LV_HAVE_AVX2*/
191
192#ifdef LV_HAVE_SSE3
193#include <pmmintrin.h>
194#include <xmmintrin.h>
195
196static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target,
197 const lv_32fc_t* source,
198 uint32_t num_points)
199{
200 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
201
202 union bit128 holderf;
203 union bit128 holderi;
204 float sq_dist = 0.0;
205
206 union bit128 xmm5, xmm4;
207 __m128 xmm1, xmm2, xmm3;
208 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209
210 xmm5.int_vec = _mm_setzero_si128();
211 xmm4.int_vec = _mm_setzero_si128();
212 holderf.int_vec = _mm_setzero_si128();
213 holderi.int_vec = _mm_setzero_si128();
214
215 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
216 xmm9 = _mm_setzero_si128();
217 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
218 xmm3 = _mm_set_ps1(FLT_MAX);
219
220 int bound = num_points >> 2;
221
222 for (int i = 0; i < bound; ++i) {
223 xmm1 = _mm_load_ps((float*)source);
224 xmm2 = _mm_load_ps((float*)&source[2]);
225
226 source += 4;
227
228 xmm1 = _mm_mul_ps(xmm1, xmm1);
229 xmm2 = _mm_mul_ps(xmm2, xmm2);
230
231 xmm1 = _mm_hadd_ps(xmm1, xmm2);
232
233 xmm3 = _mm_min_ps(xmm1, xmm3);
234
235 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
236 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
237
238 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
239 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
240
241 xmm9 = _mm_add_epi32(xmm11, xmm12);
242
243 xmm8 = _mm_add_epi32(xmm8, xmm10);
244 }
245
246 if (num_points >> 1 & 1) {
247 xmm2 = _mm_load_ps((float*)source);
248
249 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
250 xmm8 = bit128_p(&xmm1)->int_vec;
251
252 xmm2 = _mm_mul_ps(xmm2, xmm2);
253
254 source += 2;
255
256 xmm1 = _mm_hadd_ps(xmm2, xmm2);
257
258 xmm3 = _mm_min_ps(xmm1, xmm3);
259
260 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
261
262 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
263 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
264
265 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
266 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
267
268 xmm9 = _mm_add_epi32(xmm11, xmm12);
269
270 xmm8 = _mm_add_epi32(xmm8, xmm10);
271 }
272
273 if (num_points & 1) {
274 sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
275 lv_cimag(source[0]) * lv_cimag(source[0]);
276
277 xmm2 = _mm_load1_ps(&sq_dist);
278
279 xmm1 = xmm3;
280
281 xmm3 = _mm_min_ss(xmm3, xmm2);
282
283 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
284 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
285
286 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
287
288 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
289 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
290
291 xmm9 = _mm_add_epi32(xmm11, xmm12);
292 }
293
294 _mm_store_ps((float*)&(holderf.f), xmm3);
295 _mm_store_si128(&(holderi.int_vec), xmm9);
296
297 target[0] = holderi.i[0];
298 sq_dist = holderf.f[0];
299 target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
300 sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
301 target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
302 sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
303 target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
304 sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
305}
306
307#endif /*LV_HAVE_SSE3*/
308
309
310#ifdef LV_HAVE_NEON
311#include <arm_neon.h>
312#include <float.h>
313#include <limits.h>
315
316static inline void volk_32fc_index_min_16u_neon(uint16_t* target,
317 const lv_32fc_t* source,
318 uint32_t num_points)
319{
320 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
321
322 if (num_points == 0)
323 return;
324
325 const uint32_t quarter_points = num_points / 4;
326 const lv_32fc_t* inputPtr = source;
327
328 // Use integer indices directly
329 uint32x4_t vec_indices = { 0, 1, 2, 3 };
330 const uint32x4_t vec_incr = vdupq_n_u32(4);
331
332 float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
333 uint32x4_t vec_min_idx = vdupq_n_u32(0);
334
335 for (uint32_t i = 0; i < quarter_points; i++) {
336 // Load and deinterleave complex values
337 float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
338 inputPtr += 4;
339
340 // Magnitude squared: re*re + im*im
341 float32x4_t mag2 =
342 vmlaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
343
344 // Compare BEFORE min update to know which lanes change
345 uint32x4_t lt_mask = vcltq_f32(mag2, vec_min);
346 vec_min_idx = vbslq_u32(lt_mask, vec_indices, vec_min_idx);
347
348 // vminq_f32 is single-cycle, no dependency on comparison result
349 vec_min = vminq_f32(mag2, vec_min);
350
351 vec_indices = vaddq_u32(vec_indices, vec_incr);
352 }
353
354 // Scalar reduction
355 __VOLK_ATTR_ALIGNED(16) float min_buf[4];
356 __VOLK_ATTR_ALIGNED(16) uint32_t idx_buf[4];
357 vst1q_f32(min_buf, vec_min);
358 vst1q_u32(idx_buf, vec_min_idx);
359
360 float min_val = min_buf[0];
361 uint32_t result_idx = idx_buf[0];
362 for (int i = 1; i < 4; i++) {
363 if (min_buf[i] < min_val) {
364 min_val = min_buf[i];
365 result_idx = idx_buf[i];
366 } else if (min_buf[i] == min_val && idx_buf[i] < result_idx) {
367 result_idx = idx_buf[i];
368 }
369 }
370
371 // Handle tail
372 for (uint32_t i = quarter_points * 4; i < num_points; i++) {
373 float re = lv_creal(source[i]);
374 float im = lv_cimag(source[i]);
375 float mag2 = re * re + im * im;
376 if (mag2 < min_val) {
377 min_val = mag2;
378 result_idx = i;
379 }
380 }
381
382 *target = (uint16_t)result_idx;
383}
384
385#endif /*LV_HAVE_NEON*/
386
387
388#ifdef LV_HAVE_NEONV8
389#include <arm_neon.h>
390#include <float.h>
391#include <limits.h>
392
393static inline void volk_32fc_index_min_16u_neonv8(uint16_t* target,
394 const lv_32fc_t* source,
395 uint32_t num_points)
396{
397 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
398
399 if (num_points == 0)
400 return;
401
402 const uint32_t quarter_points = num_points / 4;
403 const lv_32fc_t* inputPtr = source;
404
405 // Use integer indices directly (no float conversion overhead)
406 uint32x4_t vec_indices = { 0, 1, 2, 3 };
407 const uint32x4_t vec_incr = vdupq_n_u32(4);
408
409 float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
410 uint32x4_t vec_min_idx = vdupq_n_u32(0);
411
412 for (uint32_t i = 0; i < quarter_points; i++) {
413 // Load and deinterleave complex values
414 float32x4x2_t cplx = vld2q_f32((const float*)inputPtr);
415 inputPtr += 4;
416
417 // Magnitude squared using FMA: re*re + im*im
418 float32x4_t mag2 =
419 vfmaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
420
421 // Compare BEFORE min update to know which lanes change
422 uint32x4_t lt_mask = vcltq_f32(mag2, vec_min);
423 vec_min_idx = vbslq_u32(lt_mask, vec_indices, vec_min_idx);
424
425 // vminq_f32 is single-cycle, no dependency on comparison result
426 vec_min = vminq_f32(mag2, vec_min);
427
428 vec_indices = vaddq_u32(vec_indices, vec_incr);
429 }
430
431 // ARMv8 horizontal reduction
432 float min_val = vminvq_f32(vec_min);
433 uint32x4_t min_mask = vceqq_f32(vec_min, vdupq_n_f32(min_val));
434 uint32x4_t idx_masked = vbslq_u32(min_mask, vec_min_idx, vdupq_n_u32(UINT32_MAX));
435 uint32_t result_idx = vminvq_u32(idx_masked);
436
437 // Handle tail
438 for (uint32_t i = quarter_points * 4; i < num_points; i++) {
439 float re = lv_creal(source[i]);
440 float im = lv_cimag(source[i]);
441 float mag2 = re * re + im * im;
442 if (mag2 < min_val) {
443 min_val = mag2;
444 result_idx = i;
445 }
446 }
447
448 *target = (uint16_t)result_idx;
449}
450
451#endif /*LV_HAVE_NEONV8*/
452
453
454#ifdef LV_HAVE_GENERIC
455static inline void volk_32fc_index_min_16u_generic(uint16_t* target,
456 const lv_32fc_t* source,
457 uint32_t num_points)
458{
459 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
460
461 float sq_dist = 0.0;
462 float min = FLT_MAX;
463 uint16_t index = 0;
464
465 for (uint32_t i = 0; i < num_points; ++i) {
466 sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
467 lv_cimag(source[i]) * lv_cimag(source[i]);
468
469 if (sq_dist < min) {
470 index = i;
471 min = sq_dist;
472 }
473 }
474 target[0] = index;
475}
476
477#endif /*LV_HAVE_GENERIC*/
478
479#ifdef LV_HAVE_AVX512F
480#include <float.h>
481#include <immintrin.h>
482#include <limits.h>
483
484static inline void volk_32fc_index_min_16u_a_avx512f(uint16_t* target,
485 const lv_32fc_t* src0,
486 uint32_t num_points)
487{
488 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
489
490 const lv_32fc_t* src0Ptr = src0;
491 const uint32_t sixteenthPoints = num_points / 16;
492
493 // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
494 __m512 currentIndexes =
495 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
496 const __m512 indexIncrement = _mm512_set1_ps(16);
497
498 __m512 minValues = _mm512_set1_ps(FLT_MAX);
499 __m512 minIndices = _mm512_setzero_ps();
500
501 for (uint32_t number = 0; number < sixteenthPoints; number++) {
502 // Load 16 complex values (32 floats)
503 __m512 in0 = _mm512_load_ps((const float*)src0Ptr);
504 __m512 in1 = _mm512_load_ps((const float*)(src0Ptr + 8));
505 src0Ptr += 16;
506
507 // Square all values
508 in0 = _mm512_mul_ps(in0, in0);
509 in1 = _mm512_mul_ps(in1, in1);
510
511 // Add adjacent pairs (re² + im²) using within-lane shuffle
512 // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
513 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
514 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
515 __m512 sum0 = _mm512_add_ps(in0, sw0);
516 __m512 sum1 = _mm512_add_ps(in1, sw1);
517
518 // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
519 // 0x88 = _MM_SHUFFLE(2,0,2,0)
520 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
521
522 // Compare and update minimums
523 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
524 minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
525 minValues = _mm512_min_ps(mag_sq, minValues);
526
527 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
528 }
529
530 // Reduce 16 values to find minimum
531 __VOLK_ATTR_ALIGNED(64) float minValuesBuffer[16];
532 __VOLK_ATTR_ALIGNED(64) float minIndexesBuffer[16];
533 _mm512_store_ps(minValuesBuffer, minValues);
534 _mm512_store_ps(minIndexesBuffer, minIndices);
535
536 float min = FLT_MAX;
537 uint32_t index = 0;
538 for (uint32_t i = 0; i < 16; i++) {
539 if (minValuesBuffer[i] < min) {
540 min = minValuesBuffer[i];
541 index = (uint32_t)minIndexesBuffer[i];
542 } else if (minValuesBuffer[i] == min) {
543 if ((uint32_t)minIndexesBuffer[i] < index)
544 index = (uint32_t)minIndexesBuffer[i];
545 }
546 }
547
548 // Handle tail
549 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
550 const float re = lv_creal(*src0Ptr);
551 const float im = lv_cimag(*src0Ptr);
552 const float sq_dist = re * re + im * im;
553 if (sq_dist < min) {
554 min = sq_dist;
555 index = number;
556 }
557 src0Ptr++;
558 }
559 *target = (uint16_t)index;
560}
561
562#endif /*LV_HAVE_AVX512F*/
563
564#endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/
565
566#ifndef INCLUDED_volk_32fc_index_min_16u_u_H
567#define INCLUDED_volk_32fc_index_min_16u_u_H
568
569#include <inttypes.h>
570#include <limits.h>
571#include <stdio.h>
572#include <volk/volk_common.h>
573#include <volk/volk_complex.h>
574
575#ifdef LV_HAVE_AVX2
576#include <immintrin.h>
578
579static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
580 const lv_32fc_t* source,
581 uint32_t num_points)
582{
583 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
584
585 const __m256i indices_increment = _mm256_set1_epi32(8);
586 /*
587 * At the start of each loop iteration current_indices holds the indices of
588 * the complex numbers loaded from memory. Explanation for odd order is given
589 * in implementation of vector_32fc_index_min_variant0().
590 */
591 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
592
593 __m256 min_values = _mm256_set1_ps(FLT_MAX);
594 __m256i min_indices = _mm256_setzero_si256();
595
596 for (unsigned i = 0; i < num_points / 8u; ++i) {
597 __m256 in0 = _mm256_loadu_ps((float*)source);
598 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
600 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
601 source += 8;
602 }
603
604 // determine minimum value and index in the result of the vectorized loop
605 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
606 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
607 _mm256_store_ps(min_values_buffer, min_values);
608 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
609
610 float min = FLT_MAX;
611 uint32_t index = 0;
612 for (unsigned i = 0; i < 8; i++) {
613 if (min_values_buffer[i] < min) {
614 min = min_values_buffer[i];
615 index = min_indices_buffer[i];
616 }
617 }
618
619 // handle tail not processed by the vectorized loop
620 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
621 const float abs_squared =
622 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
623 if (abs_squared < min) {
624 min = abs_squared;
625 index = i;
626 }
627 ++source;
628 }
629
630 *target = index;
631}
632
633#endif /*LV_HAVE_AVX2*/
634
635#ifdef LV_HAVE_AVX2
636#include <immintrin.h>
638
639static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
640 const lv_32fc_t* source,
641 uint32_t num_points)
642{
643 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
644
645 const __m256i indices_increment = _mm256_set1_epi32(8);
646 /*
647 * At the start of each loop iteration current_indices holds the indices of
648 * the complex numbers loaded from memory. Explanation for odd order is given
649 * in implementation of vector_32fc_index_min_variant0().
650 */
651 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
652
653 __m256 min_values = _mm256_set1_ps(FLT_MAX);
654 __m256i min_indices = _mm256_setzero_si256();
655
656 for (unsigned i = 0; i < num_points / 8u; ++i) {
657 __m256 in0 = _mm256_loadu_ps((float*)source);
658 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
660 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
661 source += 8;
662 }
663
664 // determine minimum value and index in the result of the vectorized loop
665 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
666 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
667 _mm256_store_ps(min_values_buffer, min_values);
668 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
669
670 float min = FLT_MAX;
671 uint32_t index = 0;
672 for (unsigned i = 0; i < 8; i++) {
673 if (min_values_buffer[i] < min) {
674 min = min_values_buffer[i];
675 index = min_indices_buffer[i];
676 }
677 }
678
679 // handle tail not processed by the vectorized loop
680 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
681 const float abs_squared =
682 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
683 if (abs_squared < min) {
684 min = abs_squared;
685 index = i;
686 }
687 ++source;
688 }
689
690 *target = index;
691}
692
693#endif /*LV_HAVE_AVX2*/
694
695#ifdef LV_HAVE_AVX512F
696#include <float.h>
697#include <immintrin.h>
698#include <limits.h>
699
700static inline void volk_32fc_index_min_16u_u_avx512f(uint16_t* target,
701 const lv_32fc_t* src0,
702 uint32_t num_points)
703{
704 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
705
706 const lv_32fc_t* src0Ptr = src0;
707 const uint32_t sixteenthPoints = num_points / 16;
708
709 // Index ordering after shuffle: [0,1,8,9, 2,3,10,11, 4,5,12,13, 6,7,14,15]
710 __m512 currentIndexes =
711 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
712 const __m512 indexIncrement = _mm512_set1_ps(16);
713
714 __m512 minValues = _mm512_set1_ps(FLT_MAX);
715 __m512 minIndices = _mm512_setzero_ps();
716
717 for (uint32_t number = 0; number < sixteenthPoints; number++) {
718 // Load 16 complex values (32 floats)
719 __m512 in0 = _mm512_loadu_ps((const float*)src0Ptr);
720 __m512 in1 = _mm512_loadu_ps((const float*)(src0Ptr + 8));
721 src0Ptr += 16;
722
723 // Square all values
724 in0 = _mm512_mul_ps(in0, in0);
725 in1 = _mm512_mul_ps(in1, in1);
726
727 // Add adjacent pairs (re² + im²) using within-lane shuffle
728 // 0xB1 = _MM_SHUFFLE(2,3,0,1) swaps adjacent elements
729 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
730 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
731 __m512 sum0 = _mm512_add_ps(in0, sw0);
732 __m512 sum1 = _mm512_add_ps(in1, sw1);
733
734 // Compact: pick elements 0,2 from sum0 and sum1 per 128-bit lane
735 // 0x88 = _MM_SHUFFLE(2,0,2,0)
736 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
737
738 // Compare and update minimums
739 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
740 minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
741 minValues = _mm512_min_ps(mag_sq, minValues);
742
743 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
744 }
745
746 // Reduce 16 values to find minimum
747 __VOLK_ATTR_ALIGNED(64) float minValuesBuffer[16];
748 __VOLK_ATTR_ALIGNED(64) float minIndexesBuffer[16];
749 _mm512_store_ps(minValuesBuffer, minValues);
750 _mm512_store_ps(minIndexesBuffer, minIndices);
751
752 float min = FLT_MAX;
753 uint32_t index = 0;
754 for (uint32_t i = 0; i < 16; i++) {
755 if (minValuesBuffer[i] < min) {
756 min = minValuesBuffer[i];
757 index = (uint32_t)minIndexesBuffer[i];
758 } else if (minValuesBuffer[i] == min) {
759 if ((uint32_t)minIndexesBuffer[i] < index)
760 index = (uint32_t)minIndexesBuffer[i];
761 }
762 }
763
764 // Handle tail
765 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
766 const float re = lv_creal(*src0Ptr);
767 const float im = lv_cimag(*src0Ptr);
768 const float sq_dist = re * re + im * im;
769 if (sq_dist < min) {
770 min = sq_dist;
771 index = number;
772 }
773 src0Ptr++;
774 }
775 *target = (uint16_t)index;
776}
777
778#endif /*LV_HAVE_AVX512F*/
779
780#ifdef LV_HAVE_RVV
781#include <float.h>
782#include <riscv_vector.h>
783
784static inline void volk_32fc_index_min_16u_rvv(uint16_t* target,
785 const lv_32fc_t* source,
786 uint32_t num_points)
787{
788 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
789 vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
790 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
791 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
792 for (size_t vl; n > 0; n -= vl, source += vl) {
793 vl = __riscv_vsetvl_e32m4(n);
794 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)source, vl);
795 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
796 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
797 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
798 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
799 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
800 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
801 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
802 }
803 size_t vl = __riscv_vsetvlmax_e32m4();
804 float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
805 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
806 __riscv_vsetvlmax_e32m1()));
807 // Find minimum index among lanes with min value
808 __attribute__((aligned(32))) float values[128];
809 __attribute__((aligned(32))) uint16_t indices[128];
810 __riscv_vse32(values, vmin, vl);
811 __riscv_vse16(indices, vmini, vl);
812 uint16_t min_idx = UINT16_MAX;
813 for (size_t i = 0; i < vl; i++) {
814 if (values[i] == min && indices[i] < min_idx) {
815 min_idx = indices[i];
816 }
817 }
818 *target = min_idx;
819}
820#endif /*LV_HAVE_RVV*/
821
822#ifdef LV_HAVE_RVVSEG
823#include <float.h>
824#include <riscv_vector.h>
825
826static inline void volk_32fc_index_min_16u_rvvseg(uint16_t* target,
827 const lv_32fc_t* source,
828 uint32_t num_points)
829{
830 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
831 vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
832 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
833 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
834 for (size_t vl; n > 0; n -= vl, source += vl) {
835 vl = __riscv_vsetvl_e32m4(n);
836 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)source, vl);
837 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
838 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
839 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
840 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
841 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
842 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
843 }
844 size_t vl = __riscv_vsetvlmax_e32m4();
845 float min = __riscv_vfmv_f(__riscv_vfredmin(RISCV_SHRINK4(vfmin, f, 32, vmin),
846 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
847 __riscv_vsetvlmax_e32m1()));
848 // Find minimum index among lanes with min value
849 __attribute__((aligned(32))) float values[128];
850 __attribute__((aligned(32))) uint16_t indices[128];
851 __riscv_vse32(values, vmin, vl);
852 __riscv_vse16(indices, vmini, vl);
853 uint16_t min_idx = UINT16_MAX;
854 for (size_t i = 0; i < vl; i++) {
855 if (values[i] == min && indices[i] < min_idx) {
856 min_idx = indices[i];
857 }
858 }
859 *target = min_idx;
860}
861#endif /*LV_HAVE_RVVSEG*/
862
863#endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/