57#ifndef INCLUDED_volk_32fc_index_min_32u_a_H
58#define INCLUDED_volk_32fc_index_min_32u_a_H
69static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
73 const __m256i indices_increment = _mm256_set1_epi32(8);
79 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
81 __m256 min_values = _mm256_set1_ps(FLT_MAX);
82 __m256i min_indices = _mm256_setzero_si256();
84 for (
unsigned i = 0; i < num_points / 8u; ++i) {
85 __m256 in0 = _mm256_load_ps((
float*)source);
86 __m256 in1 = _mm256_load_ps((
float*)(source + 4));
88 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
95 _mm256_store_ps(min_values_buffer, min_values);
96 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
100 for (
unsigned i = 0; i < 8; i++) {
101 if (min_values_buffer[i] < min) {
102 min = min_values_buffer[i];
103 index = min_indices_buffer[i];
108 for (
unsigned i = num_points & (~7u); i < num_points; ++i) {
109 const float abs_squared =
111 if (abs_squared < min) {
124#include <immintrin.h>
127static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
131 const __m256i indices_increment = _mm256_set1_epi32(8);
137 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
139 __m256 min_values = _mm256_set1_ps(FLT_MAX);
140 __m256i min_indices = _mm256_setzero_si256();
142 for (
unsigned i = 0; i < num_points / 8u; ++i) {
143 __m256 in0 = _mm256_load_ps((
float*)source);
144 __m256 in1 = _mm256_load_ps((
float*)(source + 4));
146 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
153 _mm256_store_ps(min_values_buffer, min_values);
154 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
158 for (
unsigned i = 0; i < 8; i++) {
159 if (min_values_buffer[i] < min) {
160 min = min_values_buffer[i];
161 index = min_indices_buffer[i];
166 for (
unsigned i = num_points & (~7u); i < num_points; ++i) {
167 const float abs_squared =
169 if (abs_squared < min) {
182#include <pmmintrin.h>
183#include <xmmintrin.h>
194 __m128 xmm1, xmm2, xmm3;
195 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
197 xmm5.
int_vec = _mm_setzero_si128();
198 xmm4.
int_vec = _mm_setzero_si128();
199 holderf.
int_vec = _mm_setzero_si128();
200 holderi.
int_vec = _mm_setzero_si128();
202 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
203 xmm9 = _mm_setzero_si128();
204 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
205 xmm3 = _mm_set_ps1(FLT_MAX);
207 int bound = num_points >> 2;
209 for (
int i = 0;
i < bound; ++
i) {
210 xmm1 = _mm_load_ps((
float*)source);
211 xmm2 = _mm_load_ps((
float*)&source[2]);
215 xmm1 = _mm_mul_ps(xmm1, xmm1);
216 xmm2 = _mm_mul_ps(xmm2, xmm2);
218 xmm1 = _mm_hadd_ps(xmm1, xmm2);
220 xmm3 = _mm_min_ps(xmm1, xmm3);
222 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
223 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
225 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
226 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
228 xmm9 = _mm_add_epi32(xmm11, xmm12);
230 xmm8 = _mm_add_epi32(xmm8, xmm10);
233 if (num_points >> 1 & 1) {
234 xmm2 = _mm_load_ps((
float*)source);
239 xmm2 = _mm_mul_ps(xmm2, xmm2);
243 xmm1 = _mm_hadd_ps(xmm2, xmm2);
245 xmm3 = _mm_min_ps(xmm1, xmm3);
247 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
249 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
250 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
252 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
253 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
255 xmm9 = _mm_add_epi32(xmm11, xmm12);
257 xmm8 = _mm_add_epi32(xmm8, xmm10);
260 if (num_points & 1) {
264 xmm2 = _mm_load1_ps(&sq_dist);
268 xmm3 = _mm_min_ss(xmm3, xmm2);
270 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
271 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
273 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
275 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
276 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
278 xmm9 = _mm_add_epi32(xmm11, xmm12);
281 _mm_store_ps((
float*)&(holderf.
f), xmm3);
282 _mm_store_si128(&(holderi.
int_vec), xmm9);
284 target[0] = holderi.
i[0];
285 sq_dist = holderf.
f[0];
286 target[0] = (holderf.
f[1] < sq_dist) ? holderi.
i[1] : target[0];
287 sq_dist = (holderf.
f[1] < sq_dist) ? holderf.
f[1] : sq_dist;
288 target[0] = (holderf.
f[2] < sq_dist) ? holderi.
i[2] : target[0];
289 sq_dist = (holderf.
f[2] < sq_dist) ? holderf.
f[2] : sq_dist;
290 target[0] = (holderf.
f[3] < sq_dist) ? holderi.
i[3] : target[0];
291 sq_dist = (holderf.
f[3] < sq_dist) ? holderf.
f[3] : sq_dist;
296#ifdef LV_HAVE_GENERIC
305 for (uint32_t
i = 0;
i < num_points; ++
i) {
319#ifdef LV_HAVE_AVX512F
321#include <immintrin.h>
323static inline void volk_32fc_index_min_32u_a_avx512f(uint32_t* target,
328 const uint32_t sixteenthPoints = num_points / 16;
331 __m512 currentIndexes =
332 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
333 const __m512 indexIncrement = _mm512_set1_ps(16);
335 __m512 minValues = _mm512_set1_ps(FLT_MAX);
336 __m512 minIndices = _mm512_setzero_ps();
338 for (uint32_t number = 0; number < sixteenthPoints; number++) {
340 __m512 in0 = _mm512_load_ps((
const float*)src0Ptr);
341 __m512 in1 = _mm512_load_ps((
const float*)(src0Ptr + 8));
345 in0 = _mm512_mul_ps(in0, in0);
346 in1 = _mm512_mul_ps(in1, in1);
350 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
351 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
352 __m512 sum0 = _mm512_add_ps(in0, sw0);
353 __m512 sum1 = _mm512_add_ps(in1, sw1);
357 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
360 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
361 minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
362 minValues = _mm512_min_ps(mag_sq, minValues);
364 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
370 _mm512_store_ps(minValuesBuffer, minValues);
371 _mm512_store_ps(minIndexesBuffer, minIndices);
375 for (uint32_t
i = 0;
i < 16;
i++) {
376 if (minValuesBuffer[
i] < min) {
377 min = minValuesBuffer[
i];
378 index = (uint32_t)minIndexesBuffer[
i];
379 }
else if (minValuesBuffer[
i] == min) {
380 if ((uint32_t)minIndexesBuffer[
i] < index)
381 index = (uint32_t)minIndexesBuffer[
i];
386 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
387 const float re =
lv_creal(*src0Ptr);
388 const float im =
lv_cimag(*src0Ptr);
389 const float sq_dist = re * re + im * im;
403#ifndef INCLUDED_volk_32fc_index_min_32u_u_H
404#define INCLUDED_volk_32fc_index_min_32u_u_H
412#include <immintrin.h>
415static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
419 const __m256i indices_increment = _mm256_set1_epi32(8);
425 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
427 __m256 min_values = _mm256_set1_ps(FLT_MAX);
428 __m256i min_indices = _mm256_setzero_si256();
430 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
431 __m256 in0 = _mm256_loadu_ps((
float*)source);
432 __m256 in1 = _mm256_loadu_ps((
float*)(source + 4));
434 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
441 _mm256_store_ps(min_values_buffer, min_values);
442 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
446 for (
unsigned i = 0;
i < 8;
i++) {
447 if (min_values_buffer[
i] < min) {
448 min = min_values_buffer[
i];
449 index = min_indices_buffer[
i];
454 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
455 const float abs_squared =
457 if (abs_squared < min) {
470#include <immintrin.h>
473static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
477 const __m256i indices_increment = _mm256_set1_epi32(8);
483 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
485 __m256 min_values = _mm256_set1_ps(FLT_MAX);
486 __m256i min_indices = _mm256_setzero_si256();
488 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
489 __m256 in0 = _mm256_loadu_ps((
float*)source);
490 __m256 in1 = _mm256_loadu_ps((
float*)(source + 4));
492 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
499 _mm256_store_ps(min_values_buffer, min_values);
500 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
504 for (
unsigned i = 0;
i < 8;
i++) {
505 if (min_values_buffer[
i] < min) {
506 min = min_values_buffer[
i];
507 index = min_indices_buffer[
i];
512 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
513 const float abs_squared =
515 if (abs_squared < min) {
535 const uint32_t quarter_points = num_points / 4;
538 uint32_t indices[4] = { 0, 1, 2, 3 };
539 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
540 uint32x4_t vec_indices = vld1q_u32(indices);
541 uint32x4_t vec_min_indices = vec_indices;
547 float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
549 for (uint32_t number = 0; number < quarter_points; number++) {
551 const float32x4_t vec_mag2 =
555 const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
556 vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
557 vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
558 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
560 uint32_t tmp_min_indices[4];
562 vst1q_u32(tmp_min_indices, vec_min_indices);
563 vst1q_f32(tmp_min, vec_min);
565 for (
int i = 0;
i < 4;
i++) {
566 if (tmp_min[
i] < min) {
568 index = tmp_min_indices[
i];
569 }
else if (tmp_min[
i] == min) {
570 if (tmp_min_indices[
i] < index)
571 index = tmp_min_indices[
i];
576 for (uint32_t number = quarter_points * 4; number < num_points; number++) {
577 const float re =
lv_creal(*sourcePtr);
578 const float im =
lv_cimag(*sourcePtr);
579 const float sq_dist = re * re + im * im;
597static inline void volk_32fc_index_min_32u_neonv8(uint32_t* target,
604 const uint32_t quarter_points = num_points / 4;
608 uint32x4_t vec_indices = { 0, 1, 2, 3 };
609 const uint32x4_t vec_incr = vdupq_n_u32(4);
611 float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
612 uint32x4_t vec_min_idx = vdupq_n_u32(0);
614 for (uint32_t
i = 0;
i < quarter_points;
i++) {
616 float32x4x2_t cplx = vld2q_f32((
const float*)inputPtr);
621 vfmaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
624 uint32x4_t lt_mask = vcltq_f32(mag2, vec_min);
625 vec_min_idx = vbslq_u32(lt_mask, vec_indices, vec_min_idx);
628 vec_min = vminq_f32(mag2, vec_min);
630 vec_indices = vaddq_u32(vec_indices, vec_incr);
634 float min_val = vminvq_f32(vec_min);
637 uint32x4_t min_mask = vceqq_f32(vec_min, vdupq_n_f32(min_val));
638 uint32x4_t idx_masked = vbslq_u32(min_mask, vec_min_idx, vdupq_n_u32(UINT32_MAX));
639 uint32_t result_idx = vminvq_u32(idx_masked);
642 for (uint32_t
i = quarter_points * 4;
i < num_points;
i++) {
645 float mag2 = re * re + im * im;
646 if (mag2 < min_val) {
652 *target = result_idx;
658#ifdef LV_HAVE_AVX512F
660#include <immintrin.h>
662static inline void volk_32fc_index_min_32u_u_avx512f(uint32_t* target,
667 const uint32_t sixteenthPoints = num_points / 16;
670 __m512 currentIndexes =
671 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
672 const __m512 indexIncrement = _mm512_set1_ps(16);
674 __m512 minValues = _mm512_set1_ps(FLT_MAX);
675 __m512 minIndices = _mm512_setzero_ps();
677 for (uint32_t number = 0; number < sixteenthPoints; number++) {
679 __m512 in0 = _mm512_loadu_ps((
const float*)src0Ptr);
680 __m512 in1 = _mm512_loadu_ps((
const float*)(src0Ptr + 8));
684 in0 = _mm512_mul_ps(in0, in0);
685 in1 = _mm512_mul_ps(in1, in1);
689 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
690 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
691 __m512 sum0 = _mm512_add_ps(in0, sw0);
692 __m512 sum1 = _mm512_add_ps(in1, sw1);
696 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
699 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
700 minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
701 minValues = _mm512_min_ps(mag_sq, minValues);
703 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
709 _mm512_store_ps(minValuesBuffer, minValues);
710 _mm512_store_ps(minIndexesBuffer, minIndices);
714 for (uint32_t
i = 0;
i < 16;
i++) {
715 if (minValuesBuffer[
i] < min) {
716 min = minValuesBuffer[
i];
717 index = (uint32_t)minIndexesBuffer[
i];
718 }
else if (minValuesBuffer[
i] == min) {
719 if ((uint32_t)minIndexesBuffer[
i] < index)
720 index = (uint32_t)minIndexesBuffer[
i];
725 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
726 const float re =
lv_creal(*src0Ptr);
727 const float im =
lv_cimag(*src0Ptr);
728 const float sq_dist = re * re + im * im;
742#include <riscv_vector.h>
744static inline void volk_32fc_index_min_32u_rvv(uint32_t* target,
748 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
749 vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
750 vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
751 size_t n = num_points;
752 for (
size_t vl; n > 0; n -= vl, source += vl) {
753 vl = __riscv_vsetvl_e32m4(n);
754 vuint64m8_t vc = __riscv_vle64_v_u64m8((
const uint64_t*)source, vl);
755 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
756 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
757 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
758 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
759 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
760 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
761 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
763 size_t vl = __riscv_vsetvlmax_e32m4();
764 float min = __riscv_vfmv_f(__riscv_vfredmin(
RISCV_SHRINK4(vfmin,
f, 32, vmin),
765 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
766 __riscv_vsetvlmax_e32m1()));
768 vbool8_t m = __riscv_vmfeq(vmin, min, vl);
769 vuint32m4_t idx_masked =
770 __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmini, m, vl);
772 *target = __riscv_vmv_x(__riscv_vredminu(
RISCV_SHRINK4(vminu, u, 32, idx_masked),
773 __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
774 __riscv_vsetvlmax_e32m1()));
780#include <riscv_vector.h>
782static inline void volk_32fc_index_min_32u_rvvseg(uint32_t* target,
786 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
787 vuint32m4_t vmini = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
788 vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
789 size_t n = num_points;
790 for (
size_t vl; n > 0; n -= vl, source += vl) {
791 vl = __riscv_vsetvl_e32m4(n);
792 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((
const float*)source, vl);
793 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
794 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
795 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
796 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
797 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
798 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
800 size_t vl = __riscv_vsetvlmax_e32m4();
801 float min = __riscv_vfmv_f(__riscv_vfredmin(
RISCV_SHRINK4(vfmin,
f, 32, vmin),
802 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
803 __riscv_vsetvlmax_e32m1()));
805 vbool8_t m = __riscv_vmfeq(vmin, min, vl);
806 vuint32m4_t idx_masked =
807 __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmini, m, vl);
809 *target = __riscv_vmv_x(__riscv_vredminu(
RISCV_SHRINK4(vminu, u, 32, idx_masked),
810 __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
811 __riscv_vsetvlmax_e32m1()));