63#ifndef INCLUDED_volk_32fc_index_min_16u_a_H
64#define INCLUDED_volk_32fc_index_min_16u_a_H
76static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
80 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
82 const __m256i indices_increment = _mm256_set1_epi32(8);
88 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
90 __m256 min_values = _mm256_set1_ps(FLT_MAX);
91 __m256i min_indices = _mm256_setzero_si256();
93 for (
unsigned i = 0; i < num_points / 8u; ++i) {
94 __m256 in0 = _mm256_load_ps((
float*)source);
95 __m256 in1 = _mm256_load_ps((
float*)(source + 4));
97 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
104 _mm256_store_ps(min_values_buffer, min_values);
105 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
109 for (
unsigned i = 0; i < 8; i++) {
110 if (min_values_buffer[i] < min) {
111 min = min_values_buffer[i];
112 index = min_indices_buffer[i];
117 for (
unsigned i = num_points & (~7u); i < num_points; ++i) {
118 const float abs_squared =
120 if (abs_squared < min) {
133#include <immintrin.h>
136static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
140 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
142 const __m256i indices_increment = _mm256_set1_epi32(8);
148 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
150 __m256 min_values = _mm256_set1_ps(FLT_MAX);
151 __m256i min_indices = _mm256_setzero_si256();
153 for (
unsigned i = 0; i < num_points / 8u; ++i) {
154 __m256 in0 = _mm256_load_ps((
float*)source);
155 __m256 in1 = _mm256_load_ps((
float*)(source + 4));
157 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
164 _mm256_store_ps(min_values_buffer, min_values);
165 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
169 for (
unsigned i = 0; i < 8; i++) {
170 if (min_values_buffer[i] < min) {
171 min = min_values_buffer[i];
172 index = min_indices_buffer[i];
177 for (
unsigned i = num_points & (~7u); i < num_points; ++i) {
178 const float abs_squared =
180 if (abs_squared < min) {
193#include <pmmintrin.h>
194#include <xmmintrin.h>
200 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
207 __m128 xmm1, xmm2, xmm3;
208 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
210 xmm5.
int_vec = _mm_setzero_si128();
211 xmm4.
int_vec = _mm_setzero_si128();
212 holderf.
int_vec = _mm_setzero_si128();
213 holderi.
int_vec = _mm_setzero_si128();
215 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
216 xmm9 = _mm_setzero_si128();
217 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
218 xmm3 = _mm_set_ps1(FLT_MAX);
220 int bound = num_points >> 2;
222 for (
int i = 0;
i < bound; ++
i) {
223 xmm1 = _mm_load_ps((
float*)source);
224 xmm2 = _mm_load_ps((
float*)&source[2]);
228 xmm1 = _mm_mul_ps(xmm1, xmm1);
229 xmm2 = _mm_mul_ps(xmm2, xmm2);
231 xmm1 = _mm_hadd_ps(xmm1, xmm2);
233 xmm3 = _mm_min_ps(xmm1, xmm3);
235 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
236 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
238 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
239 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
241 xmm9 = _mm_add_epi32(xmm11, xmm12);
243 xmm8 = _mm_add_epi32(xmm8, xmm10);
246 if (num_points >> 1 & 1) {
247 xmm2 = _mm_load_ps((
float*)source);
252 xmm2 = _mm_mul_ps(xmm2, xmm2);
256 xmm1 = _mm_hadd_ps(xmm2, xmm2);
258 xmm3 = _mm_min_ps(xmm1, xmm3);
260 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
262 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
263 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
265 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
266 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
268 xmm9 = _mm_add_epi32(xmm11, xmm12);
270 xmm8 = _mm_add_epi32(xmm8, xmm10);
273 if (num_points & 1) {
277 xmm2 = _mm_load1_ps(&sq_dist);
281 xmm3 = _mm_min_ss(xmm3, xmm2);
283 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
284 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
286 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
288 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
289 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
291 xmm9 = _mm_add_epi32(xmm11, xmm12);
294 _mm_store_ps((
float*)&(holderf.
f), xmm3);
295 _mm_store_si128(&(holderi.
int_vec), xmm9);
297 target[0] = holderi.
i[0];
298 sq_dist = holderf.
f[0];
299 target[0] = (holderf.
f[1] < sq_dist) ? holderi.
i[1] : target[0];
300 sq_dist = (holderf.
f[1] < sq_dist) ? holderf.
f[1] : sq_dist;
301 target[0] = (holderf.
f[2] < sq_dist) ? holderi.
i[2] : target[0];
302 sq_dist = (holderf.
f[2] < sq_dist) ? holderf.
f[2] : sq_dist;
303 target[0] = (holderf.
f[3] < sq_dist) ? holderi.
i[3] : target[0];
304 sq_dist = (holderf.
f[3] < sq_dist) ? holderf.
f[3] : sq_dist;
320 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
325 const uint32_t quarter_points = num_points / 4;
329 uint32x4_t vec_indices = { 0, 1, 2, 3 };
330 const uint32x4_t vec_incr = vdupq_n_u32(4);
332 float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
333 uint32x4_t vec_min_idx = vdupq_n_u32(0);
335 for (uint32_t
i = 0;
i < quarter_points;
i++) {
337 float32x4x2_t cplx = vld2q_f32((
const float*)inputPtr);
342 vmlaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
345 uint32x4_t lt_mask = vcltq_f32(mag2, vec_min);
346 vec_min_idx = vbslq_u32(lt_mask, vec_indices, vec_min_idx);
349 vec_min = vminq_f32(mag2, vec_min);
351 vec_indices = vaddq_u32(vec_indices, vec_incr);
357 vst1q_f32(min_buf, vec_min);
358 vst1q_u32(idx_buf, vec_min_idx);
360 float min_val = min_buf[0];
361 uint32_t result_idx = idx_buf[0];
362 for (
int i = 1;
i < 4;
i++) {
363 if (min_buf[
i] < min_val) {
364 min_val = min_buf[
i];
365 result_idx = idx_buf[
i];
366 }
else if (min_buf[
i] == min_val && idx_buf[
i] < result_idx) {
367 result_idx = idx_buf[
i];
372 for (uint32_t
i = quarter_points * 4;
i < num_points;
i++) {
375 float mag2 = re * re + im * im;
376 if (mag2 < min_val) {
382 *target = (uint16_t)result_idx;
393static inline void volk_32fc_index_min_16u_neonv8(uint16_t* target,
397 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
402 const uint32_t quarter_points = num_points / 4;
406 uint32x4_t vec_indices = { 0, 1, 2, 3 };
407 const uint32x4_t vec_incr = vdupq_n_u32(4);
409 float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
410 uint32x4_t vec_min_idx = vdupq_n_u32(0);
412 for (uint32_t
i = 0;
i < quarter_points;
i++) {
414 float32x4x2_t cplx = vld2q_f32((
const float*)inputPtr);
419 vfmaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
422 uint32x4_t lt_mask = vcltq_f32(mag2, vec_min);
423 vec_min_idx = vbslq_u32(lt_mask, vec_indices, vec_min_idx);
426 vec_min = vminq_f32(mag2, vec_min);
428 vec_indices = vaddq_u32(vec_indices, vec_incr);
432 float min_val = vminvq_f32(vec_min);
433 uint32x4_t min_mask = vceqq_f32(vec_min, vdupq_n_f32(min_val));
434 uint32x4_t idx_masked = vbslq_u32(min_mask, vec_min_idx, vdupq_n_u32(UINT32_MAX));
435 uint32_t result_idx = vminvq_u32(idx_masked);
438 for (uint32_t
i = quarter_points * 4;
i < num_points;
i++) {
441 float mag2 = re * re + im * im;
442 if (mag2 < min_val) {
448 *target = (uint16_t)result_idx;
454#ifdef LV_HAVE_GENERIC
459 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
465 for (uint32_t
i = 0;
i < num_points; ++
i) {
479#ifdef LV_HAVE_AVX512F
481#include <immintrin.h>
484static inline void volk_32fc_index_min_16u_a_avx512f(uint16_t* target,
488 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
491 const uint32_t sixteenthPoints = num_points / 16;
494 __m512 currentIndexes =
495 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
496 const __m512 indexIncrement = _mm512_set1_ps(16);
498 __m512 minValues = _mm512_set1_ps(FLT_MAX);
499 __m512 minIndices = _mm512_setzero_ps();
501 for (uint32_t number = 0; number < sixteenthPoints; number++) {
503 __m512 in0 = _mm512_load_ps((
const float*)src0Ptr);
504 __m512 in1 = _mm512_load_ps((
const float*)(src0Ptr + 8));
508 in0 = _mm512_mul_ps(in0, in0);
509 in1 = _mm512_mul_ps(in1, in1);
513 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
514 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
515 __m512 sum0 = _mm512_add_ps(in0, sw0);
516 __m512 sum1 = _mm512_add_ps(in1, sw1);
520 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
523 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
524 minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
525 minValues = _mm512_min_ps(mag_sq, minValues);
527 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
533 _mm512_store_ps(minValuesBuffer, minValues);
534 _mm512_store_ps(minIndexesBuffer, minIndices);
538 for (uint32_t
i = 0;
i < 16;
i++) {
539 if (minValuesBuffer[
i] < min) {
540 min = minValuesBuffer[
i];
541 index = (uint32_t)minIndexesBuffer[
i];
542 }
else if (minValuesBuffer[
i] == min) {
543 if ((uint32_t)minIndexesBuffer[
i] < index)
544 index = (uint32_t)minIndexesBuffer[
i];
549 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
550 const float re =
lv_creal(*src0Ptr);
551 const float im =
lv_cimag(*src0Ptr);
552 const float sq_dist = re * re + im * im;
559 *target = (uint16_t)index;
566#ifndef INCLUDED_volk_32fc_index_min_16u_u_H
567#define INCLUDED_volk_32fc_index_min_16u_u_H
576#include <immintrin.h>
579static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
583 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
585 const __m256i indices_increment = _mm256_set1_epi32(8);
591 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
593 __m256 min_values = _mm256_set1_ps(FLT_MAX);
594 __m256i min_indices = _mm256_setzero_si256();
596 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
597 __m256 in0 = _mm256_loadu_ps((
float*)source);
598 __m256 in1 = _mm256_loadu_ps((
float*)(source + 4));
600 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
607 _mm256_store_ps(min_values_buffer, min_values);
608 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
612 for (
unsigned i = 0;
i < 8;
i++) {
613 if (min_values_buffer[
i] < min) {
614 min = min_values_buffer[
i];
615 index = min_indices_buffer[
i];
620 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
621 const float abs_squared =
623 if (abs_squared < min) {
636#include <immintrin.h>
639static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
643 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
645 const __m256i indices_increment = _mm256_set1_epi32(8);
651 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
653 __m256 min_values = _mm256_set1_ps(FLT_MAX);
654 __m256i min_indices = _mm256_setzero_si256();
656 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
657 __m256 in0 = _mm256_loadu_ps((
float*)source);
658 __m256 in1 = _mm256_loadu_ps((
float*)(source + 4));
660 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
667 _mm256_store_ps(min_values_buffer, min_values);
668 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
672 for (
unsigned i = 0;
i < 8;
i++) {
673 if (min_values_buffer[
i] < min) {
674 min = min_values_buffer[
i];
675 index = min_indices_buffer[
i];
680 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
681 const float abs_squared =
683 if (abs_squared < min) {
695#ifdef LV_HAVE_AVX512F
697#include <immintrin.h>
700static inline void volk_32fc_index_min_16u_u_avx512f(uint16_t* target,
704 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
707 const uint32_t sixteenthPoints = num_points / 16;
710 __m512 currentIndexes =
711 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
712 const __m512 indexIncrement = _mm512_set1_ps(16);
714 __m512 minValues = _mm512_set1_ps(FLT_MAX);
715 __m512 minIndices = _mm512_setzero_ps();
717 for (uint32_t number = 0; number < sixteenthPoints; number++) {
719 __m512 in0 = _mm512_loadu_ps((
const float*)src0Ptr);
720 __m512 in1 = _mm512_loadu_ps((
const float*)(src0Ptr + 8));
724 in0 = _mm512_mul_ps(in0, in0);
725 in1 = _mm512_mul_ps(in1, in1);
729 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
730 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
731 __m512 sum0 = _mm512_add_ps(in0, sw0);
732 __m512 sum1 = _mm512_add_ps(in1, sw1);
736 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
739 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, minValues, _CMP_LT_OS);
740 minIndices = _mm512_mask_blend_ps(cmpMask, minIndices, currentIndexes);
741 minValues = _mm512_min_ps(mag_sq, minValues);
743 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
749 _mm512_store_ps(minValuesBuffer, minValues);
750 _mm512_store_ps(minIndexesBuffer, minIndices);
754 for (uint32_t
i = 0;
i < 16;
i++) {
755 if (minValuesBuffer[
i] < min) {
756 min = minValuesBuffer[
i];
757 index = (uint32_t)minIndexesBuffer[
i];
758 }
else if (minValuesBuffer[
i] == min) {
759 if ((uint32_t)minIndexesBuffer[
i] < index)
760 index = (uint32_t)minIndexesBuffer[
i];
765 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
766 const float re =
lv_creal(*src0Ptr);
767 const float im =
lv_cimag(*src0Ptr);
768 const float sq_dist = re * re + im * im;
775 *target = (uint16_t)index;
782#include <riscv_vector.h>
784static inline void volk_32fc_index_min_16u_rvv(uint16_t* target,
788 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
789 vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
790 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
791 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
792 for (
size_t vl; n > 0; n -= vl, source += vl) {
793 vl = __riscv_vsetvl_e32m4(n);
794 vuint64m8_t vc = __riscv_vle64_v_u64m8((
const uint64_t*)source, vl);
795 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
796 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
797 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
798 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
799 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
800 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
801 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
803 size_t vl = __riscv_vsetvlmax_e32m4();
804 float min = __riscv_vfmv_f(__riscv_vfredmin(
RISCV_SHRINK4(vfmin,
f, 32, vmin),
805 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
806 __riscv_vsetvlmax_e32m1()));
808 __attribute__((aligned(32))) float values[128];
809 __attribute__((aligned(32))) uint16_t indices[128];
810 __riscv_vse32(values, vmin, vl);
811 __riscv_vse16(indices, vmini, vl);
812 uint16_t min_idx = UINT16_MAX;
813 for (
size_t i = 0;
i < vl;
i++) {
814 if (values[
i] == min && indices[
i] < min_idx) {
815 min_idx = indices[
i];
824#include <riscv_vector.h>
826static inline void volk_32fc_index_min_16u_rvvseg(uint16_t* target,
830 vfloat32m4_t vmin = __riscv_vfmv_v_f_f32m4(FLT_MAX, __riscv_vsetvlmax_e32m4());
831 vuint16m2_t vmini = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
832 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
833 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
834 for (
size_t vl; n > 0; n -= vl, source += vl) {
835 vl = __riscv_vsetvl_e32m4(n);
836 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((
const float*)source, vl);
837 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
838 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
839 vbool8_t m = __riscv_vmfgt(vmin, v, vl);
840 vmin = __riscv_vfmin_tu(vmin, vmin, v, vl);
841 vmini = __riscv_vmerge_tu(vmini, vmini, vidx, m, vl);
842 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
844 size_t vl = __riscv_vsetvlmax_e32m4();
845 float min = __riscv_vfmv_f(__riscv_vfredmin(
RISCV_SHRINK4(vfmin,
f, 32, vmin),
846 __riscv_vfmv_v_f_f32m1(FLT_MAX, 1),
847 __riscv_vsetvlmax_e32m1()));
849 __attribute__((aligned(32))) float values[128];
850 __attribute__((aligned(32))) uint16_t indices[128];
851 __riscv_vse32(values, vmin, vl);
852 __riscv_vse16(indices, vmini, vl);
853 uint16_t min_idx = UINT16_MAX;
854 for (
size_t i = 0;
i < vl;
i++) {
855 if (values[
i] == min && indices[
i] < min_idx) {
856 min_idx = indices[
i];