63#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
64#define INCLUDED_volk_32fc_index_max_16u_a_H
76static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
80 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
82 const __m256i indices_increment = _mm256_set1_epi32(8);
88 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
90 __m256 max_values = _mm256_setzero_ps();
91 __m256i max_indices = _mm256_setzero_si256();
93 for (
unsigned i = 0; i < num_points / 8u; ++i) {
94 __m256 in0 = _mm256_load_ps((
float*)src0);
95 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
97 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
104 _mm256_store_ps(max_values_buffer, max_values);
105 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
109 for (
unsigned i = 0; i < 8; i++) {
110 if (max_values_buffer[i] > max) {
111 max = max_values_buffer[i];
112 index = max_indices_buffer[i];
117 for (
unsigned i = num_points & (~7u); i < num_points; ++i) {
118 const float abs_squared =
120 if (abs_squared > max) {
133#include <immintrin.h>
136static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
140 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
142 const __m256i indices_increment = _mm256_set1_epi32(8);
148 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
150 __m256 max_values = _mm256_setzero_ps();
151 __m256i max_indices = _mm256_setzero_si256();
153 for (
unsigned i = 0; i < num_points / 8u; ++i) {
154 __m256 in0 = _mm256_load_ps((
float*)src0);
155 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
157 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
164 _mm256_store_ps(max_values_buffer, max_values);
165 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
169 for (
unsigned i = 0; i < 8; i++) {
170 if (max_values_buffer[i] > max) {
171 max = max_values_buffer[i];
172 index = max_indices_buffer[i];
177 for (
unsigned i = num_points & (~7u); i < num_points; ++i) {
178 const float abs_squared =
180 if (abs_squared > max) {
193#include <pmmintrin.h>
194#include <xmmintrin.h>
200 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
201 const uint32_t num_bytes = num_points * 8;
208 __m128 xmm1, xmm2, xmm3;
209 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
211 xmm5.
int_vec = _mm_setzero_si128();
212 xmm4.
int_vec = _mm_setzero_si128();
213 holderf.
int_vec = _mm_setzero_si128();
214 holderi.
int_vec = _mm_setzero_si128();
216 int bound = num_bytes >> 5;
219 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
220 xmm9 = _mm_setzero_si128();
221 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
222 xmm3 = _mm_setzero_ps();
224 for (;
i < bound; ++
i) {
225 xmm1 = _mm_load_ps((
float*)src0);
226 xmm2 = _mm_load_ps((
float*)&src0[2]);
230 xmm1 = _mm_mul_ps(xmm1, xmm1);
231 xmm2 = _mm_mul_ps(xmm2, xmm2);
233 xmm1 = _mm_hadd_ps(xmm1, xmm2);
235 xmm3 = _mm_max_ps(xmm1, xmm3);
237 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
238 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
240 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
241 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
243 xmm9 = _mm_add_epi32(xmm11, xmm12);
245 xmm8 = _mm_add_epi32(xmm8, xmm10);
248 if (num_bytes >> 4 & 1) {
249 xmm2 = _mm_load_ps((
float*)src0);
254 xmm2 = _mm_mul_ps(xmm2, xmm2);
258 xmm1 = _mm_hadd_ps(xmm2, xmm2);
260 xmm3 = _mm_max_ps(xmm1, xmm3);
262 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
264 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
265 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
267 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
268 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
270 xmm9 = _mm_add_epi32(xmm11, xmm12);
272 xmm8 = _mm_add_epi32(xmm8, xmm10);
275 if (num_bytes >> 3 & 1) {
279 xmm2 = _mm_load1_ps(&sq_dist);
283 xmm3 = _mm_max_ss(xmm3, xmm2);
285 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
286 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
288 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
290 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
291 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
293 xmm9 = _mm_add_epi32(xmm11, xmm12);
296 _mm_store_ps((
float*)&(holderf.
f), xmm3);
297 _mm_store_si128(&(holderi.
int_vec), xmm9);
299 target[0] = holderi.
i[0];
300 sq_dist = holderf.
f[0];
301 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
302 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
303 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
304 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
305 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
306 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
321 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
326 const uint32_t quarter_points = num_points / 4;
330 uint32x4_t vec_indices = { 0, 1, 2, 3 };
331 const uint32x4_t vec_incr = vdupq_n_u32(4);
333 float32x4_t vec_max = vdupq_n_f32(0.0f);
334 uint32x4_t vec_max_idx = vdupq_n_u32(0);
336 for (uint32_t
i = 0;
i < quarter_points;
i++) {
338 float32x4x2_t cplx = vld2q_f32((
const float*)inputPtr);
343 vmlaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
346 uint32x4_t gt_mask = vcgtq_f32(mag2, vec_max);
347 vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
350 vec_max = vmaxq_f32(mag2, vec_max);
352 vec_indices = vaddq_u32(vec_indices, vec_incr);
358 vst1q_f32(max_buf, vec_max);
359 vst1q_u32(idx_buf, vec_max_idx);
361 float max_val = max_buf[0];
362 uint32_t result_idx = idx_buf[0];
363 for (
int i = 1;
i < 4;
i++) {
364 if (max_buf[
i] > max_val) {
365 max_val = max_buf[
i];
366 result_idx = idx_buf[
i];
367 }
else if (max_buf[
i] == max_val && idx_buf[
i] < result_idx) {
368 result_idx = idx_buf[
i];
373 for (uint32_t
i = quarter_points * 4;
i < num_points;
i++) {
376 float mag2 = re * re + im * im;
377 if (mag2 > max_val) {
383 *target = (uint16_t)result_idx;
394static inline void volk_32fc_index_max_16u_neonv8(uint16_t* target,
398 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
403 const uint32_t quarter_points = num_points / 4;
407 uint32x4_t vec_indices = { 0, 1, 2, 3 };
408 const uint32x4_t vec_incr = vdupq_n_u32(4);
410 float32x4_t vec_max = vdupq_n_f32(0.0f);
411 uint32x4_t vec_max_idx = vdupq_n_u32(0);
413 for (uint32_t
i = 0;
i < quarter_points;
i++) {
415 float32x4x2_t cplx = vld2q_f32((
const float*)inputPtr);
420 vfmaq_f32(vmulq_f32(cplx.val[0], cplx.val[0]), cplx.val[1], cplx.val[1]);
423 uint32x4_t gt_mask = vcgtq_f32(mag2, vec_max);
424 vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
427 vec_max = vmaxq_f32(mag2, vec_max);
429 vec_indices = vaddq_u32(vec_indices, vec_incr);
433 float max_val = vmaxvq_f32(vec_max);
434 uint32x4_t max_mask = vceqq_f32(vec_max, vdupq_n_f32(max_val));
435 uint32x4_t idx_masked = vbslq_u32(max_mask, vec_max_idx, vdupq_n_u32(UINT32_MAX));
436 uint32_t result_idx = vminvq_u32(idx_masked);
439 for (uint32_t
i = quarter_points * 4;
i < num_points;
i++) {
442 float mag2 = re * re + im * im;
443 if (mag2 > max_val) {
449 *target = (uint16_t)result_idx;
455#ifdef LV_HAVE_GENERIC
460 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
462 const uint32_t num_bytes = num_points * 8;
470 for (;
i < (num_bytes >> 3); ++
i) {
484#ifdef LV_HAVE_AVX512F
485#include <immintrin.h>
488static inline void volk_32fc_index_max_16u_a_avx512f(uint16_t* target,
492 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
495 const uint32_t sixteenthPoints = num_points / 16;
498 __m512 currentIndexes =
499 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
500 const __m512 indexIncrement = _mm512_set1_ps(16);
502 __m512 maxValues = _mm512_setzero_ps();
503 __m512 maxIndices = _mm512_setzero_ps();
505 for (uint32_t number = 0; number < sixteenthPoints; number++) {
507 __m512 in0 = _mm512_load_ps((
const float*)src0Ptr);
508 __m512 in1 = _mm512_load_ps((
const float*)(src0Ptr + 8));
512 in0 = _mm512_mul_ps(in0, in0);
513 in1 = _mm512_mul_ps(in1, in1);
517 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
518 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
519 __m512 sum0 = _mm512_add_ps(in0, sw0);
520 __m512 sum1 = _mm512_add_ps(in1, sw1);
524 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
527 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, maxValues, _CMP_GT_OS);
528 maxIndices = _mm512_mask_blend_ps(cmpMask, maxIndices, currentIndexes);
529 maxValues = _mm512_max_ps(mag_sq, maxValues);
531 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
537 _mm512_store_ps(maxValuesBuffer, maxValues);
538 _mm512_store_ps(maxIndexesBuffer, maxIndices);
542 for (uint32_t
i = 0;
i < 16;
i++) {
543 if (maxValuesBuffer[
i] > max) {
544 max = maxValuesBuffer[
i];
545 index = (uint32_t)maxIndexesBuffer[
i];
546 }
else if (maxValuesBuffer[
i] == max) {
547 if ((uint32_t)maxIndexesBuffer[
i] < index)
548 index = (uint32_t)maxIndexesBuffer[
i];
553 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
554 const float re =
lv_creal(*src0Ptr);
555 const float im =
lv_cimag(*src0Ptr);
556 const float sq_dist = re * re + im * im;
563 *target = (uint16_t)index;
570#ifndef INCLUDED_volk_32fc_index_max_16u_u_H
571#define INCLUDED_volk_32fc_index_max_16u_u_H
580#include <immintrin.h>
583static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
587 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
589 const __m256i indices_increment = _mm256_set1_epi32(8);
595 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
597 __m256 max_values = _mm256_setzero_ps();
598 __m256i max_indices = _mm256_setzero_si256();
600 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
601 __m256 in0 = _mm256_loadu_ps((
float*)src0);
602 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
604 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
611 _mm256_store_ps(max_values_buffer, max_values);
612 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
616 for (
unsigned i = 0;
i < 8;
i++) {
617 if (max_values_buffer[
i] > max) {
618 max = max_values_buffer[
i];
619 index = max_indices_buffer[
i];
624 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
625 const float abs_squared =
627 if (abs_squared > max) {
640#include <immintrin.h>
643static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
647 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
649 const __m256i indices_increment = _mm256_set1_epi32(8);
655 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
657 __m256 max_values = _mm256_setzero_ps();
658 __m256i max_indices = _mm256_setzero_si256();
660 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
661 __m256 in0 = _mm256_loadu_ps((
float*)src0);
662 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
664 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
671 _mm256_store_ps(max_values_buffer, max_values);
672 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
676 for (
unsigned i = 0;
i < 8;
i++) {
677 if (max_values_buffer[
i] > max) {
678 max = max_values_buffer[
i];
679 index = max_indices_buffer[
i];
684 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
685 const float abs_squared =
687 if (abs_squared > max) {
699#ifdef LV_HAVE_AVX512F
700#include <immintrin.h>
703static inline void volk_32fc_index_max_16u_u_avx512f(uint16_t* target,
707 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
710 const uint32_t sixteenthPoints = num_points / 16;
713 __m512 currentIndexes =
714 _mm512_setr_ps(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
715 const __m512 indexIncrement = _mm512_set1_ps(16);
717 __m512 maxValues = _mm512_setzero_ps();
718 __m512 maxIndices = _mm512_setzero_ps();
720 for (uint32_t number = 0; number < sixteenthPoints; number++) {
722 __m512 in0 = _mm512_loadu_ps((
const float*)src0Ptr);
723 __m512 in1 = _mm512_loadu_ps((
const float*)(src0Ptr + 8));
727 in0 = _mm512_mul_ps(in0, in0);
728 in1 = _mm512_mul_ps(in1, in1);
732 __m512 sw0 = _mm512_shuffle_ps(in0, in0, 0xB1);
733 __m512 sw1 = _mm512_shuffle_ps(in1, in1, 0xB1);
734 __m512 sum0 = _mm512_add_ps(in0, sw0);
735 __m512 sum1 = _mm512_add_ps(in1, sw1);
739 __m512 mag_sq = _mm512_shuffle_ps(sum0, sum1, 0x88);
742 __mmask16 cmpMask = _mm512_cmp_ps_mask(mag_sq, maxValues, _CMP_GT_OS);
743 maxIndices = _mm512_mask_blend_ps(cmpMask, maxIndices, currentIndexes);
744 maxValues = _mm512_max_ps(mag_sq, maxValues);
746 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrement);
752 _mm512_store_ps(maxValuesBuffer, maxValues);
753 _mm512_store_ps(maxIndexesBuffer, maxIndices);
757 for (uint32_t
i = 0;
i < 16;
i++) {
758 if (maxValuesBuffer[
i] > max) {
759 max = maxValuesBuffer[
i];
760 index = (uint32_t)maxIndexesBuffer[
i];
761 }
else if (maxValuesBuffer[
i] == max) {
762 if ((uint32_t)maxIndexesBuffer[
i] < index)
763 index = (uint32_t)maxIndexesBuffer[
i];
768 for (uint32_t number = sixteenthPoints * 16; number < num_points; number++) {
769 const float re =
lv_creal(*src0Ptr);
770 const float im =
lv_cimag(*src0Ptr);
771 const float sq_dist = re * re + im * im;
778 *target = (uint16_t)index;
785#include <riscv_vector.h>
788volk_32fc_index_max_16u_rvv(uint16_t* target,
const lv_32fc_t* src0, uint32_t num_points)
790 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
791 vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
792 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
793 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
794 for (
size_t vl; n > 0; n -= vl, src0 += vl) {
795 vl = __riscv_vsetvl_e32m4(n);
796 vuint64m8_t vc = __riscv_vle64_v_u64m8((
const uint64_t*)src0, vl);
797 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
798 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
799 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
800 vbool8_t m = __riscv_vmflt(vmax, v, vl);
801 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
802 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
803 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
805 size_t vl = __riscv_vsetvlmax_e32m4();
806 float max = __riscv_vfmv_f(__riscv_vfredmax(
RISCV_SHRINK4(vfmax,
f, 32, vmax),
807 __riscv_vfmv_v_f_f32m1(0, 1),
808 __riscv_vsetvlmax_e32m1()));
812 __attribute__((aligned(32))) float values[128];
813 __attribute__((aligned(32))) uint16_t indices[128];
814 __riscv_vse32(values, vmax, vl);
815 __riscv_vse16(indices, vmaxi, vl);
816 uint16_t min_idx = UINT16_MAX;
817 for (
size_t i = 0;
i < vl;
i++) {
818 if (values[
i] == max && indices[
i] < min_idx) {
819 min_idx = indices[
i];
828#include <riscv_vector.h>
830static inline void volk_32fc_index_max_16u_rvvseg(uint16_t* target,
834 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
835 vuint16m2_t vmaxi = __riscv_vmv_v_x_u16m2(0, __riscv_vsetvlmax_e16m2());
836 vuint16m2_t vidx = __riscv_vid_v_u16m2(__riscv_vsetvlmax_e16m2());
837 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
838 for (
size_t vl; n > 0; n -= vl, src0 += vl) {
839 vl = __riscv_vsetvl_e32m4(n);
840 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((
const float*)src0, vl);
841 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0), vi = __riscv_vget_f32m4(vc, 1);
842 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vr, vr, vl), vi, vi, vl);
843 vbool8_t m = __riscv_vmflt(vmax, v, vl);
844 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
845 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
846 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
848 size_t vl = __riscv_vsetvlmax_e32m4();
849 float max = __riscv_vfmv_f(__riscv_vfredmax(
RISCV_SHRINK4(vfmax,
f, 32, vmax),
850 __riscv_vfmv_v_f_f32m1(0, 1),
851 __riscv_vsetvlmax_e32m1()));
853 __attribute__((aligned(32))) float values[128];
854 __attribute__((aligned(32))) uint16_t indices[128];
855 __riscv_vse32(values, vmax, vl);
856 __riscv_vse16(indices, vmaxi, vl);
857 uint16_t min_idx = UINT16_MAX;
858 for (
size_t i = 0;
i < vl;
i++) {
859 if (values[
i] == max && indices[
i] < min_idx) {
860 min_idx = indices[
i];