58#ifndef INCLUDED_volk_32f_index_max_16u_a_H
59#define INCLUDED_volk_32f_index_max_16u_a_H
72 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
75 const uint32_t eighthPoints = num_points / 8;
77 float* inputPtr = (
float*)src0;
79 __m256 indexIncrementValues = _mm256_set1_ps(8);
80 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
84 __m256 maxValues = _mm256_set1_ps(max);
85 __m256 maxValuesIndex = _mm256_setzero_ps();
86 __m256 compareResults;
92 for (; number < eighthPoints; number++) {
94 currentValues = _mm256_load_ps(inputPtr);
96 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
98 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
100 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
101 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
105 _mm256_store_ps(maxValuesBuffer, maxValues);
106 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
108 for (number = 0; number < 8; number++) {
109 if (maxValuesBuffer[number] > max) {
110 index = maxIndexesBuffer[number];
111 max = maxValuesBuffer[number];
112 }
else if (maxValuesBuffer[number] == max) {
113 if (index > maxIndexesBuffer[number])
114 index = maxIndexesBuffer[number];
118 number = eighthPoints * 8;
119 for (; number < num_points; number++) {
120 if (src0[number] > max) {
125 target[0] = (uint16_t)index;
131#include <smmintrin.h>
134volk_32f_index_max_16u_a_sse4_1(uint16_t* target,
const float* src0, uint32_t num_points)
136 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
139 const uint32_t quarterPoints = num_points / 4;
141 float* inputPtr = (
float*)src0;
143 __m128 indexIncrementValues = _mm_set1_ps(4);
144 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
148 __m128 maxValues = _mm_set1_ps(max);
149 __m128 maxValuesIndex = _mm_setzero_ps();
150 __m128 compareResults;
151 __m128 currentValues;
156 for (; number < quarterPoints; number++) {
158 currentValues = _mm_load_ps(inputPtr);
160 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
162 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
164 maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
165 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
169 _mm_store_ps(maxValuesBuffer, maxValues);
170 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
172 for (number = 0; number < 4; number++) {
173 if (maxValuesBuffer[number] > max) {
174 index = maxIndexesBuffer[number];
175 max = maxValuesBuffer[number];
176 }
else if (maxValuesBuffer[number] == max) {
177 if (index > maxIndexesBuffer[number])
178 index = maxIndexesBuffer[number];
182 number = quarterPoints * 4;
183 for (; number < num_points; number++) {
184 if (src0[number] > max) {
189 target[0] = (uint16_t)index;
197#include <xmmintrin.h>
202 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
205 const uint32_t quarterPoints = num_points / 4;
207 float* inputPtr = (
float*)src0;
209 __m128 indexIncrementValues = _mm_set1_ps(4);
210 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
214 __m128 maxValues = _mm_set1_ps(max);
215 __m128 maxValuesIndex = _mm_setzero_ps();
216 __m128 compareResults;
217 __m128 currentValues;
222 for (; number < quarterPoints; number++) {
224 currentValues = _mm_load_ps(inputPtr);
226 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
228 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
230 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
231 _mm_andnot_ps(compareResults, maxValuesIndex));
232 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
233 _mm_andnot_ps(compareResults, maxValues));
237 _mm_store_ps(maxValuesBuffer, maxValues);
238 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
240 for (number = 0; number < 4; number++) {
241 if (maxValuesBuffer[number] > max) {
242 index = maxIndexesBuffer[number];
243 max = maxValuesBuffer[number];
244 }
else if (maxValuesBuffer[number] == max) {
245 if (index > maxIndexesBuffer[number])
246 index = maxIndexesBuffer[number];
250 number = quarterPoints * 4;
251 for (; number < num_points; number++) {
252 if (src0[number] > max) {
257 target[0] = (uint16_t)index;
271 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
276 const uint32_t quarter_points = num_points / 4;
277 const float* inputPtr = src0;
280 uint32x4_t vec_indices = { 0, 1, 2, 3 };
281 const uint32x4_t vec_incr = vdupq_n_u32(4);
283 float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
284 uint32x4_t vec_max_idx = vdupq_n_u32(0);
286 for (uint32_t i = 0; i < quarter_points; i++) {
287 float32x4_t vec_val = vld1q_f32(inputPtr);
291 uint32x4_t gt_mask = vcgtq_f32(vec_val, vec_max);
292 vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
295 vec_max = vmaxq_f32(vec_val, vec_max);
297 vec_indices = vaddq_u32(vec_indices, vec_incr);
303 vst1q_f32(max_buf, vec_max);
304 vst1q_u32(idx_buf, vec_max_idx);
306 float max_val = max_buf[0];
307 uint32_t result_idx = idx_buf[0];
308 for (
int i = 1; i < 4; i++) {
309 if (max_buf[i] > max_val) {
310 max_val = max_buf[i];
311 result_idx = idx_buf[i];
312 }
else if (max_buf[i] == max_val && idx_buf[i] < result_idx) {
313 result_idx = idx_buf[i];
318 for (uint32_t i = quarter_points * 4; i < num_points; i++) {
319 if (src0[i] > max_val) {
325 *target = (uint16_t)result_idx;
337volk_32f_index_max_16u_neonv8(uint16_t* target,
const float* src0, uint32_t num_points)
339 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
344 const uint32_t quarter_points = num_points / 4;
345 const float* inputPtr = src0;
348 uint32x4_t vec_indices = { 0, 1, 2, 3 };
349 const uint32x4_t vec_incr = vdupq_n_u32(4);
351 float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
352 uint32x4_t vec_max_idx = vdupq_n_u32(0);
354 for (uint32_t i = 0; i < quarter_points; i++) {
355 float32x4_t vec_val = vld1q_f32(inputPtr);
359 uint32x4_t gt_mask = vcgtq_f32(vec_val, vec_max);
360 vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
363 vec_max = vmaxq_f32(vec_val, vec_max);
365 vec_indices = vaddq_u32(vec_indices, vec_incr);
369 float max_val = vmaxvq_f32(vec_max);
370 uint32x4_t max_mask = vceqq_f32(vec_max, vdupq_n_f32(max_val));
371 uint32x4_t idx_masked = vbslq_u32(max_mask, vec_max_idx, vdupq_n_u32(UINT32_MAX));
372 uint32_t result_idx = vminvq_u32(idx_masked);
375 for (uint32_t i = quarter_points * 4; i < num_points; i++) {
376 if (src0[i] > max_val) {
382 *target = (uint16_t)result_idx;
388#ifdef LV_HAVE_GENERIC
393 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
400 for (; i < num_points; ++i) {
411#ifdef LV_HAVE_AVX512F
412#include <immintrin.h>
416volk_32f_index_max_16u_a_avx512f(uint16_t* target,
const float* src0, uint32_t num_points)
418 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
421 const uint32_t sixteenthPoints = num_points / 16;
423 const float* inputPtr = src0;
425 __m512 indexIncrementValues = _mm512_set1_ps(16);
426 __m512 currentIndexes = _mm512_set_ps(
427 -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
431 __m512 maxValues = _mm512_set1_ps(max);
432 __m512 maxValuesIndex = _mm512_setzero_ps();
433 __mmask16 compareResults;
434 __m512 currentValues;
439 for (; number < sixteenthPoints; number++) {
440 currentValues = _mm512_load_ps(inputPtr);
442 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
443 compareResults = _mm512_cmp_ps_mask(currentValues, maxValues, _CMP_GT_OS);
445 _mm512_mask_blend_ps(compareResults, maxValuesIndex, currentIndexes);
446 maxValues = _mm512_mask_blend_ps(compareResults, maxValues, currentValues);
450 _mm512_store_ps(maxValuesBuffer, maxValues);
451 _mm512_store_ps(maxIndexesBuffer, maxValuesIndex);
453 for (number = 0; number < 16; number++) {
454 if (maxValuesBuffer[number] > max) {
455 index = maxIndexesBuffer[number];
456 max = maxValuesBuffer[number];
457 }
else if (maxValuesBuffer[number] == max) {
458 if (index > maxIndexesBuffer[number])
459 index = maxIndexesBuffer[number];
463 number = sixteenthPoints * 16;
464 for (; number < num_points; number++) {
465 if (src0[number] > max) {
470 target[0] = (uint16_t)index;
478#ifndef INCLUDED_volk_32f_index_max_16u_u_H
479#define INCLUDED_volk_32f_index_max_16u_u_H
487#include <immintrin.h>
492 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
495 const uint32_t eighthPoints = num_points / 8;
497 float* inputPtr = (
float*)src0;
499 __m256 indexIncrementValues = _mm256_set1_ps(8);
500 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
504 __m256 maxValues = _mm256_set1_ps(max);
505 __m256 maxValuesIndex = _mm256_setzero_ps();
506 __m256 compareResults;
507 __m256 currentValues;
512 for (; number < eighthPoints; number++) {
514 currentValues = _mm256_loadu_ps(inputPtr);
516 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
518 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
520 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
521 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
525 _mm256_storeu_ps(maxValuesBuffer, maxValues);
526 _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
528 for (number = 0; number < 8; number++) {
529 if (maxValuesBuffer[number] > max) {
530 index = maxIndexesBuffer[number];
531 max = maxValuesBuffer[number];
532 }
else if (maxValuesBuffer[number] == max) {
533 if (index > maxIndexesBuffer[number])
534 index = maxIndexesBuffer[number];
538 number = eighthPoints * 8;
539 for (; number < num_points; number++) {
540 if (src0[number] > max) {
545 target[0] = (uint16_t)index;
550#ifdef LV_HAVE_AVX512F
551#include <immintrin.h>
555volk_32f_index_max_16u_u_avx512f(uint16_t* target,
const float* src0, uint32_t num_points)
557 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
560 const uint32_t sixteenthPoints = num_points / 16;
562 const float* inputPtr = src0;
564 __m512 indexIncrementValues = _mm512_set1_ps(16);
565 __m512 currentIndexes = _mm512_set_ps(
566 -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
570 __m512 maxValues = _mm512_set1_ps(max);
571 __m512 maxValuesIndex = _mm512_setzero_ps();
572 __mmask16 compareResults;
573 __m512 currentValues;
578 for (; number < sixteenthPoints; number++) {
579 currentValues = _mm512_loadu_ps(inputPtr);
581 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
582 compareResults = _mm512_cmp_ps_mask(currentValues, maxValues, _CMP_GT_OS);
584 _mm512_mask_blend_ps(compareResults, maxValuesIndex, currentIndexes);
585 maxValues = _mm512_mask_blend_ps(compareResults, maxValues, currentValues);
589 _mm512_store_ps(maxValuesBuffer, maxValues);
590 _mm512_store_ps(maxIndexesBuffer, maxValuesIndex);
592 for (number = 0; number < 16; number++) {
593 if (maxValuesBuffer[number] > max) {
594 index = maxIndexesBuffer[number];
595 max = maxValuesBuffer[number];
596 }
else if (maxValuesBuffer[number] == max) {
597 if (index > maxIndexesBuffer[number])
598 index = maxIndexesBuffer[number];
602 number = sixteenthPoints * 16;
603 for (; number < num_points; number++) {
604 if (src0[number] > max) {
609 target[0] = (uint16_t)index;
616#include <riscv_vector.h>
619volk_32f_index_max_16u_rvv(uint16_t* target,
const float* src0, uint32_t num_points)
621 vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(-FLT_MAX, __riscv_vsetvlmax_e32m8());
622 vuint16m4_t vmaxi = __riscv_vmv_v_x_u16m4(0, __riscv_vsetvlmax_e16m4());
623 vuint16m4_t vidx = __riscv_vid_v_u16m4(__riscv_vsetvlmax_e16m4());
624 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
625 for (
size_t vl; n > 0; n -= vl, src0 += vl) {
626 vl = __riscv_vsetvl_e32m8(n);
627 vfloat32m8_t v = __riscv_vle32_v_f32m8(src0, vl);
628 vbool4_t m = __riscv_vmfgt(v, vmax, vl);
629 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
630 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
631 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
633 size_t vl = __riscv_vsetvlmax_e32m8();
634 float max = __riscv_vfmv_f(__riscv_vfredmax(
RISCV_SHRINK8(vfmax, f, 32, vmax),
635 __riscv_vfmv_v_f_f32m1(-FLT_MAX, 1),
636 __riscv_vsetvlmax_e32m1()));
638 vbool4_t m = __riscv_vmfeq(vmax, max, vl);
639 vuint16m4_t idx_masked = __riscv_vmerge(
640 __riscv_vmv_v_x_u16m4(UINT16_MAX, __riscv_vsetvlmax_e16m4()), vmaxi, m, vl);
642 *target = __riscv_vmv_x(__riscv_vredminu(
RISCV_SHRINK4(vminu, u, 16, idx_masked),
643 __riscv_vmv_v_x_u16m1(UINT16_MAX, 1),
644 __riscv_vsetvlmax_e16m1()));