Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_index_max_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
51
52#ifndef INCLUDED_volk_32f_index_max_32u_a_H
53#define INCLUDED_volk_32f_index_max_32u_a_H
54
55#include <inttypes.h>
56#include <stdio.h>
57#include <volk/volk_common.h>
58
59#ifdef LV_HAVE_SSE4_1
60#include <smmintrin.h>
61
62static inline void
63volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
64{
65 if (num_points > 0) {
66 uint32_t number = 0;
67 const uint32_t quarterPoints = num_points / 4;
68
69 float* inputPtr = (float*)src0;
70
71 __m128 indexIncrementValues = _mm_set1_ps(4);
72 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
73
74 float max = src0[0];
75 float index = 0;
76 __m128 maxValues = _mm_set1_ps(max);
77 __m128 maxValuesIndex = _mm_setzero_ps();
78 __m128 compareResults;
79 __m128 currentValues;
80
81 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
82 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
83
84 for (; number < quarterPoints; number++) {
85
86 currentValues = _mm_load_ps(inputPtr);
87 inputPtr += 4;
88 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
89
90 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
91
92 maxValuesIndex =
93 _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
94 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
95 }
96
97 // Calculate the largest value from the remaining 4 points
98 _mm_store_ps(maxValuesBuffer, maxValues);
99 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
100
101 for (number = 0; number < 4; number++) {
102 if (maxValuesBuffer[number] > max) {
103 index = maxIndexesBuffer[number];
104 max = maxValuesBuffer[number];
105 } else if (maxValuesBuffer[number] == max) {
106 if (index > maxIndexesBuffer[number])
107 index = maxIndexesBuffer[number];
108 }
109 }
110
111 number = quarterPoints * 4;
112 for (; number < num_points; number++) {
113 if (src0[number] > max) {
114 index = number;
115 max = src0[number];
116 }
117 }
118 target[0] = (uint32_t)index;
119 }
120}
121
122#endif /*LV_HAVE_SSE4_1*/
123
124
125#ifdef LV_HAVE_SSE
126
127#include <xmmintrin.h>
128
129static inline void
130volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
131{
132 if (num_points > 0) {
133 uint32_t number = 0;
134 const uint32_t quarterPoints = num_points / 4;
135
136 float* inputPtr = (float*)src0;
137
138 __m128 indexIncrementValues = _mm_set1_ps(4);
139 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
140
141 float max = src0[0];
142 float index = 0;
143 __m128 maxValues = _mm_set1_ps(max);
144 __m128 maxValuesIndex = _mm_setzero_ps();
145 __m128 compareResults;
146 __m128 currentValues;
147
148 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
149 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
150
151 for (; number < quarterPoints; number++) {
152
153 currentValues = _mm_load_ps(inputPtr);
154 inputPtr += 4;
155 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
156
157 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
158
159 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
160 _mm_andnot_ps(compareResults, maxValuesIndex));
161
162 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
163 _mm_andnot_ps(compareResults, maxValues));
164 }
165
166 // Calculate the largest value from the remaining 4 points
167 _mm_store_ps(maxValuesBuffer, maxValues);
168 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
169
170 for (number = 0; number < 4; number++) {
171 if (maxValuesBuffer[number] > max) {
172 index = maxIndexesBuffer[number];
173 max = maxValuesBuffer[number];
174 } else if (maxValuesBuffer[number] == max) {
175 if (index > maxIndexesBuffer[number])
176 index = maxIndexesBuffer[number];
177 }
178 }
179
180 number = quarterPoints * 4;
181 for (; number < num_points; number++) {
182 if (src0[number] > max) {
183 index = number;
184 max = src0[number];
185 }
186 }
187 target[0] = (uint32_t)index;
188 }
189}
190
191#endif /*LV_HAVE_SSE*/
192
193
194#ifdef LV_HAVE_AVX
195#include <immintrin.h>
196
197static inline void
198volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
199{
200 if (num_points > 0) {
201 uint32_t number = 0;
202 const uint32_t quarterPoints = num_points / 8;
203
204 float* inputPtr = (float*)src0;
205
206 __m256 indexIncrementValues = _mm256_set1_ps(8);
207 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
208
209 float max = src0[0];
210 float index = 0;
211 __m256 maxValues = _mm256_set1_ps(max);
212 __m256 maxValuesIndex = _mm256_setzero_ps();
213 __m256 compareResults;
214 __m256 currentValues;
215
216 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
217 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
218
219 for (; number < quarterPoints; number++) {
220 currentValues = _mm256_load_ps(inputPtr);
221 inputPtr += 8;
222 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
223 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
224 maxValuesIndex =
225 _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
226 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
227 }
228
229 // Calculate the largest value from the remaining 8 points
230 _mm256_store_ps(maxValuesBuffer, maxValues);
231 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
232
233 for (number = 0; number < 8; number++) {
234 if (maxValuesBuffer[number] > max) {
235 index = maxIndexesBuffer[number];
236 max = maxValuesBuffer[number];
237 } else if (maxValuesBuffer[number] == max) {
238 if (index > maxIndexesBuffer[number])
239 index = maxIndexesBuffer[number];
240 }
241 }
242
243 number = quarterPoints * 8;
244 for (; number < num_points; number++) {
245 if (src0[number] > max) {
246 index = number;
247 max = src0[number];
248 }
249 }
250 target[0] = (uint32_t)index;
251 }
252}
253
254#endif /*LV_HAVE_AVX*/
255
256
257#ifdef LV_HAVE_NEON
258#include <arm_neon.h>
259
260static inline void
261volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
262{
263 if (num_points > 0) {
264 uint32_t number = 0;
265 const uint32_t quarterPoints = num_points / 4;
266
267 float* inputPtr = (float*)src0;
268 float32x4_t indexIncrementValues = vdupq_n_f32(4);
270 float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
271 float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
272
273 float max = src0[0];
274 float index = 0;
275 float32x4_t maxValues = vdupq_n_f32(max);
276 uint32x4_t maxValuesIndex = vmovq_n_u32(0);
277 uint32x4_t compareResults;
278 uint32x4_t currentIndexes_u;
279 float32x4_t currentValues;
280
281 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
282 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
283
284 for (; number < quarterPoints; number++) {
285 currentValues = vld1q_f32(inputPtr);
286 inputPtr += 4;
287 currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
288 currentIndexes_u = vcvtq_u32_f32(currentIndexes);
289 compareResults = vcleq_f32(currentValues, maxValues);
290 maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
291 vbicq_u32(currentIndexes_u, compareResults));
292 maxValues = vmaxq_f32(currentValues, maxValues);
293 }
294
295 // Calculate the largest value from the remaining 4 points
296 vst1q_f32(maxValuesBuffer, maxValues);
297 vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
298 for (number = 0; number < 4; number++) {
299 if (maxValuesBuffer[number] > max) {
300 index = maxIndexesBuffer[number];
301 max = maxValuesBuffer[number];
302 } else if (maxValuesBuffer[number] == max) {
303 if (index > maxIndexesBuffer[number])
304 index = maxIndexesBuffer[number];
305 }
306 }
307
308 number = quarterPoints * 4;
309 for (; number < num_points; number++) {
310 if (src0[number] > max) {
311 index = number;
312 max = src0[number];
313 }
314 }
315 target[0] = (uint32_t)index;
316 }
317}
318
319#endif /*LV_HAVE_NEON*/
320
321
322#ifdef LV_HAVE_NEONV8
323#include <arm_neon.h>
324#include <float.h>
325
326static inline void
327volk_32f_index_max_32u_neonv8(uint32_t* target, const float* src0, uint32_t num_points)
328{
329 if (num_points == 0)
330 return;
331
332 const uint32_t quarter_points = num_points / 4;
333 const float* inputPtr = src0;
334
335 // Use integer indices directly (no float conversion overhead)
336 uint32x4_t vec_indices = { 0, 1, 2, 3 };
337 const uint32x4_t vec_incr = vdupq_n_u32(4);
338
339 float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
340 uint32x4_t vec_max_idx = vdupq_n_u32(0);
341
342 for (uint32_t i = 0; i < quarter_points; i++) {
343 float32x4_t vec_val = vld1q_f32(inputPtr);
344 inputPtr += 4;
345
346 // Compare BEFORE max update to know which lanes change
347 uint32x4_t gt_mask = vcgtq_f32(vec_val, vec_max);
348 vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
349
350 // vmaxq_f32 is single-cycle, no dependency on comparison result
351 vec_max = vmaxq_f32(vec_val, vec_max);
352
353 vec_indices = vaddq_u32(vec_indices, vec_incr);
354 }
355
356 // ARMv8 horizontal reduction - find max value across all lanes
357 float max_val = vmaxvq_f32(vec_max);
358
359 // Find which lane(s) have the max value, get minimum index among them
360 uint32x4_t max_mask = vceqq_f32(vec_max, vdupq_n_f32(max_val));
361 uint32x4_t idx_masked = vbslq_u32(max_mask, vec_max_idx, vdupq_n_u32(UINT32_MAX));
362 uint32_t result_idx = vminvq_u32(idx_masked);
363
364 // Handle tail elements
365 for (uint32_t i = quarter_points * 4; i < num_points; i++) {
366 if (src0[i] > max_val) {
367 max_val = src0[i];
368 result_idx = i;
369 }
370 }
371
372 *target = result_idx;
373}
374
375#endif /*LV_HAVE_NEONV8*/
376
377
378#ifdef LV_HAVE_GENERIC
379
380static inline void
381volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
382{
383 if (num_points > 0) {
384 float max = src0[0];
385 uint32_t index = 0;
386
387 uint32_t i = 1;
388
389 for (; i < num_points; ++i) {
390 if (src0[i] > max) {
391 index = i;
392 max = src0[i];
393 }
394 }
395 target[0] = index;
396 }
397}
398
399#endif /*LV_HAVE_GENERIC*/
400
401#ifdef LV_HAVE_AVX512F
402#include <immintrin.h>
403
404static inline void
405volk_32f_index_max_32u_a_avx512f(uint32_t* target, const float* src0, uint32_t num_points)
406{
407 if (num_points > 0) {
408 uint32_t number = 0;
409 const uint32_t sixteenthPoints = num_points / 16;
410
411 const float* inputPtr = src0;
412
413 __m512 indexIncrementValues = _mm512_set1_ps(16);
414 __m512 currentIndexes = _mm512_set_ps(
415 -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
416
417 float max = src0[0];
418 float index = 0;
419 __m512 maxValues = _mm512_set1_ps(max);
420 __m512 maxValuesIndex = _mm512_setzero_ps();
421 __mmask16 compareResults;
422 __m512 currentValues;
423
424 __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
425 __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
426
427 for (; number < sixteenthPoints; number++) {
428 currentValues = _mm512_load_ps(inputPtr);
429 inputPtr += 16;
430 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
431 compareResults = _mm512_cmp_ps_mask(currentValues, maxValues, _CMP_GT_OS);
432 maxValuesIndex =
433 _mm512_mask_blend_ps(compareResults, maxValuesIndex, currentIndexes);
434 maxValues = _mm512_mask_blend_ps(compareResults, maxValues, currentValues);
435 }
436
437 // Calculate the largest value from the remaining 16 points
438 _mm512_store_ps(maxValuesBuffer, maxValues);
439 _mm512_store_ps(maxIndexesBuffer, maxValuesIndex);
440
441 for (number = 0; number < 16; number++) {
442 if (maxValuesBuffer[number] > max) {
443 index = maxIndexesBuffer[number];
444 max = maxValuesBuffer[number];
445 } else if (maxValuesBuffer[number] == max) {
446 if (index > maxIndexesBuffer[number])
447 index = maxIndexesBuffer[number];
448 }
449 }
450
451 number = sixteenthPoints * 16;
452 for (; number < num_points; number++) {
453 if (src0[number] > max) {
454 index = number;
455 max = src0[number];
456 }
457 }
458 target[0] = (uint32_t)index;
459 }
460}
461
462#endif /*LV_HAVE_AVX512F*/
463
464#endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
465
466
467#ifndef INCLUDED_volk_32f_index_max_32u_u_H
468#define INCLUDED_volk_32f_index_max_32u_u_H
469
470#include <inttypes.h>
471#include <stdio.h>
472#include <volk/volk_common.h>
473
474
475#ifdef LV_HAVE_AVX
476#include <immintrin.h>
477
478static inline void
479volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
480{
481 if (num_points > 0) {
482 uint32_t number = 0;
483 const uint32_t quarterPoints = num_points / 8;
484
485 float* inputPtr = (float*)src0;
486
487 __m256 indexIncrementValues = _mm256_set1_ps(8);
488 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
489
490 float max = src0[0];
491 float index = 0;
492 __m256 maxValues = _mm256_set1_ps(max);
493 __m256 maxValuesIndex = _mm256_setzero_ps();
494 __m256 compareResults;
495 __m256 currentValues;
496
497 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
498 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
499
500 for (; number < quarterPoints; number++) {
501 currentValues = _mm256_loadu_ps(inputPtr);
502 inputPtr += 8;
503 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
504 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
505 maxValuesIndex =
506 _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
507 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
508 }
509
510 // Calculate the largest value from the remaining 8 points
511 _mm256_store_ps(maxValuesBuffer, maxValues);
512 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
513
514 for (number = 0; number < 8; number++) {
515 if (maxValuesBuffer[number] > max) {
516 index = maxIndexesBuffer[number];
517 max = maxValuesBuffer[number];
518 } else if (maxValuesBuffer[number] == max) {
519 if (index > maxIndexesBuffer[number])
520 index = maxIndexesBuffer[number];
521 }
522 }
523
524 number = quarterPoints * 8;
525 for (; number < num_points; number++) {
526 if (src0[number] > max) {
527 index = number;
528 max = src0[number];
529 }
530 }
531 target[0] = (uint32_t)index;
532 }
533}
534
535#endif /*LV_HAVE_AVX*/
536
537
538#ifdef LV_HAVE_SSE4_1
539#include <smmintrin.h>
540
541static inline void
542volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
543{
544 if (num_points > 0) {
545 uint32_t number = 0;
546 const uint32_t quarterPoints = num_points / 4;
547
548 float* inputPtr = (float*)src0;
549
550 __m128 indexIncrementValues = _mm_set1_ps(4);
551 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
552
553 float max = src0[0];
554 float index = 0;
555 __m128 maxValues = _mm_set1_ps(max);
556 __m128 maxValuesIndex = _mm_setzero_ps();
557 __m128 compareResults;
558 __m128 currentValues;
559
560 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
561 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
562
563 for (; number < quarterPoints; number++) {
564 currentValues = _mm_loadu_ps(inputPtr);
565 inputPtr += 4;
566 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
567 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
568 maxValuesIndex =
569 _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
570 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
571 }
572
573 // Calculate the largest value from the remaining 4 points
574 _mm_store_ps(maxValuesBuffer, maxValues);
575 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
576
577 for (number = 0; number < 4; number++) {
578 if (maxValuesBuffer[number] > max) {
579 index = maxIndexesBuffer[number];
580 max = maxValuesBuffer[number];
581 } else if (maxValuesBuffer[number] == max) {
582 if (index > maxIndexesBuffer[number])
583 index = maxIndexesBuffer[number];
584 }
585 }
586
587 number = quarterPoints * 4;
588 for (; number < num_points; number++) {
589 if (src0[number] > max) {
590 index = number;
591 max = src0[number];
592 }
593 }
594 target[0] = (uint32_t)index;
595 }
596}
597
598#endif /*LV_HAVE_SSE4_1*/
599
600#ifdef LV_HAVE_SSE
601#include <xmmintrin.h>
602
603static inline void
604volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
605{
606 if (num_points > 0) {
607 uint32_t number = 0;
608 const uint32_t quarterPoints = num_points / 4;
609
610 float* inputPtr = (float*)src0;
611
612 __m128 indexIncrementValues = _mm_set1_ps(4);
613 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
614
615 float max = src0[0];
616 float index = 0;
617 __m128 maxValues = _mm_set1_ps(max);
618 __m128 maxValuesIndex = _mm_setzero_ps();
619 __m128 compareResults;
620 __m128 currentValues;
621
622 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
623 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
624
625 for (; number < quarterPoints; number++) {
626 currentValues = _mm_loadu_ps(inputPtr);
627 inputPtr += 4;
628 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
629 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
630 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
631 _mm_andnot_ps(compareResults, maxValuesIndex));
632 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
633 _mm_andnot_ps(compareResults, maxValues));
634 }
635
636 // Calculate the largest value from the remaining 4 points
637 _mm_store_ps(maxValuesBuffer, maxValues);
638 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
639
640 for (number = 0; number < 4; number++) {
641 if (maxValuesBuffer[number] > max) {
642 index = maxIndexesBuffer[number];
643 max = maxValuesBuffer[number];
644 } else if (maxValuesBuffer[number] == max) {
645 if (index > maxIndexesBuffer[number])
646 index = maxIndexesBuffer[number];
647 }
648 }
649
650 number = quarterPoints * 4;
651 for (; number < num_points; number++) {
652 if (src0[number] > max) {
653 index = number;
654 max = src0[number];
655 }
656 }
657 target[0] = (uint32_t)index;
658 }
659}
660
661#endif /*LV_HAVE_SSE*/
662
663#ifdef LV_HAVE_AVX512F
664#include <immintrin.h>
665
666static inline void
667volk_32f_index_max_32u_u_avx512f(uint32_t* target, const float* src0, uint32_t num_points)
668{
669 if (num_points > 0) {
670 uint32_t number = 0;
671 const uint32_t sixteenthPoints = num_points / 16;
672
673 const float* inputPtr = src0;
674
675 __m512 indexIncrementValues = _mm512_set1_ps(16);
676 __m512 currentIndexes = _mm512_set_ps(
677 -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
678
679 float max = src0[0];
680 float index = 0;
681 __m512 maxValues = _mm512_set1_ps(max);
682 __m512 maxValuesIndex = _mm512_setzero_ps();
683 __mmask16 compareResults;
684 __m512 currentValues;
685
686 __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
687 __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
688
689 for (; number < sixteenthPoints; number++) {
690 currentValues = _mm512_loadu_ps(inputPtr);
691 inputPtr += 16;
692 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
693 compareResults = _mm512_cmp_ps_mask(currentValues, maxValues, _CMP_GT_OS);
694 maxValuesIndex =
695 _mm512_mask_blend_ps(compareResults, maxValuesIndex, currentIndexes);
696 maxValues = _mm512_mask_blend_ps(compareResults, maxValues, currentValues);
697 }
698
699 // Calculate the largest value from the remaining 16 points
700 _mm512_store_ps(maxValuesBuffer, maxValues);
701 _mm512_store_ps(maxIndexesBuffer, maxValuesIndex);
702
703 for (number = 0; number < 16; number++) {
704 if (maxValuesBuffer[number] > max) {
705 index = maxIndexesBuffer[number];
706 max = maxValuesBuffer[number];
707 } else if (maxValuesBuffer[number] == max) {
708 if (index > maxIndexesBuffer[number])
709 index = maxIndexesBuffer[number];
710 }
711 }
712
713 number = sixteenthPoints * 16;
714 for (; number < num_points; number++) {
715 if (src0[number] > max) {
716 index = number;
717 max = src0[number];
718 }
719 }
720 target[0] = (uint32_t)index;
721 }
722}
723
724#endif /*LV_HAVE_AVX512F*/
725
726#ifdef LV_HAVE_RVV
727#include <float.h>
728#include <riscv_vector.h>
729
730static inline void
731volk_32f_index_max_32u_rvv(uint32_t* target, const float* src0, uint32_t num_points)
732{
733 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(-FLT_MAX, __riscv_vsetvlmax_e32m4());
734 vuint32m4_t vmaxi = __riscv_vmv_v_x_u32m4(0, __riscv_vsetvlmax_e32m4());
735 vuint32m4_t vidx = __riscv_vid_v_u32m4(__riscv_vsetvlmax_e32m4());
736 size_t n = num_points;
737 for (size_t vl; n > 0; n -= vl, src0 += vl) {
738 vl = __riscv_vsetvl_e32m4(n);
739 vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl);
740 vbool8_t m = __riscv_vmfgt(v, vmax, vl);
741 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
742 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
743 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e32m4());
744 }
745 size_t vl = __riscv_vsetvlmax_e32m4();
746 float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK4(vfmax, f, 32, vmax),
747 __riscv_vfmv_v_f_f32m1(-FLT_MAX, 1),
748 __riscv_vsetvlmax_e32m1()));
749 // Find lanes with max value, set others to UINT32_MAX
750 vbool8_t m = __riscv_vmfeq(vmax, max, vl);
751 vuint32m4_t idx_masked =
752 __riscv_vmerge(__riscv_vmv_v_x_u32m4(UINT32_MAX, vl), vmaxi, m, vl);
753 // Find minimum index among lanes with max value
754 *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 32, idx_masked),
755 __riscv_vmv_v_x_u32m1(UINT32_MAX, 1),
756 __riscv_vsetvlmax_e32m1()));
757}
758#endif /*LV_HAVE_RVV*/
759
760#endif /*INCLUDED_volk_32f_index_max_32u_u_H*/