Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_index_max_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
57
58#ifndef INCLUDED_volk_32f_index_max_16u_a_H
59#define INCLUDED_volk_32f_index_max_16u_a_H
60
61#include <inttypes.h>
62#include <limits.h>
63#include <stdio.h>
64#include <volk/volk_common.h>
65
66#ifdef LV_HAVE_AVX
67#include <immintrin.h>
68
69static inline void
70volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
71{
72 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
73
74 uint32_t number = 0;
75 const uint32_t eighthPoints = num_points / 8;
76
77 float* inputPtr = (float*)src0;
78
79 __m256 indexIncrementValues = _mm256_set1_ps(8);
80 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
81
82 float max = src0[0];
83 float index = 0;
84 __m256 maxValues = _mm256_set1_ps(max);
85 __m256 maxValuesIndex = _mm256_setzero_ps();
86 __m256 compareResults;
87 __m256 currentValues;
88
89 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
90 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
91
92 for (; number < eighthPoints; number++) {
93
94 currentValues = _mm256_load_ps(inputPtr);
95 inputPtr += 8;
96 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
97
98 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
99
100 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
101 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
102 }
103
104 // Calculate the largest value from the remaining 4 points
105 _mm256_store_ps(maxValuesBuffer, maxValues);
106 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
107
108 for (number = 0; number < 8; number++) {
109 if (maxValuesBuffer[number] > max) {
110 index = maxIndexesBuffer[number];
111 max = maxValuesBuffer[number];
112 } else if (maxValuesBuffer[number] == max) {
113 if (index > maxIndexesBuffer[number])
114 index = maxIndexesBuffer[number];
115 }
116 }
117
118 number = eighthPoints * 8;
119 for (; number < num_points; number++) {
120 if (src0[number] > max) {
121 index = number;
122 max = src0[number];
123 }
124 }
125 target[0] = (uint16_t)index;
126}
127
128#endif /*LV_HAVE_AVX*/
129
130#ifdef LV_HAVE_SSE4_1
131#include <smmintrin.h>
132
133static inline void
134volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
135{
136 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
137
138 uint32_t number = 0;
139 const uint32_t quarterPoints = num_points / 4;
140
141 float* inputPtr = (float*)src0;
142
143 __m128 indexIncrementValues = _mm_set1_ps(4);
144 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
145
146 float max = src0[0];
147 float index = 0;
148 __m128 maxValues = _mm_set1_ps(max);
149 __m128 maxValuesIndex = _mm_setzero_ps();
150 __m128 compareResults;
151 __m128 currentValues;
152
153 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
154 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
155
156 for (; number < quarterPoints; number++) {
157
158 currentValues = _mm_load_ps(inputPtr);
159 inputPtr += 4;
160 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
161
162 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
163
164 maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
165 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
166 }
167
168 // Calculate the largest value from the remaining 4 points
169 _mm_store_ps(maxValuesBuffer, maxValues);
170 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
171
172 for (number = 0; number < 4; number++) {
173 if (maxValuesBuffer[number] > max) {
174 index = maxIndexesBuffer[number];
175 max = maxValuesBuffer[number];
176 } else if (maxValuesBuffer[number] == max) {
177 if (index > maxIndexesBuffer[number])
178 index = maxIndexesBuffer[number];
179 }
180 }
181
182 number = quarterPoints * 4;
183 for (; number < num_points; number++) {
184 if (src0[number] > max) {
185 index = number;
186 max = src0[number];
187 }
188 }
189 target[0] = (uint16_t)index;
190}
191
192#endif /*LV_HAVE_SSE4_1*/
193
194
195#ifdef LV_HAVE_SSE
196
197#include <xmmintrin.h>
198
199static inline void
200volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
201{
202 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
203
204 uint32_t number = 0;
205 const uint32_t quarterPoints = num_points / 4;
206
207 float* inputPtr = (float*)src0;
208
209 __m128 indexIncrementValues = _mm_set1_ps(4);
210 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
211
212 float max = src0[0];
213 float index = 0;
214 __m128 maxValues = _mm_set1_ps(max);
215 __m128 maxValuesIndex = _mm_setzero_ps();
216 __m128 compareResults;
217 __m128 currentValues;
218
219 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
220 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
221
222 for (; number < quarterPoints; number++) {
223
224 currentValues = _mm_load_ps(inputPtr);
225 inputPtr += 4;
226 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
227
228 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
229
230 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
231 _mm_andnot_ps(compareResults, maxValuesIndex));
232 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
233 _mm_andnot_ps(compareResults, maxValues));
234 }
235
236 // Calculate the largest value from the remaining 4 points
237 _mm_store_ps(maxValuesBuffer, maxValues);
238 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
239
240 for (number = 0; number < 4; number++) {
241 if (maxValuesBuffer[number] > max) {
242 index = maxIndexesBuffer[number];
243 max = maxValuesBuffer[number];
244 } else if (maxValuesBuffer[number] == max) {
245 if (index > maxIndexesBuffer[number])
246 index = maxIndexesBuffer[number];
247 }
248 }
249
250 number = quarterPoints * 4;
251 for (; number < num_points; number++) {
252 if (src0[number] > max) {
253 index = number;
254 max = src0[number];
255 }
256 }
257 target[0] = (uint16_t)index;
258}
259
260#endif /*LV_HAVE_SSE*/
261
262
263#ifdef LV_HAVE_NEON
264#include <arm_neon.h>
265#include <float.h>
266#include <limits.h>
267
268static inline void
269volk_32f_index_max_16u_neon(uint16_t* target, const float* src0, uint32_t num_points)
270{
271 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
272
273 if (num_points == 0)
274 return;
275
276 const uint32_t quarter_points = num_points / 4;
277 const float* inputPtr = src0;
278
279 // Use integer indices directly
280 uint32x4_t vec_indices = { 0, 1, 2, 3 };
281 const uint32x4_t vec_incr = vdupq_n_u32(4);
282
283 float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
284 uint32x4_t vec_max_idx = vdupq_n_u32(0);
285
286 for (uint32_t i = 0; i < quarter_points; i++) {
287 float32x4_t vec_val = vld1q_f32(inputPtr);
288 inputPtr += 4;
289
290 // Compare BEFORE max update to know which lanes change
291 uint32x4_t gt_mask = vcgtq_f32(vec_val, vec_max);
292 vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
293
294 // vmaxq_f32 is single-cycle, no dependency on comparison result
295 vec_max = vmaxq_f32(vec_val, vec_max);
296
297 vec_indices = vaddq_u32(vec_indices, vec_incr);
298 }
299
300 // Scalar reduction
301 __VOLK_ATTR_ALIGNED(16) float max_buf[4];
302 __VOLK_ATTR_ALIGNED(16) uint32_t idx_buf[4];
303 vst1q_f32(max_buf, vec_max);
304 vst1q_u32(idx_buf, vec_max_idx);
305
306 float max_val = max_buf[0];
307 uint32_t result_idx = idx_buf[0];
308 for (int i = 1; i < 4; i++) {
309 if (max_buf[i] > max_val) {
310 max_val = max_buf[i];
311 result_idx = idx_buf[i];
312 } else if (max_buf[i] == max_val && idx_buf[i] < result_idx) {
313 result_idx = idx_buf[i];
314 }
315 }
316
317 // Handle tail
318 for (uint32_t i = quarter_points * 4; i < num_points; i++) {
319 if (src0[i] > max_val) {
320 max_val = src0[i];
321 result_idx = i;
322 }
323 }
324
325 *target = (uint16_t)result_idx;
326}
327
328#endif /*LV_HAVE_NEON*/
329
330
331#ifdef LV_HAVE_NEONV8
332#include <arm_neon.h>
333#include <float.h>
334#include <limits.h>
335
336static inline void
337volk_32f_index_max_16u_neonv8(uint16_t* target, const float* src0, uint32_t num_points)
338{
339 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
340
341 if (num_points == 0)
342 return;
343
344 const uint32_t quarter_points = num_points / 4;
345 const float* inputPtr = src0;
346
347 // Use integer indices directly (no float conversion overhead)
348 uint32x4_t vec_indices = { 0, 1, 2, 3 };
349 const uint32x4_t vec_incr = vdupq_n_u32(4);
350
351 float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
352 uint32x4_t vec_max_idx = vdupq_n_u32(0);
353
354 for (uint32_t i = 0; i < quarter_points; i++) {
355 float32x4_t vec_val = vld1q_f32(inputPtr);
356 inputPtr += 4;
357
358 // Compare BEFORE max update to know which lanes change
359 uint32x4_t gt_mask = vcgtq_f32(vec_val, vec_max);
360 vec_max_idx = vbslq_u32(gt_mask, vec_indices, vec_max_idx);
361
362 // vmaxq_f32 is single-cycle, no dependency on comparison result
363 vec_max = vmaxq_f32(vec_val, vec_max);
364
365 vec_indices = vaddq_u32(vec_indices, vec_incr);
366 }
367
368 // ARMv8 horizontal reduction
369 float max_val = vmaxvq_f32(vec_max);
370 uint32x4_t max_mask = vceqq_f32(vec_max, vdupq_n_f32(max_val));
371 uint32x4_t idx_masked = vbslq_u32(max_mask, vec_max_idx, vdupq_n_u32(UINT32_MAX));
372 uint32_t result_idx = vminvq_u32(idx_masked);
373
374 // Handle tail
375 for (uint32_t i = quarter_points * 4; i < num_points; i++) {
376 if (src0[i] > max_val) {
377 max_val = src0[i];
378 result_idx = i;
379 }
380 }
381
382 *target = (uint16_t)result_idx;
383}
384
385#endif /*LV_HAVE_NEONV8*/
386
387
388#ifdef LV_HAVE_GENERIC
389
390static inline void
391volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
392{
393 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
394
395 float max = src0[0];
396 uint16_t index = 0;
397
398 uint32_t i = 1;
399
400 for (; i < num_points; ++i) {
401 if (src0[i] > max) {
402 index = i;
403 max = src0[i];
404 }
405 }
406 target[0] = index;
407}
408
409#endif /*LV_HAVE_GENERIC*/
410
411#ifdef LV_HAVE_AVX512F
412#include <immintrin.h>
413#include <limits.h>
414
415static inline void
416volk_32f_index_max_16u_a_avx512f(uint16_t* target, const float* src0, uint32_t num_points)
417{
418 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
419
420 uint32_t number = 0;
421 const uint32_t sixteenthPoints = num_points / 16;
422
423 const float* inputPtr = src0;
424
425 __m512 indexIncrementValues = _mm512_set1_ps(16);
426 __m512 currentIndexes = _mm512_set_ps(
427 -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
428
429 float max = src0[0];
430 float index = 0;
431 __m512 maxValues = _mm512_set1_ps(max);
432 __m512 maxValuesIndex = _mm512_setzero_ps();
433 __mmask16 compareResults;
434 __m512 currentValues;
435
436 __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
437 __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
438
439 for (; number < sixteenthPoints; number++) {
440 currentValues = _mm512_load_ps(inputPtr);
441 inputPtr += 16;
442 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
443 compareResults = _mm512_cmp_ps_mask(currentValues, maxValues, _CMP_GT_OS);
444 maxValuesIndex =
445 _mm512_mask_blend_ps(compareResults, maxValuesIndex, currentIndexes);
446 maxValues = _mm512_mask_blend_ps(compareResults, maxValues, currentValues);
447 }
448
449 // Calculate the largest value from the remaining 16 points
450 _mm512_store_ps(maxValuesBuffer, maxValues);
451 _mm512_store_ps(maxIndexesBuffer, maxValuesIndex);
452
453 for (number = 0; number < 16; number++) {
454 if (maxValuesBuffer[number] > max) {
455 index = maxIndexesBuffer[number];
456 max = maxValuesBuffer[number];
457 } else if (maxValuesBuffer[number] == max) {
458 if (index > maxIndexesBuffer[number])
459 index = maxIndexesBuffer[number];
460 }
461 }
462
463 number = sixteenthPoints * 16;
464 for (; number < num_points; number++) {
465 if (src0[number] > max) {
466 index = number;
467 max = src0[number];
468 }
469 }
470 target[0] = (uint16_t)index;
471}
472
473#endif /*LV_HAVE_AVX512F*/
474
475#endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
476
477
478#ifndef INCLUDED_volk_32f_index_max_16u_u_H
479#define INCLUDED_volk_32f_index_max_16u_u_H
480
481#include <inttypes.h>
482#include <limits.h>
483#include <stdio.h>
484#include <volk/volk_common.h>
485
486#ifdef LV_HAVE_AVX
487#include <immintrin.h>
488
489static inline void
490volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
491{
492 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
493
494 uint32_t number = 0;
495 const uint32_t eighthPoints = num_points / 8;
496
497 float* inputPtr = (float*)src0;
498
499 __m256 indexIncrementValues = _mm256_set1_ps(8);
500 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
501
502 float max = src0[0];
503 float index = 0;
504 __m256 maxValues = _mm256_set1_ps(max);
505 __m256 maxValuesIndex = _mm256_setzero_ps();
506 __m256 compareResults;
507 __m256 currentValues;
508
509 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
510 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
511
512 for (; number < eighthPoints; number++) {
513
514 currentValues = _mm256_loadu_ps(inputPtr);
515 inputPtr += 8;
516 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
517
518 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
519
520 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
521 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
522 }
523
524 // Calculate the largest value from the remaining 4 points
525 _mm256_storeu_ps(maxValuesBuffer, maxValues);
526 _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
527
528 for (number = 0; number < 8; number++) {
529 if (maxValuesBuffer[number] > max) {
530 index = maxIndexesBuffer[number];
531 max = maxValuesBuffer[number];
532 } else if (maxValuesBuffer[number] == max) {
533 if (index > maxIndexesBuffer[number])
534 index = maxIndexesBuffer[number];
535 }
536 }
537
538 number = eighthPoints * 8;
539 for (; number < num_points; number++) {
540 if (src0[number] > max) {
541 index = number;
542 max = src0[number];
543 }
544 }
545 target[0] = (uint16_t)index;
546}
547
548#endif /*LV_HAVE_AVX*/
549
550#ifdef LV_HAVE_AVX512F
551#include <immintrin.h>
552#include <limits.h>
553
554static inline void
555volk_32f_index_max_16u_u_avx512f(uint16_t* target, const float* src0, uint32_t num_points)
556{
557 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
558
559 uint32_t number = 0;
560 const uint32_t sixteenthPoints = num_points / 16;
561
562 const float* inputPtr = src0;
563
564 __m512 indexIncrementValues = _mm512_set1_ps(16);
565 __m512 currentIndexes = _mm512_set_ps(
566 -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16);
567
568 float max = src0[0];
569 float index = 0;
570 __m512 maxValues = _mm512_set1_ps(max);
571 __m512 maxValuesIndex = _mm512_setzero_ps();
572 __mmask16 compareResults;
573 __m512 currentValues;
574
575 __VOLK_ATTR_ALIGNED(64) float maxValuesBuffer[16];
576 __VOLK_ATTR_ALIGNED(64) float maxIndexesBuffer[16];
577
578 for (; number < sixteenthPoints; number++) {
579 currentValues = _mm512_loadu_ps(inputPtr);
580 inputPtr += 16;
581 currentIndexes = _mm512_add_ps(currentIndexes, indexIncrementValues);
582 compareResults = _mm512_cmp_ps_mask(currentValues, maxValues, _CMP_GT_OS);
583 maxValuesIndex =
584 _mm512_mask_blend_ps(compareResults, maxValuesIndex, currentIndexes);
585 maxValues = _mm512_mask_blend_ps(compareResults, maxValues, currentValues);
586 }
587
588 // Calculate the largest value from the remaining 16 points
589 _mm512_store_ps(maxValuesBuffer, maxValues);
590 _mm512_store_ps(maxIndexesBuffer, maxValuesIndex);
591
592 for (number = 0; number < 16; number++) {
593 if (maxValuesBuffer[number] > max) {
594 index = maxIndexesBuffer[number];
595 max = maxValuesBuffer[number];
596 } else if (maxValuesBuffer[number] == max) {
597 if (index > maxIndexesBuffer[number])
598 index = maxIndexesBuffer[number];
599 }
600 }
601
602 number = sixteenthPoints * 16;
603 for (; number < num_points; number++) {
604 if (src0[number] > max) {
605 index = number;
606 max = src0[number];
607 }
608 }
609 target[0] = (uint16_t)index;
610}
611
612#endif /*LV_HAVE_AVX512F*/
613
614#ifdef LV_HAVE_RVV
615#include <float.h>
616#include <riscv_vector.h>
617
618static inline void
619volk_32f_index_max_16u_rvv(uint16_t* target, const float* src0, uint32_t num_points)
620{
621 vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(-FLT_MAX, __riscv_vsetvlmax_e32m8());
622 vuint16m4_t vmaxi = __riscv_vmv_v_x_u16m4(0, __riscv_vsetvlmax_e16m4());
623 vuint16m4_t vidx = __riscv_vid_v_u16m4(__riscv_vsetvlmax_e16m4());
624 size_t n = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
625 for (size_t vl; n > 0; n -= vl, src0 += vl) {
626 vl = __riscv_vsetvl_e32m8(n);
627 vfloat32m8_t v = __riscv_vle32_v_f32m8(src0, vl);
628 vbool4_t m = __riscv_vmfgt(v, vmax, vl);
629 vmax = __riscv_vfmax_tu(vmax, vmax, v, vl);
630 vmaxi = __riscv_vmerge_tu(vmaxi, vmaxi, vidx, m, vl);
631 vidx = __riscv_vadd(vidx, vl, __riscv_vsetvlmax_e16m4());
632 }
633 size_t vl = __riscv_vsetvlmax_e32m8();
634 float max = __riscv_vfmv_f(__riscv_vfredmax(RISCV_SHRINK8(vfmax, f, 32, vmax),
635 __riscv_vfmv_v_f_f32m1(-FLT_MAX, 1),
636 __riscv_vsetvlmax_e32m1()));
637 // Find lanes with max value, set others to UINT16_MAX
638 vbool4_t m = __riscv_vmfeq(vmax, max, vl);
639 vuint16m4_t idx_masked = __riscv_vmerge(
640 __riscv_vmv_v_x_u16m4(UINT16_MAX, __riscv_vsetvlmax_e16m4()), vmaxi, m, vl);
641 // Find minimum index among lanes with max value
642 *target = __riscv_vmv_x(__riscv_vredminu(RISCV_SHRINK4(vminu, u, 16, idx_masked),
643 __riscv_vmv_v_x_u16m1(UINT16_MAX, 1),
644 __riscv_vsetvlmax_e16m1()));
645}
646#endif /*LV_HAVE_RVV*/
647
648#endif /*INCLUDED_volk_32f_index_max_16u_u_H*/