55#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
56#define INCLUDED_volk_32f_s32f_convert_16i_u_H
65static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
66 const float* inputVector,
68 unsigned int num_points)
70 unsigned int number = 0;
72 const unsigned int sixteenthPoints = num_points / 16;
74 const float* inputVectorPtr = (
const float*)inputVector;
75 int16_t* outputVectorPtr = outputVector;
77 float min_val = SHRT_MIN;
78 float max_val = SHRT_MAX;
81 __m256 vScalar = _mm256_set1_ps(scalar);
82 __m256 inputVal1, inputVal2;
83 __m256i intInputVal1, intInputVal2;
85 __m256 vmin_val = _mm256_set1_ps(min_val);
86 __m256 vmax_val = _mm256_set1_ps(max_val);
88 for (; number < sixteenthPoints; number++) {
89 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
91 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
95 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
97 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
100 intInputVal1 = _mm256_cvtps_epi32(ret1);
101 intInputVal2 = _mm256_cvtps_epi32(ret2);
103 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
104 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
106 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
107 outputVectorPtr += 16;
110 number = sixteenthPoints * 16;
111 for (; number < num_points; number++) {
112 r = inputVector[number] * scalar;
115 else if (r < min_val)
117 outputVector[number] = (int16_t)rintf(r);
122#ifdef LV_HAVE_AVX512F
123#include <immintrin.h>
125static inline void volk_32f_s32f_convert_16i_u_avx512(int16_t* outputVector,
126 const float* inputVector,
128 unsigned int num_points)
130 unsigned int number = 0;
132 const unsigned int sixteenthPoints = num_points / 16;
134 const float* inputVectorPtr = (
const float*)inputVector;
135 int16_t* outputVectorPtr = outputVector;
137 float min_val = SHRT_MIN;
138 float max_val = SHRT_MAX;
141 __m512 vScalar = _mm512_set1_ps(scalar);
145 __m512 vmin_val = _mm512_set1_ps(min_val);
146 __m512 vmax_val = _mm512_set1_ps(max_val);
148 for (; number < sixteenthPoints; number++) {
149 inputVal = _mm512_loadu_ps(inputVectorPtr);
150 inputVectorPtr += 16;
153 ret = _mm512_max_ps(_mm512_min_ps(_mm512_mul_ps(inputVal, vScalar), vmax_val),
157 intInputVal = _mm512_cvtsepi32_epi16(_mm512_cvtps_epi32(ret));
159 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
160 outputVectorPtr += 16;
163 number = sixteenthPoints * 16;
164 for (; number < num_points; number++) {
165 r = inputVector[number] * scalar;
168 else if (r < min_val)
170 outputVector[number] = (int16_t)rintf(r);
177#include <immintrin.h>
180 const float* inputVector,
182 unsigned int num_points)
184 unsigned int number = 0;
186 const unsigned int eighthPoints = num_points / 8;
188 const float* inputVectorPtr = (
const float*)inputVector;
189 int16_t* outputVectorPtr = outputVector;
191 float min_val = SHRT_MIN;
192 float max_val = SHRT_MAX;
195 __m256 vScalar = _mm256_set1_ps(scalar);
196 __m256 inputVal, ret;
198 __m128i intInputVal1, intInputVal2;
199 __m256 vmin_val = _mm256_set1_ps(min_val);
200 __m256 vmax_val = _mm256_set1_ps(max_val);
202 for (; number < eighthPoints; number++) {
203 inputVal = _mm256_loadu_ps(inputVectorPtr);
207 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
210 intInputVal = _mm256_cvtps_epi32(ret);
212 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
213 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
215 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
217 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
218 outputVectorPtr += 8;
221 number = eighthPoints * 8;
222 for (; number < num_points; number++) {
223 r = inputVector[number] * scalar;
226 else if (r < min_val)
228 outputVector[number] = (int16_t)rintf(r);
235#include <emmintrin.h>
238 const float* inputVector,
240 unsigned int num_points)
242 unsigned int number = 0;
244 const unsigned int eighthPoints = num_points / 8;
246 const float* inputVectorPtr = (
const float*)inputVector;
247 int16_t* outputVectorPtr = outputVector;
249 float min_val = SHRT_MIN;
250 float max_val = SHRT_MAX;
253 __m128 vScalar = _mm_set_ps1(scalar);
254 __m128 inputVal1, inputVal2;
255 __m128i intInputVal1, intInputVal2;
257 __m128 vmin_val = _mm_set_ps1(min_val);
258 __m128 vmax_val = _mm_set_ps1(max_val);
260 for (; number < eighthPoints; number++) {
261 inputVal1 = _mm_loadu_ps(inputVectorPtr);
263 inputVal2 = _mm_loadu_ps(inputVectorPtr);
267 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
268 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
270 intInputVal1 = _mm_cvtps_epi32(ret1);
271 intInputVal2 = _mm_cvtps_epi32(ret2);
273 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
275 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
276 outputVectorPtr += 8;
279 number = eighthPoints * 8;
280 for (; number < num_points; number++) {
281 r = inputVector[number] * scalar;
284 else if (r < min_val)
286 outputVector[number] = (int16_t)rintf(r);
293#include <xmmintrin.h>
296 const float* inputVector,
298 unsigned int num_points)
300 unsigned int number = 0;
302 const unsigned int quarterPoints = num_points / 4;
304 const float* inputVectorPtr = (
const float*)inputVector;
305 int16_t* outputVectorPtr = outputVector;
307 float min_val = SHRT_MIN;
308 float max_val = SHRT_MAX;
311 __m128 vScalar = _mm_set_ps1(scalar);
313 __m128 vmin_val = _mm_set_ps1(min_val);
314 __m128 vmax_val = _mm_set_ps1(max_val);
318 for (; number < quarterPoints; number++) {
319 ret = _mm_loadu_ps(inputVectorPtr);
323 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
325 _mm_store_ps(outputFloatBuffer, ret);
326 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
327 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
328 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
329 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
332 number = quarterPoints * 4;
333 for (; number < num_points; number++) {
334 r = inputVector[number] * scalar;
337 else if (r < min_val)
339 outputVector[number] = (int16_t)rintf(r);
345#ifdef LV_HAVE_GENERIC
348 const float* inputVector,
350 unsigned int num_points)
352 int16_t* outputVectorPtr = outputVector;
353 const float* inputVectorPtr = inputVector;
354 unsigned int number = 0;
355 float min_val = SHRT_MIN;
356 float max_val = SHRT_MAX;
359 for (number = 0; number < num_points; number++) {
360 r = *inputVectorPtr++ * scalar;
363 else if (r < min_val)
365 *outputVectorPtr++ = (int16_t)rintf(r);
372#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
373#define INCLUDED_volk_32f_s32f_convert_16i_a_H
381#include <immintrin.h>
383static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
384 const float* inputVector,
386 unsigned int num_points)
388 unsigned int number = 0;
390 const unsigned int sixteenthPoints = num_points / 16;
392 const float* inputVectorPtr = (
const float*)inputVector;
393 int16_t* outputVectorPtr = outputVector;
395 float min_val = SHRT_MIN;
396 float max_val = SHRT_MAX;
399 __m256 vScalar = _mm256_set1_ps(scalar);
400 __m256 inputVal1, inputVal2;
401 __m256i intInputVal1, intInputVal2;
403 __m256 vmin_val = _mm256_set1_ps(min_val);
404 __m256 vmax_val = _mm256_set1_ps(max_val);
406 for (; number < sixteenthPoints; number++) {
407 inputVal1 = _mm256_load_ps(inputVectorPtr);
409 inputVal2 = _mm256_load_ps(inputVectorPtr);
413 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
415 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
418 intInputVal1 = _mm256_cvtps_epi32(ret1);
419 intInputVal2 = _mm256_cvtps_epi32(ret2);
421 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
422 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
424 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
425 outputVectorPtr += 16;
428 number = sixteenthPoints * 16;
429 for (; number < num_points; number++) {
430 r = inputVector[number] * scalar;
433 else if (r < min_val)
435 outputVector[number] = (int16_t)rintf(r);
440#ifdef LV_HAVE_AVX512F
441#include <immintrin.h>
443static inline void volk_32f_s32f_convert_16i_a_avx512(int16_t* outputVector,
444 const float* inputVector,
446 unsigned int num_points)
448 unsigned int number = 0;
450 const unsigned int sixteenthPoints = num_points / 16;
452 const float* inputVectorPtr = (
const float*)inputVector;
453 int16_t* outputVectorPtr = outputVector;
455 float min_val = SHRT_MIN;
456 float max_val = SHRT_MAX;
459 __m512 vScalar = _mm512_set1_ps(scalar);
463 __m512 vmin_val = _mm512_set1_ps(min_val);
464 __m512 vmax_val = _mm512_set1_ps(max_val);
466 for (; number < sixteenthPoints; number++) {
467 inputVal = _mm512_load_ps(inputVectorPtr);
468 inputVectorPtr += 16;
471 ret = _mm512_max_ps(_mm512_min_ps(_mm512_mul_ps(inputVal, vScalar), vmax_val),
475 intInputVal = _mm512_cvtsepi32_epi16(_mm512_cvtps_epi32(ret));
477 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
478 outputVectorPtr += 16;
481 number = sixteenthPoints * 16;
482 for (; number < num_points; number++) {
483 r = inputVector[number] * scalar;
486 else if (r < min_val)
488 outputVector[number] = (int16_t)rintf(r);
495#include <immintrin.h>
498 const float* inputVector,
500 unsigned int num_points)
502 unsigned int number = 0;
504 const unsigned int eighthPoints = num_points / 8;
506 const float* inputVectorPtr = (
const float*)inputVector;
507 int16_t* outputVectorPtr = outputVector;
509 float min_val = SHRT_MIN;
510 float max_val = SHRT_MAX;
513 __m256 vScalar = _mm256_set1_ps(scalar);
514 __m256 inputVal, ret;
516 __m128i intInputVal1, intInputVal2;
517 __m256 vmin_val = _mm256_set1_ps(min_val);
518 __m256 vmax_val = _mm256_set1_ps(max_val);
520 for (; number < eighthPoints; number++) {
521 inputVal = _mm256_load_ps(inputVectorPtr);
525 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
528 intInputVal = _mm256_cvtps_epi32(ret);
530 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
531 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
533 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
535 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
536 outputVectorPtr += 8;
539 number = eighthPoints * 8;
540 for (; number < num_points; number++) {
541 r = inputVector[number] * scalar;
544 else if (r < min_val)
546 outputVector[number] = (int16_t)rintf(r);
552#include <emmintrin.h>
555 const float* inputVector,
557 unsigned int num_points)
559 unsigned int number = 0;
561 const unsigned int eighthPoints = num_points / 8;
563 const float* inputVectorPtr = (
const float*)inputVector;
564 int16_t* outputVectorPtr = outputVector;
566 float min_val = SHRT_MIN;
567 float max_val = SHRT_MAX;
570 __m128 vScalar = _mm_set_ps1(scalar);
571 __m128 inputVal1, inputVal2;
572 __m128i intInputVal1, intInputVal2;
574 __m128 vmin_val = _mm_set_ps1(min_val);
575 __m128 vmax_val = _mm_set_ps1(max_val);
577 for (; number < eighthPoints; number++) {
578 inputVal1 = _mm_load_ps(inputVectorPtr);
580 inputVal2 = _mm_load_ps(inputVectorPtr);
584 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
585 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
587 intInputVal1 = _mm_cvtps_epi32(ret1);
588 intInputVal2 = _mm_cvtps_epi32(ret2);
590 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
592 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
593 outputVectorPtr += 8;
596 number = eighthPoints * 8;
597 for (; number < num_points; number++) {
598 r = inputVector[number] * scalar;
601 else if (r < min_val)
603 outputVector[number] = (int16_t)rintf(r);
610#include <xmmintrin.h>
613 const float* inputVector,
615 unsigned int num_points)
617 unsigned int number = 0;
619 const unsigned int quarterPoints = num_points / 4;
621 const float* inputVectorPtr = (
const float*)inputVector;
622 int16_t* outputVectorPtr = outputVector;
624 float min_val = SHRT_MIN;
625 float max_val = SHRT_MAX;
628 __m128 vScalar = _mm_set_ps1(scalar);
630 __m128 vmin_val = _mm_set_ps1(min_val);
631 __m128 vmax_val = _mm_set_ps1(max_val);
635 for (; number < quarterPoints; number++) {
636 ret = _mm_load_ps(inputVectorPtr);
640 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
642 _mm_store_ps(outputFloatBuffer, ret);
643 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
644 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
645 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
646 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
649 number = quarterPoints * 4;
650 for (; number < num_points; number++) {
651 r = inputVector[number] * scalar;
654 else if (r < min_val)
656 outputVector[number] = (int16_t)rintf(r);
666 const float* inputVector,
668 unsigned int num_points)
670 unsigned int number = 0;
671 const unsigned int eighthPoints = num_points / 8;
673 const float* inputVectorPtr = inputVector;
674 int16_t* outputVectorPtr = outputVector;
676 float min_val = SHRT_MIN;
677 float max_val = SHRT_MAX;
680 float32x4_t vScalar = vdupq_n_f32(scalar);
681 float32x4_t vmin_val = vdupq_n_f32(min_val);
682 float32x4_t vmax_val = vdupq_n_f32(max_val);
684 for (; number < eighthPoints; number++) {
685 float32x4_t inputVal1 = vld1q_f32(inputVectorPtr);
686 float32x4_t inputVal2 = vld1q_f32(inputVectorPtr + 4);
691 vmaxq_f32(vminq_f32(vmulq_f32(inputVal1, vScalar), vmax_val), vmin_val);
693 vmaxq_f32(vminq_f32(vmulq_f32(inputVal2, vScalar), vmax_val), vmin_val);
696 float32x4_t half = vdupq_n_f32(0.5f);
697 float32x4_t neg_half = vdupq_n_f32(-0.5f);
698 float32x4_t zero = vdupq_n_f32(0.0f);
699 uint32x4_t neg1 = vcltq_f32(ret1, zero);
700 uint32x4_t neg2 = vcltq_f32(ret2, zero);
701 ret1 = vaddq_f32(ret1, vbslq_f32(neg1, neg_half, half));
702 ret2 = vaddq_f32(ret2, vbslq_f32(neg2, neg_half, half));
705 int32x4_t intVal1 = vcvtq_s32_f32(ret1);
706 int32x4_t intVal2 = vcvtq_s32_f32(ret2);
709 int16x4_t narrow1 = vqmovn_s32(intVal1);
710 int16x4_t narrow2 = vqmovn_s32(intVal2);
711 int16x8_t result = vcombine_s16(narrow1, narrow2);
713 vst1q_s16(outputVectorPtr, result);
714 outputVectorPtr += 8;
717 number = eighthPoints * 8;
718 for (; number < num_points; number++) {
719 r = inputVector[number] * scalar;
722 else if (r < min_val)
724 outputVector[number] = (int16_t)rintf(r);
733static inline void volk_32f_s32f_convert_16i_neonv8(int16_t* outputVector,
734 const float* inputVector,
736 unsigned int num_points)
738 unsigned int number = 0;
739 const unsigned int sixteenthPoints = num_points / 16;
741 const float* inputVectorPtr = inputVector;
742 int16_t* outputVectorPtr = outputVector;
744 float min_val = SHRT_MIN;
745 float max_val = SHRT_MAX;
748 float32x4_t vScalar = vdupq_n_f32(scalar);
749 float32x4_t vmin_val = vdupq_n_f32(min_val);
750 float32x4_t vmax_val = vdupq_n_f32(max_val);
752 for (; number < sixteenthPoints; number++) {
753 float32x4_t inputVal0 = vld1q_f32(inputVectorPtr);
754 float32x4_t inputVal1 = vld1q_f32(inputVectorPtr + 4);
755 float32x4_t inputVal2 = vld1q_f32(inputVectorPtr + 8);
756 float32x4_t inputVal3 = vld1q_f32(inputVectorPtr + 12);
758 inputVectorPtr += 16;
762 vmaxq_f32(vminq_f32(vmulq_f32(inputVal0, vScalar), vmax_val), vmin_val);
764 vmaxq_f32(vminq_f32(vmulq_f32(inputVal1, vScalar), vmax_val), vmin_val);
766 vmaxq_f32(vminq_f32(vmulq_f32(inputVal2, vScalar), vmax_val), vmin_val);
768 vmaxq_f32(vminq_f32(vmulq_f32(inputVal3, vScalar), vmax_val), vmin_val);
771 int32x4_t intVal0 = vcvtnq_s32_f32(ret0);
772 int32x4_t intVal1 = vcvtnq_s32_f32(ret1);
773 int32x4_t intVal2 = vcvtnq_s32_f32(ret2);
774 int32x4_t intVal3 = vcvtnq_s32_f32(ret3);
777 int16x4_t narrow0 = vqmovn_s32(intVal0);
778 int16x4_t narrow1 = vqmovn_s32(intVal1);
779 int16x4_t narrow2 = vqmovn_s32(intVal2);
780 int16x4_t narrow3 = vqmovn_s32(intVal3);
781 int16x8_t result0 = vcombine_s16(narrow0, narrow1);
782 int16x8_t result1 = vcombine_s16(narrow2, narrow3);
784 vst1q_s16(outputVectorPtr, result0);
785 vst1q_s16(outputVectorPtr + 8, result1);
786 outputVectorPtr += 16;
789 number = sixteenthPoints * 16;
790 for (; number < num_points; number++) {
791 r = inputVector[number] * scalar;
794 else if (r < min_val)
796 outputVector[number] = (int16_t)rintf(r);
803#include <riscv_vector.h>
805static inline void volk_32f_s32f_convert_16i_rvv(int16_t* outputVector,
806 const float* inputVector,
808 unsigned int num_points)
810 size_t n = num_points;
811 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
812 vl = __riscv_vsetvl_e32m8(n);
813 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
814 v = __riscv_vfmul(v, scalar, vl);
815 __riscv_vse16(outputVector, __riscv_vfncvt_x(v, vl), vl);