41#ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
42#define INCLUDED_volk_16i_s32f_convert_32f_u_H
50static inline void volk_16i_s32f_convert_32f_u_avx2(
float* outputVector,
51 const int16_t* inputVector,
53 unsigned int num_points)
55 unsigned int number = 0;
56 const unsigned int eighthPoints = num_points / 8;
58 float* outputVectorPtr = outputVector;
59 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
60 int16_t* inputPtr = (int16_t*)inputVector;
65 for (; number < eighthPoints; number++) {
68 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
71 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
73 ret = _mm256_cvtepi32_ps(inputVal2);
74 ret = _mm256_mul_ps(ret, invScalar);
76 _mm256_storeu_ps(outputVectorPtr, ret);
83 number = eighthPoints * 8;
84 for (; number < num_points; number++) {
85 outputVector[number] = ((float)(inputVector[number])) / scalar;
93static inline void volk_16i_s32f_convert_32f_u_avx512(
float* outputVector,
94 const int16_t* inputVector,
96 unsigned int num_points)
98 unsigned int number = 0;
99 const unsigned int sixteenthPoints = num_points / 16;
101 float* outputVectorPtr = outputVector;
102 __m512 invScalar = _mm512_set1_ps(1.0 / scalar);
103 int16_t* inputPtr = (int16_t*)inputVector;
108 for (; number < sixteenthPoints; number++) {
111 inputVal = _mm256_loadu_si256((__m256i*)inputPtr);
114 inputVal2 = _mm512_cvtepi16_epi32(inputVal);
115 ret = _mm512_cvtepi32_ps(inputVal2);
116 ret = _mm512_mul_ps(ret, invScalar);
118 _mm512_storeu_ps(outputVectorPtr, ret);
120 outputVectorPtr += 16;
124 number = sixteenthPoints * 16;
125 for (; number < num_points; number++) {
126 outputVector[number] = ((float)(inputVector[number])) / scalar;
132#include <immintrin.h>
135 const int16_t* inputVector,
137 unsigned int num_points)
139 unsigned int number = 0;
140 const unsigned int eighthPoints = num_points / 8;
142 float* outputVectorPtr = outputVector;
143 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
144 int16_t* inputPtr = (int16_t*)inputVector;
145 __m128i inputVal, inputVal2;
148 __m256 dummy = _mm256_setzero_ps();
150 for (; number < eighthPoints; number++) {
154 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
157 inputVal2 = _mm_srli_si128(inputVal, 8);
160 inputVal = _mm_cvtepi16_epi32(inputVal);
161 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
163 ret = _mm_cvtepi32_ps(inputVal);
164 ret = _mm_mul_ps(ret, invScalar);
165 output = _mm256_insertf128_ps(dummy, ret, 0);
167 ret = _mm_cvtepi32_ps(inputVal2);
168 ret = _mm_mul_ps(ret, invScalar);
169 output = _mm256_insertf128_ps(output, ret, 1);
171 _mm256_storeu_ps(outputVectorPtr, output);
173 outputVectorPtr += 8;
178 number = eighthPoints * 8;
179 for (; number < num_points; number++) {
180 outputVector[number] = ((float)(inputVector[number])) / scalar;
186#include <smmintrin.h>
188static inline void volk_16i_s32f_convert_32f_u_sse4_1(
float* outputVector,
189 const int16_t* inputVector,
191 unsigned int num_points)
193 unsigned int number = 0;
194 const unsigned int eighthPoints = num_points / 8;
196 float* outputVectorPtr = outputVector;
197 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
198 int16_t* inputPtr = (int16_t*)inputVector;
203 for (; number < eighthPoints; number++) {
206 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
209 inputVal2 = _mm_srli_si128(inputVal, 8);
212 inputVal = _mm_cvtepi16_epi32(inputVal);
213 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
215 ret = _mm_cvtepi32_ps(inputVal);
216 ret = _mm_mul_ps(ret, invScalar);
217 _mm_storeu_ps(outputVectorPtr, ret);
218 outputVectorPtr += 4;
220 ret = _mm_cvtepi32_ps(inputVal2);
221 ret = _mm_mul_ps(ret, invScalar);
222 _mm_storeu_ps(outputVectorPtr, ret);
224 outputVectorPtr += 4;
229 number = eighthPoints * 8;
230 for (; number < num_points; number++) {
231 outputVector[number] = ((float)(inputVector[number])) / scalar;
237#include <xmmintrin.h>
240 const int16_t* inputVector,
242 unsigned int num_points)
244 unsigned int number = 0;
245 const unsigned int quarterPoints = num_points / 4;
247 float* outputVectorPtr = outputVector;
248 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
249 int16_t* inputPtr = (int16_t*)inputVector;
252 for (; number < quarterPoints; number++) {
253 ret = _mm_set_ps((
float)(inputPtr[3]),
254 (
float)(inputPtr[2]),
255 (
float)(inputPtr[1]),
256 (
float)(inputPtr[0]));
258 ret = _mm_mul_ps(ret, invScalar);
259 _mm_storeu_ps(outputVectorPtr, ret);
262 outputVectorPtr += 4;
265 number = quarterPoints * 4;
266 for (; number < num_points; number++) {
267 outputVector[number] = (float)(inputVector[number]) / scalar;
272#ifdef LV_HAVE_GENERIC
275 const int16_t* inputVector,
277 unsigned int num_points)
279 float* outputVectorPtr = outputVector;
280 const int16_t* inputVectorPtr = inputVector;
281 unsigned int number = 0;
283 for (number = 0; number < num_points; number++) {
284 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
293 const int16_t* inputVector,
295 unsigned int num_points)
297 float* outputPtr = outputVector;
298 const int16_t* inputPtr = inputVector;
299 unsigned int number = 0;
300 unsigned int eighth_points = num_points / 8;
303 int32x4_t input32_0, input32_1;
304 float32x4_t input_float_0, input_float_1;
305 float32x4x2_t output_float;
306 float32x4_t inv_scale;
308 inv_scale = vdupq_n_f32(1.0 / scalar);
314 for (number = 0; number < eighth_points; number++) {
315 input16 = vld2_s16(inputPtr);
317 input32_0 = vmovl_s16(input16.val[0]);
318 input32_1 = vmovl_s16(input16.val[1]);
320 input_float_0 = vcvtq_f32_s32(input32_0);
321 input_float_1 = vcvtq_f32_s32(input32_1);
322 output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
323 output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
324 vst2q_f32(outputPtr, output_float);
329 for (number = eighth_points * 8; number < num_points; number++) {
330 *outputPtr++ = ((float)(*inputPtr++)) / scalar;
339static inline void volk_16i_s32f_convert_32f_neonv8(
float* outputVector,
340 const int16_t* inputVector,
342 unsigned int num_points)
344 unsigned int n = num_points;
345 float* out = outputVector;
346 const int16_t* in = inputVector;
348 const float32x4_t inv_scale = vdupq_n_f32(1.0f / scalar);
352 int16x4_t v0 = vld1_s16(in);
353 int16x4_t v1 = vld1_s16(in + 4);
357 float32x4_t f0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(v0)), inv_scale);
358 float32x4_t f1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(v1)), inv_scale);
361 vst1q_f32(out + 4, f1);
370 int16x4_t v0 = vld1_s16(in);
371 vst1q_f32(out, vmulq_f32(vcvtq_f32_s32(vmovl_s16(v0)), inv_scale));
379 *out++ = ((float)(*in++)) / scalar;
388#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
389#define INCLUDED_volk_16i_s32f_convert_32f_a_H
395#include <immintrin.h>
397static inline void volk_16i_s32f_convert_32f_a_avx2(
float* outputVector,
398 const int16_t* inputVector,
400 unsigned int num_points)
402 unsigned int number = 0;
403 const unsigned int eighthPoints = num_points / 8;
405 float* outputVectorPtr = outputVector;
406 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
407 int16_t* inputPtr = (int16_t*)inputVector;
412 for (; number < eighthPoints; number++) {
415 inputVal = _mm_load_si128((__m128i*)inputPtr);
418 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
420 ret = _mm256_cvtepi32_ps(inputVal2);
421 ret = _mm256_mul_ps(ret, invScalar);
423 _mm256_store_ps(outputVectorPtr, ret);
425 outputVectorPtr += 8;
430 number = eighthPoints * 8;
431 for (; number < num_points; number++) {
432 outputVector[number] = ((float)(inputVector[number])) / scalar;
437#ifdef LV_HAVE_AVX512F
438#include <immintrin.h>
440static inline void volk_16i_s32f_convert_32f_a_avx512(
float* outputVector,
441 const int16_t* inputVector,
443 unsigned int num_points)
445 unsigned int number = 0;
446 const unsigned int sixteenthPoints = num_points / 16;
448 float* outputVectorPtr = outputVector;
449 __m512 invScalar = _mm512_set1_ps(1.0 / scalar);
450 int16_t* inputPtr = (int16_t*)inputVector;
455 for (; number < sixteenthPoints; number++) {
458 inputVal = _mm256_load_si256((__m256i*)inputPtr);
461 inputVal2 = _mm512_cvtepi16_epi32(inputVal);
462 ret = _mm512_cvtepi32_ps(inputVal2);
463 ret = _mm512_mul_ps(ret, invScalar);
465 _mm512_store_ps(outputVectorPtr, ret);
467 outputVectorPtr += 16;
471 number = sixteenthPoints * 16;
472 for (; number < num_points; number++) {
473 outputVector[number] = ((float)(inputVector[number])) / scalar;
479#include <immintrin.h>
482 const int16_t* inputVector,
484 unsigned int num_points)
486 unsigned int number = 0;
487 const unsigned int eighthPoints = num_points / 8;
489 float* outputVectorPtr = outputVector;
490 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
491 int16_t* inputPtr = (int16_t*)inputVector;
492 __m128i inputVal, inputVal2;
495 __m256 dummy = _mm256_setzero_ps();
497 for (; number < eighthPoints; number++) {
501 inputVal = _mm_load_si128((__m128i*)inputPtr);
504 inputVal2 = _mm_srli_si128(inputVal, 8);
507 inputVal = _mm_cvtepi16_epi32(inputVal);
508 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
510 ret = _mm_cvtepi32_ps(inputVal);
511 ret = _mm_mul_ps(ret, invScalar);
512 output = _mm256_insertf128_ps(dummy, ret, 0);
514 ret = _mm_cvtepi32_ps(inputVal2);
515 ret = _mm_mul_ps(ret, invScalar);
516 output = _mm256_insertf128_ps(output, ret, 1);
518 _mm256_store_ps(outputVectorPtr, output);
520 outputVectorPtr += 8;
525 number = eighthPoints * 8;
526 for (; number < num_points; number++) {
527 outputVector[number] = ((float)(inputVector[number])) / scalar;
533#include <smmintrin.h>
535static inline void volk_16i_s32f_convert_32f_a_sse4_1(
float* outputVector,
536 const int16_t* inputVector,
538 unsigned int num_points)
540 unsigned int number = 0;
541 const unsigned int eighthPoints = num_points / 8;
543 float* outputVectorPtr = outputVector;
544 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
545 int16_t* inputPtr = (int16_t*)inputVector;
550 for (; number < eighthPoints; number++) {
553 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
556 inputVal2 = _mm_srli_si128(inputVal, 8);
559 inputVal = _mm_cvtepi16_epi32(inputVal);
560 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
562 ret = _mm_cvtepi32_ps(inputVal);
563 ret = _mm_mul_ps(ret, invScalar);
564 _mm_storeu_ps(outputVectorPtr, ret);
565 outputVectorPtr += 4;
567 ret = _mm_cvtepi32_ps(inputVal2);
568 ret = _mm_mul_ps(ret, invScalar);
569 _mm_storeu_ps(outputVectorPtr, ret);
571 outputVectorPtr += 4;
576 number = eighthPoints * 8;
577 for (; number < num_points; number++) {
578 outputVector[number] = ((float)(inputVector[number])) / scalar;
584#include <xmmintrin.h>
587 const int16_t* inputVector,
589 unsigned int num_points)
591 unsigned int number = 0;
592 const unsigned int quarterPoints = num_points / 4;
594 float* outputVectorPtr = outputVector;
595 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
596 int16_t* inputPtr = (int16_t*)inputVector;
599 for (; number < quarterPoints; number++) {
600 ret = _mm_set_ps((
float)(inputPtr[3]),
601 (
float)(inputPtr[2]),
602 (
float)(inputPtr[1]),
603 (
float)(inputPtr[0]));
605 ret = _mm_mul_ps(ret, invScalar);
606 _mm_storeu_ps(outputVectorPtr, ret);
609 outputVectorPtr += 4;
612 number = quarterPoints * 4;
613 for (; number < num_points; number++) {
614 outputVector[number] = (float)(inputVector[number]) / scalar;
620#include <riscv_vector.h>
622static inline void volk_16i_s32f_convert_32f_rvv(
float* outputVector,
623 const int16_t* inputVector,
625 unsigned int num_points)
627 size_t n = num_points;
628 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
629 vl = __riscv_vsetvl_e16m4(n);
630 vfloat32m8_t v = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(inputVector, vl), vl);
631 __riscv_vse32(outputVector, __riscv_vfmul(v, 1.0f / scalar, vl), vl);