41#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
42#define INCLUDED_volk_8i_s32f_convert_32f_u_H
50static inline void volk_8i_s32f_convert_32f_u_avx2(
float* outputVector,
51 const int8_t* inputVector,
53 unsigned int num_points)
55 unsigned int number = 0;
56 const unsigned int sixteenthPoints = num_points / 16;
58 float* outputVectorPtr = outputVector;
59 const float iScalar = 1.0 / scalar;
60 __m256 invScalar = _mm256_set1_ps(iScalar);
61 const int8_t* inputVectorPtr = inputVector;
66 for (; number < sixteenthPoints; number++) {
67 inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
69 interimVal = _mm256_cvtepi8_epi32(inputVal128);
70 ret = _mm256_cvtepi32_ps(interimVal);
71 ret = _mm256_mul_ps(ret, invScalar);
72 _mm256_storeu_ps(outputVectorPtr, ret);
75 inputVal128 = _mm_srli_si128(inputVal128, 8);
76 interimVal = _mm256_cvtepi8_epi32(inputVal128);
77 ret = _mm256_cvtepi32_ps(interimVal);
78 ret = _mm256_mul_ps(ret, invScalar);
79 _mm256_storeu_ps(outputVectorPtr, ret);
85 number = sixteenthPoints * 16;
86 for (; number < num_points; number++) {
87 outputVector[number] = (float)(inputVector[number]) * iScalar;
95static inline void volk_8i_s32f_convert_32f_u_avx512(
float* outputVector,
96 const int8_t* inputVector,
98 unsigned int num_points)
100 unsigned int number = 0;
101 const unsigned int sixteenthPoints = num_points / 16;
103 float* outputVectorPtr = outputVector;
104 const float iScalar = 1.0 / scalar;
105 __m512 invScalar = _mm512_set1_ps(iScalar);
106 const int8_t* inputVectorPtr = inputVector;
111 for (; number < sixteenthPoints; number++) {
112 inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
114 interimVal = _mm512_cvtepi8_epi32(inputVal128);
115 ret = _mm512_cvtepi32_ps(interimVal);
116 ret = _mm512_mul_ps(ret, invScalar);
117 _mm512_storeu_ps(outputVectorPtr, ret);
118 outputVectorPtr += 16;
120 inputVectorPtr += 16;
123 number = sixteenthPoints * 16;
124 for (; number < num_points; number++) {
125 outputVector[number] = (float)(inputVector[number]) * iScalar;
132#include <smmintrin.h>
134static inline void volk_8i_s32f_convert_32f_u_sse4_1(
float* outputVector,
135 const int8_t* inputVector,
137 unsigned int num_points)
139 unsigned int number = 0;
140 const unsigned int sixteenthPoints = num_points / 16;
142 float* outputVectorPtr = outputVector;
143 const float iScalar = 1.0 / scalar;
144 __m128 invScalar = _mm_set_ps1(iScalar);
145 const int8_t* inputVectorPtr = inputVector;
150 for (; number < sixteenthPoints; number++) {
151 inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
153 interimVal = _mm_cvtepi8_epi32(inputVal);
154 ret = _mm_cvtepi32_ps(interimVal);
155 ret = _mm_mul_ps(ret, invScalar);
156 _mm_storeu_ps(outputVectorPtr, ret);
157 outputVectorPtr += 4;
159 inputVal = _mm_srli_si128(inputVal, 4);
160 interimVal = _mm_cvtepi8_epi32(inputVal);
161 ret = _mm_cvtepi32_ps(interimVal);
162 ret = _mm_mul_ps(ret, invScalar);
163 _mm_storeu_ps(outputVectorPtr, ret);
164 outputVectorPtr += 4;
166 inputVal = _mm_srli_si128(inputVal, 4);
167 interimVal = _mm_cvtepi8_epi32(inputVal);
168 ret = _mm_cvtepi32_ps(interimVal);
169 ret = _mm_mul_ps(ret, invScalar);
170 _mm_storeu_ps(outputVectorPtr, ret);
171 outputVectorPtr += 4;
173 inputVal = _mm_srli_si128(inputVal, 4);
174 interimVal = _mm_cvtepi8_epi32(inputVal);
175 ret = _mm_cvtepi32_ps(interimVal);
176 ret = _mm_mul_ps(ret, invScalar);
177 _mm_storeu_ps(outputVectorPtr, ret);
178 outputVectorPtr += 4;
180 inputVectorPtr += 16;
183 number = sixteenthPoints * 16;
184 for (; number < num_points; number++) {
185 outputVector[number] = (float)(inputVector[number]) * iScalar;
190#ifdef LV_HAVE_GENERIC
193 const int8_t* inputVector,
195 unsigned int num_points)
197 float* outputVectorPtr = outputVector;
198 const int8_t* inputVectorPtr = inputVector;
199 unsigned int number = 0;
200 const float iScalar = 1.0 / scalar;
202 for (number = 0; number < num_points; number++) {
203 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
211#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
212#define INCLUDED_volk_8i_s32f_convert_32f_a_H
218#include <immintrin.h>
220static inline void volk_8i_s32f_convert_32f_a_avx2(
float* outputVector,
221 const int8_t* inputVector,
223 unsigned int num_points)
225 unsigned int number = 0;
226 const unsigned int sixteenthPoints = num_points / 16;
228 float* outputVectorPtr = outputVector;
229 const float iScalar = 1.0 / scalar;
230 __m256 invScalar = _mm256_set1_ps(iScalar);
231 const int8_t* inputVectorPtr = inputVector;
236 for (; number < sixteenthPoints; number++) {
237 inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
239 interimVal = _mm256_cvtepi8_epi32(inputVal128);
240 ret = _mm256_cvtepi32_ps(interimVal);
241 ret = _mm256_mul_ps(ret, invScalar);
242 _mm256_store_ps(outputVectorPtr, ret);
243 outputVectorPtr += 8;
245 inputVal128 = _mm_srli_si128(inputVal128, 8);
246 interimVal = _mm256_cvtepi8_epi32(inputVal128);
247 ret = _mm256_cvtepi32_ps(interimVal);
248 ret = _mm256_mul_ps(ret, invScalar);
249 _mm256_store_ps(outputVectorPtr, ret);
250 outputVectorPtr += 8;
252 inputVectorPtr += 16;
255 number = sixteenthPoints * 16;
256 for (; number < num_points; number++) {
257 outputVector[number] = (float)(inputVector[number]) * iScalar;
262#ifdef LV_HAVE_AVX512F
263#include <immintrin.h>
265static inline void volk_8i_s32f_convert_32f_a_avx512(
float* outputVector,
266 const int8_t* inputVector,
268 unsigned int num_points)
270 unsigned int number = 0;
271 const unsigned int sixteenthPoints = num_points / 16;
273 float* outputVectorPtr = outputVector;
274 const float iScalar = 1.0 / scalar;
275 __m512 invScalar = _mm512_set1_ps(iScalar);
276 const int8_t* inputVectorPtr = inputVector;
281 for (; number < sixteenthPoints; number++) {
282 inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
284 interimVal = _mm512_cvtepi8_epi32(inputVal128);
285 ret = _mm512_cvtepi32_ps(interimVal);
286 ret = _mm512_mul_ps(ret, invScalar);
287 _mm512_store_ps(outputVectorPtr, ret);
288 outputVectorPtr += 16;
290 inputVectorPtr += 16;
293 number = sixteenthPoints * 16;
294 for (; number < num_points; number++) {
295 outputVector[number] = (float)(inputVector[number]) * iScalar;
301#include <smmintrin.h>
303static inline void volk_8i_s32f_convert_32f_a_sse4_1(
float* outputVector,
304 const int8_t* inputVector,
306 unsigned int num_points)
308 unsigned int number = 0;
309 const unsigned int sixteenthPoints = num_points / 16;
311 float* outputVectorPtr = outputVector;
312 const float iScalar = 1.0 / scalar;
313 __m128 invScalar = _mm_set_ps1(iScalar);
314 const int8_t* inputVectorPtr = inputVector;
319 for (; number < sixteenthPoints; number++) {
320 inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
322 interimVal = _mm_cvtepi8_epi32(inputVal);
323 ret = _mm_cvtepi32_ps(interimVal);
324 ret = _mm_mul_ps(ret, invScalar);
325 _mm_store_ps(outputVectorPtr, ret);
326 outputVectorPtr += 4;
328 inputVal = _mm_srli_si128(inputVal, 4);
329 interimVal = _mm_cvtepi8_epi32(inputVal);
330 ret = _mm_cvtepi32_ps(interimVal);
331 ret = _mm_mul_ps(ret, invScalar);
332 _mm_store_ps(outputVectorPtr, ret);
333 outputVectorPtr += 4;
335 inputVal = _mm_srli_si128(inputVal, 4);
336 interimVal = _mm_cvtepi8_epi32(inputVal);
337 ret = _mm_cvtepi32_ps(interimVal);
338 ret = _mm_mul_ps(ret, invScalar);
339 _mm_store_ps(outputVectorPtr, ret);
340 outputVectorPtr += 4;
342 inputVal = _mm_srli_si128(inputVal, 4);
343 interimVal = _mm_cvtepi8_epi32(inputVal);
344 ret = _mm_cvtepi32_ps(interimVal);
345 ret = _mm_mul_ps(ret, invScalar);
346 _mm_store_ps(outputVectorPtr, ret);
347 outputVectorPtr += 4;
349 inputVectorPtr += 16;
352 number = sixteenthPoints * 16;
353 for (; number < num_points; number++) {
354 outputVector[number] = (float)(inputVector[number]) * iScalar;
363 const int8_t* inputVector,
365 unsigned int num_points)
367 float* outputVectorPtr = outputVector;
368 const int8_t* inputVectorPtr = inputVector;
370 const float iScalar = 1.0 / scalar;
371 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
378 float32x4_t outputFloat;
380 unsigned int number = 0;
381 const unsigned int sixteenthPoints = num_points / 16;
382 for (; number < sixteenthPoints; number++) {
383 inputVal = vld1q_s8(inputVectorPtr);
384 inputVectorPtr += 16;
386 lower = vmovl_s8(vget_low_s8(inputVal));
387 higher = vmovl_s8(vget_high_s8(inputVal));
389 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
390 vst1q_f32(outputVectorPtr, outputFloat);
391 outputVectorPtr += 4;
393 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
394 vst1q_f32(outputVectorPtr, outputFloat);
395 outputVectorPtr += 4;
397 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
398 vst1q_f32(outputVectorPtr, outputFloat);
399 outputVectorPtr += 4;
402 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
403 vst1q_f32(outputVectorPtr, outputFloat);
404 outputVectorPtr += 4;
406 for (number = sixteenthPoints * 16; number < num_points; number++) {
407 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
416static inline void volk_8i_s32f_convert_32f_neonv8(
float* outputVector,
417 const int8_t* inputVector,
419 unsigned int num_points)
421 float* outputVectorPtr = outputVector;
422 const int8_t* inputVectorPtr = inputVector;
423 const float iScalar = 1.0f / scalar;
424 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
425 const unsigned int thirtysecondPoints = num_points / 32;
427 for (
unsigned int number = 0; number < thirtysecondPoints; number++) {
428 int8x16_t in0 = vld1q_s8(inputVectorPtr);
429 int8x16_t in1 = vld1q_s8(inputVectorPtr + 16);
433 int16x8_t lo0 = vmovl_s8(vget_low_s8(in0));
434 int16x8_t hi0 = vmovl_s8(vget_high_s8(in0));
435 int16x8_t lo1 = vmovl_s8(vget_low_s8(in1));
436 int16x8_t hi1 = vmovl_s8(vget_high_s8(in1));
438 vst1q_f32(outputVectorPtr,
439 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lo0))), qiScalar));
440 vst1q_f32(outputVectorPtr + 4,
441 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lo0))), qiScalar));
442 vst1q_f32(outputVectorPtr + 8,
443 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(hi0))), qiScalar));
444 vst1q_f32(outputVectorPtr + 12,
445 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(hi0))), qiScalar));
446 vst1q_f32(outputVectorPtr + 16,
447 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lo1))), qiScalar));
448 vst1q_f32(outputVectorPtr + 20,
449 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lo1))), qiScalar));
450 vst1q_f32(outputVectorPtr + 24,
451 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(hi1))), qiScalar));
452 vst1q_f32(outputVectorPtr + 28,
453 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(hi1))), qiScalar));
455 inputVectorPtr += 32;
456 outputVectorPtr += 32;
459 for (
unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
460 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
466extern void volk_8i_s32f_convert_32f_a_orc_impl(
float* outputVector,
467 const int8_t* inputVector,
471static inline void volk_8i_s32f_convert_32f_u_orc(
float* outputVector,
472 const int8_t* inputVector,
474 unsigned int num_points)
476 float invscalar = 1.0 / scalar;
477 volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
482#include <riscv_vector.h>
484static inline void volk_8i_s32f_convert_32f_rvv(
float* outputVector,
485 const int8_t* inputVector,
487 unsigned int num_points)
489 size_t n = num_points;
490 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
491 vl = __riscv_vsetvl_e8m2(n);
492 vint16m4_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m2(inputVector, vl), vl);
494 outputVector, __riscv_vfmul(__riscv_vfwcvt_f(v, vl), 1.0f / scalar, vl), vl);