Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_8i_s32f_convert_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
40
41#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
42#define INCLUDED_volk_8i_s32f_convert_32f_u_H
43
44#include <inttypes.h>
45#include <stdio.h>
46
47#ifdef LV_HAVE_AVX2
48#include <immintrin.h>
49
50static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
51 const int8_t* inputVector,
52 const float scalar,
53 unsigned int num_points)
54{
55 unsigned int number = 0;
56 const unsigned int sixteenthPoints = num_points / 16;
57
58 float* outputVectorPtr = outputVector;
59 const float iScalar = 1.0 / scalar;
60 __m256 invScalar = _mm256_set1_ps(iScalar);
61 const int8_t* inputVectorPtr = inputVector;
62 __m256 ret;
63 __m128i inputVal128;
64 __m256i interimVal;
65
66 for (; number < sixteenthPoints; number++) {
67 inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
68
69 interimVal = _mm256_cvtepi8_epi32(inputVal128);
70 ret = _mm256_cvtepi32_ps(interimVal);
71 ret = _mm256_mul_ps(ret, invScalar);
72 _mm256_storeu_ps(outputVectorPtr, ret);
73 outputVectorPtr += 8;
74
75 inputVal128 = _mm_srli_si128(inputVal128, 8);
76 interimVal = _mm256_cvtepi8_epi32(inputVal128);
77 ret = _mm256_cvtepi32_ps(interimVal);
78 ret = _mm256_mul_ps(ret, invScalar);
79 _mm256_storeu_ps(outputVectorPtr, ret);
80 outputVectorPtr += 8;
81
82 inputVectorPtr += 16;
83 }
84
85 number = sixteenthPoints * 16;
86 for (; number < num_points; number++) {
87 outputVector[number] = (float)(inputVector[number]) * iScalar;
88 }
89}
90#endif /* LV_HAVE_AVX2 */
91
92#ifdef LV_HAVE_AVX512F
93#include <immintrin.h>
94
95static inline void volk_8i_s32f_convert_32f_u_avx512(float* outputVector,
96 const int8_t* inputVector,
97 const float scalar,
98 unsigned int num_points)
99{
100 unsigned int number = 0;
101 const unsigned int sixteenthPoints = num_points / 16;
102
103 float* outputVectorPtr = outputVector;
104 const float iScalar = 1.0 / scalar;
105 __m512 invScalar = _mm512_set1_ps(iScalar);
106 const int8_t* inputVectorPtr = inputVector;
107 __m512 ret;
108 __m128i inputVal128;
109 __m512i interimVal;
110
111 for (; number < sixteenthPoints; number++) {
112 inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
113
114 interimVal = _mm512_cvtepi8_epi32(inputVal128);
115 ret = _mm512_cvtepi32_ps(interimVal);
116 ret = _mm512_mul_ps(ret, invScalar);
117 _mm512_storeu_ps(outputVectorPtr, ret);
118 outputVectorPtr += 16;
119
120 inputVectorPtr += 16;
121 }
122
123 number = sixteenthPoints * 16;
124 for (; number < num_points; number++) {
125 outputVector[number] = (float)(inputVector[number]) * iScalar;
126 }
127}
128#endif /* LV_HAVE_AVX512F */
129
130
131#ifdef LV_HAVE_SSE4_1
132#include <smmintrin.h>
133
134static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
135 const int8_t* inputVector,
136 const float scalar,
137 unsigned int num_points)
138{
139 unsigned int number = 0;
140 const unsigned int sixteenthPoints = num_points / 16;
141
142 float* outputVectorPtr = outputVector;
143 const float iScalar = 1.0 / scalar;
144 __m128 invScalar = _mm_set_ps1(iScalar);
145 const int8_t* inputVectorPtr = inputVector;
146 __m128 ret;
147 __m128i inputVal;
148 __m128i interimVal;
149
150 for (; number < sixteenthPoints; number++) {
151 inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
152
153 interimVal = _mm_cvtepi8_epi32(inputVal);
154 ret = _mm_cvtepi32_ps(interimVal);
155 ret = _mm_mul_ps(ret, invScalar);
156 _mm_storeu_ps(outputVectorPtr, ret);
157 outputVectorPtr += 4;
158
159 inputVal = _mm_srli_si128(inputVal, 4);
160 interimVal = _mm_cvtepi8_epi32(inputVal);
161 ret = _mm_cvtepi32_ps(interimVal);
162 ret = _mm_mul_ps(ret, invScalar);
163 _mm_storeu_ps(outputVectorPtr, ret);
164 outputVectorPtr += 4;
165
166 inputVal = _mm_srli_si128(inputVal, 4);
167 interimVal = _mm_cvtepi8_epi32(inputVal);
168 ret = _mm_cvtepi32_ps(interimVal);
169 ret = _mm_mul_ps(ret, invScalar);
170 _mm_storeu_ps(outputVectorPtr, ret);
171 outputVectorPtr += 4;
172
173 inputVal = _mm_srli_si128(inputVal, 4);
174 interimVal = _mm_cvtepi8_epi32(inputVal);
175 ret = _mm_cvtepi32_ps(interimVal);
176 ret = _mm_mul_ps(ret, invScalar);
177 _mm_storeu_ps(outputVectorPtr, ret);
178 outputVectorPtr += 4;
179
180 inputVectorPtr += 16;
181 }
182
183 number = sixteenthPoints * 16;
184 for (; number < num_points; number++) {
185 outputVector[number] = (float)(inputVector[number]) * iScalar;
186 }
187}
188#endif /* LV_HAVE_SSE4_1 */
189
190#ifdef LV_HAVE_GENERIC
191
192static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
193 const int8_t* inputVector,
194 const float scalar,
195 unsigned int num_points)
196{
197 float* outputVectorPtr = outputVector;
198 const int8_t* inputVectorPtr = inputVector;
199 unsigned int number = 0;
200 const float iScalar = 1.0 / scalar;
201
202 for (number = 0; number < num_points; number++) {
203 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
204 }
205}
206#endif /* LV_HAVE_GENERIC */
207
208
209#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
210
211#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
212#define INCLUDED_volk_8i_s32f_convert_32f_a_H
213
214#include <inttypes.h>
215#include <stdio.h>
216
217#ifdef LV_HAVE_AVX2
218#include <immintrin.h>
219
220static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
221 const int8_t* inputVector,
222 const float scalar,
223 unsigned int num_points)
224{
225 unsigned int number = 0;
226 const unsigned int sixteenthPoints = num_points / 16;
227
228 float* outputVectorPtr = outputVector;
229 const float iScalar = 1.0 / scalar;
230 __m256 invScalar = _mm256_set1_ps(iScalar);
231 const int8_t* inputVectorPtr = inputVector;
232 __m256 ret;
233 __m128i inputVal128;
234 __m256i interimVal;
235
236 for (; number < sixteenthPoints; number++) {
237 inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
238
239 interimVal = _mm256_cvtepi8_epi32(inputVal128);
240 ret = _mm256_cvtepi32_ps(interimVal);
241 ret = _mm256_mul_ps(ret, invScalar);
242 _mm256_store_ps(outputVectorPtr, ret);
243 outputVectorPtr += 8;
244
245 inputVal128 = _mm_srli_si128(inputVal128, 8);
246 interimVal = _mm256_cvtepi8_epi32(inputVal128);
247 ret = _mm256_cvtepi32_ps(interimVal);
248 ret = _mm256_mul_ps(ret, invScalar);
249 _mm256_store_ps(outputVectorPtr, ret);
250 outputVectorPtr += 8;
251
252 inputVectorPtr += 16;
253 }
254
255 number = sixteenthPoints * 16;
256 for (; number < num_points; number++) {
257 outputVector[number] = (float)(inputVector[number]) * iScalar;
258 }
259}
260#endif /* LV_HAVE_AVX2 */
261
262#ifdef LV_HAVE_AVX512F
263#include <immintrin.h>
264
265static inline void volk_8i_s32f_convert_32f_a_avx512(float* outputVector,
266 const int8_t* inputVector,
267 const float scalar,
268 unsigned int num_points)
269{
270 unsigned int number = 0;
271 const unsigned int sixteenthPoints = num_points / 16;
272
273 float* outputVectorPtr = outputVector;
274 const float iScalar = 1.0 / scalar;
275 __m512 invScalar = _mm512_set1_ps(iScalar);
276 const int8_t* inputVectorPtr = inputVector;
277 __m512 ret;
278 __m128i inputVal128;
279 __m512i interimVal;
280
281 for (; number < sixteenthPoints; number++) {
282 inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
283
284 interimVal = _mm512_cvtepi8_epi32(inputVal128);
285 ret = _mm512_cvtepi32_ps(interimVal);
286 ret = _mm512_mul_ps(ret, invScalar);
287 _mm512_store_ps(outputVectorPtr, ret);
288 outputVectorPtr += 16;
289
290 inputVectorPtr += 16;
291 }
292
293 number = sixteenthPoints * 16;
294 for (; number < num_points; number++) {
295 outputVector[number] = (float)(inputVector[number]) * iScalar;
296 }
297}
298#endif /* LV_HAVE_AVX512F */
299
300#ifdef LV_HAVE_SSE4_1
301#include <smmintrin.h>
302
303static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
304 const int8_t* inputVector,
305 const float scalar,
306 unsigned int num_points)
307{
308 unsigned int number = 0;
309 const unsigned int sixteenthPoints = num_points / 16;
310
311 float* outputVectorPtr = outputVector;
312 const float iScalar = 1.0 / scalar;
313 __m128 invScalar = _mm_set_ps1(iScalar);
314 const int8_t* inputVectorPtr = inputVector;
315 __m128 ret;
316 __m128i inputVal;
317 __m128i interimVal;
318
319 for (; number < sixteenthPoints; number++) {
320 inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
321
322 interimVal = _mm_cvtepi8_epi32(inputVal);
323 ret = _mm_cvtepi32_ps(interimVal);
324 ret = _mm_mul_ps(ret, invScalar);
325 _mm_store_ps(outputVectorPtr, ret);
326 outputVectorPtr += 4;
327
328 inputVal = _mm_srli_si128(inputVal, 4);
329 interimVal = _mm_cvtepi8_epi32(inputVal);
330 ret = _mm_cvtepi32_ps(interimVal);
331 ret = _mm_mul_ps(ret, invScalar);
332 _mm_store_ps(outputVectorPtr, ret);
333 outputVectorPtr += 4;
334
335 inputVal = _mm_srli_si128(inputVal, 4);
336 interimVal = _mm_cvtepi8_epi32(inputVal);
337 ret = _mm_cvtepi32_ps(interimVal);
338 ret = _mm_mul_ps(ret, invScalar);
339 _mm_store_ps(outputVectorPtr, ret);
340 outputVectorPtr += 4;
341
342 inputVal = _mm_srli_si128(inputVal, 4);
343 interimVal = _mm_cvtepi8_epi32(inputVal);
344 ret = _mm_cvtepi32_ps(interimVal);
345 ret = _mm_mul_ps(ret, invScalar);
346 _mm_store_ps(outputVectorPtr, ret);
347 outputVectorPtr += 4;
348
349 inputVectorPtr += 16;
350 }
351
352 number = sixteenthPoints * 16;
353 for (; number < num_points; number++) {
354 outputVector[number] = (float)(inputVector[number]) * iScalar;
355 }
356}
357#endif /* LV_HAVE_SSE4_1 */
358
359#ifdef LV_HAVE_NEON
360#include <arm_neon.h>
361
362static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
363 const int8_t* inputVector,
364 const float scalar,
365 unsigned int num_points)
366{
367 float* outputVectorPtr = outputVector;
368 const int8_t* inputVectorPtr = inputVector;
369
370 const float iScalar = 1.0 / scalar;
371 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
372
373 int8x16_t inputVal;
374
375 int16x8_t lower;
376 int16x8_t higher;
377
378 float32x4_t outputFloat;
379
380 unsigned int number = 0;
381 const unsigned int sixteenthPoints = num_points / 16;
382 for (; number < sixteenthPoints; number++) {
383 inputVal = vld1q_s8(inputVectorPtr);
384 inputVectorPtr += 16;
385
386 lower = vmovl_s8(vget_low_s8(inputVal));
387 higher = vmovl_s8(vget_high_s8(inputVal));
388
389 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
390 vst1q_f32(outputVectorPtr, outputFloat);
391 outputVectorPtr += 4;
392
393 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
394 vst1q_f32(outputVectorPtr, outputFloat);
395 outputVectorPtr += 4;
396
397 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
398 vst1q_f32(outputVectorPtr, outputFloat);
399 outputVectorPtr += 4;
400
401 outputFloat =
402 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
403 vst1q_f32(outputVectorPtr, outputFloat);
404 outputVectorPtr += 4;
405 }
406 for (number = sixteenthPoints * 16; number < num_points; number++) {
407 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
408 }
409}
410
411#endif /* LV_HAVE_NEON */
412
413#ifdef LV_HAVE_NEONV8
414#include <arm_neon.h>
415
416static inline void volk_8i_s32f_convert_32f_neonv8(float* outputVector,
417 const int8_t* inputVector,
418 const float scalar,
419 unsigned int num_points)
420{
421 float* outputVectorPtr = outputVector;
422 const int8_t* inputVectorPtr = inputVector;
423 const float iScalar = 1.0f / scalar;
424 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
425 const unsigned int thirtysecondPoints = num_points / 32;
426
427 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
428 int8x16_t in0 = vld1q_s8(inputVectorPtr);
429 int8x16_t in1 = vld1q_s8(inputVectorPtr + 16);
430 __VOLK_PREFETCH(inputVectorPtr + 64);
431
432 /* Widen int8 -> int16 -> int32 -> float */
433 int16x8_t lo0 = vmovl_s8(vget_low_s8(in0));
434 int16x8_t hi0 = vmovl_s8(vget_high_s8(in0));
435 int16x8_t lo1 = vmovl_s8(vget_low_s8(in1));
436 int16x8_t hi1 = vmovl_s8(vget_high_s8(in1));
437
438 vst1q_f32(outputVectorPtr,
439 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lo0))), qiScalar));
440 vst1q_f32(outputVectorPtr + 4,
441 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lo0))), qiScalar));
442 vst1q_f32(outputVectorPtr + 8,
443 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(hi0))), qiScalar));
444 vst1q_f32(outputVectorPtr + 12,
445 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(hi0))), qiScalar));
446 vst1q_f32(outputVectorPtr + 16,
447 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lo1))), qiScalar));
448 vst1q_f32(outputVectorPtr + 20,
449 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lo1))), qiScalar));
450 vst1q_f32(outputVectorPtr + 24,
451 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(hi1))), qiScalar));
452 vst1q_f32(outputVectorPtr + 28,
453 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(hi1))), qiScalar));
454
455 inputVectorPtr += 32;
456 outputVectorPtr += 32;
457 }
458
459 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
460 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
461 }
462}
463#endif /* LV_HAVE_NEONV8 */
464
465#ifdef LV_HAVE_ORC
466extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
467 const int8_t* inputVector,
468 const float scalar,
469 int num_points);
470
471static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
472 const int8_t* inputVector,
473 const float scalar,
474 unsigned int num_points)
475{
476 float invscalar = 1.0 / scalar;
477 volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
478}
479#endif /* LV_HAVE_ORC */
480
481#ifdef LV_HAVE_RVV
482#include <riscv_vector.h>
483
484static inline void volk_8i_s32f_convert_32f_rvv(float* outputVector,
485 const int8_t* inputVector,
486 const float scalar,
487 unsigned int num_points)
488{
489 size_t n = num_points;
490 for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
491 vl = __riscv_vsetvl_e8m2(n);
492 vint16m4_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m2(inputVector, vl), vl);
493 __riscv_vse32(
494 outputVector, __riscv_vfmul(__riscv_vfwcvt_f(v, vl), 1.0f / scalar, vl), vl);
495 }
496}
497#endif /*LV_HAVE_RVV*/
498
499#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */