Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16i_s32f_convert_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
40
41#ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
42#define INCLUDED_volk_16i_s32f_convert_32f_u_H
43
44#include <inttypes.h>
45#include <stdio.h>
46
47#ifdef LV_HAVE_AVX2
48#include <immintrin.h>
49
50static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector,
51 const int16_t* inputVector,
52 const float scalar,
53 unsigned int num_points)
54{
55 unsigned int number = 0;
56 const unsigned int eighthPoints = num_points / 8;
57
58 float* outputVectorPtr = outputVector;
59 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
60 int16_t* inputPtr = (int16_t*)inputVector;
61 __m128i inputVal;
62 __m256i inputVal2;
63 __m256 ret;
64
65 for (; number < eighthPoints; number++) {
66
67 // Load the 8 values
68 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
69
70 // Convert
71 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
72
73 ret = _mm256_cvtepi32_ps(inputVal2);
74 ret = _mm256_mul_ps(ret, invScalar);
75
76 _mm256_storeu_ps(outputVectorPtr, ret);
77
78 outputVectorPtr += 8;
79
80 inputPtr += 8;
81 }
82
83 number = eighthPoints * 8;
84 for (; number < num_points; number++) {
85 outputVector[number] = ((float)(inputVector[number])) / scalar;
86 }
87}
88#endif /* LV_HAVE_AVX2 */
89
90#ifdef LV_HAVE_AVX512F
91#include <immintrin.h>
92
93static inline void volk_16i_s32f_convert_32f_u_avx512(float* outputVector,
94 const int16_t* inputVector,
95 const float scalar,
96 unsigned int num_points)
97{
98 unsigned int number = 0;
99 const unsigned int sixteenthPoints = num_points / 16;
100
101 float* outputVectorPtr = outputVector;
102 __m512 invScalar = _mm512_set1_ps(1.0 / scalar);
103 int16_t* inputPtr = (int16_t*)inputVector;
104 __m256i inputVal;
105 __m512i inputVal2;
106 __m512 ret;
107
108 for (; number < sixteenthPoints; number++) {
109
110 // Load 16 int16 values
111 inputVal = _mm256_loadu_si256((__m256i*)inputPtr);
112
113 // Convert int16 → int32 → float
114 inputVal2 = _mm512_cvtepi16_epi32(inputVal);
115 ret = _mm512_cvtepi32_ps(inputVal2);
116 ret = _mm512_mul_ps(ret, invScalar);
117
118 _mm512_storeu_ps(outputVectorPtr, ret);
119
120 outputVectorPtr += 16;
121 inputPtr += 16;
122 }
123
124 number = sixteenthPoints * 16;
125 for (; number < num_points; number++) {
126 outputVector[number] = ((float)(inputVector[number])) / scalar;
127 }
128}
129#endif /* LV_HAVE_AVX512F */
130
131#ifdef LV_HAVE_AVX
132#include <immintrin.h>
133
134static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector,
135 const int16_t* inputVector,
136 const float scalar,
137 unsigned int num_points)
138{
139 unsigned int number = 0;
140 const unsigned int eighthPoints = num_points / 8;
141
142 float* outputVectorPtr = outputVector;
143 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
144 int16_t* inputPtr = (int16_t*)inputVector;
145 __m128i inputVal, inputVal2;
146 __m128 ret;
147 __m256 output;
148 __m256 dummy = _mm256_setzero_ps();
149
150 for (; number < eighthPoints; number++) {
151
152 // Load the 8 values
153 // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
154 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
155
156 // Shift the input data to the right by 64 bits ( 8 bytes )
157 inputVal2 = _mm_srli_si128(inputVal, 8);
158
159 // Convert the lower 4 values into 32 bit words
160 inputVal = _mm_cvtepi16_epi32(inputVal);
161 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
162
163 ret = _mm_cvtepi32_ps(inputVal);
164 ret = _mm_mul_ps(ret, invScalar);
165 output = _mm256_insertf128_ps(dummy, ret, 0);
166
167 ret = _mm_cvtepi32_ps(inputVal2);
168 ret = _mm_mul_ps(ret, invScalar);
169 output = _mm256_insertf128_ps(output, ret, 1);
170
171 _mm256_storeu_ps(outputVectorPtr, output);
172
173 outputVectorPtr += 8;
174
175 inputPtr += 8;
176 }
177
178 number = eighthPoints * 8;
179 for (; number < num_points; number++) {
180 outputVector[number] = ((float)(inputVector[number])) / scalar;
181 }
182}
183#endif /* LV_HAVE_AVX */
184
185#ifdef LV_HAVE_SSE4_1
186#include <smmintrin.h>
187
188static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector,
189 const int16_t* inputVector,
190 const float scalar,
191 unsigned int num_points)
192{
193 unsigned int number = 0;
194 const unsigned int eighthPoints = num_points / 8;
195
196 float* outputVectorPtr = outputVector;
197 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
198 int16_t* inputPtr = (int16_t*)inputVector;
199 __m128i inputVal;
200 __m128i inputVal2;
201 __m128 ret;
202
203 for (; number < eighthPoints; number++) {
204
205 // Load the 8 values
206 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
207
208 // Shift the input data to the right by 64 bits ( 8 bytes )
209 inputVal2 = _mm_srli_si128(inputVal, 8);
210
211 // Convert the lower 4 values into 32 bit words
212 inputVal = _mm_cvtepi16_epi32(inputVal);
213 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
214
215 ret = _mm_cvtepi32_ps(inputVal);
216 ret = _mm_mul_ps(ret, invScalar);
217 _mm_storeu_ps(outputVectorPtr, ret);
218 outputVectorPtr += 4;
219
220 ret = _mm_cvtepi32_ps(inputVal2);
221 ret = _mm_mul_ps(ret, invScalar);
222 _mm_storeu_ps(outputVectorPtr, ret);
223
224 outputVectorPtr += 4;
225
226 inputPtr += 8;
227 }
228
229 number = eighthPoints * 8;
230 for (; number < num_points; number++) {
231 outputVector[number] = ((float)(inputVector[number])) / scalar;
232 }
233}
234#endif /* LV_HAVE_SSE4_1 */
235
236#ifdef LV_HAVE_SSE
237#include <xmmintrin.h>
238
239static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector,
240 const int16_t* inputVector,
241 const float scalar,
242 unsigned int num_points)
243{
244 unsigned int number = 0;
245 const unsigned int quarterPoints = num_points / 4;
246
247 float* outputVectorPtr = outputVector;
248 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
249 int16_t* inputPtr = (int16_t*)inputVector;
250 __m128 ret;
251
252 for (; number < quarterPoints; number++) {
253 ret = _mm_set_ps((float)(inputPtr[3]),
254 (float)(inputPtr[2]),
255 (float)(inputPtr[1]),
256 (float)(inputPtr[0]));
257
258 ret = _mm_mul_ps(ret, invScalar);
259 _mm_storeu_ps(outputVectorPtr, ret);
260
261 inputPtr += 4;
262 outputVectorPtr += 4;
263 }
264
265 number = quarterPoints * 4;
266 for (; number < num_points; number++) {
267 outputVector[number] = (float)(inputVector[number]) / scalar;
268 }
269}
270#endif /* LV_HAVE_SSE */
271
272#ifdef LV_HAVE_GENERIC
273
274static inline void volk_16i_s32f_convert_32f_generic(float* outputVector,
275 const int16_t* inputVector,
276 const float scalar,
277 unsigned int num_points)
278{
279 float* outputVectorPtr = outputVector;
280 const int16_t* inputVectorPtr = inputVector;
281 unsigned int number = 0;
282
283 for (number = 0; number < num_points; number++) {
284 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
285 }
286}
287#endif /* LV_HAVE_GENERIC */
288
289#ifdef LV_HAVE_NEON
290#include <arm_neon.h>
291
292static inline void volk_16i_s32f_convert_32f_neon(float* outputVector,
293 const int16_t* inputVector,
294 const float scalar,
295 unsigned int num_points)
296{
297 float* outputPtr = outputVector;
298 const int16_t* inputPtr = inputVector;
299 unsigned int number = 0;
300 unsigned int eighth_points = num_points / 8;
301
302 int16x4x2_t input16;
303 int32x4_t input32_0, input32_1;
304 float32x4_t input_float_0, input_float_1;
305 float32x4x2_t output_float;
306 float32x4_t inv_scale;
307
308 inv_scale = vdupq_n_f32(1.0 / scalar);
309
310 // the generic disassembles to a 128-bit load
311 // and duplicates every instruction to operate on 64-bits
312 // at a time. This is only possible with lanes, which is faster
313 // than just doing a vld1_s16, but still slower.
314 for (number = 0; number < eighth_points; number++) {
315 input16 = vld2_s16(inputPtr);
316 // widen 16-bit int to 32-bit int
317 input32_0 = vmovl_s16(input16.val[0]);
318 input32_1 = vmovl_s16(input16.val[1]);
319 // convert 32-bit int to float with scale
320 input_float_0 = vcvtq_f32_s32(input32_0);
321 input_float_1 = vcvtq_f32_s32(input32_1);
322 output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
323 output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
324 vst2q_f32(outputPtr, output_float);
325 inputPtr += 8;
326 outputPtr += 8;
327 }
328
329 for (number = eighth_points * 8; number < num_points; number++) {
330 *outputPtr++ = ((float)(*inputPtr++)) / scalar;
331 }
332}
333#endif /* LV_HAVE_NEON */
334
335
336#ifdef LV_HAVE_NEONV8
337#include <arm_neon.h>
338
339static inline void volk_16i_s32f_convert_32f_neonv8(float* outputVector,
340 const int16_t* inputVector,
341 const float scalar,
342 unsigned int num_points)
343{
344 unsigned int n = num_points;
345 float* out = outputVector;
346 const int16_t* in = inputVector;
347
348 const float32x4_t inv_scale = vdupq_n_f32(1.0f / scalar);
349
350 /* Process 8 int16 values per iteration using 64-bit loads */
351 while (n >= 8) {
352 int16x4_t v0 = vld1_s16(in);
353 int16x4_t v1 = vld1_s16(in + 4);
354 __VOLK_PREFETCH(in + 16);
355
356 /* Widen int16 to int32, convert to float, scale */
357 float32x4_t f0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(v0)), inv_scale);
358 float32x4_t f1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(v1)), inv_scale);
359
360 vst1q_f32(out, f0);
361 vst1q_f32(out + 4, f1);
362
363 in += 8;
364 out += 8;
365 n -= 8;
366 }
367
368 /* Process remaining 4 values */
369 if (n >= 4) {
370 int16x4_t v0 = vld1_s16(in);
371 vst1q_f32(out, vmulq_f32(vcvtq_f32_s32(vmovl_s16(v0)), inv_scale));
372 in += 4;
373 out += 4;
374 n -= 4;
375 }
376
377 /* Scalar tail */
378 while (n > 0) {
379 *out++ = ((float)(*in++)) / scalar;
380 n--;
381 }
382}
383
384#endif /* LV_HAVE_NEONV8 */
385
386
387#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
388#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
389#define INCLUDED_volk_16i_s32f_convert_32f_a_H
390
391#include <inttypes.h>
392#include <stdio.h>
393
394#ifdef LV_HAVE_AVX2
395#include <immintrin.h>
396
397static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector,
398 const int16_t* inputVector,
399 const float scalar,
400 unsigned int num_points)
401{
402 unsigned int number = 0;
403 const unsigned int eighthPoints = num_points / 8;
404
405 float* outputVectorPtr = outputVector;
406 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
407 int16_t* inputPtr = (int16_t*)inputVector;
408 __m128i inputVal;
409 __m256i inputVal2;
410 __m256 ret;
411
412 for (; number < eighthPoints; number++) {
413
414 // Load the 8 values
415 inputVal = _mm_load_si128((__m128i*)inputPtr);
416
417 // Convert
418 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
419
420 ret = _mm256_cvtepi32_ps(inputVal2);
421 ret = _mm256_mul_ps(ret, invScalar);
422
423 _mm256_store_ps(outputVectorPtr, ret);
424
425 outputVectorPtr += 8;
426
427 inputPtr += 8;
428 }
429
430 number = eighthPoints * 8;
431 for (; number < num_points; number++) {
432 outputVector[number] = ((float)(inputVector[number])) / scalar;
433 }
434}
435#endif /* LV_HAVE_AVX2 */
436
437#ifdef LV_HAVE_AVX512F
438#include <immintrin.h>
439
440static inline void volk_16i_s32f_convert_32f_a_avx512(float* outputVector,
441 const int16_t* inputVector,
442 const float scalar,
443 unsigned int num_points)
444{
445 unsigned int number = 0;
446 const unsigned int sixteenthPoints = num_points / 16;
447
448 float* outputVectorPtr = outputVector;
449 __m512 invScalar = _mm512_set1_ps(1.0 / scalar);
450 int16_t* inputPtr = (int16_t*)inputVector;
451 __m256i inputVal;
452 __m512i inputVal2;
453 __m512 ret;
454
455 for (; number < sixteenthPoints; number++) {
456
457 // Load 16 int16 values
458 inputVal = _mm256_load_si256((__m256i*)inputPtr);
459
460 // Convert int16 → int32 → float
461 inputVal2 = _mm512_cvtepi16_epi32(inputVal);
462 ret = _mm512_cvtepi32_ps(inputVal2);
463 ret = _mm512_mul_ps(ret, invScalar);
464
465 _mm512_store_ps(outputVectorPtr, ret);
466
467 outputVectorPtr += 16;
468 inputPtr += 16;
469 }
470
471 number = sixteenthPoints * 16;
472 for (; number < num_points; number++) {
473 outputVector[number] = ((float)(inputVector[number])) / scalar;
474 }
475}
476#endif /* LV_HAVE_AVX512F */
477
478#ifdef LV_HAVE_AVX
479#include <immintrin.h>
480
481static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector,
482 const int16_t* inputVector,
483 const float scalar,
484 unsigned int num_points)
485{
486 unsigned int number = 0;
487 const unsigned int eighthPoints = num_points / 8;
488
489 float* outputVectorPtr = outputVector;
490 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
491 int16_t* inputPtr = (int16_t*)inputVector;
492 __m128i inputVal, inputVal2;
493 __m128 ret;
494 __m256 output;
495 __m256 dummy = _mm256_setzero_ps();
496
497 for (; number < eighthPoints; number++) {
498
499 // Load the 8 values
500 // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
501 inputVal = _mm_load_si128((__m128i*)inputPtr);
502
503 // Shift the input data to the right by 64 bits ( 8 bytes )
504 inputVal2 = _mm_srli_si128(inputVal, 8);
505
506 // Convert the lower 4 values into 32 bit words
507 inputVal = _mm_cvtepi16_epi32(inputVal);
508 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
509
510 ret = _mm_cvtepi32_ps(inputVal);
511 ret = _mm_mul_ps(ret, invScalar);
512 output = _mm256_insertf128_ps(dummy, ret, 0);
513
514 ret = _mm_cvtepi32_ps(inputVal2);
515 ret = _mm_mul_ps(ret, invScalar);
516 output = _mm256_insertf128_ps(output, ret, 1);
517
518 _mm256_store_ps(outputVectorPtr, output);
519
520 outputVectorPtr += 8;
521
522 inputPtr += 8;
523 }
524
525 number = eighthPoints * 8;
526 for (; number < num_points; number++) {
527 outputVector[number] = ((float)(inputVector[number])) / scalar;
528 }
529}
530#endif /* LV_HAVE_AVX */
531
532#ifdef LV_HAVE_SSE4_1
533#include <smmintrin.h>
534
535static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector,
536 const int16_t* inputVector,
537 const float scalar,
538 unsigned int num_points)
539{
540 unsigned int number = 0;
541 const unsigned int eighthPoints = num_points / 8;
542
543 float* outputVectorPtr = outputVector;
544 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
545 int16_t* inputPtr = (int16_t*)inputVector;
546 __m128i inputVal;
547 __m128i inputVal2;
548 __m128 ret;
549
550 for (; number < eighthPoints; number++) {
551
552 // Load the 8 values
553 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
554
555 // Shift the input data to the right by 64 bits ( 8 bytes )
556 inputVal2 = _mm_srli_si128(inputVal, 8);
557
558 // Convert the lower 4 values into 32 bit words
559 inputVal = _mm_cvtepi16_epi32(inputVal);
560 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
561
562 ret = _mm_cvtepi32_ps(inputVal);
563 ret = _mm_mul_ps(ret, invScalar);
564 _mm_storeu_ps(outputVectorPtr, ret);
565 outputVectorPtr += 4;
566
567 ret = _mm_cvtepi32_ps(inputVal2);
568 ret = _mm_mul_ps(ret, invScalar);
569 _mm_storeu_ps(outputVectorPtr, ret);
570
571 outputVectorPtr += 4;
572
573 inputPtr += 8;
574 }
575
576 number = eighthPoints * 8;
577 for (; number < num_points; number++) {
578 outputVector[number] = ((float)(inputVector[number])) / scalar;
579 }
580}
581#endif /* LV_HAVE_SSE4_1 */
582
583#ifdef LV_HAVE_SSE
584#include <xmmintrin.h>
585
586static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
587 const int16_t* inputVector,
588 const float scalar,
589 unsigned int num_points)
590{
591 unsigned int number = 0;
592 const unsigned int quarterPoints = num_points / 4;
593
594 float* outputVectorPtr = outputVector;
595 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
596 int16_t* inputPtr = (int16_t*)inputVector;
597 __m128 ret;
598
599 for (; number < quarterPoints; number++) {
600 ret = _mm_set_ps((float)(inputPtr[3]),
601 (float)(inputPtr[2]),
602 (float)(inputPtr[1]),
603 (float)(inputPtr[0]));
604
605 ret = _mm_mul_ps(ret, invScalar);
606 _mm_storeu_ps(outputVectorPtr, ret);
607
608 inputPtr += 4;
609 outputVectorPtr += 4;
610 }
611
612 number = quarterPoints * 4;
613 for (; number < num_points; number++) {
614 outputVector[number] = (float)(inputVector[number]) / scalar;
615 }
616}
617#endif /* LV_HAVE_SSE */
618
619#ifdef LV_HAVE_RVV
620#include <riscv_vector.h>
621
622static inline void volk_16i_s32f_convert_32f_rvv(float* outputVector,
623 const int16_t* inputVector,
624 const float scalar,
625 unsigned int num_points)
626{
627 size_t n = num_points;
628 for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
629 vl = __riscv_vsetvl_e16m4(n);
630 vfloat32m8_t v = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(inputVector, vl), vl);
631 __riscv_vse32(outputVector, __riscv_vfmul(v, 1.0f / scalar, vl), vl);
632 }
633}
634#endif /*LV_HAVE_RVV*/
635
636#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */