Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_convert_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
32
33#ifndef INCLUDED_volk_32fc_convert_16ic_a_H
34#define INCLUDED_volk_32fc_convert_16ic_a_H
35
36#include "volk/volk_complex.h"
37#include <limits.h>
38#include <math.h>
39
40#ifdef LV_HAVE_AVX2
41#include <immintrin.h>
42
43static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
44 const lv_32fc_t* inputVector,
45 unsigned int num_points)
46{
47 const unsigned int avx_iters = num_points / 8;
48
49 float* inputVectorPtr = (float*)inputVector;
50 int16_t* outputVectorPtr = (int16_t*)outputVector;
51 float aux;
52
53 const float min_val = (float)SHRT_MIN;
54 const float max_val = (float)SHRT_MAX;
55
56 __m256 inputVal1, inputVal2;
57 __m256i intInputVal1, intInputVal2;
58 __m256 ret1, ret2;
59 const __m256 vmin_val = _mm256_set1_ps(min_val);
60 const __m256 vmax_val = _mm256_set1_ps(max_val);
61 unsigned int i;
62
63 for (i = 0; i < avx_iters; i++) {
64 inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
65 inputVectorPtr += 8;
66 inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
67 inputVectorPtr += 8;
68 __VOLK_PREFETCH(inputVectorPtr + 16);
69
70 // Clip
71 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
72 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
73
74 intInputVal1 = _mm256_cvtps_epi32(ret1);
75 intInputVal2 = _mm256_cvtps_epi32(ret2);
76
77 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
78 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
79
80 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
81 outputVectorPtr += 16;
82 }
83
84 for (i = avx_iters * 16; i < num_points * 2; i++) {
85 aux = *inputVectorPtr++;
86 if (aux > max_val)
87 aux = max_val;
88 else if (aux < min_val)
89 aux = min_val;
90 *outputVectorPtr++ = (int16_t)rintf(aux);
91 }
92}
93#endif /* LV_HAVE_AVX2 */
94
95#ifdef LV_HAVE_AVX512F
96#include <immintrin.h>
97
98static inline void volk_32fc_convert_16ic_a_avx512(lv_16sc_t* outputVector,
99 const lv_32fc_t* inputVector,
100 unsigned int num_points)
101{
102 const unsigned int avx512_iters = num_points / 8;
103
104 float* inputVectorPtr = (float*)inputVector;
105 int16_t* outputVectorPtr = (int16_t*)outputVector;
106 float aux;
107
108 const float min_val = (float)SHRT_MIN;
109 const float max_val = (float)SHRT_MAX;
110
111 __m512 inputVal1;
112 __m256i intInputVal;
113 __m512 ret1;
114 const __m512 vmin_val = _mm512_set1_ps(min_val);
115 const __m512 vmax_val = _mm512_set1_ps(max_val);
116 unsigned int i;
117
118 for (i = 0; i < avx512_iters; i++) {
119 inputVal1 = _mm512_load_ps((float*)inputVectorPtr);
120 inputVectorPtr += 16;
121 __VOLK_PREFETCH(inputVectorPtr + 16);
122
123 // Clip
124 ret1 = _mm512_max_ps(_mm512_min_ps(inputVal1, vmax_val), vmin_val);
125
126 // Convert float to int32, then pack to int16 with saturation
127 intInputVal = _mm512_cvtsepi32_epi16(_mm512_cvtps_epi32(ret1));
128
129 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
130 outputVectorPtr += 16;
131 }
132
133 for (i = avx512_iters * 16; i < num_points * 2; i++) {
134 aux = *inputVectorPtr++;
135 if (aux > max_val)
136 aux = max_val;
137 else if (aux < min_val)
138 aux = min_val;
139 *outputVectorPtr++ = (int16_t)rintf(aux);
140 }
141}
142#endif /* LV_HAVE_AVX512F */
143
144#ifdef LV_HAVE_SSE2
145#include <emmintrin.h>
146
147static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
148 const lv_32fc_t* inputVector,
149 unsigned int num_points)
150{
151 const unsigned int sse_iters = num_points / 4;
152
153 float* inputVectorPtr = (float*)inputVector;
154 int16_t* outputVectorPtr = (int16_t*)outputVector;
155 float aux;
156
157 const float min_val = (float)SHRT_MIN;
158 const float max_val = (float)SHRT_MAX;
159
160 __m128 inputVal1, inputVal2;
161 __m128i intInputVal1, intInputVal2;
162 __m128 ret1, ret2;
163 const __m128 vmin_val = _mm_set_ps1(min_val);
164 const __m128 vmax_val = _mm_set_ps1(max_val);
165 unsigned int i;
166
167 for (i = 0; i < sse_iters; i++) {
168 inputVal1 = _mm_load_ps((float*)inputVectorPtr);
169 inputVectorPtr += 4;
170 inputVal2 = _mm_load_ps((float*)inputVectorPtr);
171 inputVectorPtr += 4;
172 __VOLK_PREFETCH(inputVectorPtr + 8);
173
174 // Clip
175 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
176 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
177
178 intInputVal1 = _mm_cvtps_epi32(ret1);
179 intInputVal2 = _mm_cvtps_epi32(ret2);
180
181 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
182
183 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
184 outputVectorPtr += 8;
185 }
186
187 for (i = sse_iters * 8; i < num_points * 2; i++) {
188 aux = *inputVectorPtr++;
189 if (aux > max_val)
190 aux = max_val;
191 else if (aux < min_val)
192 aux = min_val;
193 *outputVectorPtr++ = (int16_t)rintf(aux);
194 }
195}
196#endif /* LV_HAVE_SSE2 */
197
198
199#if LV_HAVE_NEONV7
200#include <arm_neon.h>
201
202static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
203 const lv_32fc_t* inputVector,
204 unsigned int num_points)
205{
206
207 const unsigned int neon_iters = num_points / 4;
208
209 float32_t* inputVectorPtr = (float32_t*)inputVector;
210 int16_t* outputVectorPtr = (int16_t*)outputVector;
211
212 const float min_val_f = (float)SHRT_MIN;
213 const float max_val_f = (float)SHRT_MAX;
214 float32_t aux;
215 unsigned int i;
216
217 const float32x4_t min_val = vmovq_n_f32(min_val_f);
218 const float32x4_t max_val = vmovq_n_f32(max_val_f);
219 float32x4_t half = vdupq_n_f32(0.5f);
220 float32x4_t ret1, ret2, a, b, sign, PlusHalf, Round;
221
222 int32x4_t toint_a = { 0, 0, 0, 0 };
223 int32x4_t toint_b = { 0, 0, 0, 0 };
224 int16x4_t intInputVal1, intInputVal2;
225 int16x8_t res;
226
227 for (i = 0; i < neon_iters; i++) {
228 a = vld1q_f32((const float32_t*)(inputVectorPtr));
229 inputVectorPtr += 4;
230 b = vld1q_f32((const float32_t*)(inputVectorPtr));
231 inputVectorPtr += 4;
232 __VOLK_PREFETCH(inputVectorPtr + 8);
233
234 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
235 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
236
237 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
238 PlusHalf = vaddq_f32(ret1, half);
239 Round = vsubq_f32(PlusHalf, sign);
240 toint_a = vcvtq_s32_f32(Round);
241
242 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret2), 31)));
243 PlusHalf = vaddq_f32(ret2, half);
244 Round = vsubq_f32(PlusHalf, sign);
245 toint_b = vcvtq_s32_f32(Round);
246
247 intInputVal1 = vqmovn_s32(toint_a);
248 intInputVal2 = vqmovn_s32(toint_b);
249
250 res = vcombine_s16(intInputVal1, intInputVal2);
251 vst1q_s16((int16_t*)outputVectorPtr, res);
252 outputVectorPtr += 8;
253 }
254
255 for (i = neon_iters * 8; i < num_points * 2; i++) {
256 aux = *inputVectorPtr++;
257 if (aux > max_val_f)
258 aux = max_val_f;
259 else if (aux < min_val_f)
260 aux = min_val_f;
261 *outputVectorPtr++ = (int16_t)rintf(aux);
262 }
263}
264
265#endif /* LV_HAVE_NEONV7 */
266
267#if LV_HAVE_NEONV8
268#include <arm_neon.h>
269
270static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
271 const lv_32fc_t* inputVector,
272 unsigned int num_points)
273{
274 const unsigned int neon_iters = num_points / 4;
275
276 float32_t* inputVectorPtr = (float32_t*)inputVector;
277 int16_t* outputVectorPtr = (int16_t*)outputVector;
278
279 const float min_val_f = (float)SHRT_MIN;
280 const float max_val_f = (float)SHRT_MAX;
281 float32_t aux;
282 unsigned int i;
283
284 const float32x4_t min_val = vmovq_n_f32(min_val_f);
285 const float32x4_t max_val = vmovq_n_f32(max_val_f);
286 float32x4_t ret1, ret2, a, b;
287
288 int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
289 int16x4_t intInputVal1, intInputVal2;
290 int16x8_t res;
291
292 for (i = 0; i < neon_iters; i++) {
293 a = vld1q_f32((const float32_t*)(inputVectorPtr));
294 inputVectorPtr += 4;
295 b = vld1q_f32((const float32_t*)(inputVectorPtr));
296 inputVectorPtr += 4;
297 __VOLK_PREFETCH(inputVectorPtr + 8);
298
299 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
300 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
301
302 // vrndiq takes into account the current rounding mode (as does rintf)
303 toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
304 toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
305
306 intInputVal1 = vqmovn_s32(toint_a);
307 intInputVal2 = vqmovn_s32(toint_b);
308
309 res = vcombine_s16(intInputVal1, intInputVal2);
310 vst1q_s16((int16_t*)outputVectorPtr, res);
311 outputVectorPtr += 8;
312 }
313
314 for (i = neon_iters * 8; i < num_points * 2; i++) {
315 aux = *inputVectorPtr++;
316 if (aux > max_val_f)
317 aux = max_val_f;
318 else if (aux < min_val_f)
319 aux = min_val_f;
320 *outputVectorPtr++ = (int16_t)rintf(aux);
321 }
322}
323#endif /* LV_HAVE_NEONV8 */
324
325
326#ifdef LV_HAVE_GENERIC
327
328static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
329 const lv_32fc_t* inputVector,
330 unsigned int num_points)
331{
332 float* inputVectorPtr = (float*)inputVector;
333 int16_t* outputVectorPtr = (int16_t*)outputVector;
334 const float min_val = (float)SHRT_MIN;
335 const float max_val = (float)SHRT_MAX;
336 float aux;
337 unsigned int i;
338 for (i = 0; i < num_points * 2; i++) {
339 aux = *inputVectorPtr++;
340 if (aux > max_val)
341 aux = max_val;
342 else if (aux < min_val)
343 aux = min_val;
344 *outputVectorPtr++ = (int16_t)rintf(aux);
345 }
346}
347#endif /* LV_HAVE_GENERIC */
348
349#endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
350
351#ifndef INCLUDED_volk_32fc_convert_16ic_u_H
352#define INCLUDED_volk_32fc_convert_16ic_u_H
353
354#include "volk/volk_complex.h"
355#include <limits.h>
356#include <math.h>
357
358
359#ifdef LV_HAVE_AVX2
360#include <immintrin.h>
361
362static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
363 const lv_32fc_t* inputVector,
364 unsigned int num_points)
365{
366 const unsigned int avx_iters = num_points / 8;
367
368 float* inputVectorPtr = (float*)inputVector;
369 int16_t* outputVectorPtr = (int16_t*)outputVector;
370 float aux;
371
372 const float min_val = (float)SHRT_MIN;
373 const float max_val = (float)SHRT_MAX;
374
375 __m256 inputVal1, inputVal2;
376 __m256i intInputVal1, intInputVal2;
377 __m256 ret1, ret2;
378 const __m256 vmin_val = _mm256_set1_ps(min_val);
379 const __m256 vmax_val = _mm256_set1_ps(max_val);
380 unsigned int i;
381
382 for (i = 0; i < avx_iters; i++) {
383 inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
384 inputVectorPtr += 8;
385 inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
386 inputVectorPtr += 8;
387 __VOLK_PREFETCH(inputVectorPtr + 16);
388
389 // Clip
390 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
391 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
392
393 intInputVal1 = _mm256_cvtps_epi32(ret1);
394 intInputVal2 = _mm256_cvtps_epi32(ret2);
395
396 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
397 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
398
399 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
400 outputVectorPtr += 16;
401 }
402
403 for (i = avx_iters * 16; i < num_points * 2; i++) {
404 aux = *inputVectorPtr++;
405 if (aux > max_val)
406 aux = max_val;
407 else if (aux < min_val)
408 aux = min_val;
409 *outputVectorPtr++ = (int16_t)rintf(aux);
410 }
411}
412#endif /* LV_HAVE_AVX2 */
413
414#ifdef LV_HAVE_AVX512F
415#include <immintrin.h>
416
417static inline void volk_32fc_convert_16ic_u_avx512(lv_16sc_t* outputVector,
418 const lv_32fc_t* inputVector,
419 unsigned int num_points)
420{
421 const unsigned int avx512_iters = num_points / 8;
422
423 float* inputVectorPtr = (float*)inputVector;
424 int16_t* outputVectorPtr = (int16_t*)outputVector;
425 float aux;
426
427 const float min_val = (float)SHRT_MIN;
428 const float max_val = (float)SHRT_MAX;
429
430 __m512 inputVal1;
431 __m256i intInputVal;
432 __m512 ret1;
433 const __m512 vmin_val = _mm512_set1_ps(min_val);
434 const __m512 vmax_val = _mm512_set1_ps(max_val);
435 unsigned int i;
436
437 for (i = 0; i < avx512_iters; i++) {
438 inputVal1 = _mm512_loadu_ps((float*)inputVectorPtr);
439 inputVectorPtr += 16;
440 __VOLK_PREFETCH(inputVectorPtr + 16);
441
442 // Clip
443 ret1 = _mm512_max_ps(_mm512_min_ps(inputVal1, vmax_val), vmin_val);
444
445 // Convert float to int32, then pack to int16 with saturation
446 intInputVal = _mm512_cvtsepi32_epi16(_mm512_cvtps_epi32(ret1));
447
448 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
449 outputVectorPtr += 16;
450 }
451
452 for (i = avx512_iters * 16; i < num_points * 2; i++) {
453 aux = *inputVectorPtr++;
454 if (aux > max_val)
455 aux = max_val;
456 else if (aux < min_val)
457 aux = min_val;
458 *outputVectorPtr++ = (int16_t)rintf(aux);
459 }
460}
461#endif /* LV_HAVE_AVX512F */
462
463
464#ifdef LV_HAVE_SSE2
465#include <emmintrin.h>
466
467static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
468 const lv_32fc_t* inputVector,
469 unsigned int num_points)
470{
471 const unsigned int sse_iters = num_points / 4;
472
473 float* inputVectorPtr = (float*)inputVector;
474 int16_t* outputVectorPtr = (int16_t*)outputVector;
475 float aux;
476
477 const float min_val = (float)SHRT_MIN;
478 const float max_val = (float)SHRT_MAX;
479
480 __m128 inputVal1, inputVal2;
481 __m128i intInputVal1, intInputVal2;
482 __m128 ret1, ret2;
483 const __m128 vmin_val = _mm_set_ps1(min_val);
484 const __m128 vmax_val = _mm_set_ps1(max_val);
485
486 unsigned int i;
487 for (i = 0; i < sse_iters; i++) {
488 inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
489 inputVectorPtr += 4;
490 inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
491 inputVectorPtr += 4;
492 __VOLK_PREFETCH(inputVectorPtr + 8);
493
494 // Clip
495 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
496 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
497
498 intInputVal1 = _mm_cvtps_epi32(ret1);
499 intInputVal2 = _mm_cvtps_epi32(ret2);
500
501 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
502
503 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
504 outputVectorPtr += 8;
505 }
506
507 for (i = sse_iters * 8; i < num_points * 2; i++) {
508 aux = *inputVectorPtr++;
509 if (aux > max_val)
510 aux = max_val;
511 else if (aux < min_val)
512 aux = min_val;
513 *outputVectorPtr++ = (int16_t)rintf(aux);
514 }
515}
516#endif /* LV_HAVE_SSE2 */
517
518#ifdef LV_HAVE_RVV
519#include <riscv_vector.h>
520
521static inline void volk_32fc_convert_16ic_rvv(lv_16sc_t* outputVector,
522 const lv_32fc_t* inputVector,
523 unsigned int num_points)
524{
525 int16_t* out = (int16_t*)outputVector;
526 float* in = (float*)inputVector;
527 size_t n = num_points * 2;
528 for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
529 vl = __riscv_vsetvl_e32m8(n);
530 vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
531 __riscv_vse16(out, __riscv_vfncvt_x(v, vl), vl);
532 }
533}
534#endif /*LV_HAVE_RVV*/
535
536#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */