33#ifndef INCLUDED_volk_32fc_convert_16ic_a_H
34#define INCLUDED_volk_32fc_convert_16ic_a_H
43static inline void volk_32fc_convert_16ic_a_avx2(
lv_16sc_t* outputVector,
45 unsigned int num_points)
47 const unsigned int avx_iters = num_points / 8;
49 float* inputVectorPtr = (
float*)inputVector;
50 int16_t* outputVectorPtr = (int16_t*)outputVector;
53 const float min_val = (float)SHRT_MIN;
54 const float max_val = (float)SHRT_MAX;
56 __m256 inputVal1, inputVal2;
57 __m256i intInputVal1, intInputVal2;
59 const __m256 vmin_val = _mm256_set1_ps(min_val);
60 const __m256 vmax_val = _mm256_set1_ps(max_val);
63 for (i = 0; i < avx_iters; i++) {
64 inputVal1 = _mm256_load_ps((
float*)inputVectorPtr);
66 inputVal2 = _mm256_load_ps((
float*)inputVectorPtr);
71 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
72 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
74 intInputVal1 = _mm256_cvtps_epi32(ret1);
75 intInputVal2 = _mm256_cvtps_epi32(ret2);
77 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
78 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
80 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
81 outputVectorPtr += 16;
84 for (i = avx_iters * 16; i < num_points * 2; i++) {
85 aux = *inputVectorPtr++;
88 else if (aux < min_val)
90 *outputVectorPtr++ = (int16_t)rintf(aux);
98static inline void volk_32fc_convert_16ic_a_avx512(
lv_16sc_t* outputVector,
100 unsigned int num_points)
102 const unsigned int avx512_iters = num_points / 8;
104 float* inputVectorPtr = (
float*)inputVector;
105 int16_t* outputVectorPtr = (int16_t*)outputVector;
108 const float min_val = (float)SHRT_MIN;
109 const float max_val = (float)SHRT_MAX;
114 const __m512 vmin_val = _mm512_set1_ps(min_val);
115 const __m512 vmax_val = _mm512_set1_ps(max_val);
118 for (i = 0; i < avx512_iters; i++) {
119 inputVal1 = _mm512_load_ps((
float*)inputVectorPtr);
120 inputVectorPtr += 16;
124 ret1 = _mm512_max_ps(_mm512_min_ps(inputVal1, vmax_val), vmin_val);
127 intInputVal = _mm512_cvtsepi32_epi16(_mm512_cvtps_epi32(ret1));
129 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
130 outputVectorPtr += 16;
133 for (i = avx512_iters * 16; i < num_points * 2; i++) {
134 aux = *inputVectorPtr++;
137 else if (aux < min_val)
139 *outputVectorPtr++ = (int16_t)rintf(aux);
145#include <emmintrin.h>
149 unsigned int num_points)
151 const unsigned int sse_iters = num_points / 4;
153 float* inputVectorPtr = (
float*)inputVector;
154 int16_t* outputVectorPtr = (int16_t*)outputVector;
157 const float min_val = (float)SHRT_MIN;
158 const float max_val = (float)SHRT_MAX;
160 __m128 inputVal1, inputVal2;
161 __m128i intInputVal1, intInputVal2;
163 const __m128 vmin_val = _mm_set_ps1(min_val);
164 const __m128 vmax_val = _mm_set_ps1(max_val);
167 for (i = 0; i < sse_iters; i++) {
168 inputVal1 = _mm_load_ps((
float*)inputVectorPtr);
170 inputVal2 = _mm_load_ps((
float*)inputVectorPtr);
175 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
176 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
178 intInputVal1 = _mm_cvtps_epi32(ret1);
179 intInputVal2 = _mm_cvtps_epi32(ret2);
181 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
183 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
184 outputVectorPtr += 8;
187 for (i = sse_iters * 8; i < num_points * 2; i++) {
188 aux = *inputVectorPtr++;
191 else if (aux < min_val)
193 *outputVectorPtr++ = (int16_t)rintf(aux);
202static inline void volk_32fc_convert_16ic_neon(
lv_16sc_t* outputVector,
204 unsigned int num_points)
207 const unsigned int neon_iters = num_points / 4;
209 float32_t* inputVectorPtr = (float32_t*)inputVector;
210 int16_t* outputVectorPtr = (int16_t*)outputVector;
212 const float min_val_f = (float)SHRT_MIN;
213 const float max_val_f = (float)SHRT_MAX;
217 const float32x4_t min_val = vmovq_n_f32(min_val_f);
218 const float32x4_t max_val = vmovq_n_f32(max_val_f);
219 float32x4_t half = vdupq_n_f32(0.5f);
220 float32x4_t ret1, ret2, a, b, sign, PlusHalf, Round;
222 int32x4_t toint_a = { 0, 0, 0, 0 };
223 int32x4_t toint_b = { 0, 0, 0, 0 };
224 int16x4_t intInputVal1, intInputVal2;
227 for (i = 0; i < neon_iters; i++) {
228 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
230 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
234 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
235 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
237 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
238 PlusHalf = vaddq_f32(ret1, half);
239 Round = vsubq_f32(PlusHalf, sign);
240 toint_a = vcvtq_s32_f32(Round);
242 sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret2), 31)));
243 PlusHalf = vaddq_f32(ret2, half);
244 Round = vsubq_f32(PlusHalf, sign);
245 toint_b = vcvtq_s32_f32(Round);
247 intInputVal1 = vqmovn_s32(toint_a);
248 intInputVal2 = vqmovn_s32(toint_b);
250 res = vcombine_s16(intInputVal1, intInputVal2);
251 vst1q_s16((int16_t*)outputVectorPtr, res);
252 outputVectorPtr += 8;
255 for (i = neon_iters * 8; i < num_points * 2; i++) {
256 aux = *inputVectorPtr++;
259 else if (aux < min_val_f)
261 *outputVectorPtr++ = (int16_t)rintf(aux);
270static inline void volk_32fc_convert_16ic_neonv8(
lv_16sc_t* outputVector,
272 unsigned int num_points)
274 const unsigned int neon_iters = num_points / 4;
276 float32_t* inputVectorPtr = (float32_t*)inputVector;
277 int16_t* outputVectorPtr = (int16_t*)outputVector;
279 const float min_val_f = (float)SHRT_MIN;
280 const float max_val_f = (float)SHRT_MAX;
284 const float32x4_t min_val = vmovq_n_f32(min_val_f);
285 const float32x4_t max_val = vmovq_n_f32(max_val_f);
286 float32x4_t ret1, ret2, a, b;
288 int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
289 int16x4_t intInputVal1, intInputVal2;
292 for (i = 0; i < neon_iters; i++) {
293 a = vld1q_f32((
const float32_t*)(inputVectorPtr));
295 b = vld1q_f32((
const float32_t*)(inputVectorPtr));
299 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
300 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
303 toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
304 toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
306 intInputVal1 = vqmovn_s32(toint_a);
307 intInputVal2 = vqmovn_s32(toint_b);
309 res = vcombine_s16(intInputVal1, intInputVal2);
310 vst1q_s16((int16_t*)outputVectorPtr, res);
311 outputVectorPtr += 8;
314 for (i = neon_iters * 8; i < num_points * 2; i++) {
315 aux = *inputVectorPtr++;
318 else if (aux < min_val_f)
320 *outputVectorPtr++ = (int16_t)rintf(aux);
326#ifdef LV_HAVE_GENERIC
330 unsigned int num_points)
332 float* inputVectorPtr = (
float*)inputVector;
333 int16_t* outputVectorPtr = (int16_t*)outputVector;
334 const float min_val = (float)SHRT_MIN;
335 const float max_val = (float)SHRT_MAX;
338 for (i = 0; i < num_points * 2; i++) {
339 aux = *inputVectorPtr++;
342 else if (aux < min_val)
344 *outputVectorPtr++ = (int16_t)rintf(aux);
351#ifndef INCLUDED_volk_32fc_convert_16ic_u_H
352#define INCLUDED_volk_32fc_convert_16ic_u_H
360#include <immintrin.h>
362static inline void volk_32fc_convert_16ic_u_avx2(
lv_16sc_t* outputVector,
364 unsigned int num_points)
366 const unsigned int avx_iters = num_points / 8;
368 float* inputVectorPtr = (
float*)inputVector;
369 int16_t* outputVectorPtr = (int16_t*)outputVector;
372 const float min_val = (float)SHRT_MIN;
373 const float max_val = (float)SHRT_MAX;
375 __m256 inputVal1, inputVal2;
376 __m256i intInputVal1, intInputVal2;
378 const __m256 vmin_val = _mm256_set1_ps(min_val);
379 const __m256 vmax_val = _mm256_set1_ps(max_val);
382 for (i = 0; i < avx_iters; i++) {
383 inputVal1 = _mm256_loadu_ps((
float*)inputVectorPtr);
385 inputVal2 = _mm256_loadu_ps((
float*)inputVectorPtr);
390 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
391 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
393 intInputVal1 = _mm256_cvtps_epi32(ret1);
394 intInputVal2 = _mm256_cvtps_epi32(ret2);
396 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
397 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
399 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
400 outputVectorPtr += 16;
403 for (i = avx_iters * 16; i < num_points * 2; i++) {
404 aux = *inputVectorPtr++;
407 else if (aux < min_val)
409 *outputVectorPtr++ = (int16_t)rintf(aux);
414#ifdef LV_HAVE_AVX512F
415#include <immintrin.h>
417static inline void volk_32fc_convert_16ic_u_avx512(
lv_16sc_t* outputVector,
419 unsigned int num_points)
421 const unsigned int avx512_iters = num_points / 8;
423 float* inputVectorPtr = (
float*)inputVector;
424 int16_t* outputVectorPtr = (int16_t*)outputVector;
427 const float min_val = (float)SHRT_MIN;
428 const float max_val = (float)SHRT_MAX;
433 const __m512 vmin_val = _mm512_set1_ps(min_val);
434 const __m512 vmax_val = _mm512_set1_ps(max_val);
437 for (i = 0; i < avx512_iters; i++) {
438 inputVal1 = _mm512_loadu_ps((
float*)inputVectorPtr);
439 inputVectorPtr += 16;
443 ret1 = _mm512_max_ps(_mm512_min_ps(inputVal1, vmax_val), vmin_val);
446 intInputVal = _mm512_cvtsepi32_epi16(_mm512_cvtps_epi32(ret1));
448 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
449 outputVectorPtr += 16;
452 for (i = avx512_iters * 16; i < num_points * 2; i++) {
453 aux = *inputVectorPtr++;
456 else if (aux < min_val)
458 *outputVectorPtr++ = (int16_t)rintf(aux);
465#include <emmintrin.h>
469 unsigned int num_points)
471 const unsigned int sse_iters = num_points / 4;
473 float* inputVectorPtr = (
float*)inputVector;
474 int16_t* outputVectorPtr = (int16_t*)outputVector;
477 const float min_val = (float)SHRT_MIN;
478 const float max_val = (float)SHRT_MAX;
480 __m128 inputVal1, inputVal2;
481 __m128i intInputVal1, intInputVal2;
483 const __m128 vmin_val = _mm_set_ps1(min_val);
484 const __m128 vmax_val = _mm_set_ps1(max_val);
487 for (i = 0; i < sse_iters; i++) {
488 inputVal1 = _mm_loadu_ps((
float*)inputVectorPtr);
490 inputVal2 = _mm_loadu_ps((
float*)inputVectorPtr);
495 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
496 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
498 intInputVal1 = _mm_cvtps_epi32(ret1);
499 intInputVal2 = _mm_cvtps_epi32(ret2);
501 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
503 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
504 outputVectorPtr += 8;
507 for (i = sse_iters * 8; i < num_points * 2; i++) {
508 aux = *inputVectorPtr++;
511 else if (aux < min_val)
513 *outputVectorPtr++ = (int16_t)rintf(aux);
519#include <riscv_vector.h>
521static inline void volk_32fc_convert_16ic_rvv(
lv_16sc_t* outputVector,
523 unsigned int num_points)
525 int16_t* out = (int16_t*)outputVector;
526 float* in = (
float*)inputVector;
527 size_t n = num_points * 2;
528 for (
size_t vl; n > 0; n -= vl, in += vl, out += vl) {
529 vl = __riscv_vsetvl_e32m8(n);
530 vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
531 __riscv_vse16(out, __riscv_vfncvt_x(v, vl), vl);