45#ifndef INCLUDED_volk_16ic_convert_32fc_a_H
46#define INCLUDED_volk_16ic_convert_32fc_a_H
53static inline void volk_16ic_convert_32fc_a_avx2(
lv_32fc_t* outputVector,
55 unsigned int num_points)
57 const unsigned int avx_iters = num_points / 4;
58 unsigned int number = 0;
59 const int16_t* complexVectorPtr = (int16_t*)inputVector;
60 float* outputVectorPtr = (
float*)outputVector;
65 for (number = 0; number < avx_iters; number++) {
66 cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
68 complexVectorPtr += 8;
70 outValInt = _mm256_cvtepi16_epi32(cplxValue);
71 outVal = _mm256_cvtepi32_ps(outValInt);
72 _mm256_store_ps((
float*)outputVectorPtr, outVal);
77 number = avx_iters * 8;
78 for (; number < num_points * 2; number++) {
79 *outputVectorPtr++ = (float)*complexVectorPtr++;
88static inline void volk_16ic_convert_32fc_a_avx512(
lv_32fc_t* outputVector,
90 unsigned int num_points)
92 const unsigned int avx512_iters = num_points / 8;
93 unsigned int number = 0;
94 const int16_t* complexVectorPtr = (int16_t*)inputVector;
95 float* outputVectorPtr = (
float*)outputVector;
100 for (number = 0; number < avx512_iters; number++) {
102 cplxValue = _mm256_load_si256((__m256i*)complexVectorPtr);
104 complexVectorPtr += 16;
107 outValInt = _mm512_cvtepi16_epi32(cplxValue);
108 outVal = _mm512_cvtepi32_ps(outValInt);
109 _mm512_store_ps((
float*)outputVectorPtr, outVal);
111 outputVectorPtr += 16;
114 number = avx512_iters * 16;
115 for (; number < num_points * 2; number++) {
116 *outputVectorPtr++ = (float)*complexVectorPtr++;
122#ifdef LV_HAVE_GENERIC
126 unsigned int num_points)
129 for (i = 0; i < num_points; i++) {
139#include <emmintrin.h>
143 unsigned int num_points)
145 const unsigned int sse_iters = num_points / 2;
152 for (number = 0; number < sse_iters; number++) {
159 _mm_store_ps((
float*)_out, a);
163 if (num_points & 1) {
172#include <immintrin.h>
176 unsigned int num_points)
178 const unsigned int sse_iters = num_points / 4;
183 unsigned int i, number;
185 for (number = 0; number < sse_iters; number++) {
196 _mm256_store_ps((
float*)_out, a);
201 for (i = 0; i < (num_points % 4); ++i) {
215 unsigned int num_points)
217 const int16_t* _in = (
const int16_t*)inputVector;
218 float* _out = (
float*)outputVector;
219 unsigned int n = num_points;
224 int16x4_t v0 = vld1_s16(_in);
225 int16x4_t v1 = vld1_s16(_in + 4);
226 int16x4_t v2 = vld1_s16(_in + 8);
227 int16x4_t v3 = vld1_s16(_in + 12);
230 vst1q_f32(_out, vcvtq_f32_s32(vmovl_s16(v0)));
231 vst1q_f32(_out + 4, vcvtq_f32_s32(vmovl_s16(v1)));
232 vst1q_f32(_out + 8, vcvtq_f32_s32(vmovl_s16(v2)));
233 vst1q_f32(_out + 12, vcvtq_f32_s32(vmovl_s16(v3)));
242 *_out++ = (float)*_in++;
243 *_out++ = (float)*_in++;
251static inline void volk_16ic_convert_32fc_neonv8(
lv_32fc_t* outputVector,
253 unsigned int num_points)
255 const int16_t* _in = (
const int16_t*)inputVector;
256 float* _out = (
float*)outputVector;
257 unsigned int n = num_points;
261 int16x4_t v0 = vld1_s16(_in);
262 int16x4_t v1 = vld1_s16(_in + 4);
263 int16x4_t v2 = vld1_s16(_in + 8);
264 int16x4_t v3 = vld1_s16(_in + 12);
267 vst1q_f32(_out, vcvtq_f32_s32(vmovl_s16(v0)));
268 vst1q_f32(_out + 4, vcvtq_f32_s32(vmovl_s16(v1)));
269 vst1q_f32(_out + 8, vcvtq_f32_s32(vmovl_s16(v2)));
270 vst1q_f32(_out + 12, vcvtq_f32_s32(vmovl_s16(v3)));
279 *_out++ = (float)*_in++;
280 *_out++ = (float)*_in++;
287#ifndef INCLUDED_volk_16ic_convert_32fc_u_H
288#define INCLUDED_volk_16ic_convert_32fc_u_H
294#include <immintrin.h>
296static inline void volk_16ic_convert_32fc_u_avx2(
lv_32fc_t* outputVector,
298 unsigned int num_points)
300 const unsigned int avx_iters = num_points / 4;
301 unsigned int number = 0;
302 const int16_t* complexVectorPtr = (int16_t*)inputVector;
303 float* outputVectorPtr = (
float*)outputVector;
308 for (number = 0; number < avx_iters; number++) {
309 cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
311 complexVectorPtr += 8;
313 outValInt = _mm256_cvtepi16_epi32(cplxValue);
314 outVal = _mm256_cvtepi32_ps(outValInt);
315 _mm256_storeu_ps((
float*)outputVectorPtr, outVal);
317 outputVectorPtr += 8;
320 number = avx_iters * 8;
321 for (; number < num_points * 2; number++) {
322 *outputVectorPtr++ = (float)*complexVectorPtr++;
328#ifdef LV_HAVE_AVX512F
329#include <immintrin.h>
331static inline void volk_16ic_convert_32fc_u_avx512(
lv_32fc_t* outputVector,
333 unsigned int num_points)
335 const unsigned int avx512_iters = num_points / 8;
336 unsigned int number = 0;
337 const int16_t* complexVectorPtr = (int16_t*)inputVector;
338 float* outputVectorPtr = (
float*)outputVector;
343 for (number = 0; number < avx512_iters; number++) {
345 cplxValue = _mm256_loadu_si256((__m256i*)complexVectorPtr);
347 complexVectorPtr += 16;
350 outValInt = _mm512_cvtepi16_epi32(cplxValue);
351 outVal = _mm512_cvtepi32_ps(outValInt);
352 _mm512_storeu_ps((
float*)outputVectorPtr, outVal);
354 outputVectorPtr += 16;
357 number = avx512_iters * 16;
358 for (; number < num_points * 2; number++) {
359 *outputVectorPtr++ = (float)*complexVectorPtr++;
366#include <emmintrin.h>
370 unsigned int num_points)
372 const unsigned int sse_iters = num_points / 2;
379 for (number = 0; number < sse_iters; number++) {
386 _mm_storeu_ps((
float*)_out, a);
390 if (num_points & 1) {
400#include <immintrin.h>
404 unsigned int num_points)
406 const unsigned int sse_iters = num_points / 4;
411 unsigned int i, number;
413 for (number = 0; number < sse_iters; number++) {
424 _mm256_storeu_ps((
float*)_out, a);
429 for (i = 0; i < (num_points % 4); ++i) {
438#include <riscv_vector.h>
440static inline void volk_16ic_convert_32fc_rvv(
lv_32fc_t* outputVector,
442 unsigned int num_points)
444 const int16_t* in = (
const int16_t*)inputVector;
445 float* out = (
float*)outputVector;
446 size_t n = num_points * 2;
447 for (
size_t vl; n > 0; n -= vl, in += vl, out += vl) {
448 vl = __riscv_vsetvl_e16m4(n);
449 vint16m4_t v = __riscv_vle16_v_i16m4(in, vl);
450 __riscv_vse32(out, __riscv_vfwcvt_f(v, vl), vl);