Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16ic_convert_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
43
44
45#ifndef INCLUDED_volk_16ic_convert_32fc_a_H
46#define INCLUDED_volk_16ic_convert_32fc_a_H
47
48#include <volk/volk_complex.h>
49
50#ifdef LV_HAVE_AVX2
51#include <immintrin.h>
52
53static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector,
54 const lv_16sc_t* inputVector,
55 unsigned int num_points)
56{
57 const unsigned int avx_iters = num_points / 4;
58 unsigned int number = 0;
59 const int16_t* complexVectorPtr = (int16_t*)inputVector;
60 float* outputVectorPtr = (float*)outputVector;
61 __m256 outVal;
62 __m256i outValInt;
63 __m128i cplxValue;
64
65 for (number = 0; number < avx_iters; number++) {
66 cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
67 __VOLK_PREFETCH(complexVectorPtr + 16);
68 complexVectorPtr += 8;
69
70 outValInt = _mm256_cvtepi16_epi32(cplxValue);
71 outVal = _mm256_cvtepi32_ps(outValInt);
72 _mm256_store_ps((float*)outputVectorPtr, outVal);
73
74 outputVectorPtr += 8;
75 }
76
77 number = avx_iters * 8;
78 for (; number < num_points * 2; number++) {
79 *outputVectorPtr++ = (float)*complexVectorPtr++;
80 }
81}
82
83#endif /* LV_HAVE_AVX2 */
84
85#ifdef LV_HAVE_AVX512F
86#include <immintrin.h>
87
88static inline void volk_16ic_convert_32fc_a_avx512(lv_32fc_t* outputVector,
89 const lv_16sc_t* inputVector,
90 unsigned int num_points)
91{
92 const unsigned int avx512_iters = num_points / 8;
93 unsigned int number = 0;
94 const int16_t* complexVectorPtr = (int16_t*)inputVector;
95 float* outputVectorPtr = (float*)outputVector;
96 __m512 outVal;
97 __m512i outValInt;
98 __m256i cplxValue;
99
100 for (number = 0; number < avx512_iters; number++) {
101 // Load 16 int16 values (8 complex = 16 floats)
102 cplxValue = _mm256_load_si256((__m256i*)complexVectorPtr);
103 __VOLK_PREFETCH(complexVectorPtr + 32);
104 complexVectorPtr += 16;
105
106 // Convert int16 → int32 → float
107 outValInt = _mm512_cvtepi16_epi32(cplxValue);
108 outVal = _mm512_cvtepi32_ps(outValInt);
109 _mm512_store_ps((float*)outputVectorPtr, outVal);
110
111 outputVectorPtr += 16;
112 }
113
114 number = avx512_iters * 16;
115 for (; number < num_points * 2; number++) {
116 *outputVectorPtr++ = (float)*complexVectorPtr++;
117 }
118}
119
120#endif /* LV_HAVE_AVX512F */
121
122#ifdef LV_HAVE_GENERIC
123
124static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector,
125 const lv_16sc_t* inputVector,
126 unsigned int num_points)
127{
128 unsigned int i;
129 for (i = 0; i < num_points; i++) {
130 outputVector[i] =
131 lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
132 }
133}
134
135#endif /* LV_HAVE_GENERIC */
136
137
138#ifdef LV_HAVE_SSE2
139#include <emmintrin.h>
140
141static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector,
142 const lv_16sc_t* inputVector,
143 unsigned int num_points)
144{
145 const unsigned int sse_iters = num_points / 2;
146
147 const lv_16sc_t* _in = inputVector;
148 lv_32fc_t* _out = outputVector;
149 __m128 a;
150 unsigned int number;
151
152 for (number = 0; number < sse_iters; number++) {
153 a = _mm_set_ps(
154 (float)(lv_cimag(_in[1])),
155 (float)(lv_creal(_in[1])),
156 (float)(lv_cimag(_in[0])),
157 (float)(lv_creal(
158 _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
159 _mm_store_ps((float*)_out, a);
160 _in += 2;
161 _out += 2;
162 }
163 if (num_points & 1) {
164 *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
165 _in++;
166 }
167}
168
169#endif /* LV_HAVE_SSE2 */
170
171#ifdef LV_HAVE_AVX
172#include <immintrin.h>
173
174static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector,
175 const lv_16sc_t* inputVector,
176 unsigned int num_points)
177{
178 const unsigned int sse_iters = num_points / 4;
179
180 const lv_16sc_t* _in = inputVector;
181 lv_32fc_t* _out = outputVector;
182 __m256 a;
183 unsigned int i, number;
184
185 for (number = 0; number < sse_iters; number++) {
186 a = _mm256_set_ps(
187 (float)(lv_cimag(_in[3])),
188 (float)(lv_creal(_in[3])),
189 (float)(lv_cimag(_in[2])),
190 (float)(lv_creal(_in[2])),
191 (float)(lv_cimag(_in[1])),
192 (float)(lv_creal(_in[1])),
193 (float)(lv_cimag(_in[0])),
194 (float)(lv_creal(
195 _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
196 _mm256_store_ps((float*)_out, a);
197 _in += 4;
198 _out += 4;
199 }
200
201 for (i = 0; i < (num_points % 4); ++i) {
202 *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
203 _in++;
204 }
205}
206
207#endif /* LV_HAVE_AVX */
208
209
210#ifdef LV_HAVE_NEON
211#include <arm_neon.h>
212
213static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
214 const lv_16sc_t* inputVector,
215 unsigned int num_points)
216{
217 const int16_t* _in = (const int16_t*)inputVector;
218 float* _out = (float*)outputVector;
219 unsigned int n = num_points;
220
221 // Process 8 complex numbers per iteration using 64-bit loads
222 // This avoids vget_low/vget_high overhead
223 while (n >= 8) {
224 int16x4_t v0 = vld1_s16(_in);
225 int16x4_t v1 = vld1_s16(_in + 4);
226 int16x4_t v2 = vld1_s16(_in + 8);
227 int16x4_t v3 = vld1_s16(_in + 12);
228 __VOLK_PREFETCH(_in + 32);
229
230 vst1q_f32(_out, vcvtq_f32_s32(vmovl_s16(v0)));
231 vst1q_f32(_out + 4, vcvtq_f32_s32(vmovl_s16(v1)));
232 vst1q_f32(_out + 8, vcvtq_f32_s32(vmovl_s16(v2)));
233 vst1q_f32(_out + 12, vcvtq_f32_s32(vmovl_s16(v3)));
234
235 _in += 16;
236 _out += 16;
237 n -= 8;
238 }
239
240 // Handle remaining elements
241 while (n--) {
242 *_out++ = (float)*_in++;
243 *_out++ = (float)*_in++;
244 }
245}
246#endif /* LV_HAVE_NEON */
247
248#ifdef LV_HAVE_NEONV8
249#include <arm_neon.h>
250
251static inline void volk_16ic_convert_32fc_neonv8(lv_32fc_t* outputVector,
252 const lv_16sc_t* inputVector,
253 unsigned int num_points)
254{
255 const int16_t* _in = (const int16_t*)inputVector;
256 float* _out = (float*)outputVector;
257 unsigned int n = num_points;
258
259 /* Process 8 complex numbers per iteration using 64-bit loads */
260 while (n >= 8) {
261 int16x4_t v0 = vld1_s16(_in);
262 int16x4_t v1 = vld1_s16(_in + 4);
263 int16x4_t v2 = vld1_s16(_in + 8);
264 int16x4_t v3 = vld1_s16(_in + 12);
265 __VOLK_PREFETCH(_in + 32);
266
267 vst1q_f32(_out, vcvtq_f32_s32(vmovl_s16(v0)));
268 vst1q_f32(_out + 4, vcvtq_f32_s32(vmovl_s16(v1)));
269 vst1q_f32(_out + 8, vcvtq_f32_s32(vmovl_s16(v2)));
270 vst1q_f32(_out + 12, vcvtq_f32_s32(vmovl_s16(v3)));
271
272 _in += 16;
273 _out += 16;
274 n -= 8;
275 }
276
277 /* Handle remaining elements */
278 while (n--) {
279 *_out++ = (float)*_in++;
280 *_out++ = (float)*_in++;
281 }
282}
283#endif /* LV_HAVE_NEONV8 */
284
285#endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
286
287#ifndef INCLUDED_volk_16ic_convert_32fc_u_H
288#define INCLUDED_volk_16ic_convert_32fc_u_H
289
290#include <volk/volk_complex.h>
291
292
293#ifdef LV_HAVE_AVX2
294#include <immintrin.h>
295
296static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector,
297 const lv_16sc_t* inputVector,
298 unsigned int num_points)
299{
300 const unsigned int avx_iters = num_points / 4;
301 unsigned int number = 0;
302 const int16_t* complexVectorPtr = (int16_t*)inputVector;
303 float* outputVectorPtr = (float*)outputVector;
304 __m256 outVal;
305 __m256i outValInt;
306 __m128i cplxValue;
307
308 for (number = 0; number < avx_iters; number++) {
309 cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
310 __VOLK_PREFETCH(complexVectorPtr + 16);
311 complexVectorPtr += 8;
312
313 outValInt = _mm256_cvtepi16_epi32(cplxValue);
314 outVal = _mm256_cvtepi32_ps(outValInt);
315 _mm256_storeu_ps((float*)outputVectorPtr, outVal);
316
317 outputVectorPtr += 8;
318 }
319
320 number = avx_iters * 8;
321 for (; number < num_points * 2; number++) {
322 *outputVectorPtr++ = (float)*complexVectorPtr++;
323 }
324}
325
326#endif /* LV_HAVE_AVX2 */
327
328#ifdef LV_HAVE_AVX512F
329#include <immintrin.h>
330
331static inline void volk_16ic_convert_32fc_u_avx512(lv_32fc_t* outputVector,
332 const lv_16sc_t* inputVector,
333 unsigned int num_points)
334{
335 const unsigned int avx512_iters = num_points / 8;
336 unsigned int number = 0;
337 const int16_t* complexVectorPtr = (int16_t*)inputVector;
338 float* outputVectorPtr = (float*)outputVector;
339 __m512 outVal;
340 __m512i outValInt;
341 __m256i cplxValue;
342
343 for (number = 0; number < avx512_iters; number++) {
344 // Load 16 int16 values (8 complex = 16 floats) - unaligned
345 cplxValue = _mm256_loadu_si256((__m256i*)complexVectorPtr);
346 __VOLK_PREFETCH(complexVectorPtr + 32);
347 complexVectorPtr += 16;
348
349 // Convert int16 → int32 → float
350 outValInt = _mm512_cvtepi16_epi32(cplxValue);
351 outVal = _mm512_cvtepi32_ps(outValInt);
352 _mm512_storeu_ps((float*)outputVectorPtr, outVal);
353
354 outputVectorPtr += 16;
355 }
356
357 number = avx512_iters * 16;
358 for (; number < num_points * 2; number++) {
359 *outputVectorPtr++ = (float)*complexVectorPtr++;
360 }
361}
362
363#endif /* LV_HAVE_AVX512F */
364
365#ifdef LV_HAVE_SSE2
366#include <emmintrin.h>
367
368static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector,
369 const lv_16sc_t* inputVector,
370 unsigned int num_points)
371{
372 const unsigned int sse_iters = num_points / 2;
373
374 const lv_16sc_t* _in = inputVector;
375 lv_32fc_t* _out = outputVector;
376 __m128 a;
377 unsigned int number;
378
379 for (number = 0; number < sse_iters; number++) {
380 a = _mm_set_ps(
381 (float)(lv_cimag(_in[1])),
382 (float)(lv_creal(_in[1])),
383 (float)(lv_cimag(_in[0])),
384 (float)(lv_creal(
385 _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
386 _mm_storeu_ps((float*)_out, a);
387 _in += 2;
388 _out += 2;
389 }
390 if (num_points & 1) {
391 *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
392 _in++;
393 }
394}
395
396#endif /* LV_HAVE_SSE2 */
397
398
399#ifdef LV_HAVE_AVX
400#include <immintrin.h>
401
402static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector,
403 const lv_16sc_t* inputVector,
404 unsigned int num_points)
405{
406 const unsigned int sse_iters = num_points / 4;
407
408 const lv_16sc_t* _in = inputVector;
409 lv_32fc_t* _out = outputVector;
410 __m256 a;
411 unsigned int i, number;
412
413 for (number = 0; number < sse_iters; number++) {
414 a = _mm256_set_ps(
415 (float)(lv_cimag(_in[3])),
416 (float)(lv_creal(_in[3])),
417 (float)(lv_cimag(_in[2])),
418 (float)(lv_creal(_in[2])),
419 (float)(lv_cimag(_in[1])),
420 (float)(lv_creal(_in[1])),
421 (float)(lv_cimag(_in[0])),
422 (float)(lv_creal(
423 _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
424 _mm256_storeu_ps((float*)_out, a);
425 _in += 4;
426 _out += 4;
427 }
428
429 for (i = 0; i < (num_points % 4); ++i) {
430 *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
431 _in++;
432 }
433}
434
435#endif /* LV_HAVE_AVX */
436
437#ifdef LV_HAVE_RVV
438#include <riscv_vector.h>
439
440static inline void volk_16ic_convert_32fc_rvv(lv_32fc_t* outputVector,
441 const lv_16sc_t* inputVector,
442 unsigned int num_points)
443{
444 const int16_t* in = (const int16_t*)inputVector;
445 float* out = (float*)outputVector;
446 size_t n = num_points * 2;
447 for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
448 vl = __riscv_vsetvl_e16m4(n);
449 vint16m4_t v = __riscv_vle16_v_i16m4(in, vl);
450 __riscv_vse32(out, __riscv_vfwcvt_f(v, vl), vl);
451 }
452}
453#endif /*LV_HAVE_RVV*/
454
455#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */