Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_x2_s32f_interleave_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
61
62#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
63#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
64
65#include <inttypes.h>
66#include <stdio.h>
67#include <volk/volk_common.h>
68
69#ifdef LV_HAVE_AVX2
70#include <immintrin.h>
71
72static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector,
73 const float* iBuffer,
74 const float* qBuffer,
75 const float scalar,
76 unsigned int num_points)
77{
78 unsigned int number = 0;
79 const float* iBufferPtr = iBuffer;
80 const float* qBufferPtr = qBuffer;
81
82 __m256 vScalar = _mm256_set1_ps(scalar);
83
84 const unsigned int eighthPoints = num_points / 8;
85
86 __m256 iValue, qValue, cplxValue1, cplxValue2;
87 __m256i intValue1, intValue2;
88
89 int16_t* complexVectorPtr = (int16_t*)complexVector;
90
91 for (; number < eighthPoints; number++) {
92 iValue = _mm256_load_ps(iBufferPtr);
93 qValue = _mm256_load_ps(qBufferPtr);
94
95 // Interleaves the lower two values in the i and q variables into one buffer
96 cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
97 cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
98
99 // Interleaves the upper two values in the i and q variables into one buffer
100 cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
101 cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
102
103 intValue1 = _mm256_cvtps_epi32(cplxValue1);
104 intValue2 = _mm256_cvtps_epi32(cplxValue2);
105
106 intValue1 = _mm256_packs_epi32(intValue1, intValue2);
107
108 _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
109 complexVectorPtr += 16;
110
111 iBufferPtr += 8;
112 qBufferPtr += 8;
113 }
114
115 number = eighthPoints * 8;
116 complexVectorPtr = (int16_t*)(&complexVector[number]);
117 for (; number < num_points; number++) {
118 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
119 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
120 }
121}
122#endif /* LV_HAVE_AVX2 */
123
124
125#ifdef LV_HAVE_SSE2
126#include <emmintrin.h>
127
128static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector,
129 const float* iBuffer,
130 const float* qBuffer,
131 const float scalar,
132 unsigned int num_points)
133{
134 unsigned int number = 0;
135 const float* iBufferPtr = iBuffer;
136 const float* qBufferPtr = qBuffer;
137
138 __m128 vScalar = _mm_set_ps1(scalar);
139
140 const unsigned int quarterPoints = num_points / 4;
141
142 __m128 iValue, qValue, cplxValue1, cplxValue2;
143 __m128i intValue1, intValue2;
144
145 int16_t* complexVectorPtr = (int16_t*)complexVector;
146
147 for (; number < quarterPoints; number++) {
148 iValue = _mm_load_ps(iBufferPtr);
149 qValue = _mm_load_ps(qBufferPtr);
150
151 // Interleaves the lower two values in the i and q variables into one buffer
152 cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
153 cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
154
155 // Interleaves the upper two values in the i and q variables into one buffer
156 cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
157 cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
158
159 intValue1 = _mm_cvtps_epi32(cplxValue1);
160 intValue2 = _mm_cvtps_epi32(cplxValue2);
161
162 intValue1 = _mm_packs_epi32(intValue1, intValue2);
163
164 _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
165 complexVectorPtr += 8;
166
167 iBufferPtr += 4;
168 qBufferPtr += 4;
169 }
170
171 number = quarterPoints * 4;
172 complexVectorPtr = (int16_t*)(&complexVector[number]);
173 for (; number < num_points; number++) {
174 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
175 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
176 }
177}
178#endif /* LV_HAVE_SSE2 */
179
180
181#ifdef LV_HAVE_SSE
182#include <xmmintrin.h>
183
184static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector,
185 const float* iBuffer,
186 const float* qBuffer,
187 const float scalar,
188 unsigned int num_points)
189{
190 unsigned int number = 0;
191 const float* iBufferPtr = iBuffer;
192 const float* qBufferPtr = qBuffer;
193
194 __m128 vScalar = _mm_set_ps1(scalar);
195
196 const unsigned int quarterPoints = num_points / 4;
197
198 __m128 iValue, qValue, cplxValue;
199
200 int16_t* complexVectorPtr = (int16_t*)complexVector;
201
202 __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
203
204 for (; number < quarterPoints; number++) {
205 iValue = _mm_load_ps(iBufferPtr);
206 qValue = _mm_load_ps(qBufferPtr);
207
208 // Interleaves the lower two values in the i and q variables into one buffer
209 cplxValue = _mm_unpacklo_ps(iValue, qValue);
210 cplxValue = _mm_mul_ps(cplxValue, vScalar);
211
212 _mm_store_ps(floatBuffer, cplxValue);
213
214 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
215 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
216 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
217 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
218
219 // Interleaves the upper two values in the i and q variables into one buffer
220 cplxValue = _mm_unpackhi_ps(iValue, qValue);
221 cplxValue = _mm_mul_ps(cplxValue, vScalar);
222
223 _mm_store_ps(floatBuffer, cplxValue);
224
225 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
226 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
227 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
228 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
229
230 iBufferPtr += 4;
231 qBufferPtr += 4;
232 }
233
234 number = quarterPoints * 4;
235 complexVectorPtr = (int16_t*)(&complexVector[number]);
236 for (; number < num_points; number++) {
237 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
238 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
239 }
240}
241#endif /* LV_HAVE_SSE */
242
243
244#ifdef LV_HAVE_GENERIC
245
246static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector,
247 const float* iBuffer,
248 const float* qBuffer,
249 const float scalar,
250 unsigned int num_points)
251{
252 int16_t* complexVectorPtr = (int16_t*)complexVector;
253 const float* iBufferPtr = iBuffer;
254 const float* qBufferPtr = qBuffer;
255 unsigned int number = 0;
256
257 for (number = 0; number < num_points; number++) {
258 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
259 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
260 }
261}
262#endif /* LV_HAVE_GENERIC */
263
264
265#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
266
267#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
268#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
269
270#include <inttypes.h>
271#include <stdio.h>
272#include <volk/volk_common.h>
273
274#ifdef LV_HAVE_AVX2
275#include <immintrin.h>
276
277static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector,
278 const float* iBuffer,
279 const float* qBuffer,
280 const float scalar,
281 unsigned int num_points)
282{
283 unsigned int number = 0;
284 const float* iBufferPtr = iBuffer;
285 const float* qBufferPtr = qBuffer;
286
287 __m256 vScalar = _mm256_set1_ps(scalar);
288
289 const unsigned int eighthPoints = num_points / 8;
290
291 __m256 iValue, qValue, cplxValue1, cplxValue2;
292 __m256i intValue1, intValue2;
293
294 int16_t* complexVectorPtr = (int16_t*)complexVector;
295
296 for (; number < eighthPoints; number++) {
297 iValue = _mm256_loadu_ps(iBufferPtr);
298 qValue = _mm256_loadu_ps(qBufferPtr);
299
300 // Interleaves the lower two values in the i and q variables into one buffer
301 cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
302 cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
303
304 // Interleaves the upper two values in the i and q variables into one buffer
305 cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
306 cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
307
308 intValue1 = _mm256_cvtps_epi32(cplxValue1);
309 intValue2 = _mm256_cvtps_epi32(cplxValue2);
310
311 intValue1 = _mm256_packs_epi32(intValue1, intValue2);
312
313 _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
314 complexVectorPtr += 16;
315
316 iBufferPtr += 8;
317 qBufferPtr += 8;
318 }
319
320 number = eighthPoints * 8;
321 complexVectorPtr = (int16_t*)(&complexVector[number]);
322 for (; number < num_points; number++) {
323 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
324 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
325 }
326}
327#endif /* LV_HAVE_AVX2 */
328
329#ifdef LV_HAVE_NEON
330#include <arm_neon.h>
331
332static inline void volk_32f_x2_s32f_interleave_16ic_neon(lv_16sc_t* complexVector,
333 const float* iBuffer,
334 const float* qBuffer,
335 const float scalar,
336 unsigned int num_points)
337{
338 unsigned int number = 0;
339 const unsigned int quarter_points = num_points / 4;
340
341 const float* iBufferPtr = iBuffer;
342 const float* qBufferPtr = qBuffer;
343 int16_t* complexVectorPtr = (int16_t*)complexVector;
344
345 float32x4_t vScalar = vdupq_n_f32(scalar);
346 float32x4_t half = vdupq_n_f32(0.5f);
347 float32x4_t neg_half = vdupq_n_f32(-0.5f);
348 float32x4_t zero = vdupq_n_f32(0.0f);
349
350 for (; number < quarter_points; number++) {
351 float32x4_t iValue = vld1q_f32(iBufferPtr);
352 float32x4_t qValue = vld1q_f32(qBufferPtr);
353
354 iValue = vmulq_f32(iValue, vScalar);
355 qValue = vmulq_f32(qValue, vScalar);
356
357 // Round to nearest: add copysign(0.5, x) before truncating
358 uint32x4_t iNeg = vcltq_f32(iValue, zero);
359 uint32x4_t qNeg = vcltq_f32(qValue, zero);
360 iValue = vaddq_f32(iValue, vbslq_f32(iNeg, neg_half, half));
361 qValue = vaddq_f32(qValue, vbslq_f32(qNeg, neg_half, half));
362
363 int32x4_t iInt = vcvtq_s32_f32(iValue);
364 int32x4_t qInt = vcvtq_s32_f32(qValue);
365
366 int16x4_t iShort = vqmovn_s32(iInt);
367 int16x4_t qShort = vqmovn_s32(qInt);
368
369 int16x4x2_t interleaved;
370 interleaved.val[0] = iShort;
371 interleaved.val[1] = qShort;
372 vst2_s16(complexVectorPtr, interleaved);
373
374 complexVectorPtr += 8;
375 iBufferPtr += 4;
376 qBufferPtr += 4;
377 }
378
379 number = quarter_points * 4;
380 complexVectorPtr = (int16_t*)(&complexVector[number]);
381 for (; number < num_points; number++) {
382 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
383 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
384 }
385}
386#endif /* LV_HAVE_NEON */
387
388#ifdef LV_HAVE_NEONV8
389#include <arm_neon.h>
390
391static inline void volk_32f_x2_s32f_interleave_16ic_neonv8(lv_16sc_t* complexVector,
392 const float* iBuffer,
393 const float* qBuffer,
394 const float scalar,
395 unsigned int num_points)
396{
397 unsigned int number = 0;
398 const unsigned int eighth_points = num_points / 8;
399
400 const float* iBufferPtr = iBuffer;
401 const float* qBufferPtr = qBuffer;
402 int16_t* complexVectorPtr = (int16_t*)complexVector;
403
404 float32x4_t vScalar = vdupq_n_f32(scalar);
405
406 for (; number < eighth_points; number++) {
407 float32x4_t iValue0 = vld1q_f32(iBufferPtr);
408 float32x4_t iValue1 = vld1q_f32(iBufferPtr + 4);
409 float32x4_t qValue0 = vld1q_f32(qBufferPtr);
410 float32x4_t qValue1 = vld1q_f32(qBufferPtr + 4);
411 __VOLK_PREFETCH(iBufferPtr + 8);
412 __VOLK_PREFETCH(qBufferPtr + 8);
413
414 iValue0 = vmulq_f32(iValue0, vScalar);
415 iValue1 = vmulq_f32(iValue1, vScalar);
416 qValue0 = vmulq_f32(qValue0, vScalar);
417 qValue1 = vmulq_f32(qValue1, vScalar);
418
419 int32x4_t iInt0 = vcvtnq_s32_f32(iValue0);
420 int32x4_t iInt1 = vcvtnq_s32_f32(iValue1);
421 int32x4_t qInt0 = vcvtnq_s32_f32(qValue0);
422 int32x4_t qInt1 = vcvtnq_s32_f32(qValue1);
423
424 int16x4_t iShort0 = vqmovn_s32(iInt0);
425 int16x4_t iShort1 = vqmovn_s32(iInt1);
426 int16x4_t qShort0 = vqmovn_s32(qInt0);
427 int16x4_t qShort1 = vqmovn_s32(qInt1);
428
429 int16x4x2_t interleaved0, interleaved1;
430 interleaved0.val[0] = iShort0;
431 interleaved0.val[1] = qShort0;
432 interleaved1.val[0] = iShort1;
433 interleaved1.val[1] = qShort1;
434
435 vst2_s16(complexVectorPtr, interleaved0);
436 vst2_s16(complexVectorPtr + 8, interleaved1);
437
438 complexVectorPtr += 16;
439 iBufferPtr += 8;
440 qBufferPtr += 8;
441 }
442
443 number = eighth_points * 8;
444 complexVectorPtr = (int16_t*)(&complexVector[number]);
445 for (; number < num_points; number++) {
446 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
447 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
448 }
449}
450#endif /* LV_HAVE_NEONV8 */
451
452#ifdef LV_HAVE_RVV
453#include <riscv_vector.h>
454
455static inline void volk_32f_x2_s32f_interleave_16ic_rvv(lv_16sc_t* complexVector,
456 const float* iBuffer,
457 const float* qBuffer,
458 const float scalar,
459 unsigned int num_points)
460{
461 uint32_t* out = (uint32_t*)complexVector;
462 size_t n = num_points;
463 for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) {
464 vl = __riscv_vsetvl_e32m8(n);
465 vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
466 vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
467 vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
468 vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
469 vuint16m4_t vr = __riscv_vreinterpret_u16m4(vri);
470 vuint16m4_t vi = __riscv_vreinterpret_u16m4(vii);
471 vuint32m8_t vc = __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFF, vi, vl);
472 __riscv_vse32(out, vc, vl);
473 }
474}
475#endif /*LV_HAVE_RVV*/
476
477#ifdef LV_HAVE_RVVSEG
478#include <riscv_vector.h>
479
480static inline void volk_32f_x2_s32f_interleave_16ic_rvvseg(lv_16sc_t* complexVector,
481 const float* iBuffer,
482 const float* qBuffer,
483 const float scalar,
484 unsigned int num_points)
485{
486 size_t n = num_points;
487 for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
488 vl = __riscv_vsetvl_e32m8(n);
489 vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
490 vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
491 vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
492 vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
493 __riscv_vsseg2e16(
494 (int16_t*)complexVector, __riscv_vcreate_v_i16m4x2(vri, vii), vl);
495 }
496}
497#endif /*LV_HAVE_RVVSEG*/
498
499#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */