62#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
63#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
72static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(
lv_16sc_t* complexVector,
76 unsigned int num_points)
78 unsigned int number = 0;
79 const float* iBufferPtr = iBuffer;
80 const float* qBufferPtr = qBuffer;
82 __m256 vScalar = _mm256_set1_ps(scalar);
84 const unsigned int eighthPoints = num_points / 8;
86 __m256 iValue, qValue, cplxValue1, cplxValue2;
87 __m256i intValue1, intValue2;
89 int16_t* complexVectorPtr = (int16_t*)complexVector;
91 for (; number < eighthPoints; number++) {
92 iValue = _mm256_load_ps(iBufferPtr);
93 qValue = _mm256_load_ps(qBufferPtr);
96 cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
97 cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
100 cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
101 cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
103 intValue1 = _mm256_cvtps_epi32(cplxValue1);
104 intValue2 = _mm256_cvtps_epi32(cplxValue2);
106 intValue1 = _mm256_packs_epi32(intValue1, intValue2);
108 _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
109 complexVectorPtr += 16;
115 number = eighthPoints * 8;
116 complexVectorPtr = (int16_t*)(&complexVector[number]);
117 for (; number < num_points; number++) {
118 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
119 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
126#include <emmintrin.h>
129 const float* iBuffer,
130 const float* qBuffer,
132 unsigned int num_points)
134 unsigned int number = 0;
135 const float* iBufferPtr = iBuffer;
136 const float* qBufferPtr = qBuffer;
138 __m128 vScalar = _mm_set_ps1(scalar);
140 const unsigned int quarterPoints = num_points / 4;
142 __m128 iValue, qValue, cplxValue1, cplxValue2;
143 __m128i intValue1, intValue2;
145 int16_t* complexVectorPtr = (int16_t*)complexVector;
147 for (; number < quarterPoints; number++) {
148 iValue = _mm_load_ps(iBufferPtr);
149 qValue = _mm_load_ps(qBufferPtr);
152 cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
153 cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
156 cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
157 cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
159 intValue1 = _mm_cvtps_epi32(cplxValue1);
160 intValue2 = _mm_cvtps_epi32(cplxValue2);
162 intValue1 = _mm_packs_epi32(intValue1, intValue2);
164 _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
165 complexVectorPtr += 8;
171 number = quarterPoints * 4;
172 complexVectorPtr = (int16_t*)(&complexVector[number]);
173 for (; number < num_points; number++) {
174 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
175 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
182#include <xmmintrin.h>
185 const float* iBuffer,
186 const float* qBuffer,
188 unsigned int num_points)
190 unsigned int number = 0;
191 const float* iBufferPtr = iBuffer;
192 const float* qBufferPtr = qBuffer;
194 __m128 vScalar = _mm_set_ps1(scalar);
196 const unsigned int quarterPoints = num_points / 4;
198 __m128 iValue, qValue, cplxValue;
200 int16_t* complexVectorPtr = (int16_t*)complexVector;
204 for (; number < quarterPoints; number++) {
205 iValue = _mm_load_ps(iBufferPtr);
206 qValue = _mm_load_ps(qBufferPtr);
209 cplxValue = _mm_unpacklo_ps(iValue, qValue);
210 cplxValue = _mm_mul_ps(cplxValue, vScalar);
212 _mm_store_ps(floatBuffer, cplxValue);
214 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
215 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
216 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
217 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
220 cplxValue = _mm_unpackhi_ps(iValue, qValue);
221 cplxValue = _mm_mul_ps(cplxValue, vScalar);
223 _mm_store_ps(floatBuffer, cplxValue);
225 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
226 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
227 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
228 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
234 number = quarterPoints * 4;
235 complexVectorPtr = (int16_t*)(&complexVector[number]);
236 for (; number < num_points; number++) {
237 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
238 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
244#ifdef LV_HAVE_GENERIC
247 const float* iBuffer,
248 const float* qBuffer,
250 unsigned int num_points)
252 int16_t* complexVectorPtr = (int16_t*)complexVector;
253 const float* iBufferPtr = iBuffer;
254 const float* qBufferPtr = qBuffer;
255 unsigned int number = 0;
257 for (number = 0; number < num_points; number++) {
258 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
259 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
267#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
268#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
275#include <immintrin.h>
277static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(
lv_16sc_t* complexVector,
278 const float* iBuffer,
279 const float* qBuffer,
281 unsigned int num_points)
283 unsigned int number = 0;
284 const float* iBufferPtr = iBuffer;
285 const float* qBufferPtr = qBuffer;
287 __m256 vScalar = _mm256_set1_ps(scalar);
289 const unsigned int eighthPoints = num_points / 8;
291 __m256 iValue, qValue, cplxValue1, cplxValue2;
292 __m256i intValue1, intValue2;
294 int16_t* complexVectorPtr = (int16_t*)complexVector;
296 for (; number < eighthPoints; number++) {
297 iValue = _mm256_loadu_ps(iBufferPtr);
298 qValue = _mm256_loadu_ps(qBufferPtr);
301 cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
302 cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
305 cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
306 cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
308 intValue1 = _mm256_cvtps_epi32(cplxValue1);
309 intValue2 = _mm256_cvtps_epi32(cplxValue2);
311 intValue1 = _mm256_packs_epi32(intValue1, intValue2);
313 _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
314 complexVectorPtr += 16;
320 number = eighthPoints * 8;
321 complexVectorPtr = (int16_t*)(&complexVector[number]);
322 for (; number < num_points; number++) {
323 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
324 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
333 const float* iBuffer,
334 const float* qBuffer,
336 unsigned int num_points)
338 unsigned int number = 0;
339 const unsigned int quarter_points = num_points / 4;
341 const float* iBufferPtr = iBuffer;
342 const float* qBufferPtr = qBuffer;
343 int16_t* complexVectorPtr = (int16_t*)complexVector;
345 float32x4_t vScalar = vdupq_n_f32(scalar);
346 float32x4_t half = vdupq_n_f32(0.5f);
347 float32x4_t neg_half = vdupq_n_f32(-0.5f);
348 float32x4_t zero = vdupq_n_f32(0.0f);
350 for (; number < quarter_points; number++) {
351 float32x4_t iValue = vld1q_f32(iBufferPtr);
352 float32x4_t qValue = vld1q_f32(qBufferPtr);
354 iValue = vmulq_f32(iValue, vScalar);
355 qValue = vmulq_f32(qValue, vScalar);
358 uint32x4_t iNeg = vcltq_f32(iValue, zero);
359 uint32x4_t qNeg = vcltq_f32(qValue, zero);
360 iValue = vaddq_f32(iValue, vbslq_f32(iNeg, neg_half, half));
361 qValue = vaddq_f32(qValue, vbslq_f32(qNeg, neg_half, half));
363 int32x4_t iInt = vcvtq_s32_f32(iValue);
364 int32x4_t qInt = vcvtq_s32_f32(qValue);
366 int16x4_t iShort = vqmovn_s32(iInt);
367 int16x4_t qShort = vqmovn_s32(qInt);
369 int16x4x2_t interleaved;
370 interleaved.val[0] = iShort;
371 interleaved.val[1] = qShort;
372 vst2_s16(complexVectorPtr, interleaved);
374 complexVectorPtr += 8;
379 number = quarter_points * 4;
380 complexVectorPtr = (int16_t*)(&complexVector[number]);
381 for (; number < num_points; number++) {
382 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
383 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
391static inline void volk_32f_x2_s32f_interleave_16ic_neonv8(
lv_16sc_t* complexVector,
392 const float* iBuffer,
393 const float* qBuffer,
395 unsigned int num_points)
397 unsigned int number = 0;
398 const unsigned int eighth_points = num_points / 8;
400 const float* iBufferPtr = iBuffer;
401 const float* qBufferPtr = qBuffer;
402 int16_t* complexVectorPtr = (int16_t*)complexVector;
404 float32x4_t vScalar = vdupq_n_f32(scalar);
406 for (; number < eighth_points; number++) {
407 float32x4_t iValue0 = vld1q_f32(iBufferPtr);
408 float32x4_t iValue1 = vld1q_f32(iBufferPtr + 4);
409 float32x4_t qValue0 = vld1q_f32(qBufferPtr);
410 float32x4_t qValue1 = vld1q_f32(qBufferPtr + 4);
414 iValue0 = vmulq_f32(iValue0, vScalar);
415 iValue1 = vmulq_f32(iValue1, vScalar);
416 qValue0 = vmulq_f32(qValue0, vScalar);
417 qValue1 = vmulq_f32(qValue1, vScalar);
419 int32x4_t iInt0 = vcvtnq_s32_f32(iValue0);
420 int32x4_t iInt1 = vcvtnq_s32_f32(iValue1);
421 int32x4_t qInt0 = vcvtnq_s32_f32(qValue0);
422 int32x4_t qInt1 = vcvtnq_s32_f32(qValue1);
424 int16x4_t iShort0 = vqmovn_s32(iInt0);
425 int16x4_t iShort1 = vqmovn_s32(iInt1);
426 int16x4_t qShort0 = vqmovn_s32(qInt0);
427 int16x4_t qShort1 = vqmovn_s32(qInt1);
429 int16x4x2_t interleaved0, interleaved1;
430 interleaved0.val[0] = iShort0;
431 interleaved0.val[1] = qShort0;
432 interleaved1.val[0] = iShort1;
433 interleaved1.val[1] = qShort1;
435 vst2_s16(complexVectorPtr, interleaved0);
436 vst2_s16(complexVectorPtr + 8, interleaved1);
438 complexVectorPtr += 16;
443 number = eighth_points * 8;
444 complexVectorPtr = (int16_t*)(&complexVector[number]);
445 for (; number < num_points; number++) {
446 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
447 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
453#include <riscv_vector.h>
455static inline void volk_32f_x2_s32f_interleave_16ic_rvv(
lv_16sc_t* complexVector,
456 const float* iBuffer,
457 const float* qBuffer,
459 unsigned int num_points)
461 uint32_t* out = (uint32_t*)complexVector;
462 size_t n = num_points;
463 for (
size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) {
464 vl = __riscv_vsetvl_e32m8(n);
465 vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
466 vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
467 vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
468 vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
469 vuint16m4_t vr = __riscv_vreinterpret_u16m4(vri);
470 vuint16m4_t vi = __riscv_vreinterpret_u16m4(vii);
471 vuint32m8_t vc = __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFF, vi, vl);
472 __riscv_vse32(out, vc, vl);
478#include <riscv_vector.h>
480static inline void volk_32f_x2_s32f_interleave_16ic_rvvseg(
lv_16sc_t* complexVector,
481 const float* iBuffer,
482 const float* qBuffer,
484 unsigned int num_points)
486 size_t n = num_points;
487 for (
size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
488 vl = __riscv_vsetvl_e32m8(n);
489 vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
490 vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
491 vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
492 vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
494 (int16_t*)complexVector, __riscv_vcreate_v_i16m4x2(vri, vii), vl);