41#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
42#define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
49static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
52 unsigned int num_points)
54 unsigned int number = 0;
55 const int8_t* complexVectorPtr = (int8_t*)complexVector;
56 int16_t* iBufferPtr = iBuffer;
57 int16_t* qBufferPtr = qBuffer;
59 __m256i MoveMask = _mm256_set_epi8(15,
92 __m256i iMove2, iMove1;
93 __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
95 unsigned int sixteenthPoints = num_points / 16;
97 for (number = 0; number < sixteenthPoints; number++) {
98 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
99 complexVectorPtr += 32;
100 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
101 complexVectorPtr += 32;
103 iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
104 iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
106 iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
107 _mm256_permute4x64_epi64(iMove2, 0x80),
109 qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
110 _mm256_permute4x64_epi64(iMove2, 0xd0),
113 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
114 _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
120 number = sixteenthPoints * 16;
121 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
122 for (; number < num_points; number++) {
123 *iBufferPtr++ = *int16ComplexVectorPtr++;
124 *qBufferPtr++ = *int16ComplexVectorPtr++;
130#include <tmmintrin.h>
135 unsigned int num_points)
137 unsigned int number = 0;
138 const int8_t* complexVectorPtr = (int8_t*)complexVector;
139 int16_t* iBufferPtr = iBuffer;
140 int16_t* qBufferPtr = qBuffer;
142 __m128i iMoveMask1 = _mm_set_epi8(
143 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
144 __m128i iMoveMask2 = _mm_set_epi8(
145 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
147 __m128i qMoveMask1 = _mm_set_epi8(
148 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
149 __m128i qMoveMask2 = _mm_set_epi8(
150 15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
152 __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
154 unsigned int eighthPoints = num_points / 8;
156 for (number = 0; number < eighthPoints; number++) {
157 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
158 complexVectorPtr += 16;
159 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
160 complexVectorPtr += 16;
162 iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
163 _mm_shuffle_epi8(complexVal2, iMoveMask2));
164 qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
165 _mm_shuffle_epi8(complexVal2, qMoveMask2));
167 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
168 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
174 number = eighthPoints * 8;
175 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
176 for (; number < num_points; number++) {
177 *iBufferPtr++ = *int16ComplexVectorPtr++;
178 *qBufferPtr++ = *int16ComplexVectorPtr++;
184#include <emmintrin.h>
189 unsigned int num_points)
191 unsigned int number = 0;
192 const int16_t* complexVectorPtr = (int16_t*)complexVector;
193 int16_t* iBufferPtr = iBuffer;
194 int16_t* qBufferPtr = qBuffer;
195 __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
196 qComplexVal2, iOutputVal, qOutputVal;
197 __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
198 __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
200 unsigned int eighthPoints = num_points / 8;
202 for (number = 0; number < eighthPoints; number++) {
203 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
204 complexVectorPtr += 8;
205 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
206 complexVectorPtr += 8;
208 iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
210 iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
212 iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
214 iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
216 iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
218 iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
220 iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
221 _mm_and_si128(iComplexVal2, highMask));
223 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
225 qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
227 qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
229 qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
231 qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
233 qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
235 qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
237 qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
238 _mm_and_si128(qComplexVal2, highMask));
240 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
246 number = eighthPoints * 8;
247 for (; number < num_points; number++) {
248 *iBufferPtr++ = *complexVectorPtr++;
249 *qBufferPtr++ = *complexVectorPtr++;
254#ifdef LV_HAVE_GENERIC
259 unsigned int num_points)
261 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
262 int16_t* iBufferPtr = iBuffer;
263 int16_t* qBufferPtr = qBuffer;
265 for (number = 0; number < num_points; number++) {
266 *iBufferPtr++ = *complexVectorPtr++;
267 *qBufferPtr++ = *complexVectorPtr++;
279 unsigned int num_points)
281 unsigned int number = 0;
282 const unsigned int eighthPoints = num_points / 8;
283 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
284 int16_t* iBufferPtr = iBuffer;
285 int16_t* qBufferPtr = qBuffer;
287 int16x8x2_t complexVal;
289 for (; number < eighthPoints; number++) {
290 complexVal = vld2q_s16(complexVectorPtr);
291 vst1q_s16(iBufferPtr, complexVal.val[0]);
292 vst1q_s16(qBufferPtr, complexVal.val[1]);
293 complexVectorPtr += 16;
298 number = eighthPoints * 8;
299 for (; number < num_points; number++) {
300 *iBufferPtr++ = *complexVectorPtr++;
301 *qBufferPtr++ = *complexVectorPtr++;
310static inline void volk_16ic_deinterleave_16i_x2_neonv8(int16_t* iBuffer,
313 unsigned int num_points)
315 unsigned int number = 0;
316 const unsigned int sixteenthPoints = num_points / 16;
317 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
318 int16_t* iBufferPtr = iBuffer;
319 int16_t* qBufferPtr = qBuffer;
321 int16x8x2_t complexVal0, complexVal1;
323 for (; number < sixteenthPoints; number++) {
324 complexVal0 = vld2q_s16(complexVectorPtr);
325 complexVal1 = vld2q_s16(complexVectorPtr + 16);
328 vst1q_s16(iBufferPtr, complexVal0.val[0]);
329 vst1q_s16(iBufferPtr + 8, complexVal1.val[0]);
330 vst1q_s16(qBufferPtr, complexVal0.val[1]);
331 vst1q_s16(qBufferPtr + 8, complexVal1.val[1]);
333 complexVectorPtr += 32;
338 number = sixteenthPoints * 16;
339 for (; number < num_points; number++) {
340 *iBufferPtr++ = *complexVectorPtr++;
341 *qBufferPtr++ = *complexVectorPtr++;
349extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
353static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
356 unsigned int num_points)
358 volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
365#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
366#define INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
371#include <immintrin.h>
373static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
376 unsigned int num_points)
378 unsigned int number = 0;
379 const int8_t* complexVectorPtr = (int8_t*)complexVector;
380 int16_t* iBufferPtr = iBuffer;
381 int16_t* qBufferPtr = qBuffer;
383 __m256i MoveMask = _mm256_set_epi8(15,
416 __m256i iMove2, iMove1;
417 __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
419 unsigned int sixteenthPoints = num_points / 16;
421 for (number = 0; number < sixteenthPoints; number++) {
422 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
423 complexVectorPtr += 32;
424 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
425 complexVectorPtr += 32;
427 iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
428 iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
430 iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
431 _mm256_permute4x64_epi64(iMove2, 0x80),
433 qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
434 _mm256_permute4x64_epi64(iMove2, 0xd0),
437 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
438 _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
444 number = sixteenthPoints * 16;
445 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
446 for (; number < num_points; number++) {
447 *iBufferPtr++ = *int16ComplexVectorPtr++;
448 *qBufferPtr++ = *int16ComplexVectorPtr++;
454#include <riscv_vector.h>
456static inline void volk_16ic_deinterleave_16i_x2_rvv(int16_t* iBuffer,
459 unsigned int num_points)
461 size_t n = num_points;
462 for (
size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
463 vl = __riscv_vsetvl_e16m4(n);
464 vuint32m8_t vc = __riscv_vle32_v_u32m8((
const uint32_t*)complexVector, vl);
465 vuint16m4_t vr = __riscv_vnsrl(vc, 0, vl);
466 vuint16m4_t vi = __riscv_vnsrl(vc, 16, vl);
467 __riscv_vse16((uint16_t*)iBuffer, vr, vl);
468 __riscv_vse16((uint16_t*)qBuffer, vi, vl);
474#include <riscv_vector.h>
476static inline void volk_16ic_deinterleave_16i_x2_rvvseg(int16_t* iBuffer,
479 unsigned int num_points)
481 size_t n = num_points;
482 for (
size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
483 vl = __riscv_vsetvl_e16m4(n);
485 __riscv_vlseg2e16_v_u16m4x2((
const uint16_t*)complexVector, vl);
486 vuint16m4_t vr = __riscv_vget_u16m4(vc, 0);
487 vuint16m4_t vi = __riscv_vget_u16m4(vc, 1);
488 __riscv_vse16((uint16_t*)iBuffer, vr, vl);
489 __riscv_vse16((uint16_t*)qBuffer, vi, vl);