41#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
42#define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
51static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
53 unsigned int num_points)
55 unsigned int number = 0;
56 const int8_t* complexVectorPtr = (int8_t*)complexVector;
57 int8_t* iBufferPtr = iBuffer;
58 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
90 __m256i iMoveMask2 = _mm256_set_epi8(13,
122 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
124 unsigned int thirtysecondPoints = num_points / 32;
126 for (number = 0; number < thirtysecondPoints; number++) {
127 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
128 complexVectorPtr += 32;
129 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
130 complexVectorPtr += 32;
132 complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
133 complexVectorPtr += 32;
134 complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
135 complexVectorPtr += 32;
137 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
138 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
140 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
141 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
143 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
144 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
146 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
147 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
149 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
150 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
152 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
153 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
155 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
160 number = thirtysecondPoints * 32;
161 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
162 for (; number < num_points; number++) {
163 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
164 int16ComplexVectorPtr++;
171#include <tmmintrin.h>
175 unsigned int num_points)
177 unsigned int number = 0;
178 const int8_t* complexVectorPtr = (int8_t*)complexVector;
179 int8_t* iBufferPtr = iBuffer;
180 __m128i iMoveMask1 = _mm_set_epi8(
181 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
182 __m128i iMoveMask2 = _mm_set_epi8(
183 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
184 __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
186 unsigned int sixteenthPoints = num_points / 16;
188 for (number = 0; number < sixteenthPoints; number++) {
189 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
190 complexVectorPtr += 16;
191 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
192 complexVectorPtr += 16;
194 complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
195 complexVectorPtr += 16;
196 complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
197 complexVectorPtr += 16;
199 complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
200 complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
202 complexVal1 = _mm_or_si128(complexVal1, complexVal2);
204 complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
205 complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
207 complexVal3 = _mm_or_si128(complexVal3, complexVal4);
210 complexVal1 = _mm_srai_epi16(complexVal1, 8);
211 complexVal3 = _mm_srai_epi16(complexVal3, 8);
213 iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
215 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
220 number = sixteenthPoints * 16;
221 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
222 for (; number < num_points; number++) {
223 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
224 int16ComplexVectorPtr++;
229#ifdef LV_HAVE_GENERIC
233 unsigned int num_points)
235 unsigned int number = 0;
236 int16_t* complexVectorPtr = (int16_t*)complexVector;
237 int8_t* iBufferPtr = iBuffer;
238 for (number = 0; number < num_points; number++) {
239 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
250 unsigned int num_points)
252 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
253 int8_t* iBufferPtr = iBuffer;
254 unsigned int eighth_points = num_points / 8;
257 int16x8x2_t complexInput;
259 for (number = 0; number < eighth_points; number++) {
260 complexInput = vld2q_s16(complexVectorPtr);
261 realOutput = vshrn_n_s16(complexInput.val[0], 8);
262 vst1_s8(iBufferPtr, realOutput);
263 complexVectorPtr += 16;
267 for (number = eighth_points * 8; number < num_points; number++) {
268 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
277static inline void volk_16ic_deinterleave_real_8i_neonv8(int8_t* iBuffer,
279 unsigned int num_points)
281 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
282 int8_t* iBufferPtr = iBuffer;
283 const unsigned int sixteenthPoints = num_points / 16;
285 for (
unsigned int number = 0; number < sixteenthPoints; number++) {
286 int16x8x2_t cplx0 = vld2q_s16(complexVectorPtr);
287 int16x8x2_t cplx1 = vld2q_s16(complexVectorPtr + 16);
290 int8x8_t out0 = vshrn_n_s16(cplx0.val[0], 8);
291 int8x8_t out1 = vshrn_n_s16(cplx1.val[0], 8);
293 vst1_s8(iBufferPtr, out0);
294 vst1_s8(iBufferPtr + 8, out1);
296 complexVectorPtr += 32;
300 for (
unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
301 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
309extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
313static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
315 unsigned int num_points)
317 volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
324#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H
325#define INCLUDED_volk_16ic_deinterleave_real_8i_u_H
332#include <immintrin.h>
334static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
336 unsigned int num_points)
338 unsigned int number = 0;
339 const int8_t* complexVectorPtr = (int8_t*)complexVector;
340 int8_t* iBufferPtr = iBuffer;
341 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
373 __m256i iMoveMask2 = _mm256_set_epi8(13,
405 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
407 unsigned int thirtysecondPoints = num_points / 32;
409 for (number = 0; number < thirtysecondPoints; number++) {
410 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
411 complexVectorPtr += 32;
412 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
413 complexVectorPtr += 32;
415 complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
416 complexVectorPtr += 32;
417 complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
418 complexVectorPtr += 32;
420 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
421 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
423 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
424 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
426 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
427 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
429 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
430 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
432 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
433 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
435 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
436 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
438 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
443 number = thirtysecondPoints * 32;
444 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
445 for (; number < num_points; number++) {
446 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
447 int16ComplexVectorPtr++;
454#include <riscv_vector.h>
456static inline void volk_16ic_deinterleave_real_8i_rvv(int8_t* iBuffer,
458 unsigned int num_points)
460 const uint32_t* in = (
const uint32_t*)complexVector;
461 size_t n = num_points;
462 for (
size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
463 vl = __riscv_vsetvl_e32m8(n);
464 vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl);
466 (uint8_t*)iBuffer, __riscv_vnsrl(__riscv_vnsrl(vc, 0, vl), 8, vl), vl);