60#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
61#define INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
71 unsigned int num_points)
73 const float* complexVectorPtr = (
float*)complexVector;
74 float* iBufferPtr = iBuffer;
75 float* qBufferPtr = qBuffer;
77 for (number = 0; number < num_points; number++) {
78 *iBufferPtr++ = *complexVectorPtr++;
79 *qBufferPtr++ = *complexVectorPtr++;
87static inline void volk_32fc_deinterleave_32f_x2_a_avx512f(
float* iBuffer,
90 unsigned int num_points)
92 const float* complexVectorPtr = (
float*)complexVector;
93 float* iBufferPtr = iBuffer;
94 float* qBufferPtr = qBuffer;
96 unsigned int number = 0;
97 const unsigned int eighthPoints = num_points / 8;
100 __m512 iValue, qValue;
102 for (; number < eighthPoints; number++) {
104 cplxValue = _mm512_load_ps(complexVectorPtr);
108 iValue = _mm512_permutexvar_ps(
109 _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0),
113 qValue = _mm512_permutexvar_ps(
114 _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0),
118 _mm256_store_ps(iBufferPtr, _mm512_castps512_ps256(iValue));
119 _mm256_store_ps(qBufferPtr, _mm512_castps512_ps256(qValue));
121 complexVectorPtr += 16;
126 number = eighthPoints * 8;
128 iBufferPtr, qBufferPtr, (
const lv_32fc_t*)complexVectorPtr, num_points - number);
133#include <immintrin.h>
137 unsigned int num_points)
139 const float* complexVectorPtr = (
float*)complexVector;
140 float* iBufferPtr = iBuffer;
141 float* qBufferPtr = qBuffer;
143 unsigned int number = 0;
145 const unsigned int eighthPoints = num_points / 8;
146 __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
147 for (; number < eighthPoints; number++) {
148 cplxValue1 = _mm256_load_ps(complexVectorPtr);
149 complexVectorPtr += 8;
151 cplxValue2 = _mm256_load_ps(complexVectorPtr);
152 complexVectorPtr += 8;
154 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
155 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
158 iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
160 qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
162 _mm256_store_ps(iBufferPtr, iValue);
163 _mm256_store_ps(qBufferPtr, qValue);
169 number = eighthPoints * 8;
170 for (; number < num_points; number++) {
171 *iBufferPtr++ = *complexVectorPtr++;
172 *qBufferPtr++ = *complexVectorPtr++;
178#include <xmmintrin.h>
183 unsigned int num_points)
185 const float* complexVectorPtr = (
float*)complexVector;
186 float* iBufferPtr = iBuffer;
187 float* qBufferPtr = qBuffer;
189 unsigned int number = 0;
190 const unsigned int quarterPoints = num_points / 4;
191 __m128 cplxValue1, cplxValue2, iValue, qValue;
192 for (; number < quarterPoints; number++) {
193 cplxValue1 = _mm_load_ps(complexVectorPtr);
194 complexVectorPtr += 4;
196 cplxValue2 = _mm_load_ps(complexVectorPtr);
197 complexVectorPtr += 4;
200 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
202 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
204 _mm_store_ps(iBufferPtr, iValue);
205 _mm_store_ps(qBufferPtr, qValue);
211 number = quarterPoints * 4;
212 for (; number < num_points; number++) {
213 *iBufferPtr++ = *complexVectorPtr++;
214 *qBufferPtr++ = *complexVectorPtr++;
226 unsigned int num_points)
228 unsigned int number = 0;
229 unsigned int quarter_points = num_points / 4;
230 const float* complexVectorPtr = (
float*)complexVector;
231 float* iBufferPtr = iBuffer;
232 float* qBufferPtr = qBuffer;
233 float32x4x2_t complexInput;
235 for (number = 0; number < quarter_points; number++) {
236 complexInput = vld2q_f32(complexVectorPtr);
237 vst1q_f32(iBufferPtr, complexInput.val[0]);
238 vst1q_f32(qBufferPtr, complexInput.val[1]);
239 complexVectorPtr += 8;
244 for (number = quarter_points * 4; number < num_points; number++) {
245 *iBufferPtr++ = *complexVectorPtr++;
246 *qBufferPtr++ = *complexVectorPtr++;
254static inline void volk_32fc_deinterleave_32f_x2_neonv8(
float* iBuffer,
257 unsigned int num_points)
259 const unsigned int eighthPoints = num_points / 8;
260 const float* complexVectorPtr = (
float*)complexVector;
261 float* iBufferPtr = iBuffer;
262 float* qBufferPtr = qBuffer;
264 for (
unsigned int number = 0; number < eighthPoints; number++) {
265 float32x4x2_t cplx0 = vld2q_f32(complexVectorPtr);
266 float32x4x2_t cplx1 = vld2q_f32(complexVectorPtr + 8);
269 vst1q_f32(iBufferPtr, cplx0.val[0]);
270 vst1q_f32(iBufferPtr + 4, cplx1.val[0]);
271 vst1q_f32(qBufferPtr, cplx0.val[1]);
272 vst1q_f32(qBufferPtr + 4, cplx1.val[1]);
274 complexVectorPtr += 16;
279 for (
unsigned int number = eighthPoints * 8; number < num_points; number++) {
280 *iBufferPtr++ = *complexVectorPtr++;
281 *qBufferPtr++ = *complexVectorPtr++;
289#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
290#define INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
295#ifdef LV_HAVE_AVX512F
296#include <immintrin.h>
298static inline void volk_32fc_deinterleave_32f_x2_u_avx512f(
float* iBuffer,
301 unsigned int num_points)
303 const float* complexVectorPtr = (
float*)complexVector;
304 float* iBufferPtr = iBuffer;
305 float* qBufferPtr = qBuffer;
307 unsigned int number = 0;
308 const unsigned int eighthPoints = num_points / 8;
311 __m512 iValue, qValue;
313 for (; number < eighthPoints; number++) {
315 cplxValue = _mm512_loadu_ps(complexVectorPtr);
319 iValue = _mm512_permutexvar_ps(
320 _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0),
324 qValue = _mm512_permutexvar_ps(
325 _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0),
329 _mm256_storeu_ps(iBufferPtr, _mm512_castps512_ps256(iValue));
330 _mm256_storeu_ps(qBufferPtr, _mm512_castps512_ps256(qValue));
332 complexVectorPtr += 16;
337 number = eighthPoints * 8;
339 iBufferPtr, qBufferPtr, (
const lv_32fc_t*)complexVectorPtr, num_points - number);
344#include <immintrin.h>
348 unsigned int num_points)
350 const float* complexVectorPtr = (
float*)complexVector;
351 float* iBufferPtr = iBuffer;
352 float* qBufferPtr = qBuffer;
354 unsigned int number = 0;
356 const unsigned int eighthPoints = num_points / 8;
357 __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
358 for (; number < eighthPoints; number++) {
359 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
360 complexVectorPtr += 8;
362 cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
363 complexVectorPtr += 8;
365 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
366 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
369 iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
371 qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
373 _mm256_storeu_ps(iBufferPtr, iValue);
374 _mm256_storeu_ps(qBufferPtr, qValue);
380 number = eighthPoints * 8;
381 for (; number < num_points; number++) {
382 *iBufferPtr++ = *complexVectorPtr++;
383 *qBufferPtr++ = *complexVectorPtr++;
389#include <riscv_vector.h>
391static inline void volk_32fc_deinterleave_32f_x2_rvv(
float* iBuffer,
394 unsigned int num_points)
396 size_t n = num_points;
397 for (
size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
398 vl = __riscv_vsetvl_e32m4(n);
399 vuint64m8_t vc = __riscv_vle64_v_u64m8((
const uint64_t*)complexVector, vl);
400 vuint32m4_t vr = __riscv_vnsrl(vc, 0, vl);
401 vuint32m4_t vi = __riscv_vnsrl(vc, 32, vl);
402 __riscv_vse32((uint32_t*)iBuffer, vr, vl);
403 __riscv_vse32((uint32_t*)qBuffer, vi, vl);
409#include <riscv_vector.h>
411static inline void volk_32fc_deinterleave_32f_x2_rvvseg(
float* iBuffer,
414 unsigned int num_points)
416 size_t n = num_points;
417 for (
size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
418 vl = __riscv_vsetvl_e32m4(n);
420 __riscv_vlseg2e32_v_u32m4x2((
const uint32_t*)complexVector, vl);
421 vuint32m4_t vr = __riscv_vget_u32m4(vc, 0);
422 vuint32m4_t vi = __riscv_vget_u32m4(vc, 1);
423 __riscv_vse32((uint32_t*)iBuffer, vr, vl);
424 __riscv_vse32((uint32_t*)qBuffer, vi, vl);