Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_deinterleave_32f_x2.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
59
60#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
61#define INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
62
63#include <inttypes.h>
64#include <stdio.h>
65
66#ifdef LV_HAVE_GENERIC
67
68static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer,
69 float* qBuffer,
70 const lv_32fc_t* complexVector,
71 unsigned int num_points)
72{
73 const float* complexVectorPtr = (float*)complexVector;
74 float* iBufferPtr = iBuffer;
75 float* qBufferPtr = qBuffer;
76 unsigned int number;
77 for (number = 0; number < num_points; number++) {
78 *iBufferPtr++ = *complexVectorPtr++;
79 *qBufferPtr++ = *complexVectorPtr++;
80 }
81}
82#endif /* LV_HAVE_GENERIC */
83
84#ifdef LV_HAVE_AVX512F
85#include <immintrin.h>
86
87static inline void volk_32fc_deinterleave_32f_x2_a_avx512f(float* iBuffer,
88 float* qBuffer,
89 const lv_32fc_t* complexVector,
90 unsigned int num_points)
91{
92 const float* complexVectorPtr = (float*)complexVector;
93 float* iBufferPtr = iBuffer;
94 float* qBufferPtr = qBuffer;
95
96 unsigned int number = 0;
97 const unsigned int eighthPoints = num_points / 8;
98
99 __m512 cplxValue;
100 __m512 iValue, qValue;
101
102 for (; number < eighthPoints; number++) {
103 // Load 8 complex numbers (16 floats): I0,Q0,I1,Q1,...,I7,Q7
104 cplxValue = _mm512_load_ps(complexVectorPtr);
105
106 // Deinterleave using permute
107 // Extract all I values (even indices: 0,2,4,6,8,10,12,14)
108 iValue = _mm512_permutexvar_ps(
109 _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0),
110 cplxValue);
111
112 // Extract all Q values (odd indices: 1,3,5,7,9,11,13,15)
113 qValue = _mm512_permutexvar_ps(
114 _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0),
115 cplxValue);
116
117 // Store only the first 8 results (lower 256 bits)
118 _mm256_store_ps(iBufferPtr, _mm512_castps512_ps256(iValue));
119 _mm256_store_ps(qBufferPtr, _mm512_castps512_ps256(qValue));
120
121 complexVectorPtr += 16;
122 iBufferPtr += 8;
123 qBufferPtr += 8;
124 }
125
126 number = eighthPoints * 8;
128 iBufferPtr, qBufferPtr, (const lv_32fc_t*)complexVectorPtr, num_points - number);
129}
130#endif /* LV_HAVE_AVX512F */
131
132#ifdef LV_HAVE_AVX
133#include <immintrin.h>
134static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer,
135 float* qBuffer,
136 const lv_32fc_t* complexVector,
137 unsigned int num_points)
138{
139 const float* complexVectorPtr = (float*)complexVector;
140 float* iBufferPtr = iBuffer;
141 float* qBufferPtr = qBuffer;
142
143 unsigned int number = 0;
144 // Mask for real and imaginary parts
145 const unsigned int eighthPoints = num_points / 8;
146 __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
147 for (; number < eighthPoints; number++) {
148 cplxValue1 = _mm256_load_ps(complexVectorPtr);
149 complexVectorPtr += 8;
150
151 cplxValue2 = _mm256_load_ps(complexVectorPtr);
152 complexVectorPtr += 8;
153
154 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
155 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
156
157 // Arrange in i1i2i3i4 format
158 iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
159 // Arrange in q1q2q3q4 format
160 qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
161
162 _mm256_store_ps(iBufferPtr, iValue);
163 _mm256_store_ps(qBufferPtr, qValue);
164
165 iBufferPtr += 8;
166 qBufferPtr += 8;
167 }
168
169 number = eighthPoints * 8;
170 for (; number < num_points; number++) {
171 *iBufferPtr++ = *complexVectorPtr++;
172 *qBufferPtr++ = *complexVectorPtr++;
173 }
174}
175#endif /* LV_HAVE_AVX */
176
177#ifdef LV_HAVE_SSE
178#include <xmmintrin.h>
179
180static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer,
181 float* qBuffer,
182 const lv_32fc_t* complexVector,
183 unsigned int num_points)
184{
185 const float* complexVectorPtr = (float*)complexVector;
186 float* iBufferPtr = iBuffer;
187 float* qBufferPtr = qBuffer;
188
189 unsigned int number = 0;
190 const unsigned int quarterPoints = num_points / 4;
191 __m128 cplxValue1, cplxValue2, iValue, qValue;
192 for (; number < quarterPoints; number++) {
193 cplxValue1 = _mm_load_ps(complexVectorPtr);
194 complexVectorPtr += 4;
195
196 cplxValue2 = _mm_load_ps(complexVectorPtr);
197 complexVectorPtr += 4;
198
199 // Arrange in i1i2i3i4 format
200 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
201 // Arrange in q1q2q3q4 format
202 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
203
204 _mm_store_ps(iBufferPtr, iValue);
205 _mm_store_ps(qBufferPtr, qValue);
206
207 iBufferPtr += 4;
208 qBufferPtr += 4;
209 }
210
211 number = quarterPoints * 4;
212 for (; number < num_points; number++) {
213 *iBufferPtr++ = *complexVectorPtr++;
214 *qBufferPtr++ = *complexVectorPtr++;
215 }
216}
217#endif /* LV_HAVE_SSE */
218
219
220#ifdef LV_HAVE_NEON
221#include <arm_neon.h>
222
223static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer,
224 float* qBuffer,
225 const lv_32fc_t* complexVector,
226 unsigned int num_points)
227{
228 unsigned int number = 0;
229 unsigned int quarter_points = num_points / 4;
230 const float* complexVectorPtr = (float*)complexVector;
231 float* iBufferPtr = iBuffer;
232 float* qBufferPtr = qBuffer;
233 float32x4x2_t complexInput;
234
235 for (number = 0; number < quarter_points; number++) {
236 complexInput = vld2q_f32(complexVectorPtr);
237 vst1q_f32(iBufferPtr, complexInput.val[0]);
238 vst1q_f32(qBufferPtr, complexInput.val[1]);
239 complexVectorPtr += 8;
240 iBufferPtr += 4;
241 qBufferPtr += 4;
242 }
243
244 for (number = quarter_points * 4; number < num_points; number++) {
245 *iBufferPtr++ = *complexVectorPtr++;
246 *qBufferPtr++ = *complexVectorPtr++;
247 }
248}
249#endif /* LV_HAVE_NEON */
250
251#ifdef LV_HAVE_NEONV8
252#include <arm_neon.h>
253
254static inline void volk_32fc_deinterleave_32f_x2_neonv8(float* iBuffer,
255 float* qBuffer,
256 const lv_32fc_t* complexVector,
257 unsigned int num_points)
258{
259 const unsigned int eighthPoints = num_points / 8;
260 const float* complexVectorPtr = (float*)complexVector;
261 float* iBufferPtr = iBuffer;
262 float* qBufferPtr = qBuffer;
263
264 for (unsigned int number = 0; number < eighthPoints; number++) {
265 float32x4x2_t cplx0 = vld2q_f32(complexVectorPtr);
266 float32x4x2_t cplx1 = vld2q_f32(complexVectorPtr + 8);
267 __VOLK_PREFETCH(complexVectorPtr + 32);
268
269 vst1q_f32(iBufferPtr, cplx0.val[0]);
270 vst1q_f32(iBufferPtr + 4, cplx1.val[0]);
271 vst1q_f32(qBufferPtr, cplx0.val[1]);
272 vst1q_f32(qBufferPtr + 4, cplx1.val[1]);
273
274 complexVectorPtr += 16;
275 iBufferPtr += 8;
276 qBufferPtr += 8;
277 }
278
279 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
280 *iBufferPtr++ = *complexVectorPtr++;
281 *qBufferPtr++ = *complexVectorPtr++;
282 }
283}
284#endif /* LV_HAVE_NEONV8 */
285
286#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a_H */
287
288
289#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
290#define INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
291
292#include <inttypes.h>
293#include <stdio.h>
294
295#ifdef LV_HAVE_AVX512F
296#include <immintrin.h>
297
298static inline void volk_32fc_deinterleave_32f_x2_u_avx512f(float* iBuffer,
299 float* qBuffer,
300 const lv_32fc_t* complexVector,
301 unsigned int num_points)
302{
303 const float* complexVectorPtr = (float*)complexVector;
304 float* iBufferPtr = iBuffer;
305 float* qBufferPtr = qBuffer;
306
307 unsigned int number = 0;
308 const unsigned int eighthPoints = num_points / 8;
309
310 __m512 cplxValue;
311 __m512 iValue, qValue;
312
313 for (; number < eighthPoints; number++) {
314 // Load 8 complex numbers (16 floats): I0,Q0,I1,Q1,...,I7,Q7 - unaligned
315 cplxValue = _mm512_loadu_ps(complexVectorPtr);
316
317 // Deinterleave using permute
318 // Extract all I values (even indices: 0,2,4,6,8,10,12,14)
319 iValue = _mm512_permutexvar_ps(
320 _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0),
321 cplxValue);
322
323 // Extract all Q values (odd indices: 1,3,5,7,9,11,13,15)
324 qValue = _mm512_permutexvar_ps(
325 _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0),
326 cplxValue);
327
328 // Store only the first 8 results (lower 256 bits) - unaligned
329 _mm256_storeu_ps(iBufferPtr, _mm512_castps512_ps256(iValue));
330 _mm256_storeu_ps(qBufferPtr, _mm512_castps512_ps256(qValue));
331
332 complexVectorPtr += 16;
333 iBufferPtr += 8;
334 qBufferPtr += 8;
335 }
336
337 number = eighthPoints * 8;
339 iBufferPtr, qBufferPtr, (const lv_32fc_t*)complexVectorPtr, num_points - number);
340}
341#endif /* LV_HAVE_AVX512F */
342
343#ifdef LV_HAVE_AVX
344#include <immintrin.h>
345static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer,
346 float* qBuffer,
347 const lv_32fc_t* complexVector,
348 unsigned int num_points)
349{
350 const float* complexVectorPtr = (float*)complexVector;
351 float* iBufferPtr = iBuffer;
352 float* qBufferPtr = qBuffer;
353
354 unsigned int number = 0;
355 // Mask for real and imaginary parts
356 const unsigned int eighthPoints = num_points / 8;
357 __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
358 for (; number < eighthPoints; number++) {
359 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
360 complexVectorPtr += 8;
361
362 cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
363 complexVectorPtr += 8;
364
365 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
366 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
367
368 // Arrange in i1i2i3i4 format
369 iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
370 // Arrange in q1q2q3q4 format
371 qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
372
373 _mm256_storeu_ps(iBufferPtr, iValue);
374 _mm256_storeu_ps(qBufferPtr, qValue);
375
376 iBufferPtr += 8;
377 qBufferPtr += 8;
378 }
379
380 number = eighthPoints * 8;
381 for (; number < num_points; number++) {
382 *iBufferPtr++ = *complexVectorPtr++;
383 *qBufferPtr++ = *complexVectorPtr++;
384 }
385}
386#endif /* LV_HAVE_AVX */
387
388#ifdef LV_HAVE_RVV
389#include <riscv_vector.h>
390
391static inline void volk_32fc_deinterleave_32f_x2_rvv(float* iBuffer,
392 float* qBuffer,
393 const lv_32fc_t* complexVector,
394 unsigned int num_points)
395{
396 size_t n = num_points;
397 for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
398 vl = __riscv_vsetvl_e32m4(n);
399 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
400 vuint32m4_t vr = __riscv_vnsrl(vc, 0, vl);
401 vuint32m4_t vi = __riscv_vnsrl(vc, 32, vl);
402 __riscv_vse32((uint32_t*)iBuffer, vr, vl);
403 __riscv_vse32((uint32_t*)qBuffer, vi, vl);
404 }
405}
406#endif /*LV_HAVE_RVV*/
407
408#ifdef LV_HAVE_RVVSEG
409#include <riscv_vector.h>
410
411static inline void volk_32fc_deinterleave_32f_x2_rvvseg(float* iBuffer,
412 float* qBuffer,
413 const lv_32fc_t* complexVector,
414 unsigned int num_points)
415{
416 size_t n = num_points;
417 for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
418 vl = __riscv_vsetvl_e32m4(n);
419 vuint32m4x2_t vc =
420 __riscv_vlseg2e32_v_u32m4x2((const uint32_t*)complexVector, vl);
421 vuint32m4_t vr = __riscv_vget_u32m4(vc, 0);
422 vuint32m4_t vi = __riscv_vget_u32m4(vc, 1);
423 __riscv_vse32((uint32_t*)iBuffer, vr, vl);
424 __riscv_vse32((uint32_t*)qBuffer, vi, vl);
425 }
426}
427#endif /*LV_HAVE_RVVSEG*/
428
429#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */