Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_x2_interleave_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
59
60#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a_H
61#define INCLUDED_volk_32f_x2_interleave_32fc_a_H
62
63#include <inttypes.h>
64#include <stdio.h>
65
66#ifdef LV_HAVE_AVX
67#include <immintrin.h>
68
69static inline void volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector,
70 const float* iBuffer,
71 const float* qBuffer,
72 unsigned int num_points)
73{
74 unsigned int number = 0;
75 float* complexVectorPtr = (float*)complexVector;
76 const float* iBufferPtr = iBuffer;
77 const float* qBufferPtr = qBuffer;
78
79 const uint64_t eighthPoints = num_points / 8;
80
81 __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
82 for (; number < eighthPoints; number++) {
83 iValue = _mm256_load_ps(iBufferPtr);
84 qValue = _mm256_load_ps(qBufferPtr);
85
86 // Interleaves the lower two values in the i and q variables into one buffer
87 cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
88 // Interleaves the upper two values in the i and q variables into one buffer
89 cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
90
91 cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
92 _mm256_store_ps(complexVectorPtr, cplxValue);
93 complexVectorPtr += 8;
94
95 cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
96 _mm256_store_ps(complexVectorPtr, cplxValue);
97 complexVectorPtr += 8;
98
99 iBufferPtr += 8;
100 qBufferPtr += 8;
101 }
102
103 number = eighthPoints * 8;
104 for (; number < num_points; number++) {
105 *complexVectorPtr++ = *iBufferPtr++;
106 *complexVectorPtr++ = *qBufferPtr++;
107 }
108}
109
110#endif /* LV_HAV_AVX */
111
112#ifdef LV_HAVE_SSE
113#include <xmmintrin.h>
114
115static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector,
116 const float* iBuffer,
117 const float* qBuffer,
118 unsigned int num_points)
119{
120 unsigned int number = 0;
121 float* complexVectorPtr = (float*)complexVector;
122 const float* iBufferPtr = iBuffer;
123 const float* qBufferPtr = qBuffer;
124
125 const uint64_t quarterPoints = num_points / 4;
126
127 __m128 iValue, qValue, cplxValue;
128 for (; number < quarterPoints; number++) {
129 iValue = _mm_load_ps(iBufferPtr);
130 qValue = _mm_load_ps(qBufferPtr);
131
132 // Interleaves the lower two values in the i and q variables into one buffer
133 cplxValue = _mm_unpacklo_ps(iValue, qValue);
134 _mm_store_ps(complexVectorPtr, cplxValue);
135 complexVectorPtr += 4;
136
137 // Interleaves the upper two values in the i and q variables into one buffer
138 cplxValue = _mm_unpackhi_ps(iValue, qValue);
139 _mm_store_ps(complexVectorPtr, cplxValue);
140 complexVectorPtr += 4;
141
142 iBufferPtr += 4;
143 qBufferPtr += 4;
144 }
145
146 number = quarterPoints * 4;
147 for (; number < num_points; number++) {
148 *complexVectorPtr++ = *iBufferPtr++;
149 *complexVectorPtr++ = *qBufferPtr++;
150 }
151}
152#endif /* LV_HAVE_SSE */
153
154
155#ifdef LV_HAVE_NEON
156#include <arm_neon.h>
157
158static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector,
159 const float* iBuffer,
160 const float* qBuffer,
161 unsigned int num_points)
162{
163 unsigned int quarter_points = num_points / 4;
164 unsigned int number;
165 float* complexVectorPtr = (float*)complexVector;
166
167 float32x4x2_t complex_vec;
168 for (number = 0; number < quarter_points; ++number) {
169 complex_vec.val[0] = vld1q_f32(iBuffer);
170 complex_vec.val[1] = vld1q_f32(qBuffer);
171 vst2q_f32(complexVectorPtr, complex_vec);
172 iBuffer += 4;
173 qBuffer += 4;
174 complexVectorPtr += 8;
175 }
176
177 for (number = quarter_points * 4; number < num_points; ++number) {
178 *complexVectorPtr++ = *iBuffer++;
179 *complexVectorPtr++ = *qBuffer++;
180 }
181}
182#endif /* LV_HAVE_NEON */
183
184#ifdef LV_HAVE_NEONV8
185#include <arm_neon.h>
186
187static inline void volk_32f_x2_interleave_32fc_neonv8(lv_32fc_t* complexVector,
188 const float* iBuffer,
189 const float* qBuffer,
190 unsigned int num_points)
191{
192 const unsigned int eighthPoints = num_points / 8;
193
194 float* outPtr = (float*)complexVector;
195 const float* iPtr = iBuffer;
196 const float* qPtr = qBuffer;
197
198 for (unsigned int number = 0; number < eighthPoints; number++) {
199 float32x4x2_t cplx0, cplx1;
200 cplx0.val[0] = vld1q_f32(iPtr);
201 cplx0.val[1] = vld1q_f32(qPtr);
202 cplx1.val[0] = vld1q_f32(iPtr + 4);
203 cplx1.val[1] = vld1q_f32(qPtr + 4);
204 __VOLK_PREFETCH(iPtr + 16);
205 __VOLK_PREFETCH(qPtr + 16);
206
207 vst2q_f32(outPtr, cplx0);
208 vst2q_f32(outPtr + 8, cplx1);
209
210 iPtr += 8;
211 qPtr += 8;
212 outPtr += 16;
213 }
214
215 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
216 *outPtr++ = *iPtr++;
217 *outPtr++ = *qPtr++;
218 }
219}
220#endif /* LV_HAVE_NEONV8 */
221
222
223#ifdef LV_HAVE_GENERIC
224
225static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector,
226 const float* iBuffer,
227 const float* qBuffer,
228 unsigned int num_points)
229{
230 float* complexVectorPtr = (float*)complexVector;
231 const float* iBufferPtr = iBuffer;
232 const float* qBufferPtr = qBuffer;
233 unsigned int number;
234
235 for (number = 0; number < num_points; number++) {
236 *complexVectorPtr++ = *iBufferPtr++;
237 *complexVectorPtr++ = *qBufferPtr++;
238 }
239}
240#endif /* LV_HAVE_GENERIC */
241
242
243#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */
244
245#ifndef INCLUDED_volk_32f_x2_interleave_32fc_u_H
246#define INCLUDED_volk_32f_x2_interleave_32fc_u_H
247
248#include <inttypes.h>
249#include <stdio.h>
250
251#ifdef LV_HAVE_AVX
252#include <immintrin.h>
253
254static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector,
255 const float* iBuffer,
256 const float* qBuffer,
257 unsigned int num_points)
258{
259 unsigned int number = 0;
260 float* complexVectorPtr = (float*)complexVector;
261 const float* iBufferPtr = iBuffer;
262 const float* qBufferPtr = qBuffer;
263
264 const uint64_t eighthPoints = num_points / 8;
265
266 __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
267 for (; number < eighthPoints; number++) {
268 iValue = _mm256_loadu_ps(iBufferPtr);
269 qValue = _mm256_loadu_ps(qBufferPtr);
270
271 // Interleaves the lower two values in the i and q variables into one buffer
272 cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
273 // Interleaves the upper two values in the i and q variables into one buffer
274 cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
275
276 cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
277 _mm256_storeu_ps(complexVectorPtr, cplxValue);
278 complexVectorPtr += 8;
279
280 cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
281 _mm256_storeu_ps(complexVectorPtr, cplxValue);
282 complexVectorPtr += 8;
283
284 iBufferPtr += 8;
285 qBufferPtr += 8;
286 }
287
288 number = eighthPoints * 8;
289 for (; number < num_points; number++) {
290 *complexVectorPtr++ = *iBufferPtr++;
291 *complexVectorPtr++ = *qBufferPtr++;
292 }
293}
294#endif /* LV_HAVE_AVX */
295
296#ifdef LV_HAVE_RVV
297#include <riscv_vector.h>
298
299static inline void volk_32f_x2_interleave_32fc_rvv(lv_32fc_t* complexVector,
300 const float* iBuffer,
301 const float* qBuffer,
302 unsigned int num_points)
303{
304 uint64_t* out = (uint64_t*)complexVector;
305 size_t n = num_points;
306 for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) {
307 vl = __riscv_vsetvl_e32m4(n);
308 vuint32m4_t vr = __riscv_vle32_v_u32m4((const uint32_t*)iBuffer, vl);
309 vuint32m4_t vi = __riscv_vle32_v_u32m4((const uint32_t*)qBuffer, vl);
310 vuint64m8_t vc =
311 __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFFFFFF, vi, vl);
312 __riscv_vse64(out, vc, vl);
313 }
314}
315#endif /*LV_HAVE_RVV*/
316
317#ifdef LV_HAVE_RVVSEG
318#include <riscv_vector.h>
319
320static inline void volk_32f_x2_interleave_32fc_rvvseg(lv_32fc_t* complexVector,
321 const float* iBuffer,
322 const float* qBuffer,
323 unsigned int num_points)
324{
325 size_t n = num_points;
326 for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
327 vl = __riscv_vsetvl_e32m4(n);
328 vfloat32m4_t vr = __riscv_vle32_v_f32m4(iBuffer, vl);
329 vfloat32m4_t vi = __riscv_vle32_v_f32m4(qBuffer, vl);
330 __riscv_vsseg2e32((float*)complexVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
331 }
332}
333#endif /*LV_HAVE_RVVSEG*/
334
335#endif /* INCLUDED_volk_32f_x2_interleave_32fc_u_H */