Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_deinterleave_imag_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
56
57#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
58#define INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
59
60#include <inttypes.h>
61#include <stdio.h>
62
63#ifdef LV_HAVE_AVX
64#include <immintrin.h>
65
66static inline void volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer,
67 const lv_32fc_t* complexVector,
68 unsigned int num_points)
69{
70 unsigned int number = 0;
71 const unsigned int eighthPoints = num_points / 8;
72 const float* complexVectorPtr = (const float*)complexVector;
73 float* qBufferPtr = qBuffer;
74
75 __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
76 for (; number < eighthPoints; number++) {
77
78 cplxValue1 = _mm256_load_ps(complexVectorPtr);
79 complexVectorPtr += 8;
80
81 cplxValue2 = _mm256_load_ps(complexVectorPtr);
82 complexVectorPtr += 8;
83
84 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
85 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
86
87 // Arrange in q1q2q3q4 format
88 qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
89
90 _mm256_store_ps(qBufferPtr, qValue);
91
92 qBufferPtr += 8;
93 }
94
95 number = eighthPoints * 8;
96 for (; number < num_points; number++) {
97 complexVectorPtr++;
98 *qBufferPtr++ = *complexVectorPtr++;
99 }
100}
101#endif /* LV_HAVE_AVX */
102
103#ifdef LV_HAVE_SSE
104#include <xmmintrin.h>
105
106static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer,
107 const lv_32fc_t* complexVector,
108 unsigned int num_points)
109{
110 unsigned int number = 0;
111 const unsigned int quarterPoints = num_points / 4;
112
113 const float* complexVectorPtr = (const float*)complexVector;
114 float* qBufferPtr = qBuffer;
115
116 __m128 cplxValue1, cplxValue2, iValue;
117 for (; number < quarterPoints; number++) {
118
119 cplxValue1 = _mm_load_ps(complexVectorPtr);
120 complexVectorPtr += 4;
121
122 cplxValue2 = _mm_load_ps(complexVectorPtr);
123 complexVectorPtr += 4;
124
125 // Arrange in q1q2q3q4 format
126 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
127
128 _mm_store_ps(qBufferPtr, iValue);
129
130 qBufferPtr += 4;
131 }
132
133 number = quarterPoints * 4;
134 for (; number < num_points; number++) {
135 complexVectorPtr++;
136 *qBufferPtr++ = *complexVectorPtr++;
137 }
138}
139#endif /* LV_HAVE_SSE */
140
141#ifdef LV_HAVE_NEON
142#include <arm_neon.h>
143
144static inline void volk_32fc_deinterleave_imag_32f_neon(float* qBuffer,
145 const lv_32fc_t* complexVector,
146 unsigned int num_points)
147{
148 unsigned int number = 0;
149 unsigned int quarter_points = num_points / 4;
150 const float* complexVectorPtr = (float*)complexVector;
151 float* qBufferPtr = qBuffer;
152 float32x4x2_t complexInput;
153
154 for (number = 0; number < quarter_points; number++) {
155 complexInput = vld2q_f32(complexVectorPtr);
156 vst1q_f32(qBufferPtr, complexInput.val[1]);
157 complexVectorPtr += 8;
158 qBufferPtr += 4;
159 }
160
161 for (number = quarter_points * 4; number < num_points; number++) {
162 complexVectorPtr++;
163 *qBufferPtr++ = *complexVectorPtr++;
164 }
165}
166#endif /* LV_HAVE_NEON */
167
168#ifdef LV_HAVE_NEONV8
169#include <arm_neon.h>
170
171static inline void volk_32fc_deinterleave_imag_32f_neonv8(float* qBuffer,
172 const lv_32fc_t* complexVector,
173 unsigned int num_points)
174{
175 const unsigned int eighthPoints = num_points / 8;
176 const float* complexVectorPtr = (float*)complexVector;
177 float* qBufferPtr = qBuffer;
178
179 for (unsigned int number = 0; number < eighthPoints; number++) {
180 float32x4x2_t cplx0 = vld2q_f32(complexVectorPtr);
181 float32x4x2_t cplx1 = vld2q_f32(complexVectorPtr + 8);
182 __VOLK_PREFETCH(complexVectorPtr + 32);
183
184 vst1q_f32(qBufferPtr, cplx0.val[1]);
185 vst1q_f32(qBufferPtr + 4, cplx1.val[1]);
186
187 complexVectorPtr += 16;
188 qBufferPtr += 8;
189 }
190
191 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
192 complexVectorPtr++;
193 *qBufferPtr++ = *complexVectorPtr++;
194 }
195}
196#endif /* LV_HAVE_NEONV8 */
197
198#ifdef LV_HAVE_GENERIC
199
200static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer,
201 const lv_32fc_t* complexVector,
202 unsigned int num_points)
203{
204 unsigned int number = 0;
205 const float* complexVectorPtr = (float*)complexVector;
206 float* qBufferPtr = qBuffer;
207 for (number = 0; number < num_points; number++) {
208 complexVectorPtr++;
209 *qBufferPtr++ = *complexVectorPtr++;
210 }
211}
212#endif /* LV_HAVE_GENERIC */
213
214
215#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_a_H */
216
217#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_u_H
218#define INCLUDED_volk_32fc_deinterleave_imag_32f_u_H
219
220#include <inttypes.h>
221#include <stdio.h>
222
223#ifdef LV_HAVE_AVX
224#include <immintrin.h>
225
226static inline void volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer,
227 const lv_32fc_t* complexVector,
228 unsigned int num_points)
229{
230 unsigned int number = 0;
231 const unsigned int eighthPoints = num_points / 8;
232 const float* complexVectorPtr = (const float*)complexVector;
233 float* qBufferPtr = qBuffer;
234
235 __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
236 for (; number < eighthPoints; number++) {
237
238 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
239 complexVectorPtr += 8;
240
241 cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
242 complexVectorPtr += 8;
243
244 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
245 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
246
247 // Arrange in q1q2q3q4 format
248 qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
249
250 _mm256_storeu_ps(qBufferPtr, qValue);
251
252 qBufferPtr += 8;
253 }
254
255 number = eighthPoints * 8;
256 for (; number < num_points; number++) {
257 complexVectorPtr++;
258 *qBufferPtr++ = *complexVectorPtr++;
259 }
260}
261#endif /* LV_HAVE_AVX */
262
263#ifdef LV_HAVE_RVV
264#include <riscv_vector.h>
265
266static inline void volk_32fc_deinterleave_imag_32f_rvv(float* qBuffer,
267 const lv_32fc_t* complexVector,
268 unsigned int num_points)
269{
270 const uint64_t* in = (const uint64_t*)complexVector;
271 size_t n = num_points;
272 for (size_t vl; n > 0; n -= vl, in += vl, qBuffer += vl) {
273 vl = __riscv_vsetvl_e64m8(n);
274 vuint64m8_t vc = __riscv_vle64_v_u64m8(in, vl);
275 __riscv_vse32((uint32_t*)qBuffer, __riscv_vnsrl(vc, 32, vl), vl);
276 }
277}
278#endif /*LV_HAVE_RVV*/
279
280#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_u_H */