Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_deinterleave_real_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
56
57#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a_H
58#define INCLUDED_volk_32fc_deinterleave_real_32f_a_H
59
60#include <inttypes.h>
61#include <stdio.h>
62
63#ifdef LV_HAVE_AVX2
64#include <immintrin.h>
65
66static inline void volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer,
67 const lv_32fc_t* complexVector,
68 unsigned int num_points)
69{
70 unsigned int number = 0;
71 const unsigned int eighthPoints = num_points / 8;
72
73 const float* complexVectorPtr = (const float*)complexVector;
74 float* iBufferPtr = iBuffer;
75
76 __m256 cplxValue1, cplxValue2;
77 __m256 iValue;
78 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
79 for (; number < eighthPoints; number++) {
80
81 cplxValue1 = _mm256_load_ps(complexVectorPtr);
82 complexVectorPtr += 8;
83
84 cplxValue2 = _mm256_load_ps(complexVectorPtr);
85 complexVectorPtr += 8;
86
87 // Arrange in i1i2i3i4 format
88 iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
89 iValue = _mm256_permutevar8x32_ps(iValue, idx);
90
91 _mm256_store_ps(iBufferPtr, iValue);
92
93 iBufferPtr += 8;
94 }
95
96 number = eighthPoints * 8;
97 for (; number < num_points; number++) {
98 *iBufferPtr++ = *complexVectorPtr++;
99 complexVectorPtr++;
100 }
101}
102#endif /* LV_HAVE_AVX2 */
103
104#ifdef LV_HAVE_SSE
105#include <xmmintrin.h>
106
107static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer,
108 const lv_32fc_t* complexVector,
109 unsigned int num_points)
110{
111 unsigned int number = 0;
112 const unsigned int quarterPoints = num_points / 4;
113
114 const float* complexVectorPtr = (const float*)complexVector;
115 float* iBufferPtr = iBuffer;
116
117 __m128 cplxValue1, cplxValue2, iValue;
118 for (; number < quarterPoints; number++) {
119
120 cplxValue1 = _mm_load_ps(complexVectorPtr);
121 complexVectorPtr += 4;
122
123 cplxValue2 = _mm_load_ps(complexVectorPtr);
124 complexVectorPtr += 4;
125
126 // Arrange in i1i2i3i4 format
127 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
128
129 _mm_store_ps(iBufferPtr, iValue);
130
131 iBufferPtr += 4;
132 }
133
134 number = quarterPoints * 4;
135 for (; number < num_points; number++) {
136 *iBufferPtr++ = *complexVectorPtr++;
137 complexVectorPtr++;
138 }
139}
140#endif /* LV_HAVE_SSE */
141
142
143#ifdef LV_HAVE_GENERIC
144
145static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer,
146 const lv_32fc_t* complexVector,
147 unsigned int num_points)
148{
149 unsigned int number = 0;
150 const float* complexVectorPtr = (float*)complexVector;
151 float* iBufferPtr = iBuffer;
152 for (number = 0; number < num_points; number++) {
153 *iBufferPtr++ = *complexVectorPtr++;
154 complexVectorPtr++;
155 }
156}
157#endif /* LV_HAVE_GENERIC */
158
159
160#ifdef LV_HAVE_NEON
161#include <arm_neon.h>
162
163static inline void volk_32fc_deinterleave_real_32f_neon(float* iBuffer,
164 const lv_32fc_t* complexVector,
165 unsigned int num_points)
166{
167 unsigned int number = 0;
168 unsigned int quarter_points = num_points / 4;
169 const float* complexVectorPtr = (float*)complexVector;
170 float* iBufferPtr = iBuffer;
171 float32x4x2_t complexInput;
172
173 for (number = 0; number < quarter_points; number++) {
174 complexInput = vld2q_f32(complexVectorPtr);
175 vst1q_f32(iBufferPtr, complexInput.val[0]);
176 complexVectorPtr += 8;
177 iBufferPtr += 4;
178 }
179
180 for (number = quarter_points * 4; number < num_points; number++) {
181 *iBufferPtr++ = *complexVectorPtr++;
182 complexVectorPtr++;
183 }
184}
185#endif /* LV_HAVE_NEON */
186
187#ifdef LV_HAVE_NEONV8
188#include <arm_neon.h>
189
190static inline void volk_32fc_deinterleave_real_32f_neonv8(float* iBuffer,
191 const lv_32fc_t* complexVector,
192 unsigned int num_points)
193{
194 const unsigned int eighthPoints = num_points / 8;
195 const float* complexVectorPtr = (float*)complexVector;
196 float* iBufferPtr = iBuffer;
197
198 for (unsigned int number = 0; number < eighthPoints; number++) {
199 float32x4x2_t cplx0 = vld2q_f32(complexVectorPtr);
200 float32x4x2_t cplx1 = vld2q_f32(complexVectorPtr + 8);
201 __VOLK_PREFETCH(complexVectorPtr + 32);
202
203 vst1q_f32(iBufferPtr, cplx0.val[0]);
204 vst1q_f32(iBufferPtr + 4, cplx1.val[0]);
205
206 complexVectorPtr += 16;
207 iBufferPtr += 8;
208 }
209
210 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
211 *iBufferPtr++ = *complexVectorPtr++;
212 complexVectorPtr++;
213 }
214}
215#endif /* LV_HAVE_NEONV8 */
216
217#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a_H */
218
219
220#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_u_H
221#define INCLUDED_volk_32fc_deinterleave_real_32f_u_H
222
223#include <inttypes.h>
224#include <stdio.h>
225
226#ifdef LV_HAVE_AVX2
227#include <immintrin.h>
228
229static inline void volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer,
230 const lv_32fc_t* complexVector,
231 unsigned int num_points)
232{
233 unsigned int number = 0;
234 const unsigned int eighthPoints = num_points / 8;
235
236 const float* complexVectorPtr = (const float*)complexVector;
237 float* iBufferPtr = iBuffer;
238
239 __m256 cplxValue1, cplxValue2;
240 __m256 iValue;
241 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
242 for (; number < eighthPoints; number++) {
243
244 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
245 complexVectorPtr += 8;
246
247 cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
248 complexVectorPtr += 8;
249
250 // Arrange in i1i2i3i4 format
251 iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
252 iValue = _mm256_permutevar8x32_ps(iValue, idx);
253
254 _mm256_storeu_ps(iBufferPtr, iValue);
255
256 iBufferPtr += 8;
257 }
258
259 number = eighthPoints * 8;
260 for (; number < num_points; number++) {
261 *iBufferPtr++ = *complexVectorPtr++;
262 complexVectorPtr++;
263 }
264}
265#endif /* LV_HAVE_AVX2 */
266
267#ifdef LV_HAVE_RVV
268#include <riscv_vector.h>
269
270static inline void volk_32fc_deinterleave_real_32f_rvv(float* iBuffer,
271 const lv_32fc_t* complexVector,
272 unsigned int num_points)
273{
274 const uint64_t* in = (const uint64_t*)complexVector;
275 size_t n = num_points;
276 for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
277 vl = __riscv_vsetvl_e64m8(n);
278 vuint64m8_t vc = __riscv_vle64_v_u64m8(in, vl);
279 __riscv_vse32((uint32_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl);
280 }
281}
282#endif /*LV_HAVE_RVV*/
283
284#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_u_H */