Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_s32f_deinterleave_real_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
59
60#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
61#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
62
63#include <inttypes.h>
64#include <stdio.h>
65#include <volk/volk_common.h>
66
67
68#ifdef LV_HAVE_AVX2
69#include <immintrin.h>
70
71static inline void
72volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
73 const lv_32fc_t* complexVector,
74 const float scalar,
75 unsigned int num_points)
76{
77 unsigned int number = 0;
78 const unsigned int eighthPoints = num_points / 8;
79
80 const float* complexVectorPtr = (float*)complexVector;
81 int16_t* iBufferPtr = iBuffer;
82
83 __m256 vScalar = _mm256_set1_ps(scalar);
84
85 __m256 cplxValue1, cplxValue2, iValue;
86 __m256i a;
87 __m128i b;
88
89 __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
90
91 for (; number < eighthPoints; number++) {
92 cplxValue1 = _mm256_load_ps(complexVectorPtr);
93 complexVectorPtr += 8;
94
95 cplxValue2 = _mm256_load_ps(complexVectorPtr);
96 complexVectorPtr += 8;
97
98 // Arrange in i1i2i3i4 format
99 iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
100
101 iValue = _mm256_mul_ps(iValue, vScalar);
102
103 a = _mm256_cvtps_epi32(iValue);
104 a = _mm256_packs_epi32(a, a);
105 a = _mm256_permutevar8x32_epi32(a, idx);
106 b = _mm256_extracti128_si256(a, 0);
107
108 _mm_store_si128((__m128i*)iBufferPtr, b);
109 iBufferPtr += 8;
110 }
111
112 number = eighthPoints * 8;
113 iBufferPtr = &iBuffer[number];
114 for (; number < num_points; number++) {
115 *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
116 complexVectorPtr++;
117 }
118}
119
120
121#endif /* LV_HAVE_AVX2 */
122
123#ifdef LV_HAVE_SSE
124#include <xmmintrin.h>
125
126static inline void
128 const lv_32fc_t* complexVector,
129 const float scalar,
130 unsigned int num_points)
131{
132 unsigned int number = 0;
133 const unsigned int quarterPoints = num_points / 4;
134
135 const float* complexVectorPtr = (float*)complexVector;
136 int16_t* iBufferPtr = iBuffer;
137
138 __m128 vScalar = _mm_set_ps1(scalar);
139
140 __m128 cplxValue1, cplxValue2, iValue;
141
142 __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
143
144 for (; number < quarterPoints; number++) {
145 cplxValue1 = _mm_load_ps(complexVectorPtr);
146 complexVectorPtr += 4;
147
148 cplxValue2 = _mm_load_ps(complexVectorPtr);
149 complexVectorPtr += 4;
150
151 // Arrange in i1i2i3i4 format
152 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
153
154 iValue = _mm_mul_ps(iValue, vScalar);
155
156 _mm_store_ps(floatBuffer, iValue);
157 *iBufferPtr++ = (int16_t)rintf(floatBuffer[0]);
158 *iBufferPtr++ = (int16_t)rintf(floatBuffer[1]);
159 *iBufferPtr++ = (int16_t)rintf(floatBuffer[2]);
160 *iBufferPtr++ = (int16_t)rintf(floatBuffer[3]);
161 }
162
163 number = quarterPoints * 4;
164 iBufferPtr = &iBuffer[number];
165 for (; number < num_points; number++) {
166 *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
167 complexVectorPtr++;
168 }
169}
170
171#endif /* LV_HAVE_SSE */
172
173
174#ifdef LV_HAVE_GENERIC
175
176static inline void
178 const lv_32fc_t* complexVector,
179 const float scalar,
180 unsigned int num_points)
181{
182 const float* complexVectorPtr = (float*)complexVector;
183 int16_t* iBufferPtr = iBuffer;
184 unsigned int number = 0;
185 for (number = 0; number < num_points; number++) {
186 *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
187 complexVectorPtr++;
188 }
189}
190
191#endif /* LV_HAVE_GENERIC */
192
193#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H */
194
195#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
196#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
197
198#include <inttypes.h>
199#include <stdio.h>
200#include <volk/volk_common.h>
201
202#ifdef LV_HAVE_AVX2
203#include <immintrin.h>
204
205static inline void
206volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
207 const lv_32fc_t* complexVector,
208 const float scalar,
209 unsigned int num_points)
210{
211 unsigned int number = 0;
212 const unsigned int eighthPoints = num_points / 8;
213
214 const float* complexVectorPtr = (float*)complexVector;
215 int16_t* iBufferPtr = iBuffer;
216
217 __m256 vScalar = _mm256_set1_ps(scalar);
218
219 __m256 cplxValue1, cplxValue2, iValue;
220 __m256i a;
221 __m128i b;
222
223 __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
224
225 for (; number < eighthPoints; number++) {
226 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
227 complexVectorPtr += 8;
228
229 cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
230 complexVectorPtr += 8;
231
232 // Arrange in i1i2i3i4 format
233 iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
234
235 iValue = _mm256_mul_ps(iValue, vScalar);
236
237 a = _mm256_cvtps_epi32(iValue);
238 a = _mm256_packs_epi32(a, a);
239 a = _mm256_permutevar8x32_epi32(a, idx);
240 b = _mm256_extracti128_si256(a, 0);
241
242 _mm_storeu_si128((__m128i*)iBufferPtr, b);
243 iBufferPtr += 8;
244 }
245
246 number = eighthPoints * 8;
247 iBufferPtr = &iBuffer[number];
248 for (; number < num_points; number++) {
249 *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
250 complexVectorPtr++;
251 }
252}
253
254#endif /* LV_HAVE_AVX2 */
255
256#ifdef LV_HAVE_NEON
257#include <arm_neon.h>
258
259static inline void
261 const lv_32fc_t* complexVector,
262 const float scalar,
263 unsigned int num_points)
264{
265 unsigned int number = 0;
266 const unsigned int quarter_points = num_points / 4;
267
268 const float* complexVectorPtr = (float*)complexVector;
269 int16_t* iBufferPtr = iBuffer;
270 float32x4_t vScalar = vdupq_n_f32(scalar);
271
272 float32x4_t half = vdupq_n_f32(0.5f);
273 float32x4_t neg_half = vdupq_n_f32(-0.5f);
274 float32x4_t zero = vdupq_n_f32(0.0f);
275
276 for (; number < quarter_points; number++) {
277 float32x4x2_t input = vld2q_f32(complexVectorPtr);
278 complexVectorPtr += 8;
279
280 float32x4_t scaled = vmulq_f32(input.val[0], vScalar);
281 // Round to nearest: add copysign(0.5, x) before truncating
282 uint32x4_t neg = vcltq_f32(scaled, zero);
283 scaled = vaddq_f32(scaled, vbslq_f32(neg, neg_half, half));
284 int32x4_t intVal = vcvtq_s32_f32(scaled);
285 int16x4_t shortVal = vqmovn_s32(intVal);
286
287 vst1_s16(iBufferPtr, shortVal);
288 iBufferPtr += 4;
289 }
290
291 number = quarter_points * 4;
292 for (; number < num_points; number++) {
293 *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
294 complexVectorPtr++;
295 }
296}
297#endif /* LV_HAVE_NEON */
298
299#ifdef LV_HAVE_NEONV8
300#include <arm_neon.h>
301
302static inline void
303volk_32fc_s32f_deinterleave_real_16i_neonv8(int16_t* iBuffer,
304 const lv_32fc_t* complexVector,
305 const float scalar,
306 unsigned int num_points)
307{
308 unsigned int number = 0;
309 const unsigned int eighth_points = num_points / 8;
310
311 const float* complexVectorPtr = (float*)complexVector;
312 int16_t* iBufferPtr = iBuffer;
313 float32x4_t vScalar = vdupq_n_f32(scalar);
314
315 for (; number < eighth_points; number++) {
316 float32x4x2_t input0 = vld2q_f32(complexVectorPtr);
317 float32x4x2_t input1 = vld2q_f32(complexVectorPtr + 8);
318 complexVectorPtr += 16;
319 __VOLK_PREFETCH(complexVectorPtr + 16);
320
321 float32x4_t scaled0 = vmulq_f32(input0.val[0], vScalar);
322 float32x4_t scaled1 = vmulq_f32(input1.val[0], vScalar);
323
324 int32x4_t intVal0 = vcvtnq_s32_f32(scaled0);
325 int32x4_t intVal1 = vcvtnq_s32_f32(scaled1);
326
327 int16x4_t shortVal0 = vqmovn_s32(intVal0);
328 int16x4_t shortVal1 = vqmovn_s32(intVal1);
329
330 vst1_s16(iBufferPtr, shortVal0);
331 vst1_s16(iBufferPtr + 4, shortVal1);
332 iBufferPtr += 8;
333 }
334
335 number = eighth_points * 8;
336 for (; number < num_points; number++) {
337 *iBufferPtr++ = (int16_t)rintf(*complexVectorPtr++ * scalar);
338 complexVectorPtr++;
339 }
340}
341#endif /* LV_HAVE_NEONV8 */
342
343#ifdef LV_HAVE_RVV
344#include <riscv_vector.h>
345
346static inline void
347volk_32fc_s32f_deinterleave_real_16i_rvv(int16_t* iBuffer,
348 const lv_32fc_t* complexVector,
349 const float scalar,
350 unsigned int num_points)
351{
352 const uint64_t* in = (const uint64_t*)complexVector;
353 size_t n = num_points;
354 for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
355 vl = __riscv_vsetvl_e64m8(n);
356 vuint32m4_t vi = __riscv_vnsrl(__riscv_vle64_v_u64m8(in, vl), 0, vl);
357 vfloat32m4_t vif = __riscv_vfmul(__riscv_vreinterpret_f32m4(vi), scalar, vl);
358 __riscv_vse16(iBuffer, __riscv_vncvt_x(__riscv_vfcvt_x(vif, vl), vl), vl);
359 }
360}
361#endif /*LV_HAVE_RVV*/
362
363#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H */