Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16ic_deinterleave_real_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
40
41#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H
42#define INCLUDED_volk_16ic_deinterleave_real_16i_a_H
43
44#include <inttypes.h>
45#include <stdio.h>
46
47
48#ifdef LV_HAVE_AVX2
49#include <immintrin.h>
50
51static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
52 const lv_16sc_t* complexVector,
53 unsigned int num_points)
54{
55 unsigned int number = 0;
56 const int16_t* complexVectorPtr = (int16_t*)complexVector;
57 int16_t* iBufferPtr = iBuffer;
58
59 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
60 0x80,
61 0x80,
62 0x80,
63 0x80,
64 0x80,
65 0x80,
66 0x80,
67 13,
68 12,
69 9,
70 8,
71 5,
72 4,
73 1,
74 0,
75 0x80,
76 0x80,
77 0x80,
78 0x80,
79 0x80,
80 0x80,
81 0x80,
82 0x80,
83 13,
84 12,
85 9,
86 8,
87 5,
88 4,
89 1,
90 0);
91 __m256i iMoveMask2 = _mm256_set_epi8(13,
92 12,
93 9,
94 8,
95 5,
96 4,
97 1,
98 0,
99 0x80,
100 0x80,
101 0x80,
102 0x80,
103 0x80,
104 0x80,
105 0x80,
106 0x80,
107 13,
108 12,
109 9,
110 8,
111 5,
112 4,
113 1,
114 0,
115 0x80,
116 0x80,
117 0x80,
118 0x80,
119 0x80,
120 0x80,
121 0x80,
122 0x80);
123
124 __m256i complexVal1, complexVal2, iOutputVal;
125
126 unsigned int sixteenthPoints = num_points / 16;
127
128 for (number = 0; number < sixteenthPoints; number++) {
129 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
130 complexVectorPtr += 16;
131 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
132 complexVectorPtr += 16;
133
134 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
135 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
136
137 iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
138 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
139
140 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
141
142 iBufferPtr += 16;
143 }
144
145 number = sixteenthPoints * 16;
146 for (; number < num_points; number++) {
147 *iBufferPtr++ = *complexVectorPtr++;
148 complexVectorPtr++;
149 }
150}
151#endif /* LV_HAVE_AVX2 */
152
153#ifdef LV_HAVE_SSSE3
154#include <tmmintrin.h>
155
156static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer,
157 const lv_16sc_t* complexVector,
158 unsigned int num_points)
159{
160 unsigned int number = 0;
161 const int16_t* complexVectorPtr = (int16_t*)complexVector;
162 int16_t* iBufferPtr = iBuffer;
163
164 __m128i iMoveMask1 = _mm_set_epi8(
165 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
166 __m128i iMoveMask2 = _mm_set_epi8(
167 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
168
169 __m128i complexVal1, complexVal2, iOutputVal;
170
171 unsigned int eighthPoints = num_points / 8;
172
173 for (number = 0; number < eighthPoints; number++) {
174 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
175 complexVectorPtr += 8;
176 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
177 complexVectorPtr += 8;
178
179 complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
180 complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
181
182 iOutputVal = _mm_or_si128(complexVal1, complexVal2);
183
184 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
185
186 iBufferPtr += 8;
187 }
188
189 number = eighthPoints * 8;
190 for (; number < num_points; number++) {
191 *iBufferPtr++ = *complexVectorPtr++;
192 complexVectorPtr++;
193 }
194}
195#endif /* LV_HAVE_SSSE3 */
196
197
198#ifdef LV_HAVE_SSE2
199#include <emmintrin.h>
200
201static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer,
202 const lv_16sc_t* complexVector,
203 unsigned int num_points)
204{
205 unsigned int number = 0;
206 const int16_t* complexVectorPtr = (int16_t*)complexVector;
207 int16_t* iBufferPtr = iBuffer;
208 __m128i complexVal1, complexVal2, iOutputVal;
209 __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
210 __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
211
212 unsigned int eighthPoints = num_points / 8;
213
214 for (number = 0; number < eighthPoints; number++) {
215 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
216 complexVectorPtr += 8;
217 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
218 complexVectorPtr += 8;
219
220 complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
221
222 complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
223
224 complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
225
226 complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
227
228 complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
229
230 complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
231
232 iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask),
233 _mm_and_si128(complexVal2, highMask));
234
235 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
236
237 iBufferPtr += 8;
238 }
239
240 number = eighthPoints * 8;
241 for (; number < num_points; number++) {
242 *iBufferPtr++ = *complexVectorPtr++;
243 complexVectorPtr++;
244 }
245}
246#endif /* LV_HAVE_SSE2 */
247
248#ifdef LV_HAVE_GENERIC
249
250static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer,
251 const lv_16sc_t* complexVector,
252 unsigned int num_points)
253{
254 unsigned int number = 0;
255 const int16_t* complexVectorPtr = (int16_t*)complexVector;
256 int16_t* iBufferPtr = iBuffer;
257 for (number = 0; number < num_points; number++) {
258 *iBufferPtr++ = *complexVectorPtr++;
259 complexVectorPtr++;
260 }
261}
262#endif /* LV_HAVE_GENERIC */
263
264
265#ifdef LV_HAVE_NEON
266#include <arm_neon.h>
267
268static inline void volk_16ic_deinterleave_real_16i_neon(int16_t* iBuffer,
269 const lv_16sc_t* complexVector,
270 unsigned int num_points)
271{
272 unsigned int number = 0;
273 const unsigned int eighthPoints = num_points / 8;
274 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
275 int16_t* iBufferPtr = iBuffer;
276
277 int16x8x2_t complexVal;
278
279 for (; number < eighthPoints; number++) {
280 complexVal = vld2q_s16(complexVectorPtr);
281 vst1q_s16(iBufferPtr, complexVal.val[0]);
282 complexVectorPtr += 16;
283 iBufferPtr += 8;
284 }
285
286 number = eighthPoints * 8;
287 for (; number < num_points; number++) {
288 *iBufferPtr++ = *complexVectorPtr++;
289 complexVectorPtr++;
290 }
291}
292#endif /* LV_HAVE_NEON */
293
294
295#ifdef LV_HAVE_NEONV8
296#include <arm_neon.h>
297
298static inline void volk_16ic_deinterleave_real_16i_neonv8(int16_t* iBuffer,
299 const lv_16sc_t* complexVector,
300 unsigned int num_points)
301{
302 unsigned int number = 0;
303 const unsigned int sixteenthPoints = num_points / 16;
304 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
305 int16_t* iBufferPtr = iBuffer;
306
307 int16x8x2_t complexVal0, complexVal1;
308
309 for (; number < sixteenthPoints; number++) {
310 complexVal0 = vld2q_s16(complexVectorPtr);
311 complexVal1 = vld2q_s16(complexVectorPtr + 16);
312 __VOLK_PREFETCH(complexVectorPtr + 32);
313
314 vst1q_s16(iBufferPtr, complexVal0.val[0]);
315 vst1q_s16(iBufferPtr + 8, complexVal1.val[0]);
316
317 complexVectorPtr += 32;
318 iBufferPtr += 16;
319 }
320
321 number = sixteenthPoints * 16;
322 for (; number < num_points; number++) {
323 *iBufferPtr++ = *complexVectorPtr++;
324 complexVectorPtr++;
325 }
326}
327#endif /* LV_HAVE_NEONV8 */
328
329
330#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */
331
332
333#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_u_H
334#define INCLUDED_volk_16ic_deinterleave_real_16i_u_H
335
336#include <inttypes.h>
337#include <stdio.h>
338
339
340#ifdef LV_HAVE_AVX2
341#include <immintrin.h>
342
343static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
344 const lv_16sc_t* complexVector,
345 unsigned int num_points)
346{
347 unsigned int number = 0;
348 const int16_t* complexVectorPtr = (int16_t*)complexVector;
349 int16_t* iBufferPtr = iBuffer;
350
351 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
352 0x80,
353 0x80,
354 0x80,
355 0x80,
356 0x80,
357 0x80,
358 0x80,
359 13,
360 12,
361 9,
362 8,
363 5,
364 4,
365 1,
366 0,
367 0x80,
368 0x80,
369 0x80,
370 0x80,
371 0x80,
372 0x80,
373 0x80,
374 0x80,
375 13,
376 12,
377 9,
378 8,
379 5,
380 4,
381 1,
382 0);
383 __m256i iMoveMask2 = _mm256_set_epi8(13,
384 12,
385 9,
386 8,
387 5,
388 4,
389 1,
390 0,
391 0x80,
392 0x80,
393 0x80,
394 0x80,
395 0x80,
396 0x80,
397 0x80,
398 0x80,
399 13,
400 12,
401 9,
402 8,
403 5,
404 4,
405 1,
406 0,
407 0x80,
408 0x80,
409 0x80,
410 0x80,
411 0x80,
412 0x80,
413 0x80,
414 0x80);
415
416 __m256i complexVal1, complexVal2, iOutputVal;
417
418 unsigned int sixteenthPoints = num_points / 16;
419
420 for (number = 0; number < sixteenthPoints; number++) {
421 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
422 complexVectorPtr += 16;
423 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
424 complexVectorPtr += 16;
425
426 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
427 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
428
429 iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
430 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
431
432 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
433
434 iBufferPtr += 16;
435 }
436
437 number = sixteenthPoints * 16;
438 for (; number < num_points; number++) {
439 *iBufferPtr++ = *complexVectorPtr++;
440 complexVectorPtr++;
441 }
442}
443#endif /* LV_HAVE_AVX2 */
444
445#ifdef LV_HAVE_RVV
446#include <riscv_vector.h>
447
448static inline void volk_16ic_deinterleave_real_16i_rvv(int16_t* iBuffer,
449 const lv_16sc_t* complexVector,
450 unsigned int num_points)
451{
452 const uint32_t* in = (const uint32_t*)complexVector;
453 size_t n = num_points;
454 for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
455 vl = __riscv_vsetvl_e32m8(n);
456 vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl);
457 __riscv_vse16((uint16_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl);
458 }
459}
460#endif /*LV_HAVE_RVV*/
461
462#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_u_H */