Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
40
41#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
42#define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
43
44#include <inttypes.h>
45#include <stdio.h>
46#ifdef LV_HAVE_AVX2
47#include <immintrin.h>
48
49static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
50 int16_t* qBuffer,
51 const lv_16sc_t* complexVector,
52 unsigned int num_points)
53{
54 unsigned int number = 0;
55 const int8_t* complexVectorPtr = (int8_t*)complexVector;
56 int16_t* iBufferPtr = iBuffer;
57 int16_t* qBufferPtr = qBuffer;
58
59 __m256i MoveMask = _mm256_set_epi8(15,
60 14,
61 11,
62 10,
63 7,
64 6,
65 3,
66 2,
67 13,
68 12,
69 9,
70 8,
71 5,
72 4,
73 1,
74 0,
75 15,
76 14,
77 11,
78 10,
79 7,
80 6,
81 3,
82 2,
83 13,
84 12,
85 9,
86 8,
87 5,
88 4,
89 1,
90 0);
91
92 __m256i iMove2, iMove1;
93 __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
94
95 unsigned int sixteenthPoints = num_points / 16;
96
97 for (number = 0; number < sixteenthPoints; number++) {
98 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
99 complexVectorPtr += 32;
100 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
101 complexVectorPtr += 32;
102
103 iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
104 iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
105
106 iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
107 _mm256_permute4x64_epi64(iMove2, 0x80),
108 0x30);
109 qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
110 _mm256_permute4x64_epi64(iMove2, 0xd0),
111 0x30);
112
113 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
114 _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
115
116 iBufferPtr += 16;
117 qBufferPtr += 16;
118 }
119
120 number = sixteenthPoints * 16;
121 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
122 for (; number < num_points; number++) {
123 *iBufferPtr++ = *int16ComplexVectorPtr++;
124 *qBufferPtr++ = *int16ComplexVectorPtr++;
125 }
126}
127#endif /* LV_HAVE_AVX2 */
128
129#ifdef LV_HAVE_SSSE3
130#include <tmmintrin.h>
131
132static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer,
133 int16_t* qBuffer,
134 const lv_16sc_t* complexVector,
135 unsigned int num_points)
136{
137 unsigned int number = 0;
138 const int8_t* complexVectorPtr = (int8_t*)complexVector;
139 int16_t* iBufferPtr = iBuffer;
140 int16_t* qBufferPtr = qBuffer;
141
142 __m128i iMoveMask1 = _mm_set_epi8(
143 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
144 __m128i iMoveMask2 = _mm_set_epi8(
145 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
146
147 __m128i qMoveMask1 = _mm_set_epi8(
148 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
149 __m128i qMoveMask2 = _mm_set_epi8(
150 15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
151
152 __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
153
154 unsigned int eighthPoints = num_points / 8;
155
156 for (number = 0; number < eighthPoints; number++) {
157 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
158 complexVectorPtr += 16;
159 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
160 complexVectorPtr += 16;
161
162 iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
163 _mm_shuffle_epi8(complexVal2, iMoveMask2));
164 qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
165 _mm_shuffle_epi8(complexVal2, qMoveMask2));
166
167 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
168 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
169
170 iBufferPtr += 8;
171 qBufferPtr += 8;
172 }
173
174 number = eighthPoints * 8;
175 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
176 for (; number < num_points; number++) {
177 *iBufferPtr++ = *int16ComplexVectorPtr++;
178 *qBufferPtr++ = *int16ComplexVectorPtr++;
179 }
180}
181#endif /* LV_HAVE_SSSE3 */
182
183#ifdef LV_HAVE_SSE2
184#include <emmintrin.h>
185
186static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer,
187 int16_t* qBuffer,
188 const lv_16sc_t* complexVector,
189 unsigned int num_points)
190{
191 unsigned int number = 0;
192 const int16_t* complexVectorPtr = (int16_t*)complexVector;
193 int16_t* iBufferPtr = iBuffer;
194 int16_t* qBufferPtr = qBuffer;
195 __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
196 qComplexVal2, iOutputVal, qOutputVal;
197 __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
198 __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
199
200 unsigned int eighthPoints = num_points / 8;
201
202 for (number = 0; number < eighthPoints; number++) {
203 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
204 complexVectorPtr += 8;
205 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
206 complexVectorPtr += 8;
207
208 iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
209
210 iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
211
212 iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
213
214 iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
215
216 iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
217
218 iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
219
220 iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
221 _mm_and_si128(iComplexVal2, highMask));
222
223 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
224
225 qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
226
227 qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
228
229 qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
230
231 qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
232
233 qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
234
235 qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
236
237 qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
238 _mm_and_si128(qComplexVal2, highMask));
239
240 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
241
242 iBufferPtr += 8;
243 qBufferPtr += 8;
244 }
245
246 number = eighthPoints * 8;
247 for (; number < num_points; number++) {
248 *iBufferPtr++ = *complexVectorPtr++;
249 *qBufferPtr++ = *complexVectorPtr++;
250 }
251}
252#endif /* LV_HAVE_SSE2 */
253
254#ifdef LV_HAVE_GENERIC
255
256static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
257 int16_t* qBuffer,
258 const lv_16sc_t* complexVector,
259 unsigned int num_points)
260{
261 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
262 int16_t* iBufferPtr = iBuffer;
263 int16_t* qBufferPtr = qBuffer;
264 unsigned int number;
265 for (number = 0; number < num_points; number++) {
266 *iBufferPtr++ = *complexVectorPtr++;
267 *qBufferPtr++ = *complexVectorPtr++;
268 }
269}
270#endif /* LV_HAVE_GENERIC */
271
272
273#ifdef LV_HAVE_NEON
274#include <arm_neon.h>
275
276static inline void volk_16ic_deinterleave_16i_x2_neon(int16_t* iBuffer,
277 int16_t* qBuffer,
278 const lv_16sc_t* complexVector,
279 unsigned int num_points)
280{
281 unsigned int number = 0;
282 const unsigned int eighthPoints = num_points / 8;
283 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
284 int16_t* iBufferPtr = iBuffer;
285 int16_t* qBufferPtr = qBuffer;
286
287 int16x8x2_t complexVal;
288
289 for (; number < eighthPoints; number++) {
290 complexVal = vld2q_s16(complexVectorPtr);
291 vst1q_s16(iBufferPtr, complexVal.val[0]);
292 vst1q_s16(qBufferPtr, complexVal.val[1]);
293 complexVectorPtr += 16;
294 iBufferPtr += 8;
295 qBufferPtr += 8;
296 }
297
298 number = eighthPoints * 8;
299 for (; number < num_points; number++) {
300 *iBufferPtr++ = *complexVectorPtr++;
301 *qBufferPtr++ = *complexVectorPtr++;
302 }
303}
304#endif /* LV_HAVE_NEON */
305
306
307#ifdef LV_HAVE_NEONV8
308#include <arm_neon.h>
309
310static inline void volk_16ic_deinterleave_16i_x2_neonv8(int16_t* iBuffer,
311 int16_t* qBuffer,
312 const lv_16sc_t* complexVector,
313 unsigned int num_points)
314{
315 unsigned int number = 0;
316 const unsigned int sixteenthPoints = num_points / 16;
317 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
318 int16_t* iBufferPtr = iBuffer;
319 int16_t* qBufferPtr = qBuffer;
320
321 int16x8x2_t complexVal0, complexVal1;
322
323 for (; number < sixteenthPoints; number++) {
324 complexVal0 = vld2q_s16(complexVectorPtr);
325 complexVal1 = vld2q_s16(complexVectorPtr + 16);
326 __VOLK_PREFETCH(complexVectorPtr + 32);
327
328 vst1q_s16(iBufferPtr, complexVal0.val[0]);
329 vst1q_s16(iBufferPtr + 8, complexVal1.val[0]);
330 vst1q_s16(qBufferPtr, complexVal0.val[1]);
331 vst1q_s16(qBufferPtr + 8, complexVal1.val[1]);
332
333 complexVectorPtr += 32;
334 iBufferPtr += 16;
335 qBufferPtr += 16;
336 }
337
338 number = sixteenthPoints * 16;
339 for (; number < num_points; number++) {
340 *iBufferPtr++ = *complexVectorPtr++;
341 *qBufferPtr++ = *complexVectorPtr++;
342 }
343}
344#endif /* LV_HAVE_NEONV8 */
345
346
347#ifdef LV_HAVE_ORC
348
349extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
350 int16_t* qBuffer,
351 const lv_16sc_t* complexVector,
352 int num_points);
353static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
354 int16_t* qBuffer,
355 const lv_16sc_t* complexVector,
356 unsigned int num_points)
357{
358 volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
359}
360#endif /* LV_HAVE_ORC */
361
362#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */
363
364
365#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
366#define INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
367
368#include <inttypes.h>
369#include <stdio.h>
370#ifdef LV_HAVE_AVX2
371#include <immintrin.h>
372
373static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
374 int16_t* qBuffer,
375 const lv_16sc_t* complexVector,
376 unsigned int num_points)
377{
378 unsigned int number = 0;
379 const int8_t* complexVectorPtr = (int8_t*)complexVector;
380 int16_t* iBufferPtr = iBuffer;
381 int16_t* qBufferPtr = qBuffer;
382
383 __m256i MoveMask = _mm256_set_epi8(15,
384 14,
385 11,
386 10,
387 7,
388 6,
389 3,
390 2,
391 13,
392 12,
393 9,
394 8,
395 5,
396 4,
397 1,
398 0,
399 15,
400 14,
401 11,
402 10,
403 7,
404 6,
405 3,
406 2,
407 13,
408 12,
409 9,
410 8,
411 5,
412 4,
413 1,
414 0);
415
416 __m256i iMove2, iMove1;
417 __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
418
419 unsigned int sixteenthPoints = num_points / 16;
420
421 for (number = 0; number < sixteenthPoints; number++) {
422 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
423 complexVectorPtr += 32;
424 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
425 complexVectorPtr += 32;
426
427 iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
428 iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
429
430 iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
431 _mm256_permute4x64_epi64(iMove2, 0x80),
432 0x30);
433 qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
434 _mm256_permute4x64_epi64(iMove2, 0xd0),
435 0x30);
436
437 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
438 _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
439
440 iBufferPtr += 16;
441 qBufferPtr += 16;
442 }
443
444 number = sixteenthPoints * 16;
445 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
446 for (; number < num_points; number++) {
447 *iBufferPtr++ = *int16ComplexVectorPtr++;
448 *qBufferPtr++ = *int16ComplexVectorPtr++;
449 }
450}
451#endif /* LV_HAVE_AVX2 */
452
453#ifdef LV_HAVE_RVV
454#include <riscv_vector.h>
455
456static inline void volk_16ic_deinterleave_16i_x2_rvv(int16_t* iBuffer,
457 int16_t* qBuffer,
458 const lv_16sc_t* complexVector,
459 unsigned int num_points)
460{
461 size_t n = num_points;
462 for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
463 vl = __riscv_vsetvl_e16m4(n);
464 vuint32m8_t vc = __riscv_vle32_v_u32m8((const uint32_t*)complexVector, vl);
465 vuint16m4_t vr = __riscv_vnsrl(vc, 0, vl);
466 vuint16m4_t vi = __riscv_vnsrl(vc, 16, vl);
467 __riscv_vse16((uint16_t*)iBuffer, vr, vl);
468 __riscv_vse16((uint16_t*)qBuffer, vi, vl);
469 }
470}
471#endif /*LV_HAVE_RVV*/
472
473#ifdef LV_HAVE_RVVSEG
474#include <riscv_vector.h>
475
476static inline void volk_16ic_deinterleave_16i_x2_rvvseg(int16_t* iBuffer,
477 int16_t* qBuffer,
478 const lv_16sc_t* complexVector,
479 unsigned int num_points)
480{
481 size_t n = num_points;
482 for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
483 vl = __riscv_vsetvl_e16m4(n);
484 vuint16m4x2_t vc =
485 __riscv_vlseg2e16_v_u16m4x2((const uint16_t*)complexVector, vl);
486 vuint16m4_t vr = __riscv_vget_u16m4(vc, 0);
487 vuint16m4_t vi = __riscv_vget_u16m4(vc, 1);
488 __riscv_vse16((uint16_t*)iBuffer, vr, vl);
489 __riscv_vse16((uint16_t*)qBuffer, vi, vl);
490 }
491}
492#endif /*LV_HAVE_RVVSEG*/
493
494#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */