Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16ic_deinterleave_real_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
40
41#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
42#define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
43
44#include <inttypes.h>
45#include <stdio.h>
46
47
48#ifdef LV_HAVE_AVX2
49#include <immintrin.h>
50
51static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
52 const lv_16sc_t* complexVector,
53 unsigned int num_points)
54{
55 unsigned int number = 0;
56 const int8_t* complexVectorPtr = (int8_t*)complexVector;
57 int8_t* iBufferPtr = iBuffer;
58 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
59 0x80,
60 0x80,
61 0x80,
62 0x80,
63 0x80,
64 0x80,
65 0x80,
66 13,
67 12,
68 9,
69 8,
70 5,
71 4,
72 1,
73 0,
74 0x80,
75 0x80,
76 0x80,
77 0x80,
78 0x80,
79 0x80,
80 0x80,
81 0x80,
82 13,
83 12,
84 9,
85 8,
86 5,
87 4,
88 1,
89 0);
90 __m256i iMoveMask2 = _mm256_set_epi8(13,
91 12,
92 9,
93 8,
94 5,
95 4,
96 1,
97 0,
98 0x80,
99 0x80,
100 0x80,
101 0x80,
102 0x80,
103 0x80,
104 0x80,
105 0x80,
106 13,
107 12,
108 9,
109 8,
110 5,
111 4,
112 1,
113 0,
114 0x80,
115 0x80,
116 0x80,
117 0x80,
118 0x80,
119 0x80,
120 0x80,
121 0x80);
122 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
123
124 unsigned int thirtysecondPoints = num_points / 32;
125
126 for (number = 0; number < thirtysecondPoints; number++) {
127 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
128 complexVectorPtr += 32;
129 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
130 complexVectorPtr += 32;
131
132 complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
133 complexVectorPtr += 32;
134 complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
135 complexVectorPtr += 32;
136
137 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
138 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
139
140 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
141 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
142
143 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
144 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
145
146 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
147 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
148
149 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
150 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
151
152 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
153 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
154
155 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
156
157 iBufferPtr += 32;
158 }
159
160 number = thirtysecondPoints * 32;
161 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
162 for (; number < num_points; number++) {
163 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
164 int16ComplexVectorPtr++;
165 }
166}
167#endif /* LV_HAVE_AVX2 */
168
169
170#ifdef LV_HAVE_SSSE3
171#include <tmmintrin.h>
172
173static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
174 const lv_16sc_t* complexVector,
175 unsigned int num_points)
176{
177 unsigned int number = 0;
178 const int8_t* complexVectorPtr = (int8_t*)complexVector;
179 int8_t* iBufferPtr = iBuffer;
180 __m128i iMoveMask1 = _mm_set_epi8(
181 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
182 __m128i iMoveMask2 = _mm_set_epi8(
183 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
184 __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
185
186 unsigned int sixteenthPoints = num_points / 16;
187
188 for (number = 0; number < sixteenthPoints; number++) {
189 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
190 complexVectorPtr += 16;
191 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
192 complexVectorPtr += 16;
193
194 complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
195 complexVectorPtr += 16;
196 complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
197 complexVectorPtr += 16;
198
199 complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
200 complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
201
202 complexVal1 = _mm_or_si128(complexVal1, complexVal2);
203
204 complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
205 complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
206
207 complexVal3 = _mm_or_si128(complexVal3, complexVal4);
208
209
210 complexVal1 = _mm_srai_epi16(complexVal1, 8);
211 complexVal3 = _mm_srai_epi16(complexVal3, 8);
212
213 iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
214
215 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
216
217 iBufferPtr += 16;
218 }
219
220 number = sixteenthPoints * 16;
221 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
222 for (; number < num_points; number++) {
223 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
224 int16ComplexVectorPtr++;
225 }
226}
227#endif /* LV_HAVE_SSSE3 */
228
229#ifdef LV_HAVE_GENERIC
230
231static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer,
232 const lv_16sc_t* complexVector,
233 unsigned int num_points)
234{
235 unsigned int number = 0;
236 int16_t* complexVectorPtr = (int16_t*)complexVector;
237 int8_t* iBufferPtr = iBuffer;
238 for (number = 0; number < num_points; number++) {
239 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
240 complexVectorPtr++;
241 }
242}
243#endif /* LV_HAVE_GENERIC */
244
245#ifdef LV_HAVE_NEON
246#include <arm_neon.h>
247
248static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer,
249 const lv_16sc_t* complexVector,
250 unsigned int num_points)
251{
252 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
253 int8_t* iBufferPtr = iBuffer;
254 unsigned int eighth_points = num_points / 8;
255 unsigned int number;
256
257 int16x8x2_t complexInput;
258 int8x8_t realOutput;
259 for (number = 0; number < eighth_points; number++) {
260 complexInput = vld2q_s16(complexVectorPtr);
261 realOutput = vshrn_n_s16(complexInput.val[0], 8);
262 vst1_s8(iBufferPtr, realOutput);
263 complexVectorPtr += 16;
264 iBufferPtr += 8;
265 }
266
267 for (number = eighth_points * 8; number < num_points; number++) {
268 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
269 complexVectorPtr++;
270 }
271}
272#endif
273
274#ifdef LV_HAVE_NEONV8
275#include <arm_neon.h>
276
277static inline void volk_16ic_deinterleave_real_8i_neonv8(int8_t* iBuffer,
278 const lv_16sc_t* complexVector,
279 unsigned int num_points)
280{
281 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
282 int8_t* iBufferPtr = iBuffer;
283 const unsigned int sixteenthPoints = num_points / 16;
284
285 for (unsigned int number = 0; number < sixteenthPoints; number++) {
286 int16x8x2_t cplx0 = vld2q_s16(complexVectorPtr);
287 int16x8x2_t cplx1 = vld2q_s16(complexVectorPtr + 16);
288 __VOLK_PREFETCH(complexVectorPtr + 64);
289
290 int8x8_t out0 = vshrn_n_s16(cplx0.val[0], 8);
291 int8x8_t out1 = vshrn_n_s16(cplx1.val[0], 8);
292
293 vst1_s8(iBufferPtr, out0);
294 vst1_s8(iBufferPtr + 8, out1);
295
296 complexVectorPtr += 32;
297 iBufferPtr += 16;
298 }
299
300 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
301 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
302 complexVectorPtr++;
303 }
304}
305#endif /* LV_HAVE_NEONV8 */
306
307#ifdef LV_HAVE_ORC
308
309extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
310 const lv_16sc_t* complexVector,
311 int num_points);
312
313static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
314 const lv_16sc_t* complexVector,
315 unsigned int num_points)
316{
317 volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
318}
319#endif /* LV_HAVE_ORC */
320
321
322#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
323
324#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H
325#define INCLUDED_volk_16ic_deinterleave_real_8i_u_H
326
327#include <inttypes.h>
328#include <stdio.h>
329
330
331#ifdef LV_HAVE_AVX2
332#include <immintrin.h>
333
334static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
335 const lv_16sc_t* complexVector,
336 unsigned int num_points)
337{
338 unsigned int number = 0;
339 const int8_t* complexVectorPtr = (int8_t*)complexVector;
340 int8_t* iBufferPtr = iBuffer;
341 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
342 0x80,
343 0x80,
344 0x80,
345 0x80,
346 0x80,
347 0x80,
348 0x80,
349 13,
350 12,
351 9,
352 8,
353 5,
354 4,
355 1,
356 0,
357 0x80,
358 0x80,
359 0x80,
360 0x80,
361 0x80,
362 0x80,
363 0x80,
364 0x80,
365 13,
366 12,
367 9,
368 8,
369 5,
370 4,
371 1,
372 0);
373 __m256i iMoveMask2 = _mm256_set_epi8(13,
374 12,
375 9,
376 8,
377 5,
378 4,
379 1,
380 0,
381 0x80,
382 0x80,
383 0x80,
384 0x80,
385 0x80,
386 0x80,
387 0x80,
388 0x80,
389 13,
390 12,
391 9,
392 8,
393 5,
394 4,
395 1,
396 0,
397 0x80,
398 0x80,
399 0x80,
400 0x80,
401 0x80,
402 0x80,
403 0x80,
404 0x80);
405 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
406
407 unsigned int thirtysecondPoints = num_points / 32;
408
409 for (number = 0; number < thirtysecondPoints; number++) {
410 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
411 complexVectorPtr += 32;
412 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
413 complexVectorPtr += 32;
414
415 complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
416 complexVectorPtr += 32;
417 complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
418 complexVectorPtr += 32;
419
420 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
421 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
422
423 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
424 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
425
426 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
427 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
428
429 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
430 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
431
432 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
433 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
434
435 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
436 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
437
438 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
439
440 iBufferPtr += 32;
441 }
442
443 number = thirtysecondPoints * 32;
444 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
445 for (; number < num_points; number++) {
446 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
447 int16ComplexVectorPtr++;
448 }
449}
450#endif /* LV_HAVE_AVX2 */
451
452
453#ifdef LV_HAVE_RVV
454#include <riscv_vector.h>
455
456static inline void volk_16ic_deinterleave_real_8i_rvv(int8_t* iBuffer,
457 const lv_16sc_t* complexVector,
458 unsigned int num_points)
459{
460 const uint32_t* in = (const uint32_t*)complexVector;
461 size_t n = num_points;
462 for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
463 vl = __riscv_vsetvl_e32m8(n);
464 vuint32m8_t vc = __riscv_vle32_v_u32m8(in, vl);
465 __riscv_vse8(
466 (uint8_t*)iBuffer, __riscv_vnsrl(__riscv_vnsrl(vc, 0, vl), 8, vl), vl);
467 }
468}
469#endif /*LV_HAVE_RVV*/
470
471#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */