Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_8ic_deinterleave_real_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
39
40#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
41#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
42
43#include <inttypes.h>
44#include <stdio.h>
45
46#ifdef LV_HAVE_AVX2
47#include <immintrin.h>
48
49static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
50 const lv_8sc_t* complexVector,
51 unsigned int num_points)
52{
53 unsigned int number = 0;
54 const int8_t* complexVectorPtr = (int8_t*)complexVector;
55 int8_t* iBufferPtr = iBuffer;
56 __m256i moveMask1 = _mm256_set_epi8(0x80,
57 0x80,
58 0x80,
59 0x80,
60 0x80,
61 0x80,
62 0x80,
63 0x80,
64 14,
65 12,
66 10,
67 8,
68 6,
69 4,
70 2,
71 0,
72 0x80,
73 0x80,
74 0x80,
75 0x80,
76 0x80,
77 0x80,
78 0x80,
79 0x80,
80 14,
81 12,
82 10,
83 8,
84 6,
85 4,
86 2,
87 0);
88 __m256i moveMask2 = _mm256_set_epi8(14,
89 12,
90 10,
91 8,
92 6,
93 4,
94 2,
95 0,
96 0x80,
97 0x80,
98 0x80,
99 0x80,
100 0x80,
101 0x80,
102 0x80,
103 0x80,
104 14,
105 12,
106 10,
107 8,
108 6,
109 4,
110 2,
111 0,
112 0x80,
113 0x80,
114 0x80,
115 0x80,
116 0x80,
117 0x80,
118 0x80,
119 0x80);
120 __m256i complexVal1, complexVal2, outputVal;
121
122 unsigned int thirtysecondPoints = num_points / 32;
123
124 for (number = 0; number < thirtysecondPoints; number++) {
125
126 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
127 complexVectorPtr += 32;
128 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
129 complexVectorPtr += 32;
130
131 complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
132 complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
133 outputVal = _mm256_or_si256(complexVal1, complexVal2);
134 outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
135
136 _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
137 iBufferPtr += 32;
138 }
139
140 number = thirtysecondPoints * 32;
141 for (; number < num_points; number++) {
142 *iBufferPtr++ = *complexVectorPtr++;
143 complexVectorPtr++;
144 }
145}
146#endif /* LV_HAVE_AVX2 */
147
148
149#ifdef LV_HAVE_SSSE3
150#include <tmmintrin.h>
151
152static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
153 const lv_8sc_t* complexVector,
154 unsigned int num_points)
155{
156 unsigned int number = 0;
157 const int8_t* complexVectorPtr = (int8_t*)complexVector;
158 int8_t* iBufferPtr = iBuffer;
159 __m128i moveMask1 = _mm_set_epi8(
160 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
161 __m128i moveMask2 = _mm_set_epi8(
162 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
163 __m128i complexVal1, complexVal2, outputVal;
164
165 unsigned int sixteenthPoints = num_points / 16;
166
167 for (number = 0; number < sixteenthPoints; number++) {
168 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
169 complexVectorPtr += 16;
170 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
171 complexVectorPtr += 16;
172
173 complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
174 complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
175
176 outputVal = _mm_or_si128(complexVal1, complexVal2);
177
178 _mm_store_si128((__m128i*)iBufferPtr, outputVal);
179 iBufferPtr += 16;
180 }
181
182 number = sixteenthPoints * 16;
183 for (; number < num_points; number++) {
184 *iBufferPtr++ = *complexVectorPtr++;
185 complexVectorPtr++;
186 }
187}
188#endif /* LV_HAVE_SSSE3 */
189
190
191#ifdef LV_HAVE_AVX
192#include <immintrin.h>
193
194static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer,
195 const lv_8sc_t* complexVector,
196 unsigned int num_points)
197{
198 unsigned int number = 0;
199 const int8_t* complexVectorPtr = (int8_t*)complexVector;
200 int8_t* iBufferPtr = iBuffer;
201 __m128i moveMaskL = _mm_set_epi8(
202 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
203 __m128i moveMaskH = _mm_set_epi8(
204 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
205 __m256i complexVal1, complexVal2, outputVal;
206 __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1,
207 outputVal2;
208
209 unsigned int thirtysecondPoints = num_points / 32;
210
211 for (number = 0; number < thirtysecondPoints; number++) {
212
213 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
214 complexVectorPtr += 32;
215 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
216 complexVectorPtr += 32;
217
218 complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
219 complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
220 complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
221 complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
222
223 complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
224 complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
225 outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
226
227
228 complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
229 complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
230 outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
231
232 __m256i dummy = _mm256_setzero_si256();
233 outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
234 outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
235
236
237 _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
238 iBufferPtr += 32;
239 }
240
241 number = thirtysecondPoints * 32;
242 for (; number < num_points; number++) {
243 *iBufferPtr++ = *complexVectorPtr++;
244 complexVectorPtr++;
245 }
246}
247#endif /* LV_HAVE_AVX */
248
249
250#ifdef LV_HAVE_GENERIC
251
252static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer,
253 const lv_8sc_t* complexVector,
254 unsigned int num_points)
255{
256 unsigned int number = 0;
257 const int8_t* complexVectorPtr = (int8_t*)complexVector;
258 int8_t* iBufferPtr = iBuffer;
259 for (number = 0; number < num_points; number++) {
260 *iBufferPtr++ = *complexVectorPtr++;
261 complexVectorPtr++;
262 }
263}
264#endif /* LV_HAVE_GENERIC */
265
266
267#ifdef LV_HAVE_NEON
268#include <arm_neon.h>
269
270static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer,
271 const lv_8sc_t* complexVector,
272 unsigned int num_points)
273{
274 unsigned int number;
275 unsigned int sixteenth_points = num_points / 16;
276
277 int8x16x2_t input_vector;
278 for (number = 0; number < sixteenth_points; ++number) {
279 input_vector = vld2q_s8((int8_t*)complexVector);
280 vst1q_s8(iBuffer, input_vector.val[0]);
281 iBuffer += 16;
282 complexVector += 16;
283 }
284
285 const int8_t* complexVectorPtr = (int8_t*)complexVector;
286 int8_t* iBufferPtr = iBuffer;
287 for (number = sixteenth_points * 16; number < num_points; number++) {
288 *iBufferPtr++ = *complexVectorPtr++;
289 complexVectorPtr++;
290 }
291}
292#endif /* LV_HAVE_NEON */
293
294#ifdef LV_HAVE_NEONV8
295#include <arm_neon.h>
296
297static inline void volk_8ic_deinterleave_real_8i_neonv8(int8_t* iBuffer,
298 const lv_8sc_t* complexVector,
299 unsigned int num_points)
300{
301 const unsigned int thirtysecondPoints = num_points / 32;
302
303 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
304 int8x16x2_t cplx0 = vld2q_s8((const int8_t*)complexVector);
305 int8x16x2_t cplx1 = vld2q_s8((const int8_t*)complexVector + 32);
306 __VOLK_PREFETCH((const int8_t*)complexVector + 64);
307
308 vst1q_s8(iBuffer, cplx0.val[0]);
309 vst1q_s8(iBuffer + 16, cplx1.val[0]);
310
311 iBuffer += 32;
312 complexVector += 32;
313 }
314
315 const int8_t* complexVectorPtr = (const int8_t*)complexVector;
316 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
317 *iBuffer++ = *complexVectorPtr++;
318 complexVectorPtr++;
319 }
320}
321#endif /* LV_HAVE_NEONV8 */
322
323
324#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
325
326#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
327#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
328
329#include <inttypes.h>
330#include <stdio.h>
331
332#ifdef LV_HAVE_AVX2
333#include <immintrin.h>
334
335static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
336 const lv_8sc_t* complexVector,
337 unsigned int num_points)
338{
339 unsigned int number = 0;
340 const int8_t* complexVectorPtr = (int8_t*)complexVector;
341 int8_t* iBufferPtr = iBuffer;
342 __m256i moveMask1 = _mm256_set_epi8(0x80,
343 0x80,
344 0x80,
345 0x80,
346 0x80,
347 0x80,
348 0x80,
349 0x80,
350 14,
351 12,
352 10,
353 8,
354 6,
355 4,
356 2,
357 0,
358 0x80,
359 0x80,
360 0x80,
361 0x80,
362 0x80,
363 0x80,
364 0x80,
365 0x80,
366 14,
367 12,
368 10,
369 8,
370 6,
371 4,
372 2,
373 0);
374 __m256i moveMask2 = _mm256_set_epi8(14,
375 12,
376 10,
377 8,
378 6,
379 4,
380 2,
381 0,
382 0x80,
383 0x80,
384 0x80,
385 0x80,
386 0x80,
387 0x80,
388 0x80,
389 0x80,
390 14,
391 12,
392 10,
393 8,
394 6,
395 4,
396 2,
397 0,
398 0x80,
399 0x80,
400 0x80,
401 0x80,
402 0x80,
403 0x80,
404 0x80,
405 0x80);
406 __m256i complexVal1, complexVal2, outputVal;
407
408 unsigned int thirtysecondPoints = num_points / 32;
409
410 for (number = 0; number < thirtysecondPoints; number++) {
411
412 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
413 complexVectorPtr += 32;
414 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
415 complexVectorPtr += 32;
416
417 complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
418 complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
419 outputVal = _mm256_or_si256(complexVal1, complexVal2);
420 outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
421
422 _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
423 iBufferPtr += 32;
424 }
425
426 number = thirtysecondPoints * 32;
427 for (; number < num_points; number++) {
428 *iBufferPtr++ = *complexVectorPtr++;
429 complexVectorPtr++;
430 }
431}
432#endif /* LV_HAVE_AVX2 */
433
434#ifdef LV_HAVE_RVV
435#include <riscv_vector.h>
436
437static inline void volk_8ic_deinterleave_real_8i_rvv(int8_t* iBuffer,
438 const lv_8sc_t* complexVector,
439 unsigned int num_points)
440{
441 const uint16_t* in = (const uint16_t*)complexVector;
442 size_t n = num_points;
443 for (size_t vl; n > 0; n -= vl, in += vl, iBuffer += vl) {
444 vl = __riscv_vsetvl_e16m8(n);
445 vuint16m8_t vc = __riscv_vle16_v_u16m8(in, vl);
446 __riscv_vse8((uint8_t*)iBuffer, __riscv_vnsrl(vc, 0, vl), vl);
447 }
448}
449#endif /*LV_HAVE_RVV*/
450
451#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */