Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_64u_byteswap.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
52
53#ifndef INCLUDED_volk_64u_byteswap_u_H
54#define INCLUDED_volk_64u_byteswap_u_H
55
56#include <inttypes.h>
57#include <stdio.h>
58
59#ifdef LV_HAVE_SSE2
60#include <emmintrin.h>
61
62static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
63{
64 uint32_t* inputPtr = (uint32_t*)intsToSwap;
65 __m128i input, byte1, byte2, byte3, byte4, output;
66 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
67 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
68 uint64_t number = 0;
69 const unsigned int halfPoints = num_points / 2;
70 for (; number < halfPoints; number++) {
71 // Load the 32t values, increment inputPtr later since we're doing it in-place.
72 input = _mm_loadu_si128((__m128i*)inputPtr);
73
74 // Do the four shifts
75 byte1 = _mm_slli_epi32(input, 24);
76 byte2 = _mm_slli_epi32(input, 8);
77 byte3 = _mm_srli_epi32(input, 8);
78 byte4 = _mm_srli_epi32(input, 24);
79 // Or bytes together
80 output = _mm_or_si128(byte1, byte4);
81 byte2 = _mm_and_si128(byte2, byte2mask);
82 output = _mm_or_si128(output, byte2);
83 byte3 = _mm_and_si128(byte3, byte3mask);
84 output = _mm_or_si128(output, byte3);
85
86 // Reorder the two words
87 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
88
89 // Store the results
90 _mm_storeu_si128((__m128i*)inputPtr, output);
91 inputPtr += 4;
92 }
93
94 // Byteswap any remaining points:
95 number = halfPoints * 2;
96 for (; number < num_points; number++) {
97 uint32_t output1 = *inputPtr;
98 uint32_t output2 = inputPtr[1];
99
100 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
101 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
102
103 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
104 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
105
106 *inputPtr++ = output2;
107 *inputPtr++ = output1;
108 }
109}
110#endif /* LV_HAVE_SSE2 */
111
112
113#ifdef LV_HAVE_NEON
114#include <arm_neon.h>
115
116static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points)
117{
118 uint8_t* inputPtr = (uint8_t*)intsToSwap;
119 unsigned int number = 0;
120 const unsigned int eighth_points = num_points / 8;
121
122 for (; number < eighth_points; number++) {
123 uint8x16_t input0 = vld1q_u8(inputPtr);
124 uint8x16_t input1 = vld1q_u8(inputPtr + 16);
125 uint8x16_t input2 = vld1q_u8(inputPtr + 32);
126 uint8x16_t input3 = vld1q_u8(inputPtr + 48);
127
128 // Reverse bytes within each 64-bit element
129 uint8x16_t output0 = vrev64q_u8(input0);
130 uint8x16_t output1 = vrev64q_u8(input1);
131 uint8x16_t output2 = vrev64q_u8(input2);
132 uint8x16_t output3 = vrev64q_u8(input3);
133
134 vst1q_u8(inputPtr, output0);
135 vst1q_u8(inputPtr + 16, output1);
136 vst1q_u8(inputPtr + 32, output2);
137 vst1q_u8(inputPtr + 48, output3);
138
139 inputPtr += 64;
140 }
141
142 // Handle remaining points
143 number = eighth_points * 8;
144 uint32_t* intPtr = (uint32_t*)(intsToSwap + number);
145 for (; number < num_points; number++) {
146 uint32_t output1 = *intPtr;
147 uint32_t output2 = intPtr[1];
148
149 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
150 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
151
152 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
153 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
154
155 *intPtr++ = output2;
156 *intPtr++ = output1;
157 }
158}
159#endif /* LV_HAVE_NEON */
160
161
162#ifdef LV_HAVE_GENERIC
163
164static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
165 unsigned int num_points)
166{
167 uint32_t* inputPtr = (uint32_t*)intsToSwap;
168 unsigned int point;
169 for (point = 0; point < num_points; point++) {
170 uint32_t output1 = *inputPtr;
171 uint32_t output2 = inputPtr[1];
172
173 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
174 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
175
176 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
177 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
178
179 *inputPtr++ = output2;
180 *inputPtr++ = output1;
181 }
182}
183#endif /* LV_HAVE_GENERIC */
184
185#if LV_HAVE_AVX2
186#include <immintrin.h>
187static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
188{
189 unsigned int number = 0;
190
191 const unsigned int nPerSet = 4;
192 const uint64_t nSets = num_points / nPerSet;
193
194 uint32_t* inputPtr = (uint32_t*)intsToSwap;
195
196 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
197 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
198 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
199
200 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
201
202 for (; number < nSets; number++) {
203
204 // Load the 32t values, increment inputPtr later since we're doing it in-place.
205 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
206 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
207
208 // Store the results
209 _mm256_store_si256((__m256i*)inputPtr, output);
210
211 /* inputPtr is 32bit so increment twice */
212 inputPtr += 2 * nPerSet;
213 }
214
215 // Byteswap any remaining points:
216 for (number = nSets * nPerSet; number < num_points; ++number) {
217 uint32_t output1 = *inputPtr;
218 uint32_t output2 = inputPtr[1];
219 uint32_t out1 =
220 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
221 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
222
223 uint32_t out2 =
224 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
225 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
226 *inputPtr++ = out2;
227 *inputPtr++ = out1;
228 }
229}
230
231#endif /* LV_HAVE_AVX2 */
232
233
234#if LV_HAVE_SSSE3
235#include <tmmintrin.h>
236static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
237 unsigned int num_points)
238{
239 unsigned int number = 0;
240
241 const unsigned int nPerSet = 2;
242 const uint64_t nSets = num_points / nPerSet;
243
244 uint32_t* inputPtr = (uint32_t*)intsToSwap;
245
246 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
247
248 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
249
250 for (; number < nSets; number++) {
251
252 // Load the 32t values, increment inputPtr later since we're doing it in-place.
253 const __m128i input = _mm_load_si128((__m128i*)inputPtr);
254 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
255
256 // Store the results
257 _mm_store_si128((__m128i*)inputPtr, output);
258
259 /* inputPtr is 32bit so increment twice */
260 inputPtr += 2 * nPerSet;
261 }
262
263 // Byteswap any remaining points:
264 for (number = nSets * nPerSet; number < num_points; ++number) {
265 uint32_t output1 = *inputPtr;
266 uint32_t output2 = inputPtr[1];
267 uint32_t out1 =
268 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
269 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
270
271 uint32_t out2 =
272 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
273 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
274 *inputPtr++ = out2;
275 *inputPtr++ = out1;
276 }
277}
278#endif /* LV_HAVE_SSSE3 */
279#endif /* INCLUDED_volk_64u_byteswap_u_H */
280
281
282#ifndef INCLUDED_volk_64u_byteswap_a_H
283#define INCLUDED_volk_64u_byteswap_a_H
284
285#include <inttypes.h>
286#include <stdio.h>
287
288#ifdef LV_HAVE_SSE2
289#include <emmintrin.h>
290
291static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
292{
293 uint32_t* inputPtr = (uint32_t*)intsToSwap;
294 __m128i input, byte1, byte2, byte3, byte4, output;
295 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
296 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
297 uint64_t number = 0;
298 const unsigned int halfPoints = num_points / 2;
299 for (; number < halfPoints; number++) {
300 // Load the 32t values, increment inputPtr later since we're doing it in-place.
301 input = _mm_load_si128((__m128i*)inputPtr);
302
303 // Do the four shifts
304 byte1 = _mm_slli_epi32(input, 24);
305 byte2 = _mm_slli_epi32(input, 8);
306 byte3 = _mm_srli_epi32(input, 8);
307 byte4 = _mm_srli_epi32(input, 24);
308 // Or bytes together
309 output = _mm_or_si128(byte1, byte4);
310 byte2 = _mm_and_si128(byte2, byte2mask);
311 output = _mm_or_si128(output, byte2);
312 byte3 = _mm_and_si128(byte3, byte3mask);
313 output = _mm_or_si128(output, byte3);
314
315 // Reorder the two words
316 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
317
318 // Store the results
319 _mm_store_si128((__m128i*)inputPtr, output);
320 inputPtr += 4;
321 }
322
323 // Byteswap any remaining points:
324 number = halfPoints * 2;
325 for (; number < num_points; number++) {
326 uint32_t output1 = *inputPtr;
327 uint32_t output2 = inputPtr[1];
328
329 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
330 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
331
332 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
333 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
334
335 *inputPtr++ = output2;
336 *inputPtr++ = output1;
337 }
338}
339#endif /* LV_HAVE_SSE2 */
340
341#if LV_HAVE_AVX2
342#include <immintrin.h>
343static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
344{
345 unsigned int number = 0;
346
347 const unsigned int nPerSet = 4;
348 const uint64_t nSets = num_points / nPerSet;
349
350 uint32_t* inputPtr = (uint32_t*)intsToSwap;
351
352 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
353 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
354 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
355
356 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
357
358 for (; number < nSets; number++) {
359 // Load the 32t values, increment inputPtr later since we're doing it in-place.
360 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
361 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
362
363 // Store the results
364 _mm256_storeu_si256((__m256i*)inputPtr, output);
365
366 /* inputPtr is 32bit so increment twice */
367 inputPtr += 2 * nPerSet;
368 }
369
370 // Byteswap any remaining points:
371 for (number = nSets * nPerSet; number < num_points; ++number) {
372 uint32_t output1 = *inputPtr;
373 uint32_t output2 = inputPtr[1];
374 uint32_t out1 =
375 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
376 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
377
378 uint32_t out2 =
379 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
380 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
381 *inputPtr++ = out2;
382 *inputPtr++ = out1;
383 }
384}
385
386#endif /* LV_HAVE_AVX2 */
387
388
389#if LV_HAVE_SSSE3
390#include <tmmintrin.h>
391static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
392 unsigned int num_points)
393{
394 unsigned int number = 0;
395
396 const unsigned int nPerSet = 2;
397 const uint64_t nSets = num_points / nPerSet;
398
399 uint32_t* inputPtr = (uint32_t*)intsToSwap;
400
401 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
402
403 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
404
405 for (; number < nSets; number++) {
406 // Load the 32t values, increment inputPtr later since we're doing it in-place.
407 const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
408 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
409
410 // Store the results
411 _mm_storeu_si128((__m128i*)inputPtr, output);
412
413 /* inputPtr is 32bit so increment twice */
414 inputPtr += 2 * nPerSet;
415 }
416
417 // Byteswap any remaining points:
418 for (number = nSets * nPerSet; number < num_points; ++number) {
419 uint32_t output1 = *inputPtr;
420 uint32_t output2 = inputPtr[1];
421 uint32_t out1 =
422 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
423 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
424
425 uint32_t out2 =
426 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
427 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
428 *inputPtr++ = out2;
429 *inputPtr++ = out1;
430 }
431}
432#endif /* LV_HAVE_SSSE3 */
433
434
435#ifdef LV_HAVE_RVV
436#include <riscv_vector.h>
437
438static inline void volk_64u_byteswap_rvv(uint64_t* intsToSwap, unsigned int num_points)
439{
440 size_t n = num_points;
441 size_t vlmax = __riscv_vsetvlmax_e8m1();
442 if (vlmax <= 256) {
443 vuint8m1_t vidx = __riscv_vreinterpret_u8m1(
444 __riscv_vsub(__riscv_vreinterpret_u64m1(__riscv_vid_v_u8m1(vlmax)),
445 0x0706050403020100 - 0x1020304050607,
446 vlmax / 8));
447 for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
448 vl = __riscv_vsetvl_e64m8(n);
449 vuint8m8_t v =
450 __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl));
451 v = RISCV_PERM8(__riscv_vrgather, v, vidx);
452 __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl);
453 }
454 } else {
455 vuint16m2_t vid = __riscv_vid_v_u16m2(vlmax);
456 vuint16m2_t voff1 = __riscv_vand(vid, 0x7, vlmax);
457 vuint16m2_t voff2 = __riscv_vrsub(voff1, 0x7, vlmax);
458 vuint16m2_t vidx = __riscv_vadd(__riscv_vsub(vid, voff1, vlmax), voff2, vlmax);
459 for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
460 vl = __riscv_vsetvl_e64m8(n);
461 vuint8m8_t v =
462 __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl));
463 v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx);
464 __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl);
465 }
466 }
467}
468#endif /* LV_HAVE_RVV */
469
470#ifdef LV_HAVE_RVA23
471#include <riscv_vector.h>
472
473static inline void volk_64u_byteswap_rva23(uint64_t* intsToSwap, unsigned int num_points)
474{
475 size_t n = num_points;
476 for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
477 vl = __riscv_vsetvl_e64m8(n);
478 vuint64m8_t v = __riscv_vle64_v_u64m8(intsToSwap, vl);
479 __riscv_vse64(intsToSwap, __riscv_vrev8(v, vl), vl);
480 }
481}
482#endif /* LV_HAVE_RVA23 */
483
484#endif /* INCLUDED_volk_64u_byteswap_a_H */