53#ifndef INCLUDED_volk_64u_byteswap_u_H
54#define INCLUDED_volk_64u_byteswap_u_H
64 uint32_t* inputPtr = (uint32_t*)intsToSwap;
65 __m128i input, byte1, byte2, byte3, byte4, output;
66 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
67 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
69 const unsigned int halfPoints = num_points / 2;
70 for (; number < halfPoints; number++) {
72 input = _mm_loadu_si128((__m128i*)inputPtr);
75 byte1 = _mm_slli_epi32(input, 24);
76 byte2 = _mm_slli_epi32(input, 8);
77 byte3 = _mm_srli_epi32(input, 8);
78 byte4 = _mm_srli_epi32(input, 24);
80 output = _mm_or_si128(byte1, byte4);
81 byte2 = _mm_and_si128(byte2, byte2mask);
82 output = _mm_or_si128(output, byte2);
83 byte3 = _mm_and_si128(byte3, byte3mask);
84 output = _mm_or_si128(output, byte3);
87 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
90 _mm_storeu_si128((__m128i*)inputPtr, output);
95 number = halfPoints * 2;
96 for (; number < num_points; number++) {
97 uint32_t output1 = *inputPtr;
98 uint32_t output2 = inputPtr[1];
100 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
101 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
103 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
104 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
106 *inputPtr++ = output2;
107 *inputPtr++ = output1;
118 uint8_t* inputPtr = (uint8_t*)intsToSwap;
119 unsigned int number = 0;
120 const unsigned int eighth_points = num_points / 8;
122 for (; number < eighth_points; number++) {
123 uint8x16_t input0 = vld1q_u8(inputPtr);
124 uint8x16_t input1 = vld1q_u8(inputPtr + 16);
125 uint8x16_t input2 = vld1q_u8(inputPtr + 32);
126 uint8x16_t input3 = vld1q_u8(inputPtr + 48);
129 uint8x16_t output0 = vrev64q_u8(input0);
130 uint8x16_t output1 = vrev64q_u8(input1);
131 uint8x16_t output2 = vrev64q_u8(input2);
132 uint8x16_t output3 = vrev64q_u8(input3);
134 vst1q_u8(inputPtr, output0);
135 vst1q_u8(inputPtr + 16, output1);
136 vst1q_u8(inputPtr + 32, output2);
137 vst1q_u8(inputPtr + 48, output3);
143 number = eighth_points * 8;
144 uint32_t* intPtr = (uint32_t*)(intsToSwap + number);
145 for (; number < num_points; number++) {
146 uint32_t output1 = *intPtr;
147 uint32_t output2 = intPtr[1];
149 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
150 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
152 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
153 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
162#ifdef LV_HAVE_GENERIC
165 unsigned int num_points)
167 uint32_t* inputPtr = (uint32_t*)intsToSwap;
169 for (point = 0; point < num_points; point++) {
170 uint32_t output1 = *inputPtr;
171 uint32_t output2 = inputPtr[1];
173 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
174 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
176 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
177 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
179 *inputPtr++ = output2;
180 *inputPtr++ = output1;
186#include <immintrin.h>
187static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap,
unsigned int num_points)
189 unsigned int number = 0;
191 const unsigned int nPerSet = 4;
192 const uint64_t nSets = num_points / nPerSet;
194 uint32_t* inputPtr = (uint32_t*)intsToSwap;
196 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
197 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
198 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
200 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
202 for (; number < nSets; number++) {
205 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
206 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
209 _mm256_store_si256((__m256i*)inputPtr, output);
212 inputPtr += 2 * nPerSet;
216 for (number = nSets * nPerSet; number < num_points; ++number) {
217 uint32_t output1 = *inputPtr;
218 uint32_t output2 = inputPtr[1];
220 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
221 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
224 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
225 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
235#include <tmmintrin.h>
237 unsigned int num_points)
239 unsigned int number = 0;
241 const unsigned int nPerSet = 2;
242 const uint64_t nSets = num_points / nPerSet;
244 uint32_t* inputPtr = (uint32_t*)intsToSwap;
246 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
248 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
250 for (; number < nSets; number++) {
253 const __m128i input = _mm_load_si128((__m128i*)inputPtr);
254 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
257 _mm_store_si128((__m128i*)inputPtr, output);
260 inputPtr += 2 * nPerSet;
264 for (number = nSets * nPerSet; number < num_points; ++number) {
265 uint32_t output1 = *inputPtr;
266 uint32_t output2 = inputPtr[1];
268 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
269 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
272 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
273 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
282#ifndef INCLUDED_volk_64u_byteswap_a_H
283#define INCLUDED_volk_64u_byteswap_a_H
289#include <emmintrin.h>
293 uint32_t* inputPtr = (uint32_t*)intsToSwap;
294 __m128i input, byte1, byte2, byte3, byte4, output;
295 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
296 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
298 const unsigned int halfPoints = num_points / 2;
299 for (; number < halfPoints; number++) {
301 input = _mm_load_si128((__m128i*)inputPtr);
304 byte1 = _mm_slli_epi32(input, 24);
305 byte2 = _mm_slli_epi32(input, 8);
306 byte3 = _mm_srli_epi32(input, 8);
307 byte4 = _mm_srli_epi32(input, 24);
309 output = _mm_or_si128(byte1, byte4);
310 byte2 = _mm_and_si128(byte2, byte2mask);
311 output = _mm_or_si128(output, byte2);
312 byte3 = _mm_and_si128(byte3, byte3mask);
313 output = _mm_or_si128(output, byte3);
316 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
319 _mm_store_si128((__m128i*)inputPtr, output);
324 number = halfPoints * 2;
325 for (; number < num_points; number++) {
326 uint32_t output1 = *inputPtr;
327 uint32_t output2 = inputPtr[1];
329 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
330 ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
332 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
333 ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
335 *inputPtr++ = output2;
336 *inputPtr++ = output1;
342#include <immintrin.h>
343static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap,
unsigned int num_points)
345 unsigned int number = 0;
347 const unsigned int nPerSet = 4;
348 const uint64_t nSets = num_points / nPerSet;
350 uint32_t* inputPtr = (uint32_t*)intsToSwap;
352 const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
353 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
354 17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
356 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
358 for (; number < nSets; number++) {
360 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
361 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
364 _mm256_storeu_si256((__m256i*)inputPtr, output);
367 inputPtr += 2 * nPerSet;
371 for (number = nSets * nPerSet; number < num_points; ++number) {
372 uint32_t output1 = *inputPtr;
373 uint32_t output2 = inputPtr[1];
375 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
376 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
379 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
380 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
390#include <tmmintrin.h>
392 unsigned int num_points)
394 unsigned int number = 0;
396 const unsigned int nPerSet = 2;
397 const uint64_t nSets = num_points / nPerSet;
399 uint32_t* inputPtr = (uint32_t*)intsToSwap;
401 uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
403 const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
405 for (; number < nSets; number++) {
407 const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
408 const __m128i output = _mm_shuffle_epi8(input, myShuffle);
411 _mm_storeu_si128((__m128i*)inputPtr, output);
414 inputPtr += 2 * nPerSet;
418 for (number = nSets * nPerSet; number < num_points; ++number) {
419 uint32_t output1 = *inputPtr;
420 uint32_t output2 = inputPtr[1];
422 ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
423 (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
426 ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
427 (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
436#include <riscv_vector.h>
438static inline void volk_64u_byteswap_rvv(uint64_t* intsToSwap,
unsigned int num_points)
440 size_t n = num_points;
441 size_t vlmax = __riscv_vsetvlmax_e8m1();
443 vuint8m1_t vidx = __riscv_vreinterpret_u8m1(
444 __riscv_vsub(__riscv_vreinterpret_u64m1(__riscv_vid_v_u8m1(vlmax)),
445 0x0706050403020100 - 0x1020304050607,
447 for (
size_t vl; n > 0; n -= vl, intsToSwap += vl) {
448 vl = __riscv_vsetvl_e64m8(n);
450 __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl));
452 __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl);
455 vuint16m2_t vid = __riscv_vid_v_u16m2(vlmax);
456 vuint16m2_t voff1 = __riscv_vand(vid, 0x7, vlmax);
457 vuint16m2_t voff2 = __riscv_vrsub(voff1, 0x7, vlmax);
458 vuint16m2_t vidx = __riscv_vadd(__riscv_vsub(vid, voff1, vlmax), voff2, vlmax);
459 for (
size_t vl; n > 0; n -= vl, intsToSwap += vl) {
460 vl = __riscv_vsetvl_e64m8(n);
462 __riscv_vreinterpret_u8m8(__riscv_vle64_v_u64m8(intsToSwap, vl));
464 __riscv_vse64(intsToSwap, __riscv_vreinterpret_u64m8(v), vl);
471#include <riscv_vector.h>
473static inline void volk_64u_byteswap_rva23(uint64_t* intsToSwap,
unsigned int num_points)
475 size_t n = num_points;
476 for (
size_t vl; n > 0; n -= vl, intsToSwap += vl) {
477 vl = __riscv_vsetvl_e64m8(n);
478 vuint64m8_t v = __riscv_vle64_v_u64m8(intsToSwap, vl);
479 __riscv_vse64(intsToSwap, __riscv_vrev8(v, vl), vl);