Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16u_byteswap.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
39
40#ifndef INCLUDED_volk_16u_byteswap_u_H
41#define INCLUDED_volk_16u_byteswap_u_H
42
43#include <inttypes.h>
44
45#ifdef LV_HAVE_GENERIC
46
47static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
48 unsigned int num_points)
49{
50 uint16_t* inputPtr = intsToSwap;
51 for (unsigned int point = 0; point < num_points; point++) {
52 uint16_t output = *inputPtr;
53 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
54 *inputPtr = output;
55 inputPtr++;
56 }
57}
58#endif /* LV_HAVE_GENERIC */
59
60
61#if LV_HAVE_AVX2
62#include <immintrin.h>
63static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
64{
65 const unsigned int nPerSet = 16;
66 const uint64_t nSets = num_points / nPerSet;
67
68 uint16_t* inputPtr = (uint16_t*)intsToSwap;
69
70 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
71 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
72 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
73
74 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
75
76 for (unsigned int number = 0; number < nSets; number++) {
77 // Load the 32t values, increment inputPtr later since we're doing it in-place.
78 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
79 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
80
81 // Store the results
82 _mm256_store_si256((__m256i*)inputPtr, output);
83 inputPtr += nPerSet;
84 }
85
86 // Byteswap any remaining points:
87 for (unsigned int number = nPerSet * nSets; number < num_points; number++) {
88 uint16_t outputVal = *inputPtr;
89 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
90 *inputPtr = outputVal;
91 inputPtr++;
92 }
93}
94#endif /* LV_HAVE_AVX2 */
95
96
97#if LV_HAVE_AVX2
98#include <immintrin.h>
99static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
100{
101 const unsigned int nPerSet = 16;
102 const uint64_t nSets = num_points / nPerSet;
103
104 uint16_t* inputPtr = (uint16_t*)intsToSwap;
105
106 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
107 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
108 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
109
110 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
111
112 for (unsigned int number = 0; number < nSets; number++) {
113 // Load the 32t values, increment inputPtr later since we're doing it in-place.
114 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
115 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
116
117 // Store the results
118 _mm256_storeu_si256((__m256i*)inputPtr, output);
119 inputPtr += nPerSet;
120 }
121
122 // Byteswap any remaining points:
123 for (unsigned int number = nPerSet * nSets; number < num_points; number++) {
124 uint16_t outputVal = *inputPtr;
125 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
126 *inputPtr = outputVal;
127 inputPtr++;
128 }
129}
130#endif /* LV_HAVE_AVX2 */
131
132
133#ifdef LV_HAVE_SSE2
134#include <emmintrin.h>
135
136static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
137{
138 uint16_t* inputPtr = intsToSwap;
139 __m128i input, left, right, output;
140
141 const unsigned int eighthPoints = num_points / 8;
142 for (unsigned int number = 0; number < eighthPoints; number++) {
143 // Load the 16t values, increment inputPtr later since we're doing it in-place.
144 input = _mm_loadu_si128((__m128i*)inputPtr);
145 // Do the two shifts
146 left = _mm_slli_epi16(input, 8);
147 right = _mm_srli_epi16(input, 8);
148 // Or the left and right halves together
149 output = _mm_or_si128(left, right);
150 // Store the results
151 _mm_storeu_si128((__m128i*)inputPtr, output);
152 inputPtr += 8;
153 }
154
155 // Byteswap any remaining points:
156 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
157 uint16_t outputVal = *inputPtr;
158 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
159 *inputPtr = outputVal;
160 inputPtr++;
161 }
162}
163#endif /* LV_HAVE_SSE2 */
164
165
166#endif /* INCLUDED_volk_16u_byteswap_u_H */
167#ifndef INCLUDED_volk_16u_byteswap_a_H
168#define INCLUDED_volk_16u_byteswap_a_H
169
170#include <inttypes.h>
171
172#ifdef LV_HAVE_SSE2
173#include <emmintrin.h>
174
175static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
176{
177 uint16_t* inputPtr = intsToSwap;
178 __m128i input, left, right, output;
179
180 const unsigned int eighthPoints = num_points / 8;
181 for (unsigned int number = 0; number < eighthPoints; number++) {
182 // Load the 16t values, increment inputPtr later since we're doing it in-place.
183 input = _mm_load_si128((__m128i*)inputPtr);
184 // Do the two shifts
185 left = _mm_slli_epi16(input, 8);
186 right = _mm_srli_epi16(input, 8);
187 // Or the left and right halves together
188 output = _mm_or_si128(left, right);
189 // Store the results
190 _mm_store_si128((__m128i*)inputPtr, output);
191 inputPtr += 8;
192 }
193
194 // Byteswap any remaining points:
195 volk_16u_byteswap_generic(inputPtr, num_points - eighthPoints * 8);
196}
197#endif /* LV_HAVE_SSE2 */
198
199#ifdef LV_HAVE_NEON
200#include <arm_neon.h>
201
202static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
203{
204 unsigned int eighth_points = num_points / 8;
205 uint16x8_t input;
206 uint16x8_t output = { 0, 0, 0, 0, 0, 0, 0, 0 };
207 uint16_t* inputPtr = intsToSwap;
208
209 for (unsigned int number = 0; number < eighth_points; number++) {
210 input = vld1q_u16(inputPtr);
211 output = vsriq_n_u16(output, input, 8);
212 output = vsliq_n_u16(output, input, 8);
213 vst1q_u16(inputPtr, output);
214 inputPtr += 8;
215 }
216
217 volk_16u_byteswap_generic(inputPtr, num_points - eighth_points * 8);
218}
219#endif /* LV_HAVE_NEON */
220
221#ifdef LV_HAVE_NEON
222#include <arm_neon.h>
223
224static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
225 unsigned int num_points)
226{
227 uint16_t* inputPtr = intsToSwap;
228 unsigned int n16points = num_points / 16;
229
230 uint8x8x4_t input_table;
231 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
232 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
233
234 /* these magic numbers are used as byte-indices in the LUT.
235 they are pre-computed to save time. A simple C program
236 can calculate them; for example for lookup01:
237 uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
238 for(ii=0; ii < 8; ++ii) {
239 index += ((uint64_t)(*(chars+ii))) << (ii*8);
240 }
241 */
242 int_lookup01 = vcreate_u8(1232017111498883080);
243 int_lookup23 = vcreate_u8(1376697457175036426);
244 int_lookup45 = vcreate_u8(1521377802851189772);
245 int_lookup67 = vcreate_u8(1666058148527343118);
246
247 for (unsigned int number = 0; number < n16points; ++number) {
248 input_table = vld4_u8((uint8_t*)inputPtr);
249 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
250 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
251 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
252 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
253 vst1_u8((uint8_t*)inputPtr, swapped_int01);
254 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
255 vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
256 vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
257
258 inputPtr += 16;
259 }
260
261 volk_16u_byteswap_generic(inputPtr, num_points - n16points * 16);
262}
263#endif /* LV_HAVE_NEON */
264
265#ifdef LV_HAVE_NEONV8
266#include <arm_neon.h>
267
268static inline void volk_16u_byteswap_neonv8(uint16_t* intsToSwap, unsigned int num_points)
269{
270 const unsigned int sixteenthPoints = num_points / 16;
271 uint16_t* inputPtr = intsToSwap;
272
273 for (unsigned int number = 0; number < sixteenthPoints; number++) {
274 uint8x16_t in0 = vld1q_u8((const uint8_t*)inputPtr);
275 uint8x16_t in1 = vld1q_u8((const uint8_t*)(inputPtr + 8));
276 __VOLK_PREFETCH(inputPtr + 32);
277
278 /* ARMv8 has vrev16q_u8 which reverses bytes within 16-bit elements */
279 vst1q_u8((uint8_t*)inputPtr, vrev16q_u8(in0));
280 vst1q_u8((uint8_t*)(inputPtr + 8), vrev16q_u8(in1));
281
282 inputPtr += 16;
283 }
284
285 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
286 uint16_t output = *inputPtr;
287 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
288 *inputPtr++ = output;
289 }
290}
291#endif /* LV_HAVE_NEONV8 */
292
293#ifdef LV_HAVE_ORC
294
295extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, int num_points);
296static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
297{
298 volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
299}
300#endif /* LV_HAVE_ORC */
301
302#ifdef LV_HAVE_RVV
303#include <riscv_vector.h>
305
306static inline void volk_16u_byteswap_rvv(uint16_t* intsToSwap, unsigned int num_points)
307{
308 size_t n = num_points;
309 size_t vlmax = __riscv_vsetvlmax_e8m1();
310 if (vlmax <= 256) {
311 vuint8m1_t vidx = __riscv_vreinterpret_u8m1(
312 __riscv_vsub(__riscv_vreinterpret_u16m1(__riscv_vid_v_u8m1(vlmax)),
313 0x100 - 0x1,
314 vlmax / 2));
315 for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
316 vl = __riscv_vsetvl_e16m8(n);
317 vuint8m8_t v =
318 __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl));
319 v = RISCV_PERM8(__riscv_vrgather, v, vidx);
320 __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl);
321 }
322 } else {
323 vuint16m2_t vidx = __riscv_vreinterpret_u16m2(
324 __riscv_vsub(__riscv_vreinterpret_u32m2(__riscv_vid_v_u16m2(vlmax)),
325 0x10000 - 0x1,
326 vlmax / 2));
327 for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
328 vl = __riscv_vsetvl_e16m8(n);
329 vuint8m8_t v =
330 __riscv_vreinterpret_u8m8(__riscv_vle16_v_u16m8(intsToSwap, vl));
331 v = RISCV_PERM8(__riscv_vrgatherei16, v, vidx);
332 __riscv_vse16(intsToSwap, __riscv_vreinterpret_u16m8(v), vl);
333 }
334 }
335}
336#endif /* LV_HAVE_RVV */
337
338#ifdef LV_HAVE_RVA23
339#include <riscv_vector.h>
340
341static inline void volk_16u_byteswap_rva23(uint16_t* intsToSwap, unsigned int num_points)
342{
343 size_t n = num_points;
344 for (size_t vl; n > 0; n -= vl, intsToSwap += vl) {
345 vl = __riscv_vsetvl_e16m8(n);
346 vuint16m8_t v = __riscv_vle16_v_u16m8(intsToSwap, vl);
347 __riscv_vse16(intsToSwap, __riscv_vrev8(v, vl), vl);
348 }
349}
350#endif /* LV_HAVE_RVA23 */
351
352#endif /* INCLUDED_volk_16u_byteswap_a_H */