Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16i_convert_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
39
40#ifndef INCLUDED_volk_16i_convert_8i_u_H
41#define INCLUDED_volk_16i_convert_8i_u_H
42
43#include <inttypes.h>
44#include <stdio.h>
45
46#ifdef LV_HAVE_AVX2
47#include <immintrin.h>
48
49static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector,
50 const int16_t* inputVector,
51 unsigned int num_points)
52{
53 unsigned int number = 0;
54 const unsigned int thirtysecondPoints = num_points / 32;
55
56 int8_t* outputVectorPtr = outputVector;
57 int16_t* inputPtr = (int16_t*)inputVector;
58 __m256i inputVal1;
59 __m256i inputVal2;
60 __m256i ret;
61
62 for (; number < thirtysecondPoints; number++) {
63
64 // Load the 16 values
65 inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr);
66 inputPtr += 16;
67 inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr);
68 inputPtr += 16;
69
70 inputVal1 = _mm256_srai_epi16(inputVal1, 8);
71 inputVal2 = _mm256_srai_epi16(inputVal2, 8);
72
73 ret = _mm256_packs_epi16(inputVal1, inputVal2);
74 ret = _mm256_permute4x64_epi64(ret, 0b11011000);
75
76 _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
77
78 outputVectorPtr += 32;
79 }
80
81 number = thirtysecondPoints * 32;
82 for (; number < num_points; number++) {
83 outputVector[number] = (int8_t)(inputVector[number] >> 8);
84 }
85}
86#endif /* LV_HAVE_AVX2 */
87
88#ifdef LV_HAVE_AVX512BW
89#include <immintrin.h>
90
91static inline void volk_16i_convert_8i_u_avx512bw(int8_t* outputVector,
92 const int16_t* inputVector,
93 unsigned int num_points)
94{
95 unsigned int number = 0;
96 const unsigned int sixtyfourthPoints = num_points / 64;
97
98 int8_t* outputVectorPtr = outputVector;
99 int16_t* inputPtr = (int16_t*)inputVector;
100 __m512i inputVal1;
101 __m512i inputVal2;
102 __m512i shifted1, shifted2;
103 __m256i ret1, ret2;
104
105 for (; number < sixtyfourthPoints; number++) {
106
107 // Load 64 int16 values
108 inputVal1 = _mm512_loadu_si512((__m512i*)inputPtr);
109 inputPtr += 32;
110 inputVal2 = _mm512_loadu_si512((__m512i*)inputPtr);
111 inputPtr += 32;
112
113 shifted1 = _mm512_srai_epi16(inputVal1, 8);
114 shifted2 = _mm512_srai_epi16(inputVal2, 8);
115
116 ret1 = _mm512_cvtsepi16_epi8(shifted1);
117 ret2 = _mm512_cvtsepi16_epi8(shifted2);
118
119 _mm256_storeu_si256((__m256i*)outputVectorPtr, ret1);
120 outputVectorPtr += 32;
121 _mm256_storeu_si256((__m256i*)outputVectorPtr, ret2);
122 outputVectorPtr += 32;
123 }
124
125 number = sixtyfourthPoints * 64;
126 for (; number < num_points; number++) {
127 outputVector[number] = (int8_t)(inputVector[number] >> 8);
128 }
129}
130#endif /* LV_HAVE_AVX512BW */
131
132
133#ifdef LV_HAVE_SSE2
134#include <emmintrin.h>
135
136static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector,
137 const int16_t* inputVector,
138 unsigned int num_points)
139{
140 unsigned int number = 0;
141 const unsigned int sixteenthPoints = num_points / 16;
142
143 int8_t* outputVectorPtr = outputVector;
144 int16_t* inputPtr = (int16_t*)inputVector;
145 __m128i inputVal1;
146 __m128i inputVal2;
147 __m128i ret;
148
149 for (; number < sixteenthPoints; number++) {
150
151 // Load the 16 values
152 inputVal1 = _mm_loadu_si128((__m128i*)inputPtr);
153 inputPtr += 8;
154 inputVal2 = _mm_loadu_si128((__m128i*)inputPtr);
155 inputPtr += 8;
156
157 inputVal1 = _mm_srai_epi16(inputVal1, 8);
158 inputVal2 = _mm_srai_epi16(inputVal2, 8);
159
160 ret = _mm_packs_epi16(inputVal1, inputVal2);
161
162 _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
163
164 outputVectorPtr += 16;
165 }
166
167 number = sixteenthPoints * 16;
168 for (; number < num_points; number++) {
169 outputVector[number] = (int8_t)(inputVector[number] >> 8);
170 }
171}
172#endif /* LV_HAVE_SSE2 */
173
174
175#ifdef LV_HAVE_GENERIC
176
177static inline void volk_16i_convert_8i_generic(int8_t* outputVector,
178 const int16_t* inputVector,
179 unsigned int num_points)
180{
181 int8_t* outputVectorPtr = outputVector;
182 const int16_t* inputVectorPtr = inputVector;
183 unsigned int number = 0;
184
185 for (number = 0; number < num_points; number++) {
186 *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
187 }
188}
189#endif /* LV_HAVE_GENERIC */
190
191
192#endif /* INCLUDED_volk_16i_convert_8i_u_H */
193#ifndef INCLUDED_volk_16i_convert_8i_a_H
194#define INCLUDED_volk_16i_convert_8i_a_H
195
196#include <inttypes.h>
197#include <stdio.h>
198
199#ifdef LV_HAVE_AVX2
200#include <immintrin.h>
201
202static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector,
203 const int16_t* inputVector,
204 unsigned int num_points)
205{
206 unsigned int number = 0;
207 const unsigned int thirtysecondPoints = num_points / 32;
208
209 int8_t* outputVectorPtr = outputVector;
210 int16_t* inputPtr = (int16_t*)inputVector;
211 __m256i inputVal1;
212 __m256i inputVal2;
213 __m256i ret;
214
215 for (; number < thirtysecondPoints; number++) {
216
217 // Load the 16 values
218 inputVal1 = _mm256_load_si256((__m256i*)inputPtr);
219 inputPtr += 16;
220 inputVal2 = _mm256_load_si256((__m256i*)inputPtr);
221 inputPtr += 16;
222
223 inputVal1 = _mm256_srai_epi16(inputVal1, 8);
224 inputVal2 = _mm256_srai_epi16(inputVal2, 8);
225
226 ret = _mm256_packs_epi16(inputVal1, inputVal2);
227 ret = _mm256_permute4x64_epi64(ret, 0b11011000);
228
229 _mm256_store_si256((__m256i*)outputVectorPtr, ret);
230
231 outputVectorPtr += 32;
232 }
233
234 number = thirtysecondPoints * 32;
235 for (; number < num_points; number++) {
236 outputVector[number] = (int8_t)(inputVector[number] >> 8);
237 }
238}
239#endif /* LV_HAVE_AVX2 */
240
241#ifdef LV_HAVE_AVX512BW
242#include <immintrin.h>
243
244static inline void volk_16i_convert_8i_a_avx512bw(int8_t* outputVector,
245 const int16_t* inputVector,
246 unsigned int num_points)
247{
248 unsigned int number = 0;
249 const unsigned int sixtyfourthPoints = num_points / 64;
250
251 int8_t* outputVectorPtr = outputVector;
252 int16_t* inputPtr = (int16_t*)inputVector;
253 __m512i inputVal1;
254 __m512i inputVal2;
255 __m512i shifted1, shifted2;
256 __m256i ret1, ret2;
257
258 for (; number < sixtyfourthPoints; number++) {
259
260 // Load 64 int16 values
261 inputVal1 = _mm512_load_si512((__m512i*)inputPtr);
262 inputPtr += 32;
263 inputVal2 = _mm512_load_si512((__m512i*)inputPtr);
264 inputPtr += 32;
265
266 shifted1 = _mm512_srai_epi16(inputVal1, 8);
267 shifted2 = _mm512_srai_epi16(inputVal2, 8);
268
269 ret1 = _mm512_cvtsepi16_epi8(shifted1);
270 ret2 = _mm512_cvtsepi16_epi8(shifted2);
271
272 _mm256_store_si256((__m256i*)outputVectorPtr, ret1);
273 outputVectorPtr += 32;
274 _mm256_store_si256((__m256i*)outputVectorPtr, ret2);
275 outputVectorPtr += 32;
276 }
277
278 number = sixtyfourthPoints * 64;
279 for (; number < num_points; number++) {
280 outputVector[number] = (int8_t)(inputVector[number] >> 8);
281 }
282}
283#endif /* LV_HAVE_AVX512BW */
284
285
286#ifdef LV_HAVE_SSE2
287#include <emmintrin.h>
288
289static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector,
290 const int16_t* inputVector,
291 unsigned int num_points)
292{
293 unsigned int number = 0;
294 const unsigned int sixteenthPoints = num_points / 16;
295
296 int8_t* outputVectorPtr = outputVector;
297 int16_t* inputPtr = (int16_t*)inputVector;
298 __m128i inputVal1;
299 __m128i inputVal2;
300 __m128i ret;
301
302 for (; number < sixteenthPoints; number++) {
303
304 // Load the 16 values
305 inputVal1 = _mm_load_si128((__m128i*)inputPtr);
306 inputPtr += 8;
307 inputVal2 = _mm_load_si128((__m128i*)inputPtr);
308 inputPtr += 8;
309
310 inputVal1 = _mm_srai_epi16(inputVal1, 8);
311 inputVal2 = _mm_srai_epi16(inputVal2, 8);
312
313 ret = _mm_packs_epi16(inputVal1, inputVal2);
314
315 _mm_store_si128((__m128i*)outputVectorPtr, ret);
316
317 outputVectorPtr += 16;
318 }
319
320 number = sixteenthPoints * 16;
321 for (; number < num_points; number++) {
322 outputVector[number] = (int8_t)(inputVector[number] >> 8);
323 }
324}
325#endif /* LV_HAVE_SSE2 */
326
327
328#ifdef LV_HAVE_NEON
329#include <arm_neon.h>
330
331static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
332 const int16_t* inputVector,
333 unsigned int num_points)
334{
335 int8_t* outputVectorPtr = outputVector;
336 const int16_t* inputVectorPtr = inputVector;
337 unsigned int number = 0;
338 unsigned int sixteenth_points = num_points / 16;
339
340 int16x8_t inputVal0;
341 int16x8_t inputVal1;
342 int8x8_t outputVal0;
343 int8x8_t outputVal1;
344 int8x16_t outputVal;
345
346 for (number = 0; number < sixteenth_points; number++) {
347 // load two input vectors
348 inputVal0 = vld1q_s16(inputVectorPtr);
349 inputVal1 = vld1q_s16(inputVectorPtr + 8);
350 // shift right
351 outputVal0 = vshrn_n_s16(inputVal0, 8);
352 outputVal1 = vshrn_n_s16(inputVal1, 8);
353 // squash two vectors and write output
354 outputVal = vcombine_s8(outputVal0, outputVal1);
355 vst1q_s8(outputVectorPtr, outputVal);
356 inputVectorPtr += 16;
357 outputVectorPtr += 16;
358 }
359
360 for (number = sixteenth_points * 16; number < num_points; number++) {
361 *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
362 }
363}
364#endif /* LV_HAVE_NEON */
365
366#ifdef LV_HAVE_NEONV8
367#include <arm_neon.h>
368
369static inline void volk_16i_convert_8i_neonv8(int8_t* outputVector,
370 const int16_t* inputVector,
371 unsigned int num_points)
372{
373 int8_t* outputVectorPtr = outputVector;
374 const int16_t* inputVectorPtr = inputVector;
375 const unsigned int thirtysecondPoints = num_points / 32;
376
377 for (unsigned int number = 0; number < thirtysecondPoints; number++) {
378 int16x8_t in0 = vld1q_s16(inputVectorPtr);
379 int16x8_t in1 = vld1q_s16(inputVectorPtr + 8);
380 int16x8_t in2 = vld1q_s16(inputVectorPtr + 16);
381 int16x8_t in3 = vld1q_s16(inputVectorPtr + 24);
382 __VOLK_PREFETCH(inputVectorPtr + 64);
383
384 int8x8_t out0 = vshrn_n_s16(in0, 8);
385 int8x8_t out1 = vshrn_n_s16(in1, 8);
386 int8x8_t out2 = vshrn_n_s16(in2, 8);
387 int8x8_t out3 = vshrn_n_s16(in3, 8);
388
389 vst1q_s8(outputVectorPtr, vcombine_s8(out0, out1));
390 vst1q_s8(outputVectorPtr + 16, vcombine_s8(out2, out3));
391
392 inputVectorPtr += 32;
393 outputVectorPtr += 32;
394 }
395
396 for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
397 *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
398 }
399}
400#endif /* LV_HAVE_NEONV8 */
401
402#ifdef LV_HAVE_RVV
403#include <riscv_vector.h>
404
405static inline void volk_16i_convert_8i_rvv(int8_t* outputVector,
406 const int16_t* inputVector,
407 unsigned int num_points)
408{
409 size_t n = num_points;
410 for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
411 vl = __riscv_vsetvl_e16m8(n);
412 vint16m8_t v = __riscv_vle16_v_i16m8(inputVector, vl);
413 __riscv_vse8(outputVector, __riscv_vnsra(v, 8, vl), vl);
414 }
415}
416#endif /*LV_HAVE_RVV*/
417
418#endif /* INCLUDED_volk_16i_convert_8i_a_H */