40#ifndef INCLUDED_volk_16i_convert_8i_u_H
41#define INCLUDED_volk_16i_convert_8i_u_H
49static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector,
50 const int16_t* inputVector,
51 unsigned int num_points)
53 unsigned int number = 0;
54 const unsigned int thirtysecondPoints = num_points / 32;
56 int8_t* outputVectorPtr = outputVector;
57 int16_t* inputPtr = (int16_t*)inputVector;
62 for (; number < thirtysecondPoints; number++) {
65 inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr);
67 inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr);
70 inputVal1 = _mm256_srai_epi16(inputVal1, 8);
71 inputVal2 = _mm256_srai_epi16(inputVal2, 8);
73 ret = _mm256_packs_epi16(inputVal1, inputVal2);
74 ret = _mm256_permute4x64_epi64(ret, 0b11011000);
76 _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
78 outputVectorPtr += 32;
81 number = thirtysecondPoints * 32;
82 for (; number < num_points; number++) {
83 outputVector[number] = (int8_t)(inputVector[number] >> 8);
88#ifdef LV_HAVE_AVX512BW
91static inline void volk_16i_convert_8i_u_avx512bw(int8_t* outputVector,
92 const int16_t* inputVector,
93 unsigned int num_points)
95 unsigned int number = 0;
96 const unsigned int sixtyfourthPoints = num_points / 64;
98 int8_t* outputVectorPtr = outputVector;
99 int16_t* inputPtr = (int16_t*)inputVector;
102 __m512i shifted1, shifted2;
105 for (; number < sixtyfourthPoints; number++) {
108 inputVal1 = _mm512_loadu_si512((__m512i*)inputPtr);
110 inputVal2 = _mm512_loadu_si512((__m512i*)inputPtr);
113 shifted1 = _mm512_srai_epi16(inputVal1, 8);
114 shifted2 = _mm512_srai_epi16(inputVal2, 8);
116 ret1 = _mm512_cvtsepi16_epi8(shifted1);
117 ret2 = _mm512_cvtsepi16_epi8(shifted2);
119 _mm256_storeu_si256((__m256i*)outputVectorPtr, ret1);
120 outputVectorPtr += 32;
121 _mm256_storeu_si256((__m256i*)outputVectorPtr, ret2);
122 outputVectorPtr += 32;
125 number = sixtyfourthPoints * 64;
126 for (; number < num_points; number++) {
127 outputVector[number] = (int8_t)(inputVector[number] >> 8);
134#include <emmintrin.h>
137 const int16_t* inputVector,
138 unsigned int num_points)
140 unsigned int number = 0;
141 const unsigned int sixteenthPoints = num_points / 16;
143 int8_t* outputVectorPtr = outputVector;
144 int16_t* inputPtr = (int16_t*)inputVector;
149 for (; number < sixteenthPoints; number++) {
152 inputVal1 = _mm_loadu_si128((__m128i*)inputPtr);
154 inputVal2 = _mm_loadu_si128((__m128i*)inputPtr);
157 inputVal1 = _mm_srai_epi16(inputVal1, 8);
158 inputVal2 = _mm_srai_epi16(inputVal2, 8);
160 ret = _mm_packs_epi16(inputVal1, inputVal2);
162 _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
164 outputVectorPtr += 16;
167 number = sixteenthPoints * 16;
168 for (; number < num_points; number++) {
169 outputVector[number] = (int8_t)(inputVector[number] >> 8);
175#ifdef LV_HAVE_GENERIC
178 const int16_t* inputVector,
179 unsigned int num_points)
181 int8_t* outputVectorPtr = outputVector;
182 const int16_t* inputVectorPtr = inputVector;
183 unsigned int number = 0;
185 for (number = 0; number < num_points; number++) {
186 *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
193#ifndef INCLUDED_volk_16i_convert_8i_a_H
194#define INCLUDED_volk_16i_convert_8i_a_H
200#include <immintrin.h>
202static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector,
203 const int16_t* inputVector,
204 unsigned int num_points)
206 unsigned int number = 0;
207 const unsigned int thirtysecondPoints = num_points / 32;
209 int8_t* outputVectorPtr = outputVector;
210 int16_t* inputPtr = (int16_t*)inputVector;
215 for (; number < thirtysecondPoints; number++) {
218 inputVal1 = _mm256_load_si256((__m256i*)inputPtr);
220 inputVal2 = _mm256_load_si256((__m256i*)inputPtr);
223 inputVal1 = _mm256_srai_epi16(inputVal1, 8);
224 inputVal2 = _mm256_srai_epi16(inputVal2, 8);
226 ret = _mm256_packs_epi16(inputVal1, inputVal2);
227 ret = _mm256_permute4x64_epi64(ret, 0b11011000);
229 _mm256_store_si256((__m256i*)outputVectorPtr, ret);
231 outputVectorPtr += 32;
234 number = thirtysecondPoints * 32;
235 for (; number < num_points; number++) {
236 outputVector[number] = (int8_t)(inputVector[number] >> 8);
241#ifdef LV_HAVE_AVX512BW
242#include <immintrin.h>
244static inline void volk_16i_convert_8i_a_avx512bw(int8_t* outputVector,
245 const int16_t* inputVector,
246 unsigned int num_points)
248 unsigned int number = 0;
249 const unsigned int sixtyfourthPoints = num_points / 64;
251 int8_t* outputVectorPtr = outputVector;
252 int16_t* inputPtr = (int16_t*)inputVector;
255 __m512i shifted1, shifted2;
258 for (; number < sixtyfourthPoints; number++) {
261 inputVal1 = _mm512_load_si512((__m512i*)inputPtr);
263 inputVal2 = _mm512_load_si512((__m512i*)inputPtr);
266 shifted1 = _mm512_srai_epi16(inputVal1, 8);
267 shifted2 = _mm512_srai_epi16(inputVal2, 8);
269 ret1 = _mm512_cvtsepi16_epi8(shifted1);
270 ret2 = _mm512_cvtsepi16_epi8(shifted2);
272 _mm256_store_si256((__m256i*)outputVectorPtr, ret1);
273 outputVectorPtr += 32;
274 _mm256_store_si256((__m256i*)outputVectorPtr, ret2);
275 outputVectorPtr += 32;
278 number = sixtyfourthPoints * 64;
279 for (; number < num_points; number++) {
280 outputVector[number] = (int8_t)(inputVector[number] >> 8);
287#include <emmintrin.h>
290 const int16_t* inputVector,
291 unsigned int num_points)
293 unsigned int number = 0;
294 const unsigned int sixteenthPoints = num_points / 16;
296 int8_t* outputVectorPtr = outputVector;
297 int16_t* inputPtr = (int16_t*)inputVector;
302 for (; number < sixteenthPoints; number++) {
305 inputVal1 = _mm_load_si128((__m128i*)inputPtr);
307 inputVal2 = _mm_load_si128((__m128i*)inputPtr);
310 inputVal1 = _mm_srai_epi16(inputVal1, 8);
311 inputVal2 = _mm_srai_epi16(inputVal2, 8);
313 ret = _mm_packs_epi16(inputVal1, inputVal2);
315 _mm_store_si128((__m128i*)outputVectorPtr, ret);
317 outputVectorPtr += 16;
320 number = sixteenthPoints * 16;
321 for (; number < num_points; number++) {
322 outputVector[number] = (int8_t)(inputVector[number] >> 8);
332 const int16_t* inputVector,
333 unsigned int num_points)
335 int8_t* outputVectorPtr = outputVector;
336 const int16_t* inputVectorPtr = inputVector;
337 unsigned int number = 0;
338 unsigned int sixteenth_points = num_points / 16;
346 for (number = 0; number < sixteenth_points; number++) {
348 inputVal0 = vld1q_s16(inputVectorPtr);
349 inputVal1 = vld1q_s16(inputVectorPtr + 8);
351 outputVal0 = vshrn_n_s16(inputVal0, 8);
352 outputVal1 = vshrn_n_s16(inputVal1, 8);
354 outputVal = vcombine_s8(outputVal0, outputVal1);
355 vst1q_s8(outputVectorPtr, outputVal);
356 inputVectorPtr += 16;
357 outputVectorPtr += 16;
360 for (number = sixteenth_points * 16; number < num_points; number++) {
361 *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
369static inline void volk_16i_convert_8i_neonv8(int8_t* outputVector,
370 const int16_t* inputVector,
371 unsigned int num_points)
373 int8_t* outputVectorPtr = outputVector;
374 const int16_t* inputVectorPtr = inputVector;
375 const unsigned int thirtysecondPoints = num_points / 32;
377 for (
unsigned int number = 0; number < thirtysecondPoints; number++) {
378 int16x8_t in0 = vld1q_s16(inputVectorPtr);
379 int16x8_t in1 = vld1q_s16(inputVectorPtr + 8);
380 int16x8_t in2 = vld1q_s16(inputVectorPtr + 16);
381 int16x8_t in3 = vld1q_s16(inputVectorPtr + 24);
384 int8x8_t out0 = vshrn_n_s16(in0, 8);
385 int8x8_t out1 = vshrn_n_s16(in1, 8);
386 int8x8_t out2 = vshrn_n_s16(in2, 8);
387 int8x8_t out3 = vshrn_n_s16(in3, 8);
389 vst1q_s8(outputVectorPtr, vcombine_s8(out0, out1));
390 vst1q_s8(outputVectorPtr + 16, vcombine_s8(out2, out3));
392 inputVectorPtr += 32;
393 outputVectorPtr += 32;
396 for (
unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
397 *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
403#include <riscv_vector.h>
405static inline void volk_16i_convert_8i_rvv(int8_t* outputVector,
406 const int16_t* inputVector,
407 unsigned int num_points)
409 size_t n = num_points;
410 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
411 vl = __riscv_vsetvl_e16m8(n);
412 vint16m8_t v = __riscv_vle16_v_i16m8(inputVector, vl);
413 __riscv_vse8(outputVector, __riscv_vnsra(v, 8, vl), vl);