40#ifndef INCLUDED_volk_8i_convert_16i_u_H
41#define INCLUDED_volk_8i_convert_16i_u_H
49static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
50 const int8_t* inputVector,
51 unsigned int num_points)
53 unsigned int number = 0;
54 const unsigned int sixteenthPoints = num_points / 16;
56 const __m128i* inputVectorPtr = (
const __m128i*)inputVector;
57 __m256i* outputVectorPtr = (__m256i*)outputVector;
61 for (; number < sixteenthPoints; number++) {
62 inputVal = _mm_loadu_si128(inputVectorPtr);
63 ret = _mm256_cvtepi8_epi16(inputVal);
64 ret = _mm256_slli_epi16(ret, 8);
65 _mm256_storeu_si256(outputVectorPtr, ret);
71 number = sixteenthPoints * 16;
72 for (; number < num_points; number++) {
73 outputVector[number] = (int16_t)(inputVector[number]) * 256;
78#ifdef LV_HAVE_AVX512BW
81static inline void volk_8i_convert_16i_u_avx512bw(int16_t* outputVector,
82 const int8_t* inputVector,
83 unsigned int num_points)
85 unsigned int number = 0;
86 const unsigned int thirtysecondPoints = num_points / 32;
88 const __m256i* inputVectorPtr = (
const __m256i*)inputVector;
89 __m512i* outputVectorPtr = (__m512i*)outputVector;
93 for (; number < thirtysecondPoints; number++) {
94 inputVal = _mm256_loadu_si256(inputVectorPtr);
95 ret = _mm512_cvtepi8_epi16(inputVal);
96 ret = _mm512_slli_epi16(ret, 8);
97 _mm512_storeu_si512(outputVectorPtr, ret);
103 number = thirtysecondPoints * 32;
104 for (; number < num_points; number++) {
105 outputVector[number] = (int16_t)(inputVector[number]) * 256;
112#include <smmintrin.h>
114static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
115 const int8_t* inputVector,
116 unsigned int num_points)
118 unsigned int number = 0;
119 const unsigned int sixteenthPoints = num_points / 16;
121 const __m128i* inputVectorPtr = (
const __m128i*)inputVector;
122 __m128i* outputVectorPtr = (__m128i*)outputVector;
126 for (; number < sixteenthPoints; number++) {
127 inputVal = _mm_loadu_si128(inputVectorPtr);
128 ret = _mm_cvtepi8_epi16(inputVal);
129 ret = _mm_slli_epi16(ret, 8);
130 _mm_storeu_si128(outputVectorPtr, ret);
134 inputVal = _mm_srli_si128(inputVal, 8);
135 ret = _mm_cvtepi8_epi16(inputVal);
136 ret = _mm_slli_epi16(ret, 8);
137 _mm_storeu_si128(outputVectorPtr, ret);
144 number = sixteenthPoints * 16;
145 for (; number < num_points; number++) {
146 outputVector[number] = (int16_t)(inputVector[number]) * 256;
152#ifdef LV_HAVE_GENERIC
155 const int8_t* inputVector,
156 unsigned int num_points)
158 int16_t* outputVectorPtr = outputVector;
159 const int8_t* inputVectorPtr = inputVector;
160 unsigned int number = 0;
162 for (number = 0; number < num_points; number++) {
163 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
172#ifndef INCLUDED_volk_8i_convert_16i_a_H
173#define INCLUDED_volk_8i_convert_16i_a_H
179#include <immintrin.h>
181static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
182 const int8_t* inputVector,
183 unsigned int num_points)
185 unsigned int number = 0;
186 const unsigned int sixteenthPoints = num_points / 16;
188 const __m128i* inputVectorPtr = (
const __m128i*)inputVector;
189 __m256i* outputVectorPtr = (__m256i*)outputVector;
193 for (; number < sixteenthPoints; number++) {
194 inputVal = _mm_load_si128(inputVectorPtr);
195 ret = _mm256_cvtepi8_epi16(inputVal);
196 ret = _mm256_slli_epi16(ret, 8);
197 _mm256_store_si256(outputVectorPtr, ret);
203 number = sixteenthPoints * 16;
204 for (; number < num_points; number++) {
205 outputVector[number] = (int16_t)(inputVector[number]) * 256;
210#ifdef LV_HAVE_AVX512BW
211#include <immintrin.h>
213static inline void volk_8i_convert_16i_a_avx512bw(int16_t* outputVector,
214 const int8_t* inputVector,
215 unsigned int num_points)
217 unsigned int number = 0;
218 const unsigned int thirtysecondPoints = num_points / 32;
220 const __m256i* inputVectorPtr = (
const __m256i*)inputVector;
221 __m512i* outputVectorPtr = (__m512i*)outputVector;
225 for (; number < thirtysecondPoints; number++) {
226 inputVal = _mm256_load_si256(inputVectorPtr);
227 ret = _mm512_cvtepi8_epi16(inputVal);
228 ret = _mm512_slli_epi16(ret, 8);
229 _mm512_store_si512(outputVectorPtr, ret);
235 number = thirtysecondPoints * 32;
236 for (; number < num_points; number++) {
237 outputVector[number] = (int16_t)(inputVector[number]) * 256;
244#include <smmintrin.h>
246static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
247 const int8_t* inputVector,
248 unsigned int num_points)
250 unsigned int number = 0;
251 const unsigned int sixteenthPoints = num_points / 16;
253 const __m128i* inputVectorPtr = (
const __m128i*)inputVector;
254 __m128i* outputVectorPtr = (__m128i*)outputVector;
258 for (; number < sixteenthPoints; number++) {
259 inputVal = _mm_load_si128(inputVectorPtr);
260 ret = _mm_cvtepi8_epi16(inputVal);
261 ret = _mm_slli_epi16(ret, 8);
262 _mm_store_si128(outputVectorPtr, ret);
266 inputVal = _mm_srli_si128(inputVal, 8);
267 ret = _mm_cvtepi8_epi16(inputVal);
268 ret = _mm_slli_epi16(ret, 8);
269 _mm_store_si128(outputVectorPtr, ret);
276 number = sixteenthPoints * 16;
277 for (; number < num_points; number++) {
278 outputVector[number] = (int16_t)(inputVector[number]) * 256;
288 const int8_t* inputVector,
289 unsigned int num_points)
291 int16_t* outputVectorPtr = outputVector;
292 const int8_t* inputVectorPtr = inputVector;
294 const unsigned int eighth_points = num_points / 8;
297 int16x8_t converted_vec;
302 for (number = 0; number < eighth_points; ++number) {
303 input_vec = vld1_s8(inputVectorPtr);
304 converted_vec = vmovl_s8(input_vec);
306 converted_vec = vshlq_n_s16(converted_vec, 8);
307 vst1q_s16(outputVectorPtr, converted_vec);
310 outputVectorPtr += 8;
313 for (number = eighth_points * 8; number < num_points; number++) {
314 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
322static inline void volk_8i_convert_16i_neonv8(int16_t* outputVector,
323 const int8_t* inputVector,
324 unsigned int num_points)
326 int16_t* outputVectorPtr = outputVector;
327 const int8_t* inputVectorPtr = inputVector;
328 const unsigned int sixteenthPoints = num_points / 16;
330 for (
unsigned int number = 0; number < sixteenthPoints; number++) {
331 int8x16_t in = vld1q_s8(inputVectorPtr);
334 int16x8_t out_lo = vshll_n_s8(vget_low_s8(in), 8);
335 int16x8_t out_hi = vshll_n_s8(vget_high_s8(in), 8);
337 vst1q_s16(outputVectorPtr, out_lo);
338 vst1q_s16(outputVectorPtr + 8, out_hi);
340 inputVectorPtr += 16;
341 outputVectorPtr += 16;
344 for (
unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
345 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
352extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
353 const int8_t* inputVector,
356static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
357 const int8_t* inputVector,
358 unsigned int num_points)
360 volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
365#include <riscv_vector.h>
367static inline void volk_8i_convert_16i_rvv(int16_t* outputVector,
368 const int8_t* inputVector,
369 unsigned int num_points)
371 size_t n = num_points;
372 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
373 vl = __riscv_vsetvl_e8m4(n);
374 vint16m8_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m4(inputVector, vl), vl);
375 __riscv_vse16(outputVector, __riscv_vsll(v, 8, vl), vl);