Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_8i_convert_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
39
40#ifndef INCLUDED_volk_8i_convert_16i_u_H
41#define INCLUDED_volk_8i_convert_16i_u_H
42
43#include <inttypes.h>
44#include <stdio.h>
45
46#ifdef LV_HAVE_AVX2
47#include <immintrin.h>
48
49static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
50 const int8_t* inputVector,
51 unsigned int num_points)
52{
53 unsigned int number = 0;
54 const unsigned int sixteenthPoints = num_points / 16;
55
56 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
57 __m256i* outputVectorPtr = (__m256i*)outputVector;
58 __m128i inputVal;
59 __m256i ret;
60
61 for (; number < sixteenthPoints; number++) {
62 inputVal = _mm_loadu_si128(inputVectorPtr);
63 ret = _mm256_cvtepi8_epi16(inputVal);
64 ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
65 _mm256_storeu_si256(outputVectorPtr, ret);
66
67 outputVectorPtr++;
68 inputVectorPtr++;
69 }
70
71 number = sixteenthPoints * 16;
72 for (; number < num_points; number++) {
73 outputVector[number] = (int16_t)(inputVector[number]) * 256;
74 }
75}
76#endif /* LV_HAVE_AVX2 */
77
78#ifdef LV_HAVE_AVX512BW
79#include <immintrin.h>
80
81static inline void volk_8i_convert_16i_u_avx512bw(int16_t* outputVector,
82 const int8_t* inputVector,
83 unsigned int num_points)
84{
85 unsigned int number = 0;
86 const unsigned int thirtysecondPoints = num_points / 32;
87
88 const __m256i* inputVectorPtr = (const __m256i*)inputVector;
89 __m512i* outputVectorPtr = (__m512i*)outputVector;
90 __m256i inputVal;
91 __m512i ret;
92
93 for (; number < thirtysecondPoints; number++) {
94 inputVal = _mm256_loadu_si256(inputVectorPtr);
95 ret = _mm512_cvtepi8_epi16(inputVal);
96 ret = _mm512_slli_epi16(ret, 8); // Multiply by 256
97 _mm512_storeu_si512(outputVectorPtr, ret);
98
99 outputVectorPtr++;
100 inputVectorPtr++;
101 }
102
103 number = thirtysecondPoints * 32;
104 for (; number < num_points; number++) {
105 outputVector[number] = (int16_t)(inputVector[number]) * 256;
106 }
107}
108#endif /* LV_HAVE_AVX512BW */
109
110
111#ifdef LV_HAVE_SSE4_1
112#include <smmintrin.h>
113
114static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
115 const int8_t* inputVector,
116 unsigned int num_points)
117{
118 unsigned int number = 0;
119 const unsigned int sixteenthPoints = num_points / 16;
120
121 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
122 __m128i* outputVectorPtr = (__m128i*)outputVector;
123 __m128i inputVal;
124 __m128i ret;
125
126 for (; number < sixteenthPoints; number++) {
127 inputVal = _mm_loadu_si128(inputVectorPtr);
128 ret = _mm_cvtepi8_epi16(inputVal);
129 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
130 _mm_storeu_si128(outputVectorPtr, ret);
131
132 outputVectorPtr++;
133
134 inputVal = _mm_srli_si128(inputVal, 8);
135 ret = _mm_cvtepi8_epi16(inputVal);
136 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
137 _mm_storeu_si128(outputVectorPtr, ret);
138
139 outputVectorPtr++;
140
141 inputVectorPtr++;
142 }
143
144 number = sixteenthPoints * 16;
145 for (; number < num_points; number++) {
146 outputVector[number] = (int16_t)(inputVector[number]) * 256;
147 }
148}
149#endif /* LV_HAVE_SSE4_1 */
150
151
152#ifdef LV_HAVE_GENERIC
153
154static inline void volk_8i_convert_16i_generic(int16_t* outputVector,
155 const int8_t* inputVector,
156 unsigned int num_points)
157{
158 int16_t* outputVectorPtr = outputVector;
159 const int8_t* inputVectorPtr = inputVector;
160 unsigned int number = 0;
161
162 for (number = 0; number < num_points; number++) {
163 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
164 }
165}
166#endif /* LV_HAVE_GENERIC */
167
168
169#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
170
171
172#ifndef INCLUDED_volk_8i_convert_16i_a_H
173#define INCLUDED_volk_8i_convert_16i_a_H
174
175#include <inttypes.h>
176#include <stdio.h>
177
178#ifdef LV_HAVE_AVX2
179#include <immintrin.h>
180
181static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
182 const int8_t* inputVector,
183 unsigned int num_points)
184{
185 unsigned int number = 0;
186 const unsigned int sixteenthPoints = num_points / 16;
187
188 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
189 __m256i* outputVectorPtr = (__m256i*)outputVector;
190 __m128i inputVal;
191 __m256i ret;
192
193 for (; number < sixteenthPoints; number++) {
194 inputVal = _mm_load_si128(inputVectorPtr);
195 ret = _mm256_cvtepi8_epi16(inputVal);
196 ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
197 _mm256_store_si256(outputVectorPtr, ret);
198
199 outputVectorPtr++;
200 inputVectorPtr++;
201 }
202
203 number = sixteenthPoints * 16;
204 for (; number < num_points; number++) {
205 outputVector[number] = (int16_t)(inputVector[number]) * 256;
206 }
207}
208#endif /* LV_HAVE_AVX2 */
209
210#ifdef LV_HAVE_AVX512BW
211#include <immintrin.h>
212
213static inline void volk_8i_convert_16i_a_avx512bw(int16_t* outputVector,
214 const int8_t* inputVector,
215 unsigned int num_points)
216{
217 unsigned int number = 0;
218 const unsigned int thirtysecondPoints = num_points / 32;
219
220 const __m256i* inputVectorPtr = (const __m256i*)inputVector;
221 __m512i* outputVectorPtr = (__m512i*)outputVector;
222 __m256i inputVal;
223 __m512i ret;
224
225 for (; number < thirtysecondPoints; number++) {
226 inputVal = _mm256_load_si256(inputVectorPtr);
227 ret = _mm512_cvtepi8_epi16(inputVal);
228 ret = _mm512_slli_epi16(ret, 8); // Multiply by 256
229 _mm512_store_si512(outputVectorPtr, ret);
230
231 outputVectorPtr++;
232 inputVectorPtr++;
233 }
234
235 number = thirtysecondPoints * 32;
236 for (; number < num_points; number++) {
237 outputVector[number] = (int16_t)(inputVector[number]) * 256;
238 }
239}
240#endif /* LV_HAVE_AVX512BW */
241
242
243#ifdef LV_HAVE_SSE4_1
244#include <smmintrin.h>
245
246static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
247 const int8_t* inputVector,
248 unsigned int num_points)
249{
250 unsigned int number = 0;
251 const unsigned int sixteenthPoints = num_points / 16;
252
253 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
254 __m128i* outputVectorPtr = (__m128i*)outputVector;
255 __m128i inputVal;
256 __m128i ret;
257
258 for (; number < sixteenthPoints; number++) {
259 inputVal = _mm_load_si128(inputVectorPtr);
260 ret = _mm_cvtepi8_epi16(inputVal);
261 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
262 _mm_store_si128(outputVectorPtr, ret);
263
264 outputVectorPtr++;
265
266 inputVal = _mm_srli_si128(inputVal, 8);
267 ret = _mm_cvtepi8_epi16(inputVal);
268 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
269 _mm_store_si128(outputVectorPtr, ret);
270
271 outputVectorPtr++;
272
273 inputVectorPtr++;
274 }
275
276 number = sixteenthPoints * 16;
277 for (; number < num_points; number++) {
278 outputVector[number] = (int16_t)(inputVector[number]) * 256;
279 }
280}
281#endif /* LV_HAVE_SSE4_1 */
282
283
284#ifdef LV_HAVE_NEON
285#include <arm_neon.h>
286
287static inline void volk_8i_convert_16i_neon(int16_t* outputVector,
288 const int8_t* inputVector,
289 unsigned int num_points)
290{
291 int16_t* outputVectorPtr = outputVector;
292 const int8_t* inputVectorPtr = inputVector;
293 unsigned int number;
294 const unsigned int eighth_points = num_points / 8;
295
296 int8x8_t input_vec;
297 int16x8_t converted_vec;
298
299 // NEON doesn't have a concept of 8 bit registers, so we are really
300 // dealing with the low half of 16-bit registers. Since this requires
301 // a move instruction we likely do better with ASM here.
302 for (number = 0; number < eighth_points; ++number) {
303 input_vec = vld1_s8(inputVectorPtr);
304 converted_vec = vmovl_s8(input_vec);
305 // converted_vec = vmulq_s16(converted_vec, scale_factor);
306 converted_vec = vshlq_n_s16(converted_vec, 8);
307 vst1q_s16(outputVectorPtr, converted_vec);
308
309 inputVectorPtr += 8;
310 outputVectorPtr += 8;
311 }
312
313 for (number = eighth_points * 8; number < num_points; number++) {
314 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
315 }
316}
317#endif /* LV_HAVE_NEON */
318
319#ifdef LV_HAVE_NEONV8
320#include <arm_neon.h>
321
322static inline void volk_8i_convert_16i_neonv8(int16_t* outputVector,
323 const int8_t* inputVector,
324 unsigned int num_points)
325{
326 int16_t* outputVectorPtr = outputVector;
327 const int8_t* inputVectorPtr = inputVector;
328 const unsigned int sixteenthPoints = num_points / 16;
329
330 for (unsigned int number = 0; number < sixteenthPoints; number++) {
331 int8x16_t in = vld1q_s8(inputVectorPtr);
332 __VOLK_PREFETCH(inputVectorPtr + 32);
333
334 int16x8_t out_lo = vshll_n_s8(vget_low_s8(in), 8);
335 int16x8_t out_hi = vshll_n_s8(vget_high_s8(in), 8);
336
337 vst1q_s16(outputVectorPtr, out_lo);
338 vst1q_s16(outputVectorPtr + 8, out_hi);
339
340 inputVectorPtr += 16;
341 outputVectorPtr += 16;
342 }
343
344 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
345 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
346 }
347}
348#endif /* LV_HAVE_NEONV8 */
349
350
351#ifdef LV_HAVE_ORC
352extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
353 const int8_t* inputVector,
354 int num_points);
355
356static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
357 const int8_t* inputVector,
358 unsigned int num_points)
359{
360 volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
361}
362#endif /* LV_HAVE_ORC */
363
364#ifdef LV_HAVE_RVV
365#include <riscv_vector.h>
366
367static inline void volk_8i_convert_16i_rvv(int16_t* outputVector,
368 const int8_t* inputVector,
369 unsigned int num_points)
370{
371 size_t n = num_points;
372 for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
373 vl = __riscv_vsetvl_e8m4(n);
374 vint16m8_t v = __riscv_vsext_vf2(__riscv_vle8_v_i8m4(inputVector, vl), vl);
375 __riscv_vse16(outputVector, __riscv_vsll(v, 8, vl), vl);
376 }
377}
378#endif /*LV_HAVE_RVV*/
379
380#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */