Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_8i_x2_add_saturated_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
51
52#ifndef INCLUDED_volk_8i_x2_add_saturated_8i_u_H
53#define INCLUDED_volk_8i_x2_add_saturated_8i_u_H
54
55#include <inttypes.h>
56
57#ifdef LV_HAVE_GENERIC
58
59static inline void volk_8i_x2_add_saturated_8i_generic(int8_t* outVector,
60 const int8_t* inVectorA,
61 const int8_t* inVectorB,
62 unsigned int num_points)
63{
64 for (unsigned int i = 0; i < num_points; i++) {
65 int8_t a = inVectorA[i];
66 int8_t b = inVectorB[i];
67 int8_t sum = a + b;
68 // Overflow if a and b have same sign but sum has different sign
69 int8_t overflow = ((a ^ sum) & (b ^ sum)) >> 7;
70 // Saturation value: 127 if a >= 0, -128 if a < 0
71 int8_t sat_val = (a >> 7) ^ 0x7F;
72 outVector[i] = (overflow & sat_val) | (~overflow & sum);
73 }
74}
75
76#endif /* LV_HAVE_GENERIC */
77
78
79#ifdef LV_HAVE_SSE2
80#include <emmintrin.h>
81
82static inline void volk_8i_x2_add_saturated_8i_u_sse2(int8_t* outVector,
83 const int8_t* inVectorA,
84 const int8_t* inVectorB,
85 unsigned int num_points)
86{
87 const unsigned int sixteenthPoints = num_points / 16;
88 unsigned int number = 0;
89
90 for (; number < sixteenthPoints; number++) {
91 __m128i a = _mm_loadu_si128((const __m128i*)(inVectorA + 16 * number));
92 __m128i b = _mm_loadu_si128((const __m128i*)(inVectorB + 16 * number));
93 __m128i result = _mm_adds_epi8(a, b);
94 _mm_storeu_si128((__m128i*)(outVector + 16 * number), result);
95 }
96
97 for (number = sixteenthPoints * 16; number < num_points; number++) {
98 int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
99 if (sum > 127)
100 sum = 127;
101 else if (sum < -128)
102 sum = -128;
103 outVector[number] = (int8_t)sum;
104 }
105}
106
107#endif /* LV_HAVE_SSE2 */
108
109
110#ifdef LV_HAVE_AVX2
111#include <immintrin.h>
112
113static inline void volk_8i_x2_add_saturated_8i_u_avx2(int8_t* outVector,
114 const int8_t* inVectorA,
115 const int8_t* inVectorB,
116 unsigned int num_points)
117{
118 const unsigned int thirtysecondPoints = num_points / 32;
119 unsigned int number = 0;
120
121 for (; number < thirtysecondPoints; number++) {
122 __m256i a = _mm256_loadu_si256((const __m256i*)(inVectorA + 32 * number));
123 __m256i b = _mm256_loadu_si256((const __m256i*)(inVectorB + 32 * number));
124 __m256i result = _mm256_adds_epi8(a, b);
125 _mm256_storeu_si256((__m256i*)(outVector + 32 * number), result);
126 }
127
128 for (number = thirtysecondPoints * 32; number < num_points; number++) {
129 int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
130 if (sum > 127)
131 sum = 127;
132 else if (sum < -128)
133 sum = -128;
134 outVector[number] = (int8_t)sum;
135 }
136}
137
138#endif /* LV_HAVE_AVX2 */
139
140
141#ifdef LV_HAVE_AVX512BW
142#include <immintrin.h>
143
144static inline void volk_8i_x2_add_saturated_8i_u_avx512bw(int8_t* outVector,
145 const int8_t* inVectorA,
146 const int8_t* inVectorB,
147 unsigned int num_points)
148{
149 const unsigned int sixtyfourthPoints = num_points / 64;
150 unsigned int number = 0;
151
152 for (; number < sixtyfourthPoints; number++) {
153 __m512i a = _mm512_loadu_si512((const __m512i*)(inVectorA + 64 * number));
154 __m512i b = _mm512_loadu_si512((const __m512i*)(inVectorB + 64 * number));
155 __m512i result = _mm512_adds_epi8(a, b);
156 _mm512_storeu_si512((__m512i*)(outVector + 64 * number), result);
157 }
158
159 for (number = sixtyfourthPoints * 64; number < num_points; number++) {
160 int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
161 if (sum > 127)
162 sum = 127;
163 else if (sum < -128)
164 sum = -128;
165 outVector[number] = (int8_t)sum;
166 }
167}
168
169#endif /* LV_HAVE_AVX512BW */
170
171
172#endif /* INCLUDED_volk_8i_x2_add_saturated_8i_u_H */
173
174
175#ifndef INCLUDED_volk_8i_x2_add_saturated_8i_a_H
176#define INCLUDED_volk_8i_x2_add_saturated_8i_a_H
177
178#include <inttypes.h>
179
180#ifdef LV_HAVE_SSE2
181#include <emmintrin.h>
182
183static inline void volk_8i_x2_add_saturated_8i_a_sse2(int8_t* outVector,
184 const int8_t* inVectorA,
185 const int8_t* inVectorB,
186 unsigned int num_points)
187{
188 const unsigned int sixteenthPoints = num_points / 16;
189 unsigned int number = 0;
190
191 for (; number < sixteenthPoints; number++) {
192 __m128i a = _mm_load_si128((const __m128i*)(inVectorA + 16 * number));
193 __m128i b = _mm_load_si128((const __m128i*)(inVectorB + 16 * number));
194 __m128i result = _mm_adds_epi8(a, b);
195 _mm_store_si128((__m128i*)(outVector + 16 * number), result);
196 }
197
198 for (number = sixteenthPoints * 16; number < num_points; number++) {
199 int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
200 if (sum > 127)
201 sum = 127;
202 else if (sum < -128)
203 sum = -128;
204 outVector[number] = (int8_t)sum;
205 }
206}
207
208#endif /* LV_HAVE_SSE2 */
209
210
211#ifdef LV_HAVE_AVX2
212#include <immintrin.h>
213
214static inline void volk_8i_x2_add_saturated_8i_a_avx2(int8_t* outVector,
215 const int8_t* inVectorA,
216 const int8_t* inVectorB,
217 unsigned int num_points)
218{
219 const unsigned int thirtysecondPoints = num_points / 32;
220 unsigned int number = 0;
221
222 for (; number < thirtysecondPoints; number++) {
223 __m256i a = _mm256_load_si256((const __m256i*)(inVectorA + 32 * number));
224 __m256i b = _mm256_load_si256((const __m256i*)(inVectorB + 32 * number));
225 __m256i result = _mm256_adds_epi8(a, b);
226 _mm256_store_si256((__m256i*)(outVector + 32 * number), result);
227 }
228
229 for (number = thirtysecondPoints * 32; number < num_points; number++) {
230 int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
231 if (sum > 127)
232 sum = 127;
233 else if (sum < -128)
234 sum = -128;
235 outVector[number] = (int8_t)sum;
236 }
237}
238
239#endif /* LV_HAVE_AVX2 */
240
241
242#ifdef LV_HAVE_AVX512BW
243#include <immintrin.h>
244
245static inline void volk_8i_x2_add_saturated_8i_a_avx512bw(int8_t* outVector,
246 const int8_t* inVectorA,
247 const int8_t* inVectorB,
248 unsigned int num_points)
249{
250 const unsigned int sixtyfourthPoints = num_points / 64;
251 unsigned int number = 0;
252
253 for (; number < sixtyfourthPoints; number++) {
254 __m512i a = _mm512_load_si512((const __m512i*)(inVectorA + 64 * number));
255 __m512i b = _mm512_load_si512((const __m512i*)(inVectorB + 64 * number));
256 __m512i result = _mm512_adds_epi8(a, b);
257 _mm512_store_si512((__m512i*)(outVector + 64 * number), result);
258 }
259
260 for (number = sixtyfourthPoints * 64; number < num_points; number++) {
261 int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
262 if (sum > 127)
263 sum = 127;
264 else if (sum < -128)
265 sum = -128;
266 outVector[number] = (int8_t)sum;
267 }
268}
269
270#endif /* LV_HAVE_AVX512BW */
271
272
273#ifdef LV_HAVE_NEON
274#include <arm_neon.h>
275
276static inline void volk_8i_x2_add_saturated_8i_neon(int8_t* outVector,
277 const int8_t* inVectorA,
278 const int8_t* inVectorB,
279 unsigned int num_points)
280{
281 const unsigned int sixteenthPoints = num_points / 16;
282 unsigned int number = 0;
283
284 for (; number < sixteenthPoints; number++) {
285 int8x16_t a = vld1q_s8(inVectorA + 16 * number);
286 int8x16_t b = vld1q_s8(inVectorB + 16 * number);
287 vst1q_s8(outVector + 16 * number, vqaddq_s8(a, b));
288 }
289
290 for (number = sixteenthPoints * 16; number < num_points; number++) {
291 int16_t sum = (int16_t)inVectorA[number] + (int16_t)inVectorB[number];
292 if (sum > 127)
293 sum = 127;
294 else if (sum < -128)
295 sum = -128;
296 outVector[number] = (int8_t)sum;
297 }
298}
299
300#endif /* LV_HAVE_NEON */
301
302
303#ifdef LV_HAVE_NEONV8
304#include <arm_neon.h>
305#include <volk/volk_common.h>
306
307static inline void volk_8i_x2_add_saturated_8i_neonv8(int8_t* outVector,
308 const int8_t* inVectorA,
309 const int8_t* inVectorB,
310 unsigned int num_points)
311{
312 const unsigned int thirtysecondPoints = num_points / 32;
313 unsigned int number = 0;
314
315 for (; number < thirtysecondPoints; number++) {
316 __VOLK_PREFETCH(inVectorA + 64);
317 __VOLK_PREFETCH(inVectorB + 64);
318 int8x16_t a0 = vld1q_s8(inVectorA);
319 int8x16_t b0 = vld1q_s8(inVectorB);
320 int8x16_t a1 = vld1q_s8(inVectorA + 16);
321 int8x16_t b1 = vld1q_s8(inVectorB + 16);
322 vst1q_s8(outVector, vqaddq_s8(a0, b0));
323 vst1q_s8(outVector + 16, vqaddq_s8(a1, b1));
324 inVectorA += 32;
325 inVectorB += 32;
326 outVector += 32;
327 }
328
329 for (number = thirtysecondPoints * 32; number < num_points; number++) {
330 int16_t sum = (int16_t)(*inVectorA++) + (int16_t)(*inVectorB++);
331 if (sum > 127)
332 sum = 127;
333 else if (sum < -128)
334 sum = -128;
335 *outVector++ = (int8_t)sum;
336 }
337}
338
339#endif /* LV_HAVE_NEONV8 */
340
341
342#ifdef LV_HAVE_RVV
343#include <riscv_vector.h>
344
345static inline void volk_8i_x2_add_saturated_8i_rvv(int8_t* outVector,
346 const int8_t* inVectorA,
347 const int8_t* inVectorB,
348 unsigned int num_points)
349{
350 size_t n = num_points;
351 for (size_t vl; n > 0; n -= vl, inVectorA += vl, inVectorB += vl, outVector += vl) {
352 vl = __riscv_vsetvl_e8m8(n);
353 vint8m8_t a = __riscv_vle8_v_i8m8(inVectorA, vl);
354 vint8m8_t b = __riscv_vle8_v_i8m8(inVectorB, vl);
355 __riscv_vse8(outVector, __riscv_vsadd(a, b, vl), vl);
356 }
357}
358
359#endif /* LV_HAVE_RVV */
360
361
362#endif /* INCLUDED_volk_8i_x2_add_saturated_8i_a_H */