52#ifndef INCLUDED_volk_8u_x2_add_saturated_8u_u_H
53#define INCLUDED_volk_8u_x2_add_saturated_8u_u_H
60 const uint8_t* inVectorA,
61 const uint8_t* inVectorB,
62 unsigned int num_points)
64 for (
unsigned int i = 0; i < num_points; i++) {
65 uint8_t sum = inVectorA[i] + inVectorB[i];
66 outVector[i] = sum | -(uint8_t)(sum < inVectorA[i]);
77 const uint8_t* inVectorA,
78 const uint8_t* inVectorB,
79 unsigned int num_points)
81 const unsigned int sixteenthPoints = num_points / 16;
82 unsigned int number = 0;
84 for (; number < sixteenthPoints; number++) {
85 __m128i a = _mm_loadu_si128((
const __m128i*)(inVectorA + 16 * number));
86 __m128i b = _mm_loadu_si128((
const __m128i*)(inVectorB + 16 * number));
87 __m128i result = _mm_adds_epu8(a, b);
88 _mm_storeu_si128((__m128i*)(outVector + 16 * number), result);
91 for (number = sixteenthPoints * 16; number < num_points; number++) {
92 uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
95 outVector[number] = (uint8_t)sum;
103#include <immintrin.h>
105static inline void volk_8u_x2_add_saturated_8u_u_avx2(uint8_t* outVector,
106 const uint8_t* inVectorA,
107 const uint8_t* inVectorB,
108 unsigned int num_points)
110 const unsigned int thirtysecondPoints = num_points / 32;
111 unsigned int number = 0;
113 for (; number < thirtysecondPoints; number++) {
114 __m256i a = _mm256_loadu_si256((
const __m256i*)(inVectorA + 32 * number));
115 __m256i b = _mm256_loadu_si256((
const __m256i*)(inVectorB + 32 * number));
116 __m256i result = _mm256_adds_epu8(a, b);
117 _mm256_storeu_si256((__m256i*)(outVector + 32 * number), result);
120 for (number = thirtysecondPoints * 32; number < num_points; number++) {
121 uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
124 outVector[number] = (uint8_t)sum;
131#ifdef LV_HAVE_AVX512BW
132#include <immintrin.h>
134static inline void volk_8u_x2_add_saturated_8u_u_avx512bw(uint8_t* outVector,
135 const uint8_t* inVectorA,
136 const uint8_t* inVectorB,
137 unsigned int num_points)
139 const unsigned int sixtyfourthPoints = num_points / 64;
140 unsigned int number = 0;
142 for (; number < sixtyfourthPoints; number++) {
143 __m512i a = _mm512_loadu_si512((
const __m512i*)(inVectorA + 64 * number));
144 __m512i b = _mm512_loadu_si512((
const __m512i*)(inVectorB + 64 * number));
145 __m512i result = _mm512_adds_epu8(a, b);
146 _mm512_storeu_si512((__m512i*)(outVector + 64 * number), result);
149 for (number = sixtyfourthPoints * 64; number < num_points; number++) {
150 uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
153 outVector[number] = (uint8_t)sum;
163#ifndef INCLUDED_volk_8u_x2_add_saturated_8u_a_H
164#define INCLUDED_volk_8u_x2_add_saturated_8u_a_H
169#include <emmintrin.h>
172 const uint8_t* inVectorA,
173 const uint8_t* inVectorB,
174 unsigned int num_points)
176 const unsigned int sixteenthPoints = num_points / 16;
177 unsigned int number = 0;
179 for (; number < sixteenthPoints; number++) {
180 __m128i a = _mm_load_si128((
const __m128i*)(inVectorA + 16 * number));
181 __m128i b = _mm_load_si128((
const __m128i*)(inVectorB + 16 * number));
182 __m128i result = _mm_adds_epu8(a, b);
183 _mm_store_si128((__m128i*)(outVector + 16 * number), result);
186 for (number = sixteenthPoints * 16; number < num_points; number++) {
187 uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
190 outVector[number] = (uint8_t)sum;
198#include <immintrin.h>
200static inline void volk_8u_x2_add_saturated_8u_a_avx2(uint8_t* outVector,
201 const uint8_t* inVectorA,
202 const uint8_t* inVectorB,
203 unsigned int num_points)
205 const unsigned int thirtysecondPoints = num_points / 32;
206 unsigned int number = 0;
208 for (; number < thirtysecondPoints; number++) {
209 __m256i a = _mm256_load_si256((
const __m256i*)(inVectorA + 32 * number));
210 __m256i b = _mm256_load_si256((
const __m256i*)(inVectorB + 32 * number));
211 __m256i result = _mm256_adds_epu8(a, b);
212 _mm256_store_si256((__m256i*)(outVector + 32 * number), result);
215 for (number = thirtysecondPoints * 32; number < num_points; number++) {
216 uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
219 outVector[number] = (uint8_t)sum;
226#ifdef LV_HAVE_AVX512BW
227#include <immintrin.h>
229static inline void volk_8u_x2_add_saturated_8u_a_avx512bw(uint8_t* outVector,
230 const uint8_t* inVectorA,
231 const uint8_t* inVectorB,
232 unsigned int num_points)
234 const unsigned int sixtyfourthPoints = num_points / 64;
235 unsigned int number = 0;
237 for (; number < sixtyfourthPoints; number++) {
238 __m512i a = _mm512_load_si512((
const __m512i*)(inVectorA + 64 * number));
239 __m512i b = _mm512_load_si512((
const __m512i*)(inVectorB + 64 * number));
240 __m512i result = _mm512_adds_epu8(a, b);
241 _mm512_store_si512((__m512i*)(outVector + 64 * number), result);
244 for (number = sixtyfourthPoints * 64; number < num_points; number++) {
245 uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
248 outVector[number] = (uint8_t)sum;
259 const uint8_t* inVectorA,
260 const uint8_t* inVectorB,
261 unsigned int num_points)
263 const unsigned int sixteenthPoints = num_points / 16;
264 unsigned int number = 0;
266 for (; number < sixteenthPoints; number++) {
267 uint8x16_t a = vld1q_u8(inVectorA + 16 * number);
268 uint8x16_t b = vld1q_u8(inVectorB + 16 * number);
269 vst1q_u8(outVector + 16 * number, vqaddq_u8(a, b));
272 for (number = sixteenthPoints * 16; number < num_points; number++) {
273 uint16_t sum = (uint16_t)inVectorA[number] + (uint16_t)inVectorB[number];
276 outVector[number] = (uint8_t)sum;
287static inline void volk_8u_x2_add_saturated_8u_neonv8(uint8_t* outVector,
288 const uint8_t* inVectorA,
289 const uint8_t* inVectorB,
290 unsigned int num_points)
292 const unsigned int thirtysecondPoints = num_points / 32;
293 unsigned int number = 0;
295 for (; number < thirtysecondPoints; number++) {
298 uint8x16_t a0 = vld1q_u8(inVectorA);
299 uint8x16_t b0 = vld1q_u8(inVectorB);
300 uint8x16_t a1 = vld1q_u8(inVectorA + 16);
301 uint8x16_t b1 = vld1q_u8(inVectorB + 16);
302 vst1q_u8(outVector, vqaddq_u8(a0, b0));
303 vst1q_u8(outVector + 16, vqaddq_u8(a1, b1));
309 for (number = thirtysecondPoints * 32; number < num_points; number++) {
310 uint16_t sum = (uint16_t)(*inVectorA++) + (uint16_t)(*inVectorB++);
313 *outVector++ = (uint8_t)sum;
321#include <riscv_vector.h>
323static inline void volk_8u_x2_add_saturated_8u_rvv(uint8_t* outVector,
324 const uint8_t* inVectorA,
325 const uint8_t* inVectorB,
326 unsigned int num_points)
328 size_t n = num_points;
329 for (
size_t vl; n > 0; n -= vl, inVectorA += vl, inVectorB += vl, outVector += vl) {
330 vl = __riscv_vsetvl_e8m8(n);
331 vuint8m8_t a = __riscv_vle8_v_u8m8(inVectorA, vl);
332 vuint8m8_t b = __riscv_vle8_v_u8m8(inVectorB, vl);
333 __riscv_vse8(outVector, __riscv_vsaddu(a, b, vl), vl);