52#ifndef INCLUDED_volk_16i_x2_add_saturated_16i_u_H
53#define INCLUDED_volk_16i_x2_add_saturated_16i_u_H
60 const int16_t* inVectorA,
61 const int16_t* inVectorB,
62 unsigned int num_points)
64 for (
unsigned int i = 0; i < num_points; i++) {
65 int16_t a = inVectorA[i];
66 int16_t b = inVectorB[i];
69 int16_t overflow = ((a ^ sum) & (b ^ sum)) >> 15;
71 int16_t sat_val = (a >> 15) ^ 0x7FFF;
72 outVector[i] = (overflow & sat_val) | (~overflow & sum);
83 const int16_t* inVectorA,
84 const int16_t* inVectorB,
85 unsigned int num_points)
87 const unsigned int eighthPoints = num_points / 8;
88 unsigned int number = 0;
90 for (; number < eighthPoints; number++) {
91 __m128i a = _mm_loadu_si128((
const __m128i*)(inVectorA + 8 * number));
92 __m128i b = _mm_loadu_si128((
const __m128i*)(inVectorB + 8 * number));
93 __m128i result = _mm_adds_epi16(a, b);
94 _mm_storeu_si128((__m128i*)(outVector + 8 * number), result);
97 for (number = eighthPoints * 8; number < num_points; number++) {
98 int32_t sum = (int32_t)inVectorA[number] + (int32_t)inVectorB[number];
101 else if (sum < -32768)
103 outVector[number] = (int16_t)sum;
111#include <immintrin.h>
113static inline void volk_16i_x2_add_saturated_16i_u_avx2(int16_t* outVector,
114 const int16_t* inVectorA,
115 const int16_t* inVectorB,
116 unsigned int num_points)
118 const unsigned int sixteenthPoints = num_points / 16;
119 unsigned int number = 0;
121 for (; number < sixteenthPoints; number++) {
122 __m256i a = _mm256_loadu_si256((
const __m256i*)(inVectorA + 16 * number));
123 __m256i b = _mm256_loadu_si256((
const __m256i*)(inVectorB + 16 * number));
124 __m256i result = _mm256_adds_epi16(a, b);
125 _mm256_storeu_si256((__m256i*)(outVector + 16 * number), result);
128 for (number = sixteenthPoints * 16; number < num_points; number++) {
129 int32_t sum = (int32_t)inVectorA[number] + (int32_t)inVectorB[number];
132 else if (sum < -32768)
134 outVector[number] = (int16_t)sum;
141#ifdef LV_HAVE_AVX512BW
142#include <immintrin.h>
144static inline void volk_16i_x2_add_saturated_16i_u_avx512bw(int16_t* outVector,
145 const int16_t* inVectorA,
146 const int16_t* inVectorB,
147 unsigned int num_points)
149 const unsigned int thirtysecondPoints = num_points / 32;
150 unsigned int number = 0;
152 for (; number < thirtysecondPoints; number++) {
153 __m512i a = _mm512_loadu_si512((
const __m512i*)(inVectorA + 32 * number));
154 __m512i b = _mm512_loadu_si512((
const __m512i*)(inVectorB + 32 * number));
155 __m512i result = _mm512_adds_epi16(a, b);
156 _mm512_storeu_si512((__m512i*)(outVector + 32 * number), result);
159 for (number = thirtysecondPoints * 32; number < num_points; number++) {
160 int32_t sum = (int32_t)inVectorA[number] + (int32_t)inVectorB[number];
163 else if (sum < -32768)
165 outVector[number] = (int16_t)sum;
175#ifndef INCLUDED_volk_16i_x2_add_saturated_16i_a_H
176#define INCLUDED_volk_16i_x2_add_saturated_16i_a_H
181#include <emmintrin.h>
184 const int16_t* inVectorA,
185 const int16_t* inVectorB,
186 unsigned int num_points)
188 const unsigned int eighthPoints = num_points / 8;
189 unsigned int number = 0;
191 for (; number < eighthPoints; number++) {
192 __m128i a = _mm_load_si128((
const __m128i*)(inVectorA + 8 * number));
193 __m128i b = _mm_load_si128((
const __m128i*)(inVectorB + 8 * number));
194 __m128i result = _mm_adds_epi16(a, b);
195 _mm_store_si128((__m128i*)(outVector + 8 * number), result);
198 for (number = eighthPoints * 8; number < num_points; number++) {
199 int32_t sum = (int32_t)inVectorA[number] + (int32_t)inVectorB[number];
202 else if (sum < -32768)
204 outVector[number] = (int16_t)sum;
212#include <immintrin.h>
214static inline void volk_16i_x2_add_saturated_16i_a_avx2(int16_t* outVector,
215 const int16_t* inVectorA,
216 const int16_t* inVectorB,
217 unsigned int num_points)
219 const unsigned int sixteenthPoints = num_points / 16;
220 unsigned int number = 0;
222 for (; number < sixteenthPoints; number++) {
223 __m256i a = _mm256_load_si256((
const __m256i*)(inVectorA + 16 * number));
224 __m256i b = _mm256_load_si256((
const __m256i*)(inVectorB + 16 * number));
225 __m256i result = _mm256_adds_epi16(a, b);
226 _mm256_store_si256((__m256i*)(outVector + 16 * number), result);
229 for (number = sixteenthPoints * 16; number < num_points; number++) {
230 int32_t sum = (int32_t)inVectorA[number] + (int32_t)inVectorB[number];
233 else if (sum < -32768)
235 outVector[number] = (int16_t)sum;
242#ifdef LV_HAVE_AVX512BW
243#include <immintrin.h>
245static inline void volk_16i_x2_add_saturated_16i_a_avx512bw(int16_t* outVector,
246 const int16_t* inVectorA,
247 const int16_t* inVectorB,
248 unsigned int num_points)
250 const unsigned int thirtysecondPoints = num_points / 32;
251 unsigned int number = 0;
253 for (; number < thirtysecondPoints; number++) {
254 __m512i a = _mm512_load_si512((
const __m512i*)(inVectorA + 32 * number));
255 __m512i b = _mm512_load_si512((
const __m512i*)(inVectorB + 32 * number));
256 __m512i result = _mm512_adds_epi16(a, b);
257 _mm512_store_si512((__m512i*)(outVector + 32 * number), result);
260 for (number = thirtysecondPoints * 32; number < num_points; number++) {
261 int32_t sum = (int32_t)inVectorA[number] + (int32_t)inVectorB[number];
264 else if (sum < -32768)
266 outVector[number] = (int16_t)sum;
277 const int16_t* inVectorA,
278 const int16_t* inVectorB,
279 unsigned int num_points)
281 const unsigned int eighthPoints = num_points / 8;
282 unsigned int number = 0;
284 for (; number < eighthPoints; number++) {
285 int16x8_t a = vld1q_s16(inVectorA + 8 * number);
286 int16x8_t b = vld1q_s16(inVectorB + 8 * number);
287 vst1q_s16(outVector + 8 * number, vqaddq_s16(a, b));
290 for (number = eighthPoints * 8; number < num_points; number++) {
291 int32_t sum = (int32_t)inVectorA[number] + (int32_t)inVectorB[number];
294 else if (sum < -32768)
296 outVector[number] = (int16_t)sum;
307static inline void volk_16i_x2_add_saturated_16i_neonv8(int16_t* outVector,
308 const int16_t* inVectorA,
309 const int16_t* inVectorB,
310 unsigned int num_points)
312 const unsigned int sixteenthPoints = num_points / 16;
313 unsigned int number = 0;
315 for (; number < sixteenthPoints; number++) {
318 int16x8_t a0 = vld1q_s16(inVectorA);
319 int16x8_t b0 = vld1q_s16(inVectorB);
320 int16x8_t a1 = vld1q_s16(inVectorA + 8);
321 int16x8_t b1 = vld1q_s16(inVectorB + 8);
322 vst1q_s16(outVector, vqaddq_s16(a0, b0));
323 vst1q_s16(outVector + 8, vqaddq_s16(a1, b1));
329 for (number = sixteenthPoints * 16; number < num_points; number++) {
330 int32_t sum = (int32_t)(*inVectorA++) + (int32_t)(*inVectorB++);
333 else if (sum < -32768)
335 *outVector++ = (int16_t)sum;
343#include <riscv_vector.h>
345static inline void volk_16i_x2_add_saturated_16i_rvv(int16_t* outVector,
346 const int16_t* inVectorA,
347 const int16_t* inVectorB,
348 unsigned int num_points)
350 size_t n = num_points;
351 for (
size_t vl; n > 0; n -= vl, inVectorA += vl, inVectorB += vl, outVector += vl) {
352 vl = __riscv_vsetvl_e16m8(n);
353 vint16m8_t a = __riscv_vle16_v_i16m8(inVectorA, vl);
354 vint16m8_t b = __riscv_vle16_v_i16m8(inVectorB, vl);
355 __riscv_vse16(outVector, __riscv_vsadd(a, b, vl), vl);