Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16u_x2_add_saturated_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
51
52#ifndef INCLUDED_volk_16u_x2_add_saturated_16u_u_H
53#define INCLUDED_volk_16u_x2_add_saturated_16u_u_H
54
55#include <inttypes.h>
56
57#ifdef LV_HAVE_GENERIC
58
59static inline void volk_16u_x2_add_saturated_16u_generic(uint16_t* outVector,
60 const uint16_t* inVectorA,
61 const uint16_t* inVectorB,
62 unsigned int num_points)
63{
64 for (unsigned int i = 0; i < num_points; i++) {
65 uint16_t sum = inVectorA[i] + inVectorB[i];
66 outVector[i] = sum | -(uint16_t)(sum < inVectorA[i]);
67 }
68}
69
70#endif /* LV_HAVE_GENERIC */
71
72
73#ifdef LV_HAVE_SSE2
74#include <emmintrin.h>
75
76static inline void volk_16u_x2_add_saturated_16u_u_sse2(uint16_t* outVector,
77 const uint16_t* inVectorA,
78 const uint16_t* inVectorB,
79 unsigned int num_points)
80{
81 const unsigned int eighthPoints = num_points / 8;
82 unsigned int number = 0;
83
84 for (; number < eighthPoints; number++) {
85 __m128i a = _mm_loadu_si128((const __m128i*)(inVectorA + 8 * number));
86 __m128i b = _mm_loadu_si128((const __m128i*)(inVectorB + 8 * number));
87 __m128i result = _mm_adds_epu16(a, b);
88 _mm_storeu_si128((__m128i*)(outVector + 8 * number), result);
89 }
90
91 for (number = eighthPoints * 8; number < num_points; number++) {
92 uint32_t sum = (uint32_t)inVectorA[number] + (uint32_t)inVectorB[number];
93 if (sum > 65535)
94 sum = 65535;
95 outVector[number] = (uint16_t)sum;
96 }
97}
98
99#endif /* LV_HAVE_SSE2 */
100
101
102#ifdef LV_HAVE_AVX2
103#include <immintrin.h>
104
105static inline void volk_16u_x2_add_saturated_16u_u_avx2(uint16_t* outVector,
106 const uint16_t* inVectorA,
107 const uint16_t* inVectorB,
108 unsigned int num_points)
109{
110 const unsigned int sixteenthPoints = num_points / 16;
111 unsigned int number = 0;
112
113 for (; number < sixteenthPoints; number++) {
114 __m256i a = _mm256_loadu_si256((const __m256i*)(inVectorA + 16 * number));
115 __m256i b = _mm256_loadu_si256((const __m256i*)(inVectorB + 16 * number));
116 __m256i result = _mm256_adds_epu16(a, b);
117 _mm256_storeu_si256((__m256i*)(outVector + 16 * number), result);
118 }
119
120 for (number = sixteenthPoints * 16; number < num_points; number++) {
121 uint32_t sum = (uint32_t)inVectorA[number] + (uint32_t)inVectorB[number];
122 if (sum > 65535)
123 sum = 65535;
124 outVector[number] = (uint16_t)sum;
125 }
126}
127
128#endif /* LV_HAVE_AVX2 */
129
130
131#ifdef LV_HAVE_AVX512BW
132#include <immintrin.h>
133
134static inline void volk_16u_x2_add_saturated_16u_u_avx512bw(uint16_t* outVector,
135 const uint16_t* inVectorA,
136 const uint16_t* inVectorB,
137 unsigned int num_points)
138{
139 const unsigned int thirtysecondPoints = num_points / 32;
140 unsigned int number = 0;
141
142 for (; number < thirtysecondPoints; number++) {
143 __m512i a = _mm512_loadu_si512((const __m512i*)(inVectorA + 32 * number));
144 __m512i b = _mm512_loadu_si512((const __m512i*)(inVectorB + 32 * number));
145 __m512i result = _mm512_adds_epu16(a, b);
146 _mm512_storeu_si512((__m512i*)(outVector + 32 * number), result);
147 }
148
149 for (number = thirtysecondPoints * 32; number < num_points; number++) {
150 uint32_t sum = (uint32_t)inVectorA[number] + (uint32_t)inVectorB[number];
151 if (sum > 65535)
152 sum = 65535;
153 outVector[number] = (uint16_t)sum;
154 }
155}
156
157#endif /* LV_HAVE_AVX512BW */
158
159
160#endif /* INCLUDED_volk_16u_x2_add_saturated_16u_u_H */
161
162
163#ifndef INCLUDED_volk_16u_x2_add_saturated_16u_a_H
164#define INCLUDED_volk_16u_x2_add_saturated_16u_a_H
165
166#include <inttypes.h>
167
168#ifdef LV_HAVE_SSE2
169#include <emmintrin.h>
170
171static inline void volk_16u_x2_add_saturated_16u_a_sse2(uint16_t* outVector,
172 const uint16_t* inVectorA,
173 const uint16_t* inVectorB,
174 unsigned int num_points)
175{
176 const unsigned int eighthPoints = num_points / 8;
177 unsigned int number = 0;
178
179 for (; number < eighthPoints; number++) {
180 __m128i a = _mm_load_si128((const __m128i*)(inVectorA + 8 * number));
181 __m128i b = _mm_load_si128((const __m128i*)(inVectorB + 8 * number));
182 __m128i result = _mm_adds_epu16(a, b);
183 _mm_store_si128((__m128i*)(outVector + 8 * number), result);
184 }
185
186 for (number = eighthPoints * 8; number < num_points; number++) {
187 uint32_t sum = (uint32_t)inVectorA[number] + (uint32_t)inVectorB[number];
188 if (sum > 65535)
189 sum = 65535;
190 outVector[number] = (uint16_t)sum;
191 }
192}
193
194#endif /* LV_HAVE_SSE2 */
195
196
197#ifdef LV_HAVE_AVX2
198#include <immintrin.h>
199
200static inline void volk_16u_x2_add_saturated_16u_a_avx2(uint16_t* outVector,
201 const uint16_t* inVectorA,
202 const uint16_t* inVectorB,
203 unsigned int num_points)
204{
205 const unsigned int sixteenthPoints = num_points / 16;
206 unsigned int number = 0;
207
208 for (; number < sixteenthPoints; number++) {
209 __m256i a = _mm256_load_si256((const __m256i*)(inVectorA + 16 * number));
210 __m256i b = _mm256_load_si256((const __m256i*)(inVectorB + 16 * number));
211 __m256i result = _mm256_adds_epu16(a, b);
212 _mm256_store_si256((__m256i*)(outVector + 16 * number), result);
213 }
214
215 for (number = sixteenthPoints * 16; number < num_points; number++) {
216 uint32_t sum = (uint32_t)inVectorA[number] + (uint32_t)inVectorB[number];
217 if (sum > 65535)
218 sum = 65535;
219 outVector[number] = (uint16_t)sum;
220 }
221}
222
223#endif /* LV_HAVE_AVX2 */
224
225
226#ifdef LV_HAVE_AVX512BW
227#include <immintrin.h>
228
229static inline void volk_16u_x2_add_saturated_16u_a_avx512bw(uint16_t* outVector,
230 const uint16_t* inVectorA,
231 const uint16_t* inVectorB,
232 unsigned int num_points)
233{
234 const unsigned int thirtysecondPoints = num_points / 32;
235 unsigned int number = 0;
236
237 for (; number < thirtysecondPoints; number++) {
238 __m512i a = _mm512_load_si512((const __m512i*)(inVectorA + 32 * number));
239 __m512i b = _mm512_load_si512((const __m512i*)(inVectorB + 32 * number));
240 __m512i result = _mm512_adds_epu16(a, b);
241 _mm512_store_si512((__m512i*)(outVector + 32 * number), result);
242 }
243
244 for (number = thirtysecondPoints * 32; number < num_points; number++) {
245 uint32_t sum = (uint32_t)inVectorA[number] + (uint32_t)inVectorB[number];
246 if (sum > 65535)
247 sum = 65535;
248 outVector[number] = (uint16_t)sum;
249 }
250}
251
252#endif /* LV_HAVE_AVX512BW */
253
254
255#ifdef LV_HAVE_NEON
256#include <arm_neon.h>
257
258static inline void volk_16u_x2_add_saturated_16u_neon(uint16_t* outVector,
259 const uint16_t* inVectorA,
260 const uint16_t* inVectorB,
261 unsigned int num_points)
262{
263 const unsigned int eighthPoints = num_points / 8;
264 unsigned int number = 0;
265
266 for (; number < eighthPoints; number++) {
267 uint16x8_t a = vld1q_u16(inVectorA + 8 * number);
268 uint16x8_t b = vld1q_u16(inVectorB + 8 * number);
269 vst1q_u16(outVector + 8 * number, vqaddq_u16(a, b));
270 }
271
272 for (number = eighthPoints * 8; number < num_points; number++) {
273 uint32_t sum = (uint32_t)inVectorA[number] + (uint32_t)inVectorB[number];
274 if (sum > 65535)
275 sum = 65535;
276 outVector[number] = (uint16_t)sum;
277 }
278}
279
280#endif /* LV_HAVE_NEON */
281
282
283#ifdef LV_HAVE_NEONV8
284#include <arm_neon.h>
285#include <volk/volk_common.h>
286
287static inline void volk_16u_x2_add_saturated_16u_neonv8(uint16_t* outVector,
288 const uint16_t* inVectorA,
289 const uint16_t* inVectorB,
290 unsigned int num_points)
291{
292 const unsigned int sixteenthPoints = num_points / 16;
293 unsigned int number = 0;
294
295 for (; number < sixteenthPoints; number++) {
296 __VOLK_PREFETCH(inVectorA + 32);
297 __VOLK_PREFETCH(inVectorB + 32);
298 uint16x8_t a0 = vld1q_u16(inVectorA);
299 uint16x8_t b0 = vld1q_u16(inVectorB);
300 uint16x8_t a1 = vld1q_u16(inVectorA + 8);
301 uint16x8_t b1 = vld1q_u16(inVectorB + 8);
302 vst1q_u16(outVector, vqaddq_u16(a0, b0));
303 vst1q_u16(outVector + 8, vqaddq_u16(a1, b1));
304 inVectorA += 16;
305 inVectorB += 16;
306 outVector += 16;
307 }
308
309 for (number = sixteenthPoints * 16; number < num_points; number++) {
310 uint32_t sum = (uint32_t)(*inVectorA++) + (uint32_t)(*inVectorB++);
311 if (sum > 65535)
312 sum = 65535;
313 *outVector++ = (uint16_t)sum;
314 }
315}
316
317#endif /* LV_HAVE_NEONV8 */
318
319
320#ifdef LV_HAVE_RVV
321#include <riscv_vector.h>
322
323static inline void volk_16u_x2_add_saturated_16u_rvv(uint16_t* outVector,
324 const uint16_t* inVectorA,
325 const uint16_t* inVectorB,
326 unsigned int num_points)
327{
328 size_t n = num_points;
329 for (size_t vl; n > 0; n -= vl, inVectorA += vl, inVectorB += vl, outVector += vl) {
330 vl = __riscv_vsetvl_e16m8(n);
331 vuint16m8_t a = __riscv_vle16_v_u16m8(inVectorA, vl);
332 vuint16m8_t b = __riscv_vle16_v_u16m8(inVectorB, vl);
333 __riscv_vse16(outVector, __riscv_vsaddu(a, b, vl), vl);
334 }
335}
336
337#endif /* LV_HAVE_RVV */
338
339
340#endif /* INCLUDED_volk_16u_x2_add_saturated_16u_a_H */