Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_x2_add_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
60
61#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
62#define INCLUDED_volk_32f_x2_add_32f_u_H
63
64#include <inttypes.h>
65#include <stdio.h>
66
67#ifdef LV_HAVE_AVX512F
68#include <immintrin.h>
69
70static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector,
71 const float* aVector,
72 const float* bVector,
73 unsigned int num_points)
74{
75 unsigned int number = 0;
76 const unsigned int sixteenthPoints = num_points / 16;
77
78 float* cPtr = cVector;
79 const float* aPtr = aVector;
80 const float* bPtr = bVector;
81
82 __m512 aVal, bVal, cVal;
83 for (; number < sixteenthPoints; number++) {
84
85 aVal = _mm512_loadu_ps(aPtr);
86 bVal = _mm512_loadu_ps(bPtr);
87
88 cVal = _mm512_add_ps(aVal, bVal);
89
90 _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
91
92 aPtr += 16;
93 bPtr += 16;
94 cPtr += 16;
95 }
96
97 number = sixteenthPoints * 16;
98
99 for (; number < num_points; number++) {
100 *cPtr++ = (*aPtr++) + (*bPtr++);
101 }
102}
103
104#endif /* LV_HAVE_AVX512F */
105
106
107#ifdef LV_HAVE_AVX
108#include <immintrin.h>
109
110static inline void volk_32f_x2_add_32f_u_avx(float* cVector,
111 const float* aVector,
112 const float* bVector,
113 unsigned int num_points)
114{
115 unsigned int number = 0;
116 const unsigned int eighthPoints = num_points / 8;
117 float* cPtr = cVector;
118 const float* aPtr = aVector;
119 const float* bPtr = bVector;
120 __m256 aVal, bVal, cVal;
121 for (; number < eighthPoints; number++) {
122
123 aVal = _mm256_loadu_ps(aPtr);
124 bVal = _mm256_loadu_ps(bPtr);
125
126 cVal = _mm256_add_ps(aVal, bVal);
127
128 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
129
130 aPtr += 8;
131 bPtr += 8;
132 cPtr += 8;
133 }
134
135 number = eighthPoints * 8;
136
137 for (; number < num_points; number++) {
138 *cPtr++ = (*aPtr++) + (*bPtr++);
139 }
140}
141#endif /* LV_HAVE_AVX */
142
143
144#ifdef LV_HAVE_SSE
145#include <xmmintrin.h>
146
147static inline void volk_32f_x2_add_32f_u_sse(float* cVector,
148 const float* aVector,
149 const float* bVector,
150 unsigned int num_points)
151{
152 unsigned int number = 0;
153 const unsigned int quarterPoints = num_points / 4;
154
155 float* cPtr = cVector;
156 const float* aPtr = aVector;
157 const float* bPtr = bVector;
158
159 __m128 aVal, bVal, cVal;
160 for (; number < quarterPoints; number++) {
161
162 aVal = _mm_loadu_ps(aPtr);
163 bVal = _mm_loadu_ps(bPtr);
164
165 cVal = _mm_add_ps(aVal, bVal);
166
167 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
168
169 aPtr += 4;
170 bPtr += 4;
171 cPtr += 4;
172 }
173
174 number = quarterPoints * 4;
175 for (; number < num_points; number++) {
176 *cPtr++ = (*aPtr++) + (*bPtr++);
177 }
178}
179#endif /* LV_HAVE_SSE */
180
181
182#ifdef LV_HAVE_GENERIC
183
184static inline void volk_32f_x2_add_32f_generic(float* cVector,
185 const float* aVector,
186 const float* bVector,
187 unsigned int num_points)
188{
189 float* cPtr = cVector;
190 const float* aPtr = aVector;
191 const float* bPtr = bVector;
192 unsigned int number = 0;
193
194 for (number = 0; number < num_points; number++) {
195 *cPtr++ = (*aPtr++) + (*bPtr++);
196 }
197}
198#endif /* LV_HAVE_GENERIC */
199
200
201#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
202#ifndef INCLUDED_volk_32f_x2_add_32f_a_H
203#define INCLUDED_volk_32f_x2_add_32f_a_H
204
205#include <inttypes.h>
206#include <stdio.h>
207
208#ifdef LV_HAVE_AVX512F
209#include <immintrin.h>
210
211static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector,
212 const float* aVector,
213 const float* bVector,
214 unsigned int num_points)
215{
216 unsigned int number = 0;
217 const unsigned int sixteenthPoints = num_points / 16;
218
219 float* cPtr = cVector;
220 const float* aPtr = aVector;
221 const float* bPtr = bVector;
222
223 __m512 aVal, bVal, cVal;
224 for (; number < sixteenthPoints; number++) {
225
226 aVal = _mm512_load_ps(aPtr);
227 bVal = _mm512_load_ps(bPtr);
228
229 cVal = _mm512_add_ps(aVal, bVal);
230
231 _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
232
233 aPtr += 16;
234 bPtr += 16;
235 cPtr += 16;
236 }
237
238 number = sixteenthPoints * 16;
239
240 for (; number < num_points; number++) {
241 *cPtr++ = (*aPtr++) + (*bPtr++);
242 }
243}
244
245#endif /* LV_HAVE_AVX512F */
246
247
248#ifdef LV_HAVE_AVX
249#include <immintrin.h>
250
251static inline void volk_32f_x2_add_32f_a_avx(float* cVector,
252 const float* aVector,
253 const float* bVector,
254 unsigned int num_points)
255{
256 unsigned int number = 0;
257 const unsigned int eighthPoints = num_points / 8;
258
259 float* cPtr = cVector;
260 const float* aPtr = aVector;
261 const float* bPtr = bVector;
262
263 __m256 aVal, bVal, cVal;
264 for (; number < eighthPoints; number++) {
265
266 aVal = _mm256_load_ps(aPtr);
267 bVal = _mm256_load_ps(bPtr);
268
269 cVal = _mm256_add_ps(aVal, bVal);
270
271 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
272
273 aPtr += 8;
274 bPtr += 8;
275 cPtr += 8;
276 }
277
278 number = eighthPoints * 8;
279 for (; number < num_points; number++) {
280 *cPtr++ = (*aPtr++) + (*bPtr++);
281 }
282}
283#endif /* LV_HAVE_AVX */
284
285#ifdef LV_HAVE_SSE
286#include <xmmintrin.h>
287
288static inline void volk_32f_x2_add_32f_a_sse(float* cVector,
289 const float* aVector,
290 const float* bVector,
291 unsigned int num_points)
292{
293 unsigned int number = 0;
294 const unsigned int quarterPoints = num_points / 4;
295
296 float* cPtr = cVector;
297 const float* aPtr = aVector;
298 const float* bPtr = bVector;
299
300 __m128 aVal, bVal, cVal;
301 for (; number < quarterPoints; number++) {
302 aVal = _mm_load_ps(aPtr);
303 bVal = _mm_load_ps(bPtr);
304
305 cVal = _mm_add_ps(aVal, bVal);
306
307 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
308
309 aPtr += 4;
310 bPtr += 4;
311 cPtr += 4;
312 }
313
314 number = quarterPoints * 4;
315 for (; number < num_points; number++) {
316 *cPtr++ = (*aPtr++) + (*bPtr++);
317 }
318}
319#endif /* LV_HAVE_SSE */
320
321
322#ifdef LV_HAVE_NEON
323#include <arm_neon.h>
324
325static inline void volk_32f_x2_add_32f_u_neon(float* cVector,
326 const float* aVector,
327 const float* bVector,
328 unsigned int num_points)
329{
330 unsigned int number = 0;
331 const unsigned int quarterPoints = num_points / 4;
332
333 float* cPtr = cVector;
334 const float* aPtr = aVector;
335 const float* bPtr = bVector;
336 float32x4_t aVal, bVal, cVal;
337 for (number = 0; number < quarterPoints; number++) {
338 // Load in to NEON registers
339 aVal = vld1q_f32(aPtr);
340 bVal = vld1q_f32(bPtr);
341 __VOLK_PREFETCH(aPtr + 4);
342 __VOLK_PREFETCH(bPtr + 4);
343
344 // vector add
345 cVal = vaddq_f32(aVal, bVal);
346 // Store the results back into the C container
347 vst1q_f32(cPtr, cVal);
348
349 aPtr += 4; // q uses quadwords, 4 floats per vadd
350 bPtr += 4;
351 cPtr += 4;
352 }
353
354 number = quarterPoints * 4; // should be = num_points
355 for (; number < num_points; number++) {
356 *cPtr++ = (*aPtr++) + (*bPtr++);
357 }
358}
359
360#endif /* LV_HAVE_NEON */
361
362#ifdef LV_HAVE_NEONV7
363extern void volk_32f_x2_add_32f_a_neonasm(float* cVector,
364 const float* aVector,
365 const float* bVector,
366 unsigned int num_points);
367#endif /* LV_HAVE_NEONV7 */
368
369#ifdef LV_HAVE_NEONV7
370extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector,
371 const float* aVector,
372 const float* bVector,
373 unsigned int num_points);
374#endif /* LV_HAVE_NEONV7 */
375
376
377#ifdef LV_HAVE_NEONV8
378#include <arm_neon.h>
379
380static inline void volk_32f_x2_add_32f_neonv8(float* cVector,
381 const float* aVector,
382 const float* bVector,
383 unsigned int num_points)
384{
385 unsigned int n = num_points;
386 float* c = cVector;
387 const float* a = aVector;
388 const float* b = bVector;
389
390 /* Process 8 floats per iteration (2x unroll) */
391 while (n >= 8) {
392 float32x4_t a0 = vld1q_f32(a);
393 float32x4_t a1 = vld1q_f32(a + 4);
394 float32x4_t b0 = vld1q_f32(b);
395 float32x4_t b1 = vld1q_f32(b + 4);
396 __VOLK_PREFETCH(a + 16);
397 __VOLK_PREFETCH(b + 16);
398
399 vst1q_f32(c, vaddq_f32(a0, b0));
400 vst1q_f32(c + 4, vaddq_f32(a1, b1));
401
402 a += 8;
403 b += 8;
404 c += 8;
405 n -= 8;
406 }
407
408 /* Process remaining 4 floats */
409 if (n >= 4) {
410 vst1q_f32(c, vaddq_f32(vld1q_f32(a), vld1q_f32(b)));
411 a += 4;
412 b += 4;
413 c += 4;
414 n -= 4;
415 }
416
417 /* Scalar tail */
418 while (n > 0) {
419 *c++ = *a++ + *b++;
420 n--;
421 }
422}
423
424#endif /* LV_HAVE_NEONV8 */
425
426
427#ifdef LV_HAVE_ORC
428
429extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector,
430 const float* aVector,
431 const float* bVector,
432 int num_points);
433
434static inline void volk_32f_x2_add_32f_u_orc(float* cVector,
435 const float* aVector,
436 const float* bVector,
437 unsigned int num_points)
438{
439 volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
440}
441
442#endif /* LV_HAVE_ORC */
443
444#ifdef LV_HAVE_RVV
445#include <riscv_vector.h>
446
447static inline void volk_32f_x2_add_32f_rvv(float* cVector,
448 const float* aVector,
449 const float* bVector,
450 unsigned int num_points)
451{
452 size_t n = num_points;
453 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
454 vl = __riscv_vsetvl_e32m8(n);
455 vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
456 vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
457 __riscv_vse32(cVector, __riscv_vfadd(va, vb, vl), vl);
458 }
459}
460#endif /*LV_HAVE_RVV*/
461
462#endif /* INCLUDED_volk_32f_x2_add_32f_a_H */