Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_accumulator_s32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
49
50#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
51#define INCLUDED_volk_32f_accumulator_s32f_a_H
52
53#include <inttypes.h>
54#include <volk/volk_common.h>
55
56#ifdef LV_HAVE_AVX512F
57#include <immintrin.h>
58
59static inline void volk_32f_accumulator_s32f_a_avx512f(float* result,
60 const float* inputBuffer,
61 unsigned int num_points)
62{
63 float returnValue = 0;
64 unsigned int number = 0;
65 const unsigned int sixteenthPoints = num_points / 16;
66
67 const float* aPtr = inputBuffer;
68
69 __m512 accumulator = _mm512_setzero_ps();
70 __m512 aVal = _mm512_setzero_ps();
71
72 for (; number < sixteenthPoints; number++) {
73 aVal = _mm512_load_ps(aPtr);
74 accumulator = _mm512_add_ps(accumulator, aVal);
75 aPtr += 16;
76 }
77
78 // Horizontal sum using AVX512 reduce instruction
79 returnValue = _mm512_reduce_add_ps(accumulator);
80
81 number = sixteenthPoints * 16;
82 for (; number < num_points; number++) {
83 returnValue += (*aPtr++);
84 }
85 *result = returnValue;
86}
87#endif /* LV_HAVE_AVX512F */
88
89
90#ifdef LV_HAVE_AVX
91#include <immintrin.h>
92
93static inline void volk_32f_accumulator_s32f_a_avx(float* result,
94 const float* inputBuffer,
95 unsigned int num_points)
96{
97 float returnValue = 0;
98 unsigned int number = 0;
99 const unsigned int eighthPoints = num_points / 8;
100
101 const float* aPtr = inputBuffer;
102 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
103
104 __m256 accumulator = _mm256_setzero_ps();
105 __m256 aVal = _mm256_setzero_ps();
106
107 for (; number < eighthPoints; number++) {
108 aVal = _mm256_load_ps(aPtr);
109 accumulator = _mm256_add_ps(accumulator, aVal);
110 aPtr += 8;
111 }
112
113 _mm256_store_ps(tempBuffer, accumulator);
114
115 returnValue = tempBuffer[0];
116 returnValue += tempBuffer[1];
117 returnValue += tempBuffer[2];
118 returnValue += tempBuffer[3];
119 returnValue += tempBuffer[4];
120 returnValue += tempBuffer[5];
121 returnValue += tempBuffer[6];
122 returnValue += tempBuffer[7];
123
124 number = eighthPoints * 8;
125 for (; number < num_points; number++) {
126 returnValue += (*aPtr++);
127 }
128 *result = returnValue;
129}
130#endif /* LV_HAVE_AVX */
131
132
133#ifdef LV_HAVE_AVX512F
134#include <immintrin.h>
135
136static inline void volk_32f_accumulator_s32f_u_avx512f(float* result,
137 const float* inputBuffer,
138 unsigned int num_points)
139{
140 float returnValue = 0;
141 unsigned int number = 0;
142 const unsigned int sixteenthPoints = num_points / 16;
143
144 const float* aPtr = inputBuffer;
145
146 __m512 accumulator = _mm512_setzero_ps();
147 __m512 aVal = _mm512_setzero_ps();
148
149 for (; number < sixteenthPoints; number++) {
150 aVal = _mm512_loadu_ps(aPtr);
151 accumulator = _mm512_add_ps(accumulator, aVal);
152 aPtr += 16;
153 }
154
155 // Horizontal sum using AVX512 reduce instruction
156 returnValue = _mm512_reduce_add_ps(accumulator);
157
158 number = sixteenthPoints * 16;
159 for (; number < num_points; number++) {
160 returnValue += (*aPtr++);
161 }
162 *result = returnValue;
163}
164#endif /* LV_HAVE_AVX512F */
165
166
167#ifdef LV_HAVE_AVX
168#include <immintrin.h>
169
170static inline void volk_32f_accumulator_s32f_u_avx(float* result,
171 const float* inputBuffer,
172 unsigned int num_points)
173{
174 float returnValue = 0;
175 unsigned int number = 0;
176 const unsigned int eighthPoints = num_points / 8;
177
178 const float* aPtr = inputBuffer;
179 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
180
181 __m256 accumulator = _mm256_setzero_ps();
182 __m256 aVal = _mm256_setzero_ps();
183
184 for (; number < eighthPoints; number++) {
185 aVal = _mm256_loadu_ps(aPtr);
186 accumulator = _mm256_add_ps(accumulator, aVal);
187 aPtr += 8;
188 }
189
190 _mm256_store_ps(tempBuffer, accumulator);
191
192 returnValue = tempBuffer[0];
193 returnValue += tempBuffer[1];
194 returnValue += tempBuffer[2];
195 returnValue += tempBuffer[3];
196 returnValue += tempBuffer[4];
197 returnValue += tempBuffer[5];
198 returnValue += tempBuffer[6];
199 returnValue += tempBuffer[7];
200
201 number = eighthPoints * 8;
202 for (; number < num_points; number++) {
203 returnValue += (*aPtr++);
204 }
205 *result = returnValue;
206}
207#endif /* LV_HAVE_AVX */
208
209
210#ifdef LV_HAVE_SSE
211#include <xmmintrin.h>
212
213static inline void volk_32f_accumulator_s32f_a_sse(float* result,
214 const float* inputBuffer,
215 unsigned int num_points)
216{
217 float returnValue = 0;
218 unsigned int number = 0;
219 const unsigned int quarterPoints = num_points / 4;
220
221 const float* aPtr = inputBuffer;
222 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
223
224 __m128 accumulator = _mm_setzero_ps();
225 __m128 aVal = _mm_setzero_ps();
226
227 for (; number < quarterPoints; number++) {
228 aVal = _mm_load_ps(aPtr);
229 accumulator = _mm_add_ps(accumulator, aVal);
230 aPtr += 4;
231 }
232
233 _mm_store_ps(tempBuffer, accumulator);
234
235 returnValue = tempBuffer[0];
236 returnValue += tempBuffer[1];
237 returnValue += tempBuffer[2];
238 returnValue += tempBuffer[3];
239
240 number = quarterPoints * 4;
241 for (; number < num_points; number++) {
242 returnValue += (*aPtr++);
243 }
244 *result = returnValue;
245}
246#endif /* LV_HAVE_SSE */
247
248
249#ifdef LV_HAVE_SSE
250#include <xmmintrin.h>
251
252static inline void volk_32f_accumulator_s32f_u_sse(float* result,
253 const float* inputBuffer,
254 unsigned int num_points)
255{
256 float returnValue = 0;
257 unsigned int number = 0;
258 const unsigned int quarterPoints = num_points / 4;
259
260 const float* aPtr = inputBuffer;
261 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
262
263 __m128 accumulator = _mm_setzero_ps();
264 __m128 aVal = _mm_setzero_ps();
265
266 for (; number < quarterPoints; number++) {
267 aVal = _mm_loadu_ps(aPtr);
268 accumulator = _mm_add_ps(accumulator, aVal);
269 aPtr += 4;
270 }
271
272 _mm_store_ps(tempBuffer, accumulator);
273
274 returnValue = tempBuffer[0];
275 returnValue += tempBuffer[1];
276 returnValue += tempBuffer[2];
277 returnValue += tempBuffer[3];
278
279 number = quarterPoints * 4;
280 for (; number < num_points; number++) {
281 returnValue += (*aPtr++);
282 }
283 *result = returnValue;
284}
285#endif /* LV_HAVE_SSE */
286
287
288#ifdef LV_HAVE_NEON
289#include <arm_neon.h>
290
291static inline void volk_32f_accumulator_s32f_neon(float* result,
292 const float* inputBuffer,
293 unsigned int num_points)
294{
295 float returnValue = 0;
296 unsigned int number = 0;
297 const unsigned int quarterPoints = num_points / 4;
298
299 const float* aPtr = inputBuffer;
300 float32x4_t accumulator = vdupq_n_f32(0.0f);
301 float32x4_t aVal;
302
303 for (; number < quarterPoints; number++) {
304 aVal = vld1q_f32(aPtr);
305 accumulator = vaddq_f32(accumulator, aVal);
306 aPtr += 4;
307 }
308
309 // Horizontal sum - manual for NEON (ARMv7 compatible)
310 float32x2_t sum_pair =
311 vadd_f32(vget_low_f32(accumulator), vget_high_f32(accumulator));
312 sum_pair = vpadd_f32(sum_pair, sum_pair);
313 returnValue = vget_lane_f32(sum_pair, 0);
314
315 number = quarterPoints * 4;
316 for (; number < num_points; number++) {
317 returnValue += (*aPtr++);
318 }
319 *result = returnValue;
320}
321#endif /* LV_HAVE_NEON */
322
323
324#ifdef LV_HAVE_NEONV8
325#include <arm_neon.h>
326
327static inline void volk_32f_accumulator_s32f_neonv8(float* result,
328 const float* inputBuffer,
329 unsigned int num_points)
330{
331 float returnValue = 0;
332 unsigned int number = 0;
333 const unsigned int eighthPoints = num_points / 8;
334
335 const float* aPtr = inputBuffer;
336 float32x4_t accumulator0 = vdupq_n_f32(0.0f);
337 float32x4_t accumulator1 = vdupq_n_f32(0.0f);
338
339 // 2x unrolled loop for better instruction-level parallelism
340 for (; number < eighthPoints; number++) {
341 float32x4_t aVal0 = vld1q_f32(aPtr);
342 float32x4_t aVal1 = vld1q_f32(aPtr + 4);
343 __VOLK_PREFETCH(aPtr + 8);
344 accumulator0 = vaddq_f32(accumulator0, aVal0);
345 accumulator1 = vaddq_f32(accumulator1, aVal1);
346 aPtr += 8;
347 }
348
349 // Combine accumulators
350 accumulator0 = vaddq_f32(accumulator0, accumulator1);
351
352 // ARMv8 horizontal sum using vaddvq_f32
353 returnValue = vaddvq_f32(accumulator0);
354
355 number = eighthPoints * 8;
356 for (; number < num_points; number++) {
357 returnValue += (*aPtr++);
358 }
359 *result = returnValue;
360}
361#endif /* LV_HAVE_NEONV8 */
362
363
364#ifdef LV_HAVE_GENERIC
365static inline void volk_32f_accumulator_s32f_generic(float* result,
366 const float* inputBuffer,
367 unsigned int num_points)
368{
369 const float* aPtr = inputBuffer;
370 unsigned int number = 0;
371 float returnValue = 0;
372
373 for (; number < num_points; number++) {
374 returnValue += (*aPtr++);
375 }
376 *result = returnValue;
377}
378#endif /* LV_HAVE_GENERIC */
379
380#ifdef LV_HAVE_RVV
381#include <riscv_vector.h>
383
384static inline void volk_32f_accumulator_s32f_rvv(float* result,
385 const float* inputBuffer,
386 unsigned int num_points)
387{
388 vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
389 size_t n = num_points;
390 for (size_t vl; n > 0; n -= vl, inputBuffer += vl) {
391 vl = __riscv_vsetvl_e32m8(n);
392 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl);
393 vsum = __riscv_vfadd_tu(vsum, vsum, v, vl);
394 }
395 size_t vl = __riscv_vsetvlmax_e32m1();
396 vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum);
397 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
398 *result = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl));
399}
400#endif /*LV_HAVE_RVV*/
401
402#endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */