50#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
51#define INCLUDED_volk_32f_accumulator_s32f_a_H
59static inline void volk_32f_accumulator_s32f_a_avx512f(
float* result,
60 const float* inputBuffer,
61 unsigned int num_points)
63 float returnValue = 0;
64 unsigned int number = 0;
65 const unsigned int sixteenthPoints = num_points / 16;
67 const float* aPtr = inputBuffer;
69 __m512 accumulator = _mm512_setzero_ps();
70 __m512 aVal = _mm512_setzero_ps();
72 for (; number < sixteenthPoints; number++) {
73 aVal = _mm512_load_ps(aPtr);
74 accumulator = _mm512_add_ps(accumulator, aVal);
79 returnValue = _mm512_reduce_add_ps(accumulator);
81 number = sixteenthPoints * 16;
82 for (; number < num_points; number++) {
83 returnValue += (*aPtr++);
85 *result = returnValue;
94 const float* inputBuffer,
95 unsigned int num_points)
97 float returnValue = 0;
98 unsigned int number = 0;
99 const unsigned int eighthPoints = num_points / 8;
101 const float* aPtr = inputBuffer;
104 __m256 accumulator = _mm256_setzero_ps();
105 __m256 aVal = _mm256_setzero_ps();
107 for (; number < eighthPoints; number++) {
108 aVal = _mm256_load_ps(aPtr);
109 accumulator = _mm256_add_ps(accumulator, aVal);
113 _mm256_store_ps(tempBuffer, accumulator);
115 returnValue = tempBuffer[0];
116 returnValue += tempBuffer[1];
117 returnValue += tempBuffer[2];
118 returnValue += tempBuffer[3];
119 returnValue += tempBuffer[4];
120 returnValue += tempBuffer[5];
121 returnValue += tempBuffer[6];
122 returnValue += tempBuffer[7];
124 number = eighthPoints * 8;
125 for (; number < num_points; number++) {
126 returnValue += (*aPtr++);
128 *result = returnValue;
133#ifdef LV_HAVE_AVX512F
134#include <immintrin.h>
136static inline void volk_32f_accumulator_s32f_u_avx512f(
float* result,
137 const float* inputBuffer,
138 unsigned int num_points)
140 float returnValue = 0;
141 unsigned int number = 0;
142 const unsigned int sixteenthPoints = num_points / 16;
144 const float* aPtr = inputBuffer;
146 __m512 accumulator = _mm512_setzero_ps();
147 __m512 aVal = _mm512_setzero_ps();
149 for (; number < sixteenthPoints; number++) {
150 aVal = _mm512_loadu_ps(aPtr);
151 accumulator = _mm512_add_ps(accumulator, aVal);
156 returnValue = _mm512_reduce_add_ps(accumulator);
158 number = sixteenthPoints * 16;
159 for (; number < num_points; number++) {
160 returnValue += (*aPtr++);
162 *result = returnValue;
168#include <immintrin.h>
171 const float* inputBuffer,
172 unsigned int num_points)
174 float returnValue = 0;
175 unsigned int number = 0;
176 const unsigned int eighthPoints = num_points / 8;
178 const float* aPtr = inputBuffer;
181 __m256 accumulator = _mm256_setzero_ps();
182 __m256 aVal = _mm256_setzero_ps();
184 for (; number < eighthPoints; number++) {
185 aVal = _mm256_loadu_ps(aPtr);
186 accumulator = _mm256_add_ps(accumulator, aVal);
190 _mm256_store_ps(tempBuffer, accumulator);
192 returnValue = tempBuffer[0];
193 returnValue += tempBuffer[1];
194 returnValue += tempBuffer[2];
195 returnValue += tempBuffer[3];
196 returnValue += tempBuffer[4];
197 returnValue += tempBuffer[5];
198 returnValue += tempBuffer[6];
199 returnValue += tempBuffer[7];
201 number = eighthPoints * 8;
202 for (; number < num_points; number++) {
203 returnValue += (*aPtr++);
205 *result = returnValue;
211#include <xmmintrin.h>
214 const float* inputBuffer,
215 unsigned int num_points)
217 float returnValue = 0;
218 unsigned int number = 0;
219 const unsigned int quarterPoints = num_points / 4;
221 const float* aPtr = inputBuffer;
224 __m128 accumulator = _mm_setzero_ps();
225 __m128 aVal = _mm_setzero_ps();
227 for (; number < quarterPoints; number++) {
228 aVal = _mm_load_ps(aPtr);
229 accumulator = _mm_add_ps(accumulator, aVal);
233 _mm_store_ps(tempBuffer, accumulator);
235 returnValue = tempBuffer[0];
236 returnValue += tempBuffer[1];
237 returnValue += tempBuffer[2];
238 returnValue += tempBuffer[3];
240 number = quarterPoints * 4;
241 for (; number < num_points; number++) {
242 returnValue += (*aPtr++);
244 *result = returnValue;
250#include <xmmintrin.h>
253 const float* inputBuffer,
254 unsigned int num_points)
256 float returnValue = 0;
257 unsigned int number = 0;
258 const unsigned int quarterPoints = num_points / 4;
260 const float* aPtr = inputBuffer;
263 __m128 accumulator = _mm_setzero_ps();
264 __m128 aVal = _mm_setzero_ps();
266 for (; number < quarterPoints; number++) {
267 aVal = _mm_loadu_ps(aPtr);
268 accumulator = _mm_add_ps(accumulator, aVal);
272 _mm_store_ps(tempBuffer, accumulator);
274 returnValue = tempBuffer[0];
275 returnValue += tempBuffer[1];
276 returnValue += tempBuffer[2];
277 returnValue += tempBuffer[3];
279 number = quarterPoints * 4;
280 for (; number < num_points; number++) {
281 returnValue += (*aPtr++);
283 *result = returnValue;
292 const float* inputBuffer,
293 unsigned int num_points)
295 float returnValue = 0;
296 unsigned int number = 0;
297 const unsigned int quarterPoints = num_points / 4;
299 const float* aPtr = inputBuffer;
300 float32x4_t accumulator = vdupq_n_f32(0.0f);
303 for (; number < quarterPoints; number++) {
304 aVal = vld1q_f32(aPtr);
305 accumulator = vaddq_f32(accumulator, aVal);
310 float32x2_t sum_pair =
311 vadd_f32(vget_low_f32(accumulator), vget_high_f32(accumulator));
312 sum_pair = vpadd_f32(sum_pair, sum_pair);
313 returnValue = vget_lane_f32(sum_pair, 0);
315 number = quarterPoints * 4;
316 for (; number < num_points; number++) {
317 returnValue += (*aPtr++);
319 *result = returnValue;
327static inline void volk_32f_accumulator_s32f_neonv8(
float* result,
328 const float* inputBuffer,
329 unsigned int num_points)
331 float returnValue = 0;
332 unsigned int number = 0;
333 const unsigned int eighthPoints = num_points / 8;
335 const float* aPtr = inputBuffer;
336 float32x4_t accumulator0 = vdupq_n_f32(0.0f);
337 float32x4_t accumulator1 = vdupq_n_f32(0.0f);
340 for (; number < eighthPoints; number++) {
341 float32x4_t aVal0 = vld1q_f32(aPtr);
342 float32x4_t aVal1 = vld1q_f32(aPtr + 4);
344 accumulator0 = vaddq_f32(accumulator0, aVal0);
345 accumulator1 = vaddq_f32(accumulator1, aVal1);
350 accumulator0 = vaddq_f32(accumulator0, accumulator1);
353 returnValue = vaddvq_f32(accumulator0);
355 number = eighthPoints * 8;
356 for (; number < num_points; number++) {
357 returnValue += (*aPtr++);
359 *result = returnValue;
364#ifdef LV_HAVE_GENERIC
366 const float* inputBuffer,
367 unsigned int num_points)
369 const float* aPtr = inputBuffer;
370 unsigned int number = 0;
371 float returnValue = 0;
373 for (; number < num_points; number++) {
374 returnValue += (*aPtr++);
376 *result = returnValue;
381#include <riscv_vector.h>
384static inline void volk_32f_accumulator_s32f_rvv(
float* result,
385 const float* inputBuffer,
386 unsigned int num_points)
388 vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
389 size_t n = num_points;
390 for (
size_t vl; n > 0; n -= vl, inputBuffer += vl) {
391 vl = __riscv_vsetvl_e32m8(n);
392 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl);
393 vsum = __riscv_vfadd_tu(vsum, vsum, v, vl);
395 size_t vl = __riscv_vsetvlmax_e32m1();
397 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
398 *result = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl));