51#ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
52#define INCLUDED_volk_32fc_accumulator_s32fc_a_H
60static inline void volk_32fc_accumulator_s32fc_a_avx512f(
lv_32fc_t* result,
62 unsigned int num_points)
65 unsigned int number = 0;
66 const unsigned int eighthPoints = num_points / 8;
71 __m512 accumulator = _mm512_setzero_ps();
72 __m512 aVal = _mm512_setzero_ps();
74 for (; number < eighthPoints; number++) {
75 aVal = _mm512_load_ps((
float*)aPtr);
76 accumulator = _mm512_add_ps(accumulator, aVal);
80 _mm512_store_ps(tempBuffer, accumulator);
83 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
84 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
85 returnValue +=
lv_cmake(tempBuffer[4], tempBuffer[5]);
86 returnValue +=
lv_cmake(tempBuffer[6], tempBuffer[7]);
87 returnValue +=
lv_cmake(tempBuffer[8], tempBuffer[9]);
88 returnValue +=
lv_cmake(tempBuffer[10], tempBuffer[11]);
89 returnValue +=
lv_cmake(tempBuffer[12], tempBuffer[13]);
90 returnValue +=
lv_cmake(tempBuffer[14], tempBuffer[15]);
92 number = eighthPoints * 8;
93 for (; number < num_points; number++) {
94 returnValue += (*aPtr++);
96 *result = returnValue;
101#ifdef LV_HAVE_AVX512F
102#include <immintrin.h>
104static inline void volk_32fc_accumulator_s32fc_u_avx512f(
lv_32fc_t* result,
106 unsigned int num_points)
109 unsigned int number = 0;
110 const unsigned int eighthPoints = num_points / 8;
115 __m512 accumulator = _mm512_setzero_ps();
116 __m512 aVal = _mm512_setzero_ps();
118 for (; number < eighthPoints; number++) {
119 aVal = _mm512_loadu_ps((
float*)aPtr);
120 accumulator = _mm512_add_ps(accumulator, aVal);
124 _mm512_store_ps(tempBuffer, accumulator);
127 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
128 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
129 returnValue +=
lv_cmake(tempBuffer[4], tempBuffer[5]);
130 returnValue +=
lv_cmake(tempBuffer[6], tempBuffer[7]);
131 returnValue +=
lv_cmake(tempBuffer[8], tempBuffer[9]);
132 returnValue +=
lv_cmake(tempBuffer[10], tempBuffer[11]);
133 returnValue +=
lv_cmake(tempBuffer[12], tempBuffer[13]);
134 returnValue +=
lv_cmake(tempBuffer[14], tempBuffer[15]);
136 number = eighthPoints * 8;
137 for (; number < num_points; number++) {
138 returnValue += (*aPtr++);
140 *result = returnValue;
145#ifdef LV_HAVE_GENERIC
148 unsigned int num_points)
151 unsigned int number = 0;
154 for (; number < num_points; number++) {
155 returnValue += (*aPtr++);
157 *result = returnValue;
162#include <immintrin.h>
166 unsigned int num_points)
169 unsigned int number = 0;
170 const unsigned int quarterPoints = num_points / 4;
175 __m256 accumulator = _mm256_setzero_ps();
176 __m256 aVal = _mm256_setzero_ps();
178 for (; number < quarterPoints; number++) {
179 aVal = _mm256_loadu_ps((
float*)aPtr);
180 accumulator = _mm256_add_ps(accumulator, aVal);
184 _mm256_store_ps(tempBuffer, accumulator);
186 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
187 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
188 returnValue +=
lv_cmake(tempBuffer[4], tempBuffer[5]);
189 returnValue +=
lv_cmake(tempBuffer[6], tempBuffer[7]);
191 number = quarterPoints * 4;
192 for (; number < num_points; number++) {
193 returnValue += (*aPtr++);
195 *result = returnValue;
200#include <xmmintrin.h>
204 unsigned int num_points)
207 unsigned int number = 0;
208 const unsigned int halfPoints = num_points / 2;
213 __m128 accumulator = _mm_setzero_ps();
214 __m128 aVal = _mm_setzero_ps();
216 for (; number < halfPoints; number++) {
217 aVal = _mm_loadu_ps((
float*)aPtr);
218 accumulator = _mm_add_ps(accumulator, aVal);
222 _mm_store_ps(tempBuffer, accumulator);
224 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
225 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
227 number = halfPoints * 2;
228 for (; number < num_points; number++) {
229 returnValue += (*aPtr++);
231 *result = returnValue;
236#include <immintrin.h>
240 unsigned int num_points)
243 unsigned int number = 0;
244 const unsigned int quarterPoints = num_points / 4;
249 __m256 accumulator = _mm256_setzero_ps();
250 __m256 aVal = _mm256_setzero_ps();
252 for (; number < quarterPoints; number++) {
253 aVal = _mm256_load_ps((
float*)aPtr);
254 accumulator = _mm256_add_ps(accumulator, aVal);
258 _mm256_store_ps(tempBuffer, accumulator);
260 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
261 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
262 returnValue +=
lv_cmake(tempBuffer[4], tempBuffer[5]);
263 returnValue +=
lv_cmake(tempBuffer[6], tempBuffer[7]);
265 number = quarterPoints * 4;
266 for (; number < num_points; number++) {
267 returnValue += (*aPtr++);
269 *result = returnValue;
274#include <xmmintrin.h>
278 unsigned int num_points)
281 unsigned int number = 0;
282 const unsigned int halfPoints = num_points / 2;
287 __m128 accumulator = _mm_setzero_ps();
288 __m128 aVal = _mm_setzero_ps();
290 for (; number < halfPoints; number++) {
291 aVal = _mm_load_ps((
float*)aPtr);
292 accumulator = _mm_add_ps(accumulator, aVal);
296 _mm_store_ps(tempBuffer, accumulator);
298 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
299 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
301 number = halfPoints * 2;
302 for (; number < num_points; number++) {
303 returnValue += (*aPtr++);
305 *result = returnValue;
313 unsigned int num_points)
316 unsigned int number = 0;
318 unsigned int eighthPoints = num_points / 8;
320 float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
321 float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
322 float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
323 float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
326 for (; number < eighthPoints; number++) {
327 in_vec = vld1q_f32((
float*)aPtr);
328 out_vec0 = vaddq_f32(in_vec, out_vec0);
331 in_vec = vld1q_f32((
float*)aPtr);
332 out_vec1 = vaddq_f32(in_vec, out_vec1);
335 in_vec = vld1q_f32((
float*)aPtr);
336 out_vec2 = vaddq_f32(in_vec, out_vec2);
339 in_vec = vld1q_f32((
float*)aPtr);
340 out_vec3 = vaddq_f32(in_vec, out_vec3);
343 vst1q_f32(tempBuffer, out_vec0);
344 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
345 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
347 vst1q_f32(tempBuffer, out_vec1);
348 returnValue +=
lv_cmake(tempBuffer[0], tempBuffer[1]);
349 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
351 vst1q_f32(tempBuffer, out_vec2);
352 returnValue +=
lv_cmake(tempBuffer[0], tempBuffer[1]);
353 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
355 vst1q_f32(tempBuffer, out_vec3);
356 returnValue +=
lv_cmake(tempBuffer[0], tempBuffer[1]);
357 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
359 number = eighthPoints * 8;
360 for (; number < num_points; number++) {
361 returnValue += (*aPtr++);
363 *result = returnValue;
370static inline void volk_32fc_accumulator_s32fc_neonv8(
lv_32fc_t* result,
372 unsigned int num_points)
375 unsigned int number = 0;
376 const unsigned int eighthPoints = num_points / 8;
380 float32x4_t out_vec0 = vdupq_n_f32(0.f);
381 float32x4_t out_vec1 = vdupq_n_f32(0.f);
382 float32x4_t out_vec2 = vdupq_n_f32(0.f);
383 float32x4_t out_vec3 = vdupq_n_f32(0.f);
385 for (; number < eighthPoints; number++) {
386 in_vec = vld1q_f32((
float*)aPtr);
387 out_vec0 = vaddq_f32(in_vec, out_vec0);
390 in_vec = vld1q_f32((
float*)aPtr);
391 out_vec1 = vaddq_f32(in_vec, out_vec1);
394 in_vec = vld1q_f32((
float*)aPtr);
395 out_vec2 = vaddq_f32(in_vec, out_vec2);
398 in_vec = vld1q_f32((
float*)aPtr);
399 out_vec3 = vaddq_f32(in_vec, out_vec3);
404 out_vec0 = vaddq_f32(out_vec0, out_vec1);
405 out_vec2 = vaddq_f32(out_vec2, out_vec3);
406 out_vec0 = vaddq_f32(out_vec0, out_vec2);
410 float32x2_t low = vget_low_f32(out_vec0);
411 float32x2_t high = vget_high_f32(out_vec0);
412 float32x2_t sum = vadd_f32(low, high);
417 for (number = eighthPoints * 8; number < num_points; number++) {
418 returnValue += (*aPtr++);
421 *result = returnValue;
427#include <riscv_vector.h>
430static inline void volk_32fc_accumulator_s32fc_rvv(
lv_32fc_t* result,
432 unsigned int num_points)
434 size_t vlmax = __riscv_vsetvlmax_e32m8();
435 vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, vlmax);
436 const float* in = (
const float*)inputBuffer;
437 size_t n = num_points * 2;
438 for (
size_t vl; n > 0; n -= vl, in += vl) {
439 vl = __riscv_vsetvl_e32m8(n < vlmax ? n : vlmax);
440 vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
441 vsum = __riscv_vfadd_tu(vsum, vsum, v, vl);
443 vuint64m8_t vsumu = __riscv_vreinterpret_u64m8(__riscv_vreinterpret_u32m8(vsum));
444 vfloat32m4_t vsum1 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 0, vlmax));
445 vfloat32m4_t vsum2 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 32, vlmax));
446 vlmax = __riscv_vsetvlmax_e32m1();
449 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vlmax);
450 *result =
lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vlmax)),
451 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vlmax)));