Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_accumulator_s32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
50
51#ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
52#define INCLUDED_volk_32fc_accumulator_s32fc_a_H
53
54#include <inttypes.h>
55#include <volk/volk_common.h>
56
57#ifdef LV_HAVE_AVX512F
58#include <immintrin.h>
59
60static inline void volk_32fc_accumulator_s32fc_a_avx512f(lv_32fc_t* result,
61 const lv_32fc_t* inputBuffer,
62 unsigned int num_points)
63{
64 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
65 unsigned int number = 0;
66 const unsigned int eighthPoints = num_points / 8;
67
68 const lv_32fc_t* aPtr = inputBuffer;
69 __VOLK_ATTR_ALIGNED(64) float tempBuffer[16];
70
71 __m512 accumulator = _mm512_setzero_ps();
72 __m512 aVal = _mm512_setzero_ps();
73
74 for (; number < eighthPoints; number++) {
75 aVal = _mm512_load_ps((float*)aPtr);
76 accumulator = _mm512_add_ps(accumulator, aVal);
77 aPtr += 8;
78 }
79
80 _mm512_store_ps(tempBuffer, accumulator);
81
82 // Sum pairs as complex numbers
83 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
84 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
85 returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
86 returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
87 returnValue += lv_cmake(tempBuffer[8], tempBuffer[9]);
88 returnValue += lv_cmake(tempBuffer[10], tempBuffer[11]);
89 returnValue += lv_cmake(tempBuffer[12], tempBuffer[13]);
90 returnValue += lv_cmake(tempBuffer[14], tempBuffer[15]);
91
92 number = eighthPoints * 8;
93 for (; number < num_points; number++) {
94 returnValue += (*aPtr++);
95 }
96 *result = returnValue;
97}
98#endif /* LV_HAVE_AVX512F */
99
100
101#ifdef LV_HAVE_AVX512F
102#include <immintrin.h>
103
104static inline void volk_32fc_accumulator_s32fc_u_avx512f(lv_32fc_t* result,
105 const lv_32fc_t* inputBuffer,
106 unsigned int num_points)
107{
108 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
109 unsigned int number = 0;
110 const unsigned int eighthPoints = num_points / 8;
111
112 const lv_32fc_t* aPtr = inputBuffer;
113 __VOLK_ATTR_ALIGNED(64) float tempBuffer[16];
114
115 __m512 accumulator = _mm512_setzero_ps();
116 __m512 aVal = _mm512_setzero_ps();
117
118 for (; number < eighthPoints; number++) {
119 aVal = _mm512_loadu_ps((float*)aPtr);
120 accumulator = _mm512_add_ps(accumulator, aVal);
121 aPtr += 8;
122 }
123
124 _mm512_store_ps(tempBuffer, accumulator);
125
126 // Sum pairs as complex numbers
127 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
128 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
129 returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
130 returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
131 returnValue += lv_cmake(tempBuffer[8], tempBuffer[9]);
132 returnValue += lv_cmake(tempBuffer[10], tempBuffer[11]);
133 returnValue += lv_cmake(tempBuffer[12], tempBuffer[13]);
134 returnValue += lv_cmake(tempBuffer[14], tempBuffer[15]);
135
136 number = eighthPoints * 8;
137 for (; number < num_points; number++) {
138 returnValue += (*aPtr++);
139 }
140 *result = returnValue;
141}
142#endif /* LV_HAVE_AVX512F */
143
144
145#ifdef LV_HAVE_GENERIC
147 const lv_32fc_t* inputBuffer,
148 unsigned int num_points)
149{
150 const lv_32fc_t* aPtr = inputBuffer;
151 unsigned int number = 0;
152 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
153
154 for (; number < num_points; number++) {
155 returnValue += (*aPtr++);
156 }
157 *result = returnValue;
158}
159#endif /* LV_HAVE_GENERIC */
160
161#ifdef LV_HAVE_AVX
162#include <immintrin.h>
163
165 const lv_32fc_t* inputBuffer,
166 unsigned int num_points)
167{
168 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
169 unsigned int number = 0;
170 const unsigned int quarterPoints = num_points / 4;
171
172 const lv_32fc_t* aPtr = inputBuffer;
173 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
174
175 __m256 accumulator = _mm256_setzero_ps();
176 __m256 aVal = _mm256_setzero_ps();
177
178 for (; number < quarterPoints; number++) {
179 aVal = _mm256_loadu_ps((float*)aPtr);
180 accumulator = _mm256_add_ps(accumulator, aVal);
181 aPtr += 4;
182 }
183
184 _mm256_store_ps(tempBuffer, accumulator);
185
186 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
187 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
188 returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
189 returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
190
191 number = quarterPoints * 4;
192 for (; number < num_points; number++) {
193 returnValue += (*aPtr++);
194 }
195 *result = returnValue;
196}
197#endif /* LV_HAVE_AVX */
198
199#ifdef LV_HAVE_SSE
200#include <xmmintrin.h>
201
203 const lv_32fc_t* inputBuffer,
204 unsigned int num_points)
205{
206 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
207 unsigned int number = 0;
208 const unsigned int halfPoints = num_points / 2;
209
210 const lv_32fc_t* aPtr = inputBuffer;
211 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
212
213 __m128 accumulator = _mm_setzero_ps();
214 __m128 aVal = _mm_setzero_ps();
215
216 for (; number < halfPoints; number++) {
217 aVal = _mm_loadu_ps((float*)aPtr);
218 accumulator = _mm_add_ps(accumulator, aVal);
219 aPtr += 2;
220 }
221
222 _mm_store_ps(tempBuffer, accumulator);
223
224 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
225 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
226
227 number = halfPoints * 2;
228 for (; number < num_points; number++) {
229 returnValue += (*aPtr++);
230 }
231 *result = returnValue;
232}
233#endif /* LV_HAVE_SSE */
234
235#ifdef LV_HAVE_AVX
236#include <immintrin.h>
237
239 const lv_32fc_t* inputBuffer,
240 unsigned int num_points)
241{
242 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
243 unsigned int number = 0;
244 const unsigned int quarterPoints = num_points / 4;
245
246 const lv_32fc_t* aPtr = inputBuffer;
247 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
248
249 __m256 accumulator = _mm256_setzero_ps();
250 __m256 aVal = _mm256_setzero_ps();
251
252 for (; number < quarterPoints; number++) {
253 aVal = _mm256_load_ps((float*)aPtr);
254 accumulator = _mm256_add_ps(accumulator, aVal);
255 aPtr += 4;
256 }
257
258 _mm256_store_ps(tempBuffer, accumulator);
259
260 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
261 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
262 returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
263 returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
264
265 number = quarterPoints * 4;
266 for (; number < num_points; number++) {
267 returnValue += (*aPtr++);
268 }
269 *result = returnValue;
270}
271#endif /* LV_HAVE_AVX */
272
273#ifdef LV_HAVE_SSE
274#include <xmmintrin.h>
275
277 const lv_32fc_t* inputBuffer,
278 unsigned int num_points)
279{
280 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
281 unsigned int number = 0;
282 const unsigned int halfPoints = num_points / 2;
283
284 const lv_32fc_t* aPtr = inputBuffer;
285 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
286
287 __m128 accumulator = _mm_setzero_ps();
288 __m128 aVal = _mm_setzero_ps();
289
290 for (; number < halfPoints; number++) {
291 aVal = _mm_load_ps((float*)aPtr);
292 accumulator = _mm_add_ps(accumulator, aVal);
293 aPtr += 2;
294 }
295
296 _mm_store_ps(tempBuffer, accumulator);
297
298 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
299 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
300
301 number = halfPoints * 2;
302 for (; number < num_points; number++) {
303 returnValue += (*aPtr++);
304 }
305 *result = returnValue;
306}
307#endif /* LV_HAVE_SSE */
308
309#ifdef LV_HAVE_NEON
310#include <arm_neon.h>
312 const lv_32fc_t* inputBuffer,
313 unsigned int num_points)
314{
315 const lv_32fc_t* aPtr = inputBuffer;
316 unsigned int number = 0;
317 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
318 unsigned int eighthPoints = num_points / 8;
319 float32x4_t in_vec;
320 float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
321 float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
322 float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
323 float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
324 __VOLK_ATTR_ALIGNED(32) float tempBuffer[4];
325
326 for (; number < eighthPoints; number++) {
327 in_vec = vld1q_f32((float*)aPtr);
328 out_vec0 = vaddq_f32(in_vec, out_vec0);
329 aPtr += 2;
330
331 in_vec = vld1q_f32((float*)aPtr);
332 out_vec1 = vaddq_f32(in_vec, out_vec1);
333 aPtr += 2;
334
335 in_vec = vld1q_f32((float*)aPtr);
336 out_vec2 = vaddq_f32(in_vec, out_vec2);
337 aPtr += 2;
338
339 in_vec = vld1q_f32((float*)aPtr);
340 out_vec3 = vaddq_f32(in_vec, out_vec3);
341 aPtr += 2;
342 }
343 vst1q_f32(tempBuffer, out_vec0);
344 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
345 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
346
347 vst1q_f32(tempBuffer, out_vec1);
348 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
349 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
350
351 vst1q_f32(tempBuffer, out_vec2);
352 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
353 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
354
355 vst1q_f32(tempBuffer, out_vec3);
356 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
357 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
358
359 number = eighthPoints * 8;
360 for (; number < num_points; number++) {
361 returnValue += (*aPtr++);
362 }
363 *result = returnValue;
364}
365#endif /* LV_HAVE_NEON */
366
367#ifdef LV_HAVE_NEONV8
368#include <arm_neon.h>
369
370static inline void volk_32fc_accumulator_s32fc_neonv8(lv_32fc_t* result,
371 const lv_32fc_t* inputBuffer,
372 unsigned int num_points)
373{
374 const lv_32fc_t* aPtr = inputBuffer;
375 unsigned int number = 0;
376 const unsigned int eighthPoints = num_points / 8;
377
378 /* Keep interleaved like neon version - vld1q is faster than vld2q */
379 float32x4_t in_vec;
380 float32x4_t out_vec0 = vdupq_n_f32(0.f);
381 float32x4_t out_vec1 = vdupq_n_f32(0.f);
382 float32x4_t out_vec2 = vdupq_n_f32(0.f);
383 float32x4_t out_vec3 = vdupq_n_f32(0.f);
384
385 for (; number < eighthPoints; number++) {
386 in_vec = vld1q_f32((float*)aPtr);
387 out_vec0 = vaddq_f32(in_vec, out_vec0);
388 aPtr += 2;
389
390 in_vec = vld1q_f32((float*)aPtr);
391 out_vec1 = vaddq_f32(in_vec, out_vec1);
392 aPtr += 2;
393
394 in_vec = vld1q_f32((float*)aPtr);
395 out_vec2 = vaddq_f32(in_vec, out_vec2);
396 aPtr += 2;
397
398 in_vec = vld1q_f32((float*)aPtr);
399 out_vec3 = vaddq_f32(in_vec, out_vec3);
400 aPtr += 2;
401 }
402
403 /* Combine the 4 accumulators */
404 out_vec0 = vaddq_f32(out_vec0, out_vec1);
405 out_vec2 = vaddq_f32(out_vec2, out_vec3);
406 out_vec0 = vaddq_f32(out_vec0, out_vec2);
407
408 /* Horizontal reduction: out_vec0 = [sum_r0, sum_i0, sum_r1, sum_i1] */
409 /* We need real = sum_r0 + sum_r1, imag = sum_i0 + sum_i1 */
410 float32x2_t low = vget_low_f32(out_vec0); /* [sum_r0, sum_i0] */
411 float32x2_t high = vget_high_f32(out_vec0); /* [sum_r1, sum_i1] */
412 float32x2_t sum = vadd_f32(low, high); /* [real_sum, imag_sum] */
413
414 lv_32fc_t returnValue = lv_cmake(vget_lane_f32(sum, 0), vget_lane_f32(sum, 1));
415
416 /* Tail case */
417 for (number = eighthPoints * 8; number < num_points; number++) {
418 returnValue += (*aPtr++);
419 }
420
421 *result = returnValue;
422}
423
424#endif /* LV_HAVE_NEONV8 */
425
426#ifdef LV_HAVE_RVV
427#include <riscv_vector.h>
429
430static inline void volk_32fc_accumulator_s32fc_rvv(lv_32fc_t* result,
431 const lv_32fc_t* inputBuffer,
432 unsigned int num_points)
433{
434 size_t vlmax = __riscv_vsetvlmax_e32m8();
435 vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, vlmax);
436 const float* in = (const float*)inputBuffer;
437 size_t n = num_points * 2;
438 for (size_t vl; n > 0; n -= vl, in += vl) {
439 vl = __riscv_vsetvl_e32m8(n < vlmax ? n : vlmax); /* force exact vl */
440 vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
441 vsum = __riscv_vfadd_tu(vsum, vsum, v, vl);
442 }
443 vuint64m8_t vsumu = __riscv_vreinterpret_u64m8(__riscv_vreinterpret_u32m8(vsum));
444 vfloat32m4_t vsum1 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 0, vlmax));
445 vfloat32m4_t vsum2 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 32, vlmax));
446 vlmax = __riscv_vsetvlmax_e32m1();
447 vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsum1);
448 vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsum2);
449 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vlmax);
450 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vlmax)),
451 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vlmax)));
452}
453#endif /*LV_HAVE_RVV*/
454
455#endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */