71#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
72#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
79#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
88 float* center_point_array,
90 unsigned int num_points)
98 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
100 xmm9 = _mm_setzero_ps();
101 xmm1 = _mm_setzero_ps();
102 xmm0 = _mm_load1_ps(¢er_point_array[0]);
103 xmm6 = _mm_load1_ps(¢er_point_array[1]);
104 xmm7 = _mm_load1_ps(¢er_point_array[2]);
105 xmm8 = _mm_load1_ps(¢er_point_array[3]);
106 xmm10 = _mm_load1_ps(cutoff);
108 int bound = num_points / 8;
109 int leftovers = num_points - 8 * bound;
111 for (; i < bound; ++i) {
113 xmm2 = _mm_load_ps(src0);
114 xmm2 = _mm_max_ps(xmm10, xmm2);
115 xmm3 = _mm_mul_ps(xmm2, xmm2);
116 xmm4 = _mm_mul_ps(xmm2, xmm3);
117 xmm5 = _mm_mul_ps(xmm3, xmm3);
119 xmm2 = _mm_mul_ps(xmm2, xmm0);
120 xmm3 = _mm_mul_ps(xmm3, xmm6);
121 xmm4 = _mm_mul_ps(xmm4, xmm7);
122 xmm5 = _mm_mul_ps(xmm5, xmm8);
124 xmm2 = _mm_add_ps(xmm2, xmm3);
125 xmm3 = _mm_add_ps(xmm4, xmm5);
129 xmm9 = _mm_add_ps(xmm2, xmm9);
130 xmm9 = _mm_add_ps(xmm3, xmm9);
133 xmm2 = _mm_load_ps(src0);
134 xmm2 = _mm_max_ps(xmm10, xmm2);
135 xmm3 = _mm_mul_ps(xmm2, xmm2);
136 xmm4 = _mm_mul_ps(xmm2, xmm3);
137 xmm5 = _mm_mul_ps(xmm3, xmm3);
139 xmm2 = _mm_mul_ps(xmm2, xmm0);
140 xmm3 = _mm_mul_ps(xmm3, xmm6);
141 xmm4 = _mm_mul_ps(xmm4, xmm7);
142 xmm5 = _mm_mul_ps(xmm5, xmm8);
144 xmm2 = _mm_add_ps(xmm2, xmm3);
145 xmm3 = _mm_add_ps(xmm4, xmm5);
149 xmm1 = _mm_add_ps(xmm2, xmm1);
150 xmm1 = _mm_add_ps(xmm3, xmm1);
152 xmm2 = _mm_hadd_ps(xmm9, xmm1);
153 xmm3 = _mm_hadd_ps(xmm2, xmm2);
154 xmm4 = _mm_hadd_ps(xmm3, xmm3);
155 _mm_store_ss(&result, xmm4);
157 for (i = 0; i < leftovers; ++i) {
159 fst =
MAX(fst, *cutoff);
163 result += (center_point_array[0] * fst + center_point_array[1] * sq +
164 center_point_array[2] * thrd + center_point_array[3] * frth);
167 result += (float)(num_points)*center_point_array[4];
174#if LV_HAVE_AVX && LV_HAVE_FMA
175#include <immintrin.h>
177static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(
float* target,
179 float* center_point_array,
181 unsigned int num_points)
183 const unsigned int eighth_points = num_points / 8;
189 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
191 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
193 cpa0 = _mm256_set1_ps(center_point_array[0]);
194 cpa1 = _mm256_set1_ps(center_point_array[1]);
195 cpa2 = _mm256_set1_ps(center_point_array[2]);
196 cpa3 = _mm256_set1_ps(center_point_array[3]);
197 cutoff_vec = _mm256_set1_ps(*cutoff);
198 target_vec = _mm256_setzero_ps();
202 for (i = 0; i < eighth_points; ++i) {
203 x_to_1 = _mm256_load_ps(src0);
204 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
205 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
206 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
208 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
210 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
211 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
213 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
214 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
216 target_vec = _mm256_add_ps(x_to_1, target_vec);
217 target_vec = _mm256_add_ps(x_to_3, target_vec);
224 target_vec = _mm256_hadd_ps(
227 _mm256_store_ps(temp_results, target_vec);
228 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
230 for (i = eighth_points * 8; i < num_points; ++i) {
232 fst =
MAX(fst, *cutoff);
236 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
237 center_point_array[2] * thrd + center_point_array[3] * frth);
239 *target += (float)(num_points)*center_point_array[4];
244#include <immintrin.h>
248 float* center_point_array,
250 unsigned int num_points)
252 const unsigned int eighth_points = num_points / 8;
258 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
260 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
262 cpa0 = _mm256_set1_ps(center_point_array[0]);
263 cpa1 = _mm256_set1_ps(center_point_array[1]);
264 cpa2 = _mm256_set1_ps(center_point_array[2]);
265 cpa3 = _mm256_set1_ps(center_point_array[3]);
266 cutoff_vec = _mm256_set1_ps(*cutoff);
267 target_vec = _mm256_setzero_ps();
271 for (i = 0; i < eighth_points; ++i) {
272 x_to_1 = _mm256_load_ps(src0);
273 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
274 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
275 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
277 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
279 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
280 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
281 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
282 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
284 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
285 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
287 target_vec = _mm256_add_ps(x_to_1, target_vec);
288 target_vec = _mm256_add_ps(x_to_3, target_vec);
295 target_vec = _mm256_hadd_ps(
298 _mm256_store_ps(temp_results, target_vec);
299 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
301 for (i = eighth_points * 8; i < num_points; ++i) {
303 fst =
MAX(fst, *cutoff);
307 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
308 center_point_array[2] * thrd + center_point_array[3] * frth);
310 *target += (float)(num_points)*center_point_array[4];
315#ifdef LV_HAVE_GENERIC
319 float* center_point_array,
321 unsigned int num_points)
323 const unsigned int eighth_points = num_points / 8;
325 float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
333 for (i = 0; i < eighth_points; ++i) {
334 for (k = 0; k < 8; ++k) {
336 fst =
MAX(fst, *cutoff);
340 result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
341 result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
344 for (k = 0; k < 8; k += 2) {
345 result[k] = result[k] + result[k + 1];
348 *target = result[0] + result[2] + result[4] + result[6];
350 for (i = eighth_points * 8; i < num_points; ++i) {
352 fst =
MAX(fst, *cutoff);
356 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
357 center_point_array[2] * thrd + center_point_array[3] * frth);
359 *target += (float)(num_points)*center_point_array[4];
368 float* __restrict src0,
369 float* __restrict center_point_array,
370 float* __restrict cutoff,
371 unsigned int num_points)
374 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
378 float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
379 accumulator1_vec = vld1q_f32(zero);
380 accumulator2_vec = vld1q_f32(zero);
381 accumulator3_vec = vld1q_f32(zero);
382 accumulator4_vec = vld1q_f32(zero);
383 float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
384 float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
387 cutoff_vector = vdupq_n_f32(*cutoff);
389 cpa_0 = vdupq_n_f32(center_point_array[0]);
390 cpa_1 = vdupq_n_f32(center_point_array[1]);
391 cpa_2 = vdupq_n_f32(center_point_array[2]);
392 cpa_3 = vdupq_n_f32(center_point_array[3]);
394 for (i = 0; i < num_points / 4; ++i) {
396 x_to_1 = vld1q_f32(src0);
399 x_to_1 = vmaxq_f32(x_to_1, cutoff_vector);
400 x_to_2 = vmulq_f32(x_to_1, x_to_1);
401 x_to_3 = vmulq_f32(x_to_2, x_to_1);
402 x_to_4 = vmulq_f32(x_to_3, x_to_1);
403 x_to_1 = vmulq_f32(x_to_1, cpa_0);
404 x_to_2 = vmulq_f32(x_to_2, cpa_1);
405 x_to_3 = vmulq_f32(x_to_3, cpa_2);
406 x_to_4 = vmulq_f32(x_to_4, cpa_3);
407 accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
408 accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
409 accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
410 accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
414 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
415 accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
416 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
419 vst1q_f32(res_accumulators, accumulator1_vec);
420 accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
428 for (i = 4 * (num_points / 4); i < num_points; ++i) {
430 fst =
MAX(fst, *cutoff);
437 accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
438 center_point_array[2] * thrd + center_point_array[3] * frth);
441 *target = accumulator + (float)num_points * center_point_array[4];
448#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
449#define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
456#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
459#if LV_HAVE_AVX && LV_HAVE_FMA
460#include <immintrin.h>
462static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(
float* target,
464 float* center_point_array,
466 unsigned int num_points)
468 const unsigned int eighth_points = num_points / 8;
474 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
476 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
478 cpa0 = _mm256_set1_ps(center_point_array[0]);
479 cpa1 = _mm256_set1_ps(center_point_array[1]);
480 cpa2 = _mm256_set1_ps(center_point_array[2]);
481 cpa3 = _mm256_set1_ps(center_point_array[3]);
482 cutoff_vec = _mm256_set1_ps(*cutoff);
483 target_vec = _mm256_setzero_ps();
487 for (i = 0; i < eighth_points; ++i) {
488 x_to_1 = _mm256_loadu_ps(src0);
489 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
490 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
491 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
493 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
495 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
496 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
498 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
499 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
501 target_vec = _mm256_add_ps(x_to_1, target_vec);
502 target_vec = _mm256_add_ps(x_to_3, target_vec);
509 target_vec = _mm256_hadd_ps(
512 _mm256_storeu_ps(temp_results, target_vec);
513 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
515 for (i = eighth_points * 8; i < num_points; ++i) {
517 fst =
MAX(fst, *cutoff);
521 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
522 center_point_array[2] * thrd + center_point_array[3] * frth);
525 *target += (float)(num_points)*center_point_array[4];
530#include <immintrin.h>
534 float* center_point_array,
536 unsigned int num_points)
538 const unsigned int eighth_points = num_points / 8;
544 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
546 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
548 cpa0 = _mm256_set1_ps(center_point_array[0]);
549 cpa1 = _mm256_set1_ps(center_point_array[1]);
550 cpa2 = _mm256_set1_ps(center_point_array[2]);
551 cpa3 = _mm256_set1_ps(center_point_array[3]);
552 cutoff_vec = _mm256_set1_ps(*cutoff);
553 target_vec = _mm256_setzero_ps();
557 for (i = 0; i < eighth_points; ++i) {
558 x_to_1 = _mm256_loadu_ps(src0);
559 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
560 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
561 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
563 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
565 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
566 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
567 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
568 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
570 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
571 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
573 target_vec = _mm256_add_ps(x_to_1, target_vec);
574 target_vec = _mm256_add_ps(x_to_3, target_vec);
581 target_vec = _mm256_hadd_ps(
584 _mm256_storeu_ps(temp_results, target_vec);
585 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
587 for (i = eighth_points * 8; i < num_points; ++i) {
589 fst =
MAX(fst, *cutoff);
594 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
595 center_point_array[2] * thrd + center_point_array[3] * frth);
598 *target += (float)(num_points)*center_point_array[4];
603#include <riscv_vector.h>
606static inline void volk_32f_x3_sum_of_poly_32f_rvv(
float* target,
608 float* center_point_array,
610 unsigned int num_points)
612 size_t vlmax = __riscv_vsetvlmax_e32m4();
613 vfloat32m4_t vsum = __riscv_vfmv_v_f_f32m4(0, vlmax);
614 float mul1 = center_point_array[0];
615 float mul2 = center_point_array[1];
616 vfloat32m4_t vmul3 = __riscv_vfmv_v_f_f32m4(center_point_array[2], vlmax);
617 vfloat32m4_t vmul4 = __riscv_vfmv_v_f_f32m4(center_point_array[3], vlmax);
618 vfloat32m4_t vmax = __riscv_vfmv_v_f_f32m4(*cutoff, vlmax);
620 size_t n = num_points;
621 for (
size_t vl; n > 0; n -= vl, src0 += vl) {
622 vl = __riscv_vsetvl_e32m4(n);
623 vfloat32m4_t v = __riscv_vle32_v_f32m4(src0, vl);
624 vfloat32m4_t v1 = __riscv_vfmax(v, vmax, vl);
625 vfloat32m4_t v2 = __riscv_vfmul(v1, v1, vl);
626 vfloat32m4_t v3 = __riscv_vfmul(v1, v2, vl);
627 vfloat32m4_t v4 = __riscv_vfmul(v2, v2, vl);
628 v2 = __riscv_vfmul(v2, mul2, vl);
629 v4 = __riscv_vfmul(v4, vmul4, vl);
630 v1 = __riscv_vfmadd(v1, mul1, v2, vl);
631 v3 = __riscv_vfmadd(v3, vmul3, v4, vl);
632 v1 = __riscv_vfadd(v1, v3, vl);
633 vsum = __riscv_vfadd_tu(vsum, vsum, v1, vl);
635 size_t vl = __riscv_vsetvlmax_e32m1();
637 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
638 float sum = __riscv_vfmv_f(__riscv_vfredusum(v, z, vl));
639 *target = sum + num_points * center_point_array[4];