45#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
46#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
55extern void volk_32fc_x2_dot_prod_32fc_sifive_u74(
lv_32fc_t* result,
58 unsigned int num_points);
67 unsigned int num_points)
70 float* res = (
float*)result;
71 float* in = (
float*)input;
72 float* tp = (
float*)taps;
73 unsigned int n_2_ccomplex_blocks = num_points / 2;
75 float sum0[2] = { 0, 0 };
76 float sum1[2] = { 0, 0 };
79 for (i = 0; i < n_2_ccomplex_blocks; ++i) {
80 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
81 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
82 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
83 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
89 res[0] = sum0[0] + sum1[0];
90 res[1] = sum0[1] + sum1[1];
94 *result += input[num_points - 1] * taps[num_points - 1];
103#include <pmmintrin.h>
108 unsigned int num_points)
112 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
114 unsigned int number = 0;
115 const unsigned int halfPoints = num_points / 2;
116 unsigned int isodd = num_points & 1;
118 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
123 dotProdVal = _mm_setzero_ps();
125 for (; number < halfPoints; number++) {
127 x = _mm_loadu_ps((
float*)a);
128 y = _mm_loadu_ps((
float*)b);
130 yl = _mm_moveldup_ps(y);
131 yh = _mm_movehdup_ps(y);
133 tmp1 = _mm_mul_ps(x, yl);
135 x = _mm_shuffle_ps(x, x, 0xB1);
137 tmp2 = _mm_mul_ps(x, yh);
139 z = _mm_addsub_ps(tmp1,
143 _mm_add_ps(dotProdVal, z);
151 _mm_storeu_ps((
float*)dotProductVector,
154 dotProduct += (dotProductVector[0] + dotProductVector[1]);
157 dotProduct += input[num_points - 1] * taps[num_points - 1];
160 *result = dotProduct;
167#include <immintrin.h>
172 unsigned int num_points)
175 unsigned int isodd = num_points & 3;
178 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
180 unsigned int number = 0;
181 const unsigned int quarterPoints = num_points / 4;
183 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
188 dotProdVal = _mm256_setzero_ps();
190 for (; number < quarterPoints; number++) {
191 x = _mm256_loadu_ps((
float*)a);
192 y = _mm256_loadu_ps((
float*)b);
194 yl = _mm256_moveldup_ps(y);
195 yh = _mm256_movehdup_ps(y);
197 tmp1 = _mm256_mul_ps(x, yl);
199 x = _mm256_shuffle_ps(x, x, 0xB1);
201 tmp2 = _mm256_mul_ps(x, yh);
203 z = _mm256_addsub_ps(tmp1,
206 dotProdVal = _mm256_add_ps(dotProdVal,
215 _mm256_storeu_ps((
float*)dotProductVector,
218 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
219 dotProductVector[3]);
221 for (i = num_points - isodd; i < num_points; i++) {
222 dotProduct += input[i] * taps[i];
225 *result = dotProduct;
230#if LV_HAVE_AVX && LV_HAVE_FMA
231#include <immintrin.h>
233static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(
lv_32fc_t* result,
236 unsigned int num_points)
239 unsigned int isodd = num_points & 3;
242 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
244 unsigned int number = 0;
245 const unsigned int quarterPoints = num_points / 4;
247 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
252 dotProdVal = _mm256_setzero_ps();
254 for (; number < quarterPoints; number++) {
256 x = _mm256_loadu_ps((
float*)a);
257 y = _mm256_loadu_ps((
float*)b);
259 yl = _mm256_moveldup_ps(y);
260 yh = _mm256_movehdup_ps(y);
264 x = _mm256_shuffle_ps(x, x, 0xB1);
266 tmp2 = _mm256_mul_ps(x, yh);
268 z = _mm256_fmaddsub_ps(
271 dotProdVal = _mm256_add_ps(dotProdVal,
280 _mm256_storeu_ps((
float*)dotProductVector,
283 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
284 dotProductVector[3]);
286 for (i = num_points - isodd; i < num_points; i++) {
287 dotProduct += input[i] * taps[i];
290 *result = dotProduct;
297#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
298#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
308#include <pmmintrin.h>
313 unsigned int num_points)
316 const unsigned int num_bytes = num_points * 8;
317 unsigned int isodd = num_points & 1;
320 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
322 unsigned int number = 0;
323 const unsigned int halfPoints = num_bytes >> 4;
325 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
330 dotProdVal = _mm_setzero_ps();
332 for (; number < halfPoints; number++) {
334 x = _mm_load_ps((
float*)a);
335 y = _mm_load_ps((
float*)b);
337 yl = _mm_moveldup_ps(y);
338 yh = _mm_movehdup_ps(y);
340 tmp1 = _mm_mul_ps(x, yl);
342 x = _mm_shuffle_ps(x, x, 0xB1);
344 tmp2 = _mm_mul_ps(x, yh);
346 z = _mm_addsub_ps(tmp1,
350 _mm_add_ps(dotProdVal, z);
358 _mm_store_ps((
float*)dotProductVector,
361 dotProduct += (dotProductVector[0] + dotProductVector[1]);
364 dotProduct += input[num_points - 1] * taps[num_points - 1];
367 *result = dotProduct;
379 unsigned int num_points)
382 unsigned int quarter_points = num_points / 4;
389 float32x4x2_t a_val, b_val, c_val, accumulator;
390 float32x4x2_t tmp_real, tmp_imag;
391 accumulator.val[0] = vdupq_n_f32(0);
392 accumulator.val[1] = vdupq_n_f32(0);
394 for (number = 0; number < quarter_points; ++number) {
395 a_val = vld2q_f32((
float*)a_ptr);
396 b_val = vld2q_f32((
float*)b_ptr);
402 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
404 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
408 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
410 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
412 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
413 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
415 accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
416 accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
422 vst2q_f32((
float*)accum_result, accumulator);
423 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
426 for (number = quarter_points * 4; number < num_points; ++number) {
427 *result += (*a_ptr++) * (*b_ptr++);
437 unsigned int num_points)
440 unsigned int quarter_points = num_points / 4;
447 float32x4x2_t a_val, b_val, accumulator;
448 float32x4x2_t tmp_imag;
449 accumulator.val[0] = vdupq_n_f32(0);
450 accumulator.val[1] = vdupq_n_f32(0);
452 for (number = 0; number < quarter_points; ++number) {
453 a_val = vld2q_f32((
float*)a_ptr);
454 b_val = vld2q_f32((
float*)b_ptr);
459 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
460 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
463 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
464 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
466 accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
467 accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
474 vst2q_f32((
float*)accum_result, accumulator);
475 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
478 for (number = quarter_points * 4; number < num_points; ++number) {
479 *result += (*a_ptr++) * (*b_ptr++);
488 unsigned int num_points)
491 unsigned int quarter_points = num_points / 4;
498 float32x4x2_t a_val, b_val, accumulator1, accumulator2;
499 accumulator1.val[0] = vdupq_n_f32(0);
500 accumulator1.val[1] = vdupq_n_f32(0);
501 accumulator2.val[0] = vdupq_n_f32(0);
502 accumulator2.val[1] = vdupq_n_f32(0);
504 for (number = 0; number < quarter_points; ++number) {
505 a_val = vld2q_f32((
float*)a_ptr);
506 b_val = vld2q_f32((
float*)b_ptr);
511 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
512 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
513 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
514 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
519 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
520 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
522 vst2q_f32((
float*)accum_result, accumulator1);
523 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
526 for (number = quarter_points * 4; number < num_points; ++number) {
527 *result += (*a_ptr++) * (*b_ptr++);
536 unsigned int num_points)
541 unsigned int quarter_points = num_points / 8;
548 float32x4x4_t a_val, b_val, accumulator1, accumulator2;
549 float32x4x2_t reduced_accumulator;
550 accumulator1.val[0] = vdupq_n_f32(0);
551 accumulator1.val[1] = vdupq_n_f32(0);
552 accumulator1.val[2] = vdupq_n_f32(0);
553 accumulator1.val[3] = vdupq_n_f32(0);
554 accumulator2.val[0] = vdupq_n_f32(0);
555 accumulator2.val[1] = vdupq_n_f32(0);
556 accumulator2.val[2] = vdupq_n_f32(0);
557 accumulator2.val[3] = vdupq_n_f32(0);
560 for (number = 0; number < quarter_points; ++number) {
561 a_val = vld4q_f32((
float*)a_ptr);
562 b_val = vld4q_f32((
float*)b_ptr);
567 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
568 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
570 accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
571 accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
573 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
574 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
576 accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
577 accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
583 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
584 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
585 accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
586 accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
587 reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
588 reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
591 vst2q_f32((
float*)accum_result, reduced_accumulator);
592 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
595 for (number = quarter_points * 8; number < num_points; ++number) {
596 *result += (*a_ptr++) * (*b_ptr++);
605static inline void volk_32fc_x2_dot_prod_32fc_neonv8(
lv_32fc_t* result,
608 unsigned int num_points)
610 unsigned int n = num_points;
615 float32x4_t acc0_r = vdupq_n_f32(0);
616 float32x4_t acc0_i = vdupq_n_f32(0);
617 float32x4_t acc1_r = vdupq_n_f32(0);
618 float32x4_t acc1_i = vdupq_n_f32(0);
622 float32x4x2_t a0 = vld2q_f32((
const float*)a);
623 float32x4x2_t b0 = vld2q_f32((
const float*)b);
624 float32x4x2_t a1 = vld2q_f32((
const float*)(a + 4));
625 float32x4x2_t b1 = vld2q_f32((
const float*)(b + 4));
633 acc0_r = vfmaq_f32(acc0_r, a0.val[0], b0.val[0]);
634 acc0_r = vfmsq_f32(acc0_r, a0.val[1], b0.val[1]);
635 acc0_i = vfmaq_f32(acc0_i, a0.val[0], b0.val[1]);
636 acc0_i = vfmaq_f32(acc0_i, a0.val[1], b0.val[0]);
638 acc1_r = vfmaq_f32(acc1_r, a1.val[0], b1.val[0]);
639 acc1_r = vfmsq_f32(acc1_r, a1.val[1], b1.val[1]);
640 acc1_i = vfmaq_f32(acc1_i, a1.val[0], b1.val[1]);
641 acc1_i = vfmaq_f32(acc1_i, a1.val[1], b1.val[0]);
650 float32x4x2_t a0 = vld2q_f32((
const float*)a);
651 float32x4x2_t b0 = vld2q_f32((
const float*)b);
653 acc0_r = vfmaq_f32(acc0_r, a0.val[0], b0.val[0]);
654 acc0_r = vfmsq_f32(acc0_r, a0.val[1], b0.val[1]);
655 acc0_i = vfmaq_f32(acc0_i, a0.val[0], b0.val[1]);
656 acc0_i = vfmaq_f32(acc0_i, a0.val[1], b0.val[0]);
664 acc0_r = vaddq_f32(acc0_r, acc1_r);
665 acc0_i = vaddq_f32(acc0_i, acc1_i);
668 float32x2_t sum_r = vadd_f32(vget_low_f32(acc0_r), vget_high_f32(acc0_r));
669 float32x2_t sum_i = vadd_f32(vget_low_f32(acc0_i), vget_high_f32(acc0_i));
670 sum_r = vpadd_f32(sum_r, sum_r);
671 sum_i = vpadd_f32(sum_i, sum_i);
673 float res_r = vget_lane_f32(sum_r, 0);
674 float res_i = vget_lane_f32(sum_i, 0);
693#include <immintrin.h>
698 unsigned int num_points)
701 unsigned int isodd = num_points & 3;
704 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
706 unsigned int number = 0;
707 const unsigned int quarterPoints = num_points / 4;
709 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
714 dotProdVal = _mm256_setzero_ps();
716 for (; number < quarterPoints; number++) {
718 x = _mm256_load_ps((
float*)a);
719 y = _mm256_load_ps((
float*)b);
721 yl = _mm256_moveldup_ps(y);
722 yh = _mm256_movehdup_ps(y);
724 tmp1 = _mm256_mul_ps(x, yl);
726 x = _mm256_shuffle_ps(x, x, 0xB1);
728 tmp2 = _mm256_mul_ps(x, yh);
730 z = _mm256_addsub_ps(tmp1,
733 dotProdVal = _mm256_add_ps(dotProdVal,
742 _mm256_store_ps((
float*)dotProductVector,
745 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
746 dotProductVector[3]);
748 for (i = num_points - isodd; i < num_points; i++) {
749 dotProduct += input[i] * taps[i];
752 *result = dotProduct;
757#if LV_HAVE_AVX && LV_HAVE_FMA
758#include <immintrin.h>
760static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(
lv_32fc_t* result,
763 unsigned int num_points)
766 unsigned int isodd = num_points & 3;
769 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
771 unsigned int number = 0;
772 const unsigned int quarterPoints = num_points / 4;
774 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
779 dotProdVal = _mm256_setzero_ps();
781 for (; number < quarterPoints; number++) {
783 x = _mm256_load_ps((
float*)a);
784 y = _mm256_load_ps((
float*)b);
786 yl = _mm256_moveldup_ps(y);
787 yh = _mm256_movehdup_ps(y);
791 x = _mm256_shuffle_ps(x, x, 0xB1);
793 tmp2 = _mm256_mul_ps(x, yh);
795 z = _mm256_fmaddsub_ps(
798 dotProdVal = _mm256_add_ps(dotProdVal,
807 _mm256_store_ps((
float*)dotProductVector,
810 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
811 dotProductVector[3]);
813 for (i = num_points - isodd; i < num_points; i++) {
814 dotProduct += input[i] * taps[i];
817 *result = dotProduct;
823#include <riscv_vector.h>
826static inline void volk_32fc_x2_dot_prod_32fc_rvv(
lv_32fc_t* result,
829 unsigned int num_points)
831 vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
832 vfloat32m2_t vsumi = vsumr;
833 size_t n = num_points;
834 for (
size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
835 vl = __riscv_vsetvl_e32m2(n);
836 vuint64m4_t va = __riscv_vle64_v_u64m4((
const uint64_t*)input, vl);
837 vuint64m4_t vb = __riscv_vle64_v_u64m4((
const uint64_t*)taps, vl);
838 vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
839 vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl));
840 vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
841 vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl));
842 vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
843 vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
844 vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
845 vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
847 size_t vl = __riscv_vsetvlmax_e32m1();
850 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
851 *result =
lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
852 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
857#include <riscv_vector.h>
860static inline void volk_32fc_x2_dot_prod_32fc_rvvseg(
lv_32fc_t* result,
863 unsigned int num_points)
865 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
866 vfloat32m4_t vsumi = vsumr;
867 size_t n = num_points;
868 for (
size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
869 vl = __riscv_vsetvl_e32m4(n);
870 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((
const float*)input, vl);
871 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((
const float*)taps, vl);
872 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
873 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
874 vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
875 vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
876 vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
877 vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
879 size_t vl = __riscv_vsetvlmax_e32m1();
882 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
883 *result =
lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
884 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));