45#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
46#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
57 unsigned int num_points)
60 static const int N_UNROLL = 4;
68 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
70 for (i = 0; i < n; i += N_UNROLL) {
71 acc0 += taps[i + 0] * (float)input[i + 0];
72 acc1 += taps[i + 1] * (float)input[i + 1];
73 acc2 += taps[i + 2] * (float)input[i + 2];
74 acc3 += taps[i + 3] * (float)input[i + 3];
77 for (; i < num_points; i++) {
78 acc0 += taps[i] * (float)input[i];
81 *result = acc0 + acc1 + acc2 + acc3;
91 unsigned int num_points)
95 unsigned quarter_points = num_points / 4;
97 short* inputPtr = (
short*)input;
100 float32x4x2_t tapsVal, accumulator_val;
103 float32x4_t input_float, prod_re, prod_im;
105 accumulator_val.val[0] = vdupq_n_f32(0.0);
106 accumulator_val.val[1] = vdupq_n_f32(0.0);
108 for (ii = 0; ii < quarter_points; ++ii) {
109 tapsVal = vld2q_f32((
float*)tapsPtr);
110 input16 = vld1_s16(inputPtr);
112 input32 = vmovl_s16(input16);
114 input_float = vcvtq_f32_s32(input32);
116 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
117 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
119 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
120 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
125 vst2q_f32((
float*)accumulator_vec, accumulator_val);
126 accumulator_vec[0] += accumulator_vec[1];
127 accumulator_vec[2] += accumulator_vec[3];
128 accumulator_vec[0] += accumulator_vec[2];
130 for (ii = quarter_points * 4; ii < num_points; ++ii) {
131 accumulator_vec[0] += *(tapsPtr++) * (
float)(*(inputPtr++));
134 *result = accumulator_vec[0];
142static inline void volk_16i_32fc_dot_prod_32fc_neonv8(
lv_32fc_t* result,
145 unsigned int num_points)
147 const unsigned int eighthPoints = num_points / 8;
148 const short* inputPtr = input;
152 float32x4_t real_acc0 = vdupq_n_f32(0);
153 float32x4_t imag_acc0 = vdupq_n_f32(0);
154 float32x4_t real_acc1 = vdupq_n_f32(0);
155 float32x4_t imag_acc1 = vdupq_n_f32(0);
157 for (
unsigned int number = 0; number < eighthPoints; number++) {
159 int16x8_t input16 = vld1q_s16(inputPtr);
160 float32x4_t input_lo = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input16)));
161 float32x4_t input_hi = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input16)));
164 float32x4x2_t taps0 = vld2q_f32((
const float*)tapsPtr);
165 float32x4x2_t taps1 = vld2q_f32((
const float*)(tapsPtr + 4));
170 real_acc0 = vfmaq_f32(real_acc0, input_lo, taps0.val[0]);
171 imag_acc0 = vfmaq_f32(imag_acc0, input_lo, taps0.val[1]);
172 real_acc1 = vfmaq_f32(real_acc1, input_hi, taps1.val[0]);
173 imag_acc1 = vfmaq_f32(imag_acc1, input_hi, taps1.val[1]);
180 real_acc0 = vaddq_f32(real_acc0, real_acc1);
181 imag_acc0 = vaddq_f32(imag_acc0, imag_acc1);
184 float real_sum = vaddvq_f32(real_acc0);
185 float imag_sum = vaddvq_f32(imag_acc0);
190 const float* bPtr = (
const float*)tapsPtr;
191 for (
unsigned int number = eighthPoints * 8; number < num_points; number++) {
192 returnValue +=
lv_cmake(inputPtr[0] * bPtr[0], inputPtr[0] * bPtr[1]);
197 *result = returnValue;
201#if LV_HAVE_SSE && LV_HAVE_MMX
203static inline void volk_16i_32fc_dot_prod_32fc_u_sse(
lv_32fc_t* result,
206 unsigned int num_points)
209 unsigned int number = 0;
210 const unsigned int eighthPoints = num_points / 8;
213 const short* aPtr = input;
214 const float* bPtr = (
float*)taps;
217 __m128 f0, f1, f2, f3;
218 __m128 a0Val, a1Val, a2Val, a3Val;
219 __m128 b0Val, b1Val, b2Val, b3Val;
220 __m128 c0Val, c1Val, c2Val, c3Val;
222 __m128 dotProdVal0 = _mm_setzero_ps();
223 __m128 dotProdVal1 = _mm_setzero_ps();
224 __m128 dotProdVal2 = _mm_setzero_ps();
225 __m128 dotProdVal3 = _mm_setzero_ps();
227 for (; number < eighthPoints; number++) {
229 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
230 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
231 f0 = _mm_cvtpi16_ps(m0);
232 f1 = _mm_cvtpi16_ps(m0);
233 f2 = _mm_cvtpi16_ps(m1);
234 f3 = _mm_cvtpi16_ps(m1);
236 a0Val = _mm_unpacklo_ps(f0, f1);
237 a1Val = _mm_unpackhi_ps(f0, f1);
238 a2Val = _mm_unpacklo_ps(f2, f3);
239 a3Val = _mm_unpackhi_ps(f2, f3);
241 b0Val = _mm_loadu_ps(bPtr);
242 b1Val = _mm_loadu_ps(bPtr + 4);
243 b2Val = _mm_loadu_ps(bPtr + 8);
244 b3Val = _mm_loadu_ps(bPtr + 12);
246 c0Val = _mm_mul_ps(a0Val, b0Val);
247 c1Val = _mm_mul_ps(a1Val, b1Val);
248 c2Val = _mm_mul_ps(a2Val, b2Val);
249 c3Val = _mm_mul_ps(a3Val, b3Val);
251 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
252 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
253 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
254 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
262 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
263 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
264 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
268 _mm_store_ps(dotProductVector,
271 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
272 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
274 number = eighthPoints * 8;
275 for (; number < num_points; number++) {
276 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
281 *result = returnValue;
287#if LV_HAVE_AVX2 && LV_HAVE_FMA
289static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(
lv_32fc_t* result,
292 unsigned int num_points)
295 unsigned int number = 0;
296 const unsigned int sixteenthPoints = num_points / 16;
299 const short* aPtr = input;
300 const float* bPtr = (
float*)taps;
304 __m256 g0, g1, h0, h1, h2, h3;
305 __m256 a0Val, a1Val, a2Val, a3Val;
306 __m256 b0Val, b1Val, b2Val, b3Val;
308 __m256 dotProdVal0 = _mm256_setzero_ps();
309 __m256 dotProdVal1 = _mm256_setzero_ps();
310 __m256 dotProdVal2 = _mm256_setzero_ps();
311 __m256 dotProdVal3 = _mm256_setzero_ps();
313 for (; number < sixteenthPoints; number++) {
315 m0 = _mm_loadu_si128((__m128i
const*)aPtr);
316 m1 = _mm_loadu_si128((__m128i
const*)(aPtr + 8));
318 f0 = _mm256_cvtepi16_epi32(m0);
319 g0 = _mm256_cvtepi32_ps(f0);
320 f1 = _mm256_cvtepi16_epi32(m1);
321 g1 = _mm256_cvtepi32_ps(f1);
323 h0 = _mm256_unpacklo_ps(g0, g0);
324 h1 = _mm256_unpackhi_ps(g0, g0);
325 h2 = _mm256_unpacklo_ps(g1, g1);
326 h3 = _mm256_unpackhi_ps(g1, g1);
328 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
329 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
330 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
331 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
333 b0Val = _mm256_loadu_ps(bPtr);
334 b1Val = _mm256_loadu_ps(bPtr + 8);
335 b2Val = _mm256_loadu_ps(bPtr + 16);
336 b3Val = _mm256_loadu_ps(bPtr + 24);
338 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
339 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
340 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
341 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
347 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
348 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
349 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
353 _mm256_store_ps(dotProductVector,
356 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
357 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
358 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
359 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
361 number = sixteenthPoints * 16;
362 for (; number < num_points; number++) {
363 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
368 *result = returnValue;
376static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(
lv_32fc_t* result,
379 unsigned int num_points)
382 unsigned int number = 0;
383 const unsigned int sixteenthPoints = num_points / 16;
386 const short* aPtr = input;
387 const float* bPtr = (
float*)taps;
391 __m256 g0, g1, h0, h1, h2, h3;
392 __m256 a0Val, a1Val, a2Val, a3Val;
393 __m256 b0Val, b1Val, b2Val, b3Val;
394 __m256 c0Val, c1Val, c2Val, c3Val;
396 __m256 dotProdVal0 = _mm256_setzero_ps();
397 __m256 dotProdVal1 = _mm256_setzero_ps();
398 __m256 dotProdVal2 = _mm256_setzero_ps();
399 __m256 dotProdVal3 = _mm256_setzero_ps();
401 for (; number < sixteenthPoints; number++) {
403 m0 = _mm_loadu_si128((__m128i
const*)aPtr);
404 m1 = _mm_loadu_si128((__m128i
const*)(aPtr + 8));
406 f0 = _mm256_cvtepi16_epi32(m0);
407 g0 = _mm256_cvtepi32_ps(f0);
408 f1 = _mm256_cvtepi16_epi32(m1);
409 g1 = _mm256_cvtepi32_ps(f1);
411 h0 = _mm256_unpacklo_ps(g0, g0);
412 h1 = _mm256_unpackhi_ps(g0, g0);
413 h2 = _mm256_unpacklo_ps(g1, g1);
414 h3 = _mm256_unpackhi_ps(g1, g1);
416 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
417 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
418 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
419 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
421 b0Val = _mm256_loadu_ps(bPtr);
422 b1Val = _mm256_loadu_ps(bPtr + 8);
423 b2Val = _mm256_loadu_ps(bPtr + 16);
424 b3Val = _mm256_loadu_ps(bPtr + 24);
426 c0Val = _mm256_mul_ps(a0Val, b0Val);
427 c1Val = _mm256_mul_ps(a1Val, b1Val);
428 c2Val = _mm256_mul_ps(a2Val, b2Val);
429 c3Val = _mm256_mul_ps(a3Val, b3Val);
431 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
432 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
433 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
434 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
440 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
441 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
442 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
446 _mm256_store_ps(dotProductVector,
449 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
450 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
451 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
452 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
454 number = sixteenthPoints * 16;
455 for (; number < num_points; number++) {
456 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
461 *result = returnValue;
467#if LV_HAVE_SSE && LV_HAVE_MMX
470static inline void volk_16i_32fc_dot_prod_32fc_a_sse(
lv_32fc_t* result,
473 unsigned int num_points)
476 unsigned int number = 0;
477 const unsigned int eighthPoints = num_points / 8;
480 const short* aPtr = input;
481 const float* bPtr = (
float*)taps;
484 __m128 f0, f1, f2, f3;
485 __m128 a0Val, a1Val, a2Val, a3Val;
486 __m128 b0Val, b1Val, b2Val, b3Val;
487 __m128 c0Val, c1Val, c2Val, c3Val;
489 __m128 dotProdVal0 = _mm_setzero_ps();
490 __m128 dotProdVal1 = _mm_setzero_ps();
491 __m128 dotProdVal2 = _mm_setzero_ps();
492 __m128 dotProdVal3 = _mm_setzero_ps();
494 for (; number < eighthPoints; number++) {
496 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
497 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
498 f0 = _mm_cvtpi16_ps(m0);
499 f1 = _mm_cvtpi16_ps(m0);
500 f2 = _mm_cvtpi16_ps(m1);
501 f3 = _mm_cvtpi16_ps(m1);
503 a0Val = _mm_unpacklo_ps(f0, f1);
504 a1Val = _mm_unpackhi_ps(f0, f1);
505 a2Val = _mm_unpacklo_ps(f2, f3);
506 a3Val = _mm_unpackhi_ps(f2, f3);
508 b0Val = _mm_load_ps(bPtr);
509 b1Val = _mm_load_ps(bPtr + 4);
510 b2Val = _mm_load_ps(bPtr + 8);
511 b3Val = _mm_load_ps(bPtr + 12);
513 c0Val = _mm_mul_ps(a0Val, b0Val);
514 c1Val = _mm_mul_ps(a1Val, b1Val);
515 c2Val = _mm_mul_ps(a2Val, b2Val);
516 c3Val = _mm_mul_ps(a3Val, b3Val);
518 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
519 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
520 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
521 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
529 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
530 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
531 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
535 _mm_store_ps(dotProductVector,
538 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
539 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
541 number = eighthPoints * 8;
542 for (; number < num_points; number++) {
543 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
548 *result = returnValue;
555static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(
lv_32fc_t* result,
558 unsigned int num_points)
561 unsigned int number = 0;
562 const unsigned int sixteenthPoints = num_points / 16;
565 const short* aPtr = input;
566 const float* bPtr = (
float*)taps;
570 __m256 g0, g1, h0, h1, h2, h3;
571 __m256 a0Val, a1Val, a2Val, a3Val;
572 __m256 b0Val, b1Val, b2Val, b3Val;
573 __m256 c0Val, c1Val, c2Val, c3Val;
575 __m256 dotProdVal0 = _mm256_setzero_ps();
576 __m256 dotProdVal1 = _mm256_setzero_ps();
577 __m256 dotProdVal2 = _mm256_setzero_ps();
578 __m256 dotProdVal3 = _mm256_setzero_ps();
580 for (; number < sixteenthPoints; number++) {
582 m0 = _mm_load_si128((__m128i
const*)aPtr);
583 m1 = _mm_load_si128((__m128i
const*)(aPtr + 8));
585 f0 = _mm256_cvtepi16_epi32(m0);
586 g0 = _mm256_cvtepi32_ps(f0);
587 f1 = _mm256_cvtepi16_epi32(m1);
588 g1 = _mm256_cvtepi32_ps(f1);
590 h0 = _mm256_unpacklo_ps(g0, g0);
591 h1 = _mm256_unpackhi_ps(g0, g0);
592 h2 = _mm256_unpacklo_ps(g1, g1);
593 h3 = _mm256_unpackhi_ps(g1, g1);
595 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
596 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
597 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
598 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
600 b0Val = _mm256_load_ps(bPtr);
601 b1Val = _mm256_load_ps(bPtr + 8);
602 b2Val = _mm256_load_ps(bPtr + 16);
603 b3Val = _mm256_load_ps(bPtr + 24);
605 c0Val = _mm256_mul_ps(a0Val, b0Val);
606 c1Val = _mm256_mul_ps(a1Val, b1Val);
607 c2Val = _mm256_mul_ps(a2Val, b2Val);
608 c3Val = _mm256_mul_ps(a3Val, b3Val);
610 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
611 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
612 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
613 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
619 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
620 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
621 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
625 _mm256_store_ps(dotProductVector,
628 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
629 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
630 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
631 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
633 number = sixteenthPoints * 16;
634 for (; number < num_points; number++) {
635 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
640 *result = returnValue;
646#if LV_HAVE_AVX2 && LV_HAVE_FMA
648static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(
lv_32fc_t* result,
651 unsigned int num_points)
654 unsigned int number = 0;
655 const unsigned int sixteenthPoints = num_points / 16;
658 const short* aPtr = input;
659 const float* bPtr = (
float*)taps;
663 __m256 g0, g1, h0, h1, h2, h3;
664 __m256 a0Val, a1Val, a2Val, a3Val;
665 __m256 b0Val, b1Val, b2Val, b3Val;
667 __m256 dotProdVal0 = _mm256_setzero_ps();
668 __m256 dotProdVal1 = _mm256_setzero_ps();
669 __m256 dotProdVal2 = _mm256_setzero_ps();
670 __m256 dotProdVal3 = _mm256_setzero_ps();
672 for (; number < sixteenthPoints; number++) {
674 m0 = _mm_load_si128((__m128i
const*)aPtr);
675 m1 = _mm_load_si128((__m128i
const*)(aPtr + 8));
677 f0 = _mm256_cvtepi16_epi32(m0);
678 g0 = _mm256_cvtepi32_ps(f0);
679 f1 = _mm256_cvtepi16_epi32(m1);
680 g1 = _mm256_cvtepi32_ps(f1);
682 h0 = _mm256_unpacklo_ps(g0, g0);
683 h1 = _mm256_unpackhi_ps(g0, g0);
684 h2 = _mm256_unpacklo_ps(g1, g1);
685 h3 = _mm256_unpackhi_ps(g1, g1);
687 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
688 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
689 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
690 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
692 b0Val = _mm256_load_ps(bPtr);
693 b1Val = _mm256_load_ps(bPtr + 8);
694 b2Val = _mm256_load_ps(bPtr + 16);
695 b3Val = _mm256_load_ps(bPtr + 24);
697 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
698 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
699 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
700 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
706 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
707 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
708 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
712 _mm256_store_ps(dotProductVector,
715 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
716 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
717 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
718 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
720 number = sixteenthPoints * 16;
721 for (; number < num_points; number++) {
722 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
727 *result = returnValue;
734#include <riscv_vector.h>
737static inline void volk_16i_32fc_dot_prod_32fc_rvv(
lv_32fc_t* result,
740 unsigned int num_points)
742 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
743 vfloat32m4_t vsumi = vsumr;
744 size_t n = num_points;
745 for (
size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
746 vl = __riscv_vsetvl_e32m4(n);
747 vuint64m8_t vc = __riscv_vle64_v_u64m8((
const uint64_t*)taps, vl);
748 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
749 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
751 __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((
const int16_t*)input, vl), vl);
752 vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
753 vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
755 size_t vl = __riscv_vsetvlmax_e32m1();
758 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
759 *result =
lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
760 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
765#include <riscv_vector.h>
768static inline void volk_16i_32fc_dot_prod_32fc_rvvseg(
lv_32fc_t* result,
771 unsigned int num_points)
773 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
774 vfloat32m4_t vsumi = vsumr;
775 size_t n = num_points;
776 for (
size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
777 vl = __riscv_vsetvl_e32m4(n);
778 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((
const float*)taps, vl);
779 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
780 vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
782 __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((
const int16_t*)input, vl), vl);
783 vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
784 vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
786 size_t vl = __riscv_vsetvlmax_e32m1();
789 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
790 *result =
lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
791 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));