50#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
51#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
61 unsigned int num_points)
65 const float* aPtr = (
float*)input;
66 const float* bPtr = taps;
67 unsigned int number = 0;
69 for (number = 0; number < num_points; number++) {
70 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
75 *result = returnValue;
84static inline void volk_32fc_32f_dot_prod_32fc_a_avx512f(
lv_32fc_t* result,
87 unsigned int num_points)
89 unsigned int number = 0;
90 const unsigned int sixteenthPoints = num_points / 16;
93 const float* aPtr = (
float*)input;
94 const float* bPtr = taps;
100 __m512 dotProdVal0 = _mm512_setzero_ps();
101 __m512 dotProdVal1 = _mm512_setzero_ps();
104 const __m512i idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
106 _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15);
108 for (; number < sixteenthPoints; number++) {
110 a0Val = _mm512_load_ps(aPtr);
111 a1Val = _mm512_load_ps(aPtr + 16);
114 xVal = _mm512_load_ps(bPtr);
117 b0Val = _mm512_permutexvar_ps(idx, xVal);
118 b1Val = _mm512_permutexvar_ps(idx2, xVal);
120 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
121 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
127 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
130 _mm512_store_ps(dotProductVector, dotProdVal0);
132 for (
unsigned int i = 0; i < 16; i += 2) {
133 returnValue +=
lv_cmake(dotProductVector[i], dotProductVector[i + 1]);
136 number = sixteenthPoints * 16;
139 &returnTail, input + number, bPtr, num_points - number);
140 returnValue += returnTail;
142 *result = returnValue;
147#if LV_HAVE_AVX2 && LV_HAVE_FMA
149#include <immintrin.h>
151static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(
lv_32fc_t* result,
154 unsigned int num_points)
157 unsigned int number = 0;
158 const unsigned int sixteenthPoints = num_points / 16;
161 const float* aPtr = (
float*)input;
162 const float* bPtr = taps;
164 __m256 a0Val, a1Val, a2Val, a3Val;
165 __m256 b0Val, b1Val, b2Val, b3Val;
166 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
168 __m256 dotProdVal0 = _mm256_setzero_ps();
169 __m256 dotProdVal1 = _mm256_setzero_ps();
170 __m256 dotProdVal2 = _mm256_setzero_ps();
171 __m256 dotProdVal3 = _mm256_setzero_ps();
173 for (; number < sixteenthPoints; number++) {
175 a0Val = _mm256_load_ps(aPtr);
176 a1Val = _mm256_load_ps(aPtr + 8);
177 a2Val = _mm256_load_ps(aPtr + 16);
178 a3Val = _mm256_load_ps(aPtr + 24);
180 x0Val = _mm256_load_ps(bPtr);
181 x1Val = _mm256_load_ps(bPtr + 8);
182 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
183 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
184 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
185 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
188 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
189 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
190 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
191 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
193 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
194 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
195 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
196 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
202 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
203 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
204 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
208 _mm256_store_ps(dotProductVector,
211 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
212 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
213 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
214 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
216 number = sixteenthPoints * 16;
219 &returnTail, input + number, bPtr, num_points - number);
220 returnValue += returnTail;
222 *result = returnValue;
229#include <immintrin.h>
234 unsigned int num_points)
237 unsigned int number = 0;
238 const unsigned int sixteenthPoints = num_points / 16;
241 const float* aPtr = (
float*)input;
242 const float* bPtr = taps;
244 __m256 a0Val, a1Val, a2Val, a3Val;
245 __m256 b0Val, b1Val, b2Val, b3Val;
246 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
247 __m256 c0Val, c1Val, c2Val, c3Val;
249 __m256 dotProdVal0 = _mm256_setzero_ps();
250 __m256 dotProdVal1 = _mm256_setzero_ps();
251 __m256 dotProdVal2 = _mm256_setzero_ps();
252 __m256 dotProdVal3 = _mm256_setzero_ps();
254 for (; number < sixteenthPoints; number++) {
256 a0Val = _mm256_load_ps(aPtr);
257 a1Val = _mm256_load_ps(aPtr + 8);
258 a2Val = _mm256_load_ps(aPtr + 16);
259 a3Val = _mm256_load_ps(aPtr + 24);
261 x0Val = _mm256_load_ps(bPtr);
262 x1Val = _mm256_load_ps(bPtr + 8);
263 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
264 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
265 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
266 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
269 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
270 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
271 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
272 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
274 c0Val = _mm256_mul_ps(a0Val, b0Val);
275 c1Val = _mm256_mul_ps(a1Val, b1Val);
276 c2Val = _mm256_mul_ps(a2Val, b2Val);
277 c3Val = _mm256_mul_ps(a3Val, b3Val);
279 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
280 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
281 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
282 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
288 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
289 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
290 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
294 _mm256_store_ps(dotProductVector,
297 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
298 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
299 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
300 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
302 number = sixteenthPoints * 16;
303 for (; number < num_points; number++) {
304 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
309 *result = returnValue;
321 unsigned int num_points)
324 unsigned int number = 0;
325 const unsigned int eighthPoints = num_points / 8;
328 const float* aPtr = (
float*)input;
329 const float* bPtr = taps;
331 __m128 a0Val, a1Val, a2Val, a3Val;
332 __m128 b0Val, b1Val, b2Val, b3Val;
333 __m128 x0Val, x1Val, x2Val, x3Val;
334 __m128 c0Val, c1Val, c2Val, c3Val;
336 __m128 dotProdVal0 = _mm_setzero_ps();
337 __m128 dotProdVal1 = _mm_setzero_ps();
338 __m128 dotProdVal2 = _mm_setzero_ps();
339 __m128 dotProdVal3 = _mm_setzero_ps();
341 for (; number < eighthPoints; number++) {
343 a0Val = _mm_load_ps(aPtr);
344 a1Val = _mm_load_ps(aPtr + 4);
345 a2Val = _mm_load_ps(aPtr + 8);
346 a3Val = _mm_load_ps(aPtr + 12);
348 x0Val = _mm_load_ps(bPtr);
349 x1Val = _mm_load_ps(bPtr);
350 x2Val = _mm_load_ps(bPtr + 4);
351 x3Val = _mm_load_ps(bPtr + 4);
352 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
353 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
354 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
355 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
357 c0Val = _mm_mul_ps(a0Val, b0Val);
358 c1Val = _mm_mul_ps(a1Val, b1Val);
359 c2Val = _mm_mul_ps(a2Val, b2Val);
360 c3Val = _mm_mul_ps(a3Val, b3Val);
362 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
363 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
364 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
365 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
371 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
372 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
373 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
377 _mm_store_ps(dotProductVector,
380 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
381 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
383 number = eighthPoints * 8;
384 for (; number < num_points; number++) {
385 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
390 *result = returnValue;
395#ifdef LV_HAVE_AVX512F
397#include <immintrin.h>
399static inline void volk_32fc_32f_dot_prod_32fc_u_avx512f(
lv_32fc_t* result,
402 unsigned int num_points)
404 unsigned int number = 0;
405 const unsigned int sixteenthPoints = num_points / 16;
408 const float* aPtr = (
float*)input;
409 const float* bPtr = taps;
415 __m512 dotProdVal0 = _mm512_setzero_ps();
416 __m512 dotProdVal1 = _mm512_setzero_ps();
419 const __m512i idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
421 _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15);
423 for (; number < sixteenthPoints; number++) {
425 a0Val = _mm512_loadu_ps(aPtr);
426 a1Val = _mm512_loadu_ps(aPtr + 16);
429 xVal = _mm512_loadu_ps(bPtr);
432 b0Val = _mm512_permutexvar_ps(idx, xVal);
433 b1Val = _mm512_permutexvar_ps(idx2, xVal);
435 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
436 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
442 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
445 _mm512_store_ps(dotProductVector, dotProdVal0);
447 for (
unsigned int i = 0; i < 16; i += 2) {
448 returnValue +=
lv_cmake(dotProductVector[i], dotProductVector[i + 1]);
451 number = sixteenthPoints * 16;
454 &returnTail, input + number, bPtr, num_points - number);
455 returnValue += returnTail;
457 *result = returnValue;
462#if LV_HAVE_AVX2 && LV_HAVE_FMA
464#include <immintrin.h>
466static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(
lv_32fc_t* result,
469 unsigned int num_points)
472 unsigned int number = 0;
473 const unsigned int sixteenthPoints = num_points / 16;
476 const float* aPtr = (
float*)input;
477 const float* bPtr = taps;
479 __m256 a0Val, a1Val, a2Val, a3Val;
480 __m256 b0Val, b1Val, b2Val, b3Val;
481 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
483 __m256 dotProdVal0 = _mm256_setzero_ps();
484 __m256 dotProdVal1 = _mm256_setzero_ps();
485 __m256 dotProdVal2 = _mm256_setzero_ps();
486 __m256 dotProdVal3 = _mm256_setzero_ps();
488 for (; number < sixteenthPoints; number++) {
490 a0Val = _mm256_loadu_ps(aPtr);
491 a1Val = _mm256_loadu_ps(aPtr + 8);
492 a2Val = _mm256_loadu_ps(aPtr + 16);
493 a3Val = _mm256_loadu_ps(aPtr + 24);
495 x0Val = _mm256_loadu_ps(bPtr);
496 x1Val = _mm256_loadu_ps(bPtr + 8);
497 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
498 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
499 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
500 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
503 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
504 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
505 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
506 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
508 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
509 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
510 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
511 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
517 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
518 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
519 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
523 _mm256_store_ps(dotProductVector,
526 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
527 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
528 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
529 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
531 number = sixteenthPoints * 16;
532 for (; number < num_points; number++) {
533 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
538 *result = returnValue;
545#include <immintrin.h>
550 unsigned int num_points)
553 unsigned int number = 0;
554 const unsigned int sixteenthPoints = num_points / 16;
557 const float* aPtr = (
float*)input;
558 const float* bPtr = taps;
560 __m256 a0Val, a1Val, a2Val, a3Val;
561 __m256 b0Val, b1Val, b2Val, b3Val;
562 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
563 __m256 c0Val, c1Val, c2Val, c3Val;
565 __m256 dotProdVal0 = _mm256_setzero_ps();
566 __m256 dotProdVal1 = _mm256_setzero_ps();
567 __m256 dotProdVal2 = _mm256_setzero_ps();
568 __m256 dotProdVal3 = _mm256_setzero_ps();
570 for (; number < sixteenthPoints; number++) {
572 a0Val = _mm256_loadu_ps(aPtr);
573 a1Val = _mm256_loadu_ps(aPtr + 8);
574 a2Val = _mm256_loadu_ps(aPtr + 16);
575 a3Val = _mm256_loadu_ps(aPtr + 24);
577 x0Val = _mm256_loadu_ps(bPtr);
578 x1Val = _mm256_loadu_ps(bPtr + 8);
579 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
580 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
581 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
582 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
585 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
586 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
587 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
588 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
590 c0Val = _mm256_mul_ps(a0Val, b0Val);
591 c1Val = _mm256_mul_ps(a1Val, b1Val);
592 c2Val = _mm256_mul_ps(a2Val, b2Val);
593 c3Val = _mm256_mul_ps(a3Val, b3Val);
595 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
596 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
597 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
598 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
604 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
605 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
606 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
610 _mm256_store_ps(dotProductVector,
613 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
614 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
615 returnValue +=
lv_cmake(dotProductVector[4], dotProductVector[5]);
616 returnValue +=
lv_cmake(dotProductVector[6], dotProductVector[7]);
618 number = sixteenthPoints * 16;
619 for (; number < num_points; number++) {
620 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
625 *result = returnValue;
635 const float* __restrict taps,
636 unsigned int num_points)
640 const unsigned int quarterPoints = num_points / 8;
643 const float* inputPtr = (
float*)input;
644 const float* tapsPtr = taps;
645 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
646 float accVector_real[4];
647 float accVector_imag[4];
649 float32x4x2_t inputVector0, inputVector1;
650 float32x4_t tapsVector0, tapsVector1;
651 float32x4_t tmp_real0, tmp_imag0;
652 float32x4_t tmp_real1, tmp_imag1;
653 float32x4_t real_accumulator0, imag_accumulator0;
654 float32x4_t real_accumulator1, imag_accumulator1;
658 real_accumulator0 = vld1q_f32(zero);
659 imag_accumulator0 = vld1q_f32(zero);
660 real_accumulator1 = vld1q_f32(zero);
661 imag_accumulator1 = vld1q_f32(zero);
663 for (number = 0; number < quarterPoints; number++) {
665 tapsVector0 = vld1q_f32(tapsPtr);
666 tapsVector1 = vld1q_f32(tapsPtr + 4);
669 inputVector0 = vld2q_f32(inputPtr);
670 inputVector1 = vld2q_f32(inputPtr + 8);
673 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
674 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
676 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
677 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
679 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
680 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
682 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
683 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
689 real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
690 imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
693 vst1q_f32(accVector_real, real_accumulator0);
694 vst1q_f32(accVector_imag, imag_accumulator0);
696 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
697 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
700 for (number = quarterPoints * 8; number < num_points; number++) {
701 returnValue +=
lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
706 *result = returnValue;
716 const float* __restrict taps,
717 unsigned int num_points)
721 const unsigned int quarterPoints = num_points / 4;
724 const float* inputPtr = (
float*)input;
725 const float* tapsPtr = taps;
726 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
727 float accVector_real[4];
728 float accVector_imag[4];
730 float32x4x2_t inputVector;
731 float32x4_t tapsVector;
732 float32x4_t tmp_real, tmp_imag;
733 float32x4_t real_accumulator, imag_accumulator;
738 real_accumulator = vld1q_f32(zero);
739 imag_accumulator = vld1q_f32(zero);
741 for (number = 0; number < quarterPoints; number++) {
744 tapsVector = vld1q_f32(tapsPtr);
747 inputVector = vld2q_f32(inputPtr);
749 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
750 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
752 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
753 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
761 vst1q_f32(accVector_real, real_accumulator);
762 vst1q_f32(accVector_imag, imag_accumulator);
764 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
765 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
768 for (number = quarterPoints * 4; number < num_points; number++) {
769 returnValue +=
lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
774 *result = returnValue;
782static inline void volk_32fc_32f_dot_prod_32fc_neonv8(
lv_32fc_t* result,
785 unsigned int num_points)
787 const unsigned int eighthPoints = num_points / 8;
788 const float* inputPtr = (
const float*)input;
789 const float* tapsPtr = taps;
792 float32x4_t real_acc0 = vdupq_n_f32(0);
793 float32x4_t imag_acc0 = vdupq_n_f32(0);
794 float32x4_t real_acc1 = vdupq_n_f32(0);
795 float32x4_t imag_acc1 = vdupq_n_f32(0);
797 for (
unsigned int number = 0; number < eighthPoints; number++) {
799 float32x4x2_t cplx0 = vld2q_f32(inputPtr);
800 float32x4x2_t cplx1 = vld2q_f32(inputPtr + 8);
803 float32x4_t taps0 = vld1q_f32(tapsPtr);
804 float32x4_t taps1 = vld1q_f32(tapsPtr + 4);
809 real_acc0 = vfmaq_f32(real_acc0, taps0, cplx0.val[0]);
810 imag_acc0 = vfmaq_f32(imag_acc0, taps0, cplx0.val[1]);
811 real_acc1 = vfmaq_f32(real_acc1, taps1, cplx1.val[0]);
812 imag_acc1 = vfmaq_f32(imag_acc1, taps1, cplx1.val[1]);
819 real_acc0 = vaddq_f32(real_acc0, real_acc1);
820 imag_acc0 = vaddq_f32(imag_acc0, imag_acc1);
823 float real_sum = vaddvq_f32(real_acc0);
824 float imag_sum = vaddvq_f32(imag_acc0);
829 for (
unsigned int number = eighthPoints * 8; number < num_points; number++) {
830 returnValue +=
lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
835 *result = returnValue;
840extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(
lv_32fc_t* result,
843 unsigned int num_points);
847extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(
lv_32fc_t* result,
850 unsigned int num_points);
854extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(
lv_32fc_t* result,
857 unsigned int num_points);
865 unsigned int num_points)
868 unsigned int number = 0;
869 const unsigned int eighthPoints = num_points / 8;
872 const float* aPtr = (
float*)input;
873 const float* bPtr = taps;
875 __m128 a0Val, a1Val, a2Val, a3Val;
876 __m128 b0Val, b1Val, b2Val, b3Val;
877 __m128 x0Val, x1Val, x2Val, x3Val;
878 __m128 c0Val, c1Val, c2Val, c3Val;
880 __m128 dotProdVal0 = _mm_setzero_ps();
881 __m128 dotProdVal1 = _mm_setzero_ps();
882 __m128 dotProdVal2 = _mm_setzero_ps();
883 __m128 dotProdVal3 = _mm_setzero_ps();
885 for (; number < eighthPoints; number++) {
887 a0Val = _mm_loadu_ps(aPtr);
888 a1Val = _mm_loadu_ps(aPtr + 4);
889 a2Val = _mm_loadu_ps(aPtr + 8);
890 a3Val = _mm_loadu_ps(aPtr + 12);
892 x0Val = _mm_loadu_ps(bPtr);
893 x1Val = _mm_loadu_ps(bPtr);
894 x2Val = _mm_loadu_ps(bPtr + 4);
895 x3Val = _mm_loadu_ps(bPtr + 4);
896 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
897 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
898 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
899 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
901 c0Val = _mm_mul_ps(a0Val, b0Val);
902 c1Val = _mm_mul_ps(a1Val, b1Val);
903 c2Val = _mm_mul_ps(a2Val, b2Val);
904 c3Val = _mm_mul_ps(a3Val, b3Val);
906 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
907 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
908 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
909 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
915 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
916 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
917 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
921 _mm_store_ps(dotProductVector,
924 returnValue +=
lv_cmake(dotProductVector[0], dotProductVector[1]);
925 returnValue +=
lv_cmake(dotProductVector[2], dotProductVector[3]);
927 number = eighthPoints * 8;
928 for (; number < num_points; number++) {
929 returnValue +=
lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
934 *result = returnValue;
940#include <riscv_vector.h>
943static inline void volk_32fc_32f_dot_prod_32fc_rvv(
lv_32fc_t* result,
946 unsigned int num_points)
948 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
949 vfloat32m4_t vsumi = vsumr;
950 size_t n = num_points;
951 for (
size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
952 vl = __riscv_vsetvl_e32m4(n);
953 vuint64m8_t va = __riscv_vle64_v_u64m8((
const uint64_t*)input, vl);
954 vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr;
955 vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
956 vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
957 vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl);
958 vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl);
960 size_t vl = __riscv_vsetvlmax_e32m1();
963 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
964 *result =
lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
965 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
970#include <riscv_vector.h>
973static inline void volk_32fc_32f_dot_prod_32fc_rvvseg(
lv_32fc_t* result,
976 unsigned int num_points)
978 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
979 vfloat32m4_t vsumi = vsumr;
980 size_t n = num_points;
981 for (
size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
982 vl = __riscv_vsetvl_e32m4(n);
983 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((
const float*)input, vl);
984 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
985 vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr;
986 vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl);
987 vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl);
989 size_t vl = __riscv_vsetvlmax_e32m1();
992 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
993 *result =
lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
994 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));