Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_x2_dot_prod_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
57
58#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
59#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
60
61#include <stdio.h>
62#include <volk/volk_common.h>
63
64
65#ifdef LV_HAVE_GENERIC
66
67
68static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
69 const float* input,
70 const float* taps,
71 unsigned int num_points)
72{
73
74 float dotProduct = 0;
75 const float* aPtr = input;
76 const float* bPtr = taps;
77 unsigned int number = 0;
78
79 for (number = 0; number < num_points; number++) {
80 dotProduct += ((*aPtr++) * (*bPtr++));
81 }
82
83 *result = dotProduct;
84}
85
86#endif /*LV_HAVE_GENERIC*/
87
88
89#ifdef LV_HAVE_SSE
90
91
92static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
93 const float* input,
94 const float* taps,
95 unsigned int num_points)
96{
97
98 unsigned int number = 0;
99 const unsigned int sixteenthPoints = num_points / 16;
100
101 float dotProduct = 0;
102 const float* aPtr = input;
103 const float* bPtr = taps;
104
105 __m128 a0Val, a1Val, a2Val, a3Val;
106 __m128 b0Val, b1Val, b2Val, b3Val;
107 __m128 c0Val, c1Val, c2Val, c3Val;
108
109 __m128 dotProdVal0 = _mm_setzero_ps();
110 __m128 dotProdVal1 = _mm_setzero_ps();
111 __m128 dotProdVal2 = _mm_setzero_ps();
112 __m128 dotProdVal3 = _mm_setzero_ps();
113
114 for (; number < sixteenthPoints; number++) {
115
116 a0Val = _mm_loadu_ps(aPtr);
117 a1Val = _mm_loadu_ps(aPtr + 4);
118 a2Val = _mm_loadu_ps(aPtr + 8);
119 a3Val = _mm_loadu_ps(aPtr + 12);
120 b0Val = _mm_loadu_ps(bPtr);
121 b1Val = _mm_loadu_ps(bPtr + 4);
122 b2Val = _mm_loadu_ps(bPtr + 8);
123 b3Val = _mm_loadu_ps(bPtr + 12);
124
125 c0Val = _mm_mul_ps(a0Val, b0Val);
126 c1Val = _mm_mul_ps(a1Val, b1Val);
127 c2Val = _mm_mul_ps(a2Val, b2Val);
128 c3Val = _mm_mul_ps(a3Val, b3Val);
129
130 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
131 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
132 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
133 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
134
135 aPtr += 16;
136 bPtr += 16;
137 }
138
139 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
140 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
141 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
142
143 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
144
145 _mm_store_ps(dotProductVector,
146 dotProdVal0); // Store the results back into the dot product vector
147
148 dotProduct = dotProductVector[0];
149 dotProduct += dotProductVector[1];
150 dotProduct += dotProductVector[2];
151 dotProduct += dotProductVector[3];
152
153 number = sixteenthPoints * 16;
154 for (; number < num_points; number++) {
155 dotProduct += ((*aPtr++) * (*bPtr++));
156 }
157
158 *result = dotProduct;
159}
160
161#endif /*LV_HAVE_SSE*/
162
163#ifdef LV_HAVE_SSE3
164
165#include <pmmintrin.h>
166
167static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
168 const float* input,
169 const float* taps,
170 unsigned int num_points)
171{
172 unsigned int number = 0;
173 const unsigned int sixteenthPoints = num_points / 16;
174
175 float dotProduct = 0;
176 const float* aPtr = input;
177 const float* bPtr = taps;
178
179 __m128 a0Val, a1Val, a2Val, a3Val;
180 __m128 b0Val, b1Val, b2Val, b3Val;
181 __m128 c0Val, c1Val, c2Val, c3Val;
182
183 __m128 dotProdVal0 = _mm_setzero_ps();
184 __m128 dotProdVal1 = _mm_setzero_ps();
185 __m128 dotProdVal2 = _mm_setzero_ps();
186 __m128 dotProdVal3 = _mm_setzero_ps();
187
188 for (; number < sixteenthPoints; number++) {
189
190 a0Val = _mm_loadu_ps(aPtr);
191 a1Val = _mm_loadu_ps(aPtr + 4);
192 a2Val = _mm_loadu_ps(aPtr + 8);
193 a3Val = _mm_loadu_ps(aPtr + 12);
194 b0Val = _mm_loadu_ps(bPtr);
195 b1Val = _mm_loadu_ps(bPtr + 4);
196 b2Val = _mm_loadu_ps(bPtr + 8);
197 b3Val = _mm_loadu_ps(bPtr + 12);
198
199 c0Val = _mm_mul_ps(a0Val, b0Val);
200 c1Val = _mm_mul_ps(a1Val, b1Val);
201 c2Val = _mm_mul_ps(a2Val, b2Val);
202 c3Val = _mm_mul_ps(a3Val, b3Val);
203
204 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
205 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
206 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
207 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
208
209 aPtr += 16;
210 bPtr += 16;
211 }
212
213 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
214 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
215 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
216
217 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
218 _mm_store_ps(dotProductVector,
219 dotProdVal0); // Store the results back into the dot product vector
220
221 dotProduct = dotProductVector[0];
222 dotProduct += dotProductVector[1];
223 dotProduct += dotProductVector[2];
224 dotProduct += dotProductVector[3];
225
226 number = sixteenthPoints * 16;
227 for (; number < num_points; number++) {
228 dotProduct += ((*aPtr++) * (*bPtr++));
229 }
230
231 *result = dotProduct;
232}
233
234#endif /*LV_HAVE_SSE3*/
235
236#ifdef LV_HAVE_SSE4_1
237
238#include <smmintrin.h>
239
240static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
241 const float* input,
242 const float* taps,
243 unsigned int num_points)
244{
245 unsigned int number = 0;
246 const unsigned int sixteenthPoints = num_points / 16;
247
248 float dotProduct = 0;
249 const float* aPtr = input;
250 const float* bPtr = taps;
251
252 __m128 aVal1, bVal1, cVal1;
253 __m128 aVal2, bVal2, cVal2;
254 __m128 aVal3, bVal3, cVal3;
255 __m128 aVal4, bVal4, cVal4;
256
257 __m128 dotProdVal = _mm_setzero_ps();
258
259 for (; number < sixteenthPoints; number++) {
260
261 aVal1 = _mm_loadu_ps(aPtr);
262 aPtr += 4;
263 aVal2 = _mm_loadu_ps(aPtr);
264 aPtr += 4;
265 aVal3 = _mm_loadu_ps(aPtr);
266 aPtr += 4;
267 aVal4 = _mm_loadu_ps(aPtr);
268 aPtr += 4;
269
270 bVal1 = _mm_loadu_ps(bPtr);
271 bPtr += 4;
272 bVal2 = _mm_loadu_ps(bPtr);
273 bPtr += 4;
274 bVal3 = _mm_loadu_ps(bPtr);
275 bPtr += 4;
276 bVal4 = _mm_loadu_ps(bPtr);
277 bPtr += 4;
278
279 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
280 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
281 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
282 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
283
284 cVal1 = _mm_or_ps(cVal1, cVal2);
285 cVal3 = _mm_or_ps(cVal3, cVal4);
286 cVal1 = _mm_or_ps(cVal1, cVal3);
287
288 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
289 }
290
291 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
292 _mm_store_ps(dotProductVector,
293 dotProdVal); // Store the results back into the dot product vector
294
295 dotProduct = dotProductVector[0];
296 dotProduct += dotProductVector[1];
297 dotProduct += dotProductVector[2];
298 dotProduct += dotProductVector[3];
299
300 number = sixteenthPoints * 16;
301 for (; number < num_points; number++) {
302 dotProduct += ((*aPtr++) * (*bPtr++));
303 }
304
305 *result = dotProduct;
306}
307
308#endif /*LV_HAVE_SSE4_1*/
309
310#ifdef LV_HAVE_AVX
311
312#include <immintrin.h>
313
314static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
315 const float* input,
316 const float* taps,
317 unsigned int num_points)
318{
319
320 unsigned int number = 0;
321 const unsigned int sixteenthPoints = num_points / 16;
322
323 float dotProduct = 0;
324 const float* aPtr = input;
325 const float* bPtr = taps;
326
327 __m256 a0Val, a1Val;
328 __m256 b0Val, b1Val;
329 __m256 c0Val, c1Val;
330
331 __m256 dotProdVal0 = _mm256_setzero_ps();
332 __m256 dotProdVal1 = _mm256_setzero_ps();
333
334 for (; number < sixteenthPoints; number++) {
335
336 a0Val = _mm256_loadu_ps(aPtr);
337 a1Val = _mm256_loadu_ps(aPtr + 8);
338 b0Val = _mm256_loadu_ps(bPtr);
339 b1Val = _mm256_loadu_ps(bPtr + 8);
340
341 c0Val = _mm256_mul_ps(a0Val, b0Val);
342 c1Val = _mm256_mul_ps(a1Val, b1Val);
343
344 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
345 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
346
347 aPtr += 16;
348 bPtr += 16;
349 }
350
351 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
352
353 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
354
355 _mm256_storeu_ps(dotProductVector,
356 dotProdVal0); // Store the results back into the dot product vector
357
358 dotProduct = dotProductVector[0];
359 dotProduct += dotProductVector[1];
360 dotProduct += dotProductVector[2];
361 dotProduct += dotProductVector[3];
362 dotProduct += dotProductVector[4];
363 dotProduct += dotProductVector[5];
364 dotProduct += dotProductVector[6];
365 dotProduct += dotProductVector[7];
366
367 number = sixteenthPoints * 16;
368 for (; number < num_points; number++) {
369 dotProduct += ((*aPtr++) * (*bPtr++));
370 }
371
372 *result = dotProduct;
373}
374
375#endif /*LV_HAVE_AVX*/
376
377#if LV_HAVE_AVX2 && LV_HAVE_FMA
378#include <immintrin.h>
379static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
380 const float* input,
381 const float* taps,
382 unsigned int num_points)
383{
384 unsigned int number;
385 const unsigned int eighthPoints = num_points / 8;
386
387 const float* aPtr = input;
388 const float* bPtr = taps;
389
390 __m256 dotProdVal = _mm256_setzero_ps();
391 __m256 aVal1, bVal1;
392
393 for (number = 0; number < eighthPoints; number++) {
394
395 aVal1 = _mm256_loadu_ps(aPtr);
396 bVal1 = _mm256_loadu_ps(bPtr);
397 aPtr += 8;
398 bPtr += 8;
399
400 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
401 }
402
403 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
404 _mm256_storeu_ps(dotProductVector,
405 dotProdVal); // Store the results back into the dot product vector
406
407 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
408 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
409 dotProductVector[6] + dotProductVector[7];
410
411 for (number = eighthPoints * 8; number < num_points; number++) {
412 dotProduct += ((*aPtr++) * (*bPtr++));
413 }
414
415 *result = dotProduct;
416}
417#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
418
419#if LV_HAVE_AVX512F
420#include <immintrin.h>
421static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
422 const float* input,
423 const float* taps,
424 unsigned int num_points)
425{
426 unsigned int number;
427 const unsigned int sixteenthPoints = num_points / 16;
428
429 const float* aPtr = input;
430 const float* bPtr = taps;
431
432 __m512 dotProdVal = _mm512_setzero_ps();
433 __m512 aVal1, bVal1;
434
435 for (number = 0; number < sixteenthPoints; number++) {
436
437 aVal1 = _mm512_loadu_ps(aPtr);
438 bVal1 = _mm512_loadu_ps(bPtr);
439 aPtr += 16;
440 bPtr += 16;
441
442 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
443 }
444
445 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
446 _mm512_storeu_ps(dotProductVector,
447 dotProdVal); // Store the results back into the dot product vector
448
449 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
450 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
451 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
452 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
453 dotProductVector[12] + dotProductVector[13] +
454 dotProductVector[14] + dotProductVector[15];
455
456 for (number = sixteenthPoints * 16; number < num_points; number++) {
457 dotProduct += ((*aPtr++) * (*bPtr++));
458 }
459
460 *result = dotProduct;
461}
462#endif /* LV_HAVE_AVX512F */
463
464#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
465
466#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
467#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
468
469#include <stdio.h>
470#include <volk/volk_common.h>
471
472
473#ifdef LV_HAVE_SSE
474
475
476static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
477 const float* input,
478 const float* taps,
479 unsigned int num_points)
480{
481
482 unsigned int number = 0;
483 const unsigned int sixteenthPoints = num_points / 16;
484
485 float dotProduct = 0;
486 const float* aPtr = input;
487 const float* bPtr = taps;
488
489 __m128 a0Val, a1Val, a2Val, a3Val;
490 __m128 b0Val, b1Val, b2Val, b3Val;
491 __m128 c0Val, c1Val, c2Val, c3Val;
492
493 __m128 dotProdVal0 = _mm_setzero_ps();
494 __m128 dotProdVal1 = _mm_setzero_ps();
495 __m128 dotProdVal2 = _mm_setzero_ps();
496 __m128 dotProdVal3 = _mm_setzero_ps();
497
498 for (; number < sixteenthPoints; number++) {
499
500 a0Val = _mm_load_ps(aPtr);
501 a1Val = _mm_load_ps(aPtr + 4);
502 a2Val = _mm_load_ps(aPtr + 8);
503 a3Val = _mm_load_ps(aPtr + 12);
504 b0Val = _mm_load_ps(bPtr);
505 b1Val = _mm_load_ps(bPtr + 4);
506 b2Val = _mm_load_ps(bPtr + 8);
507 b3Val = _mm_load_ps(bPtr + 12);
508
509 c0Val = _mm_mul_ps(a0Val, b0Val);
510 c1Val = _mm_mul_ps(a1Val, b1Val);
511 c2Val = _mm_mul_ps(a2Val, b2Val);
512 c3Val = _mm_mul_ps(a3Val, b3Val);
513
514 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
515 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
516 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
517 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
518
519 aPtr += 16;
520 bPtr += 16;
521 }
522
523 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
524 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
525 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
526
527 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
528
529 _mm_store_ps(dotProductVector,
530 dotProdVal0); // Store the results back into the dot product vector
531
532 dotProduct = dotProductVector[0];
533 dotProduct += dotProductVector[1];
534 dotProduct += dotProductVector[2];
535 dotProduct += dotProductVector[3];
536
537 number = sixteenthPoints * 16;
538 for (; number < num_points; number++) {
539 dotProduct += ((*aPtr++) * (*bPtr++));
540 }
541
542 *result = dotProduct;
543}
544
545#endif /*LV_HAVE_SSE*/
546
547#ifdef LV_HAVE_SSE3
548
549#include <pmmintrin.h>
550
551static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
552 const float* input,
553 const float* taps,
554 unsigned int num_points)
555{
556 unsigned int number = 0;
557 const unsigned int sixteenthPoints = num_points / 16;
558
559 float dotProduct = 0;
560 const float* aPtr = input;
561 const float* bPtr = taps;
562
563 __m128 a0Val, a1Val, a2Val, a3Val;
564 __m128 b0Val, b1Val, b2Val, b3Val;
565 __m128 c0Val, c1Val, c2Val, c3Val;
566
567 __m128 dotProdVal0 = _mm_setzero_ps();
568 __m128 dotProdVal1 = _mm_setzero_ps();
569 __m128 dotProdVal2 = _mm_setzero_ps();
570 __m128 dotProdVal3 = _mm_setzero_ps();
571
572 for (; number < sixteenthPoints; number++) {
573
574 a0Val = _mm_load_ps(aPtr);
575 a1Val = _mm_load_ps(aPtr + 4);
576 a2Val = _mm_load_ps(aPtr + 8);
577 a3Val = _mm_load_ps(aPtr + 12);
578 b0Val = _mm_load_ps(bPtr);
579 b1Val = _mm_load_ps(bPtr + 4);
580 b2Val = _mm_load_ps(bPtr + 8);
581 b3Val = _mm_load_ps(bPtr + 12);
582
583 c0Val = _mm_mul_ps(a0Val, b0Val);
584 c1Val = _mm_mul_ps(a1Val, b1Val);
585 c2Val = _mm_mul_ps(a2Val, b2Val);
586 c3Val = _mm_mul_ps(a3Val, b3Val);
587
588 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
589 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
590 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
591 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
592
593 aPtr += 16;
594 bPtr += 16;
595 }
596
597 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
598 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
599 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
600
601 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
602 _mm_store_ps(dotProductVector,
603 dotProdVal0); // Store the results back into the dot product vector
604
605 dotProduct = dotProductVector[0];
606 dotProduct += dotProductVector[1];
607 dotProduct += dotProductVector[2];
608 dotProduct += dotProductVector[3];
609
610 number = sixteenthPoints * 16;
611 for (; number < num_points; number++) {
612 dotProduct += ((*aPtr++) * (*bPtr++));
613 }
614
615 *result = dotProduct;
616}
617
618#endif /*LV_HAVE_SSE3*/
619
620#ifdef LV_HAVE_SSE4_1
621
622#include <smmintrin.h>
623
624static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
625 const float* input,
626 const float* taps,
627 unsigned int num_points)
628{
629 unsigned int number = 0;
630 const unsigned int sixteenthPoints = num_points / 16;
631
632 float dotProduct = 0;
633 const float* aPtr = input;
634 const float* bPtr = taps;
635
636 __m128 aVal1, bVal1, cVal1;
637 __m128 aVal2, bVal2, cVal2;
638 __m128 aVal3, bVal3, cVal3;
639 __m128 aVal4, bVal4, cVal4;
640
641 __m128 dotProdVal = _mm_setzero_ps();
642
643 for (; number < sixteenthPoints; number++) {
644
645 aVal1 = _mm_load_ps(aPtr);
646 aPtr += 4;
647 aVal2 = _mm_load_ps(aPtr);
648 aPtr += 4;
649 aVal3 = _mm_load_ps(aPtr);
650 aPtr += 4;
651 aVal4 = _mm_load_ps(aPtr);
652 aPtr += 4;
653
654 bVal1 = _mm_load_ps(bPtr);
655 bPtr += 4;
656 bVal2 = _mm_load_ps(bPtr);
657 bPtr += 4;
658 bVal3 = _mm_load_ps(bPtr);
659 bPtr += 4;
660 bVal4 = _mm_load_ps(bPtr);
661 bPtr += 4;
662
663 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
664 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
665 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
666 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
667
668 cVal1 = _mm_or_ps(cVal1, cVal2);
669 cVal3 = _mm_or_ps(cVal3, cVal4);
670 cVal1 = _mm_or_ps(cVal1, cVal3);
671
672 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
673 }
674
675 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
676 _mm_store_ps(dotProductVector,
677 dotProdVal); // Store the results back into the dot product vector
678
679 dotProduct = dotProductVector[0];
680 dotProduct += dotProductVector[1];
681 dotProduct += dotProductVector[2];
682 dotProduct += dotProductVector[3];
683
684 number = sixteenthPoints * 16;
685 for (; number < num_points; number++) {
686 dotProduct += ((*aPtr++) * (*bPtr++));
687 }
688
689 *result = dotProduct;
690}
691
692#endif /*LV_HAVE_SSE4_1*/
693
694#ifdef LV_HAVE_AVX
695
696#include <immintrin.h>
697
698static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
699 const float* input,
700 const float* taps,
701 unsigned int num_points)
702{
703
704 unsigned int number = 0;
705 const unsigned int sixteenthPoints = num_points / 16;
706
707 float dotProduct = 0;
708 const float* aPtr = input;
709 const float* bPtr = taps;
710
711 __m256 a0Val, a1Val;
712 __m256 b0Val, b1Val;
713 __m256 c0Val, c1Val;
714
715 __m256 dotProdVal0 = _mm256_setzero_ps();
716 __m256 dotProdVal1 = _mm256_setzero_ps();
717
718 for (; number < sixteenthPoints; number++) {
719
720 a0Val = _mm256_load_ps(aPtr);
721 a1Val = _mm256_load_ps(aPtr + 8);
722 b0Val = _mm256_load_ps(bPtr);
723 b1Val = _mm256_load_ps(bPtr + 8);
724
725 c0Val = _mm256_mul_ps(a0Val, b0Val);
726 c1Val = _mm256_mul_ps(a1Val, b1Val);
727
728 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
729 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
730
731 aPtr += 16;
732 bPtr += 16;
733 }
734
735 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
736
737 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
738
739 _mm256_store_ps(dotProductVector,
740 dotProdVal0); // Store the results back into the dot product vector
741
742 dotProduct = dotProductVector[0];
743 dotProduct += dotProductVector[1];
744 dotProduct += dotProductVector[2];
745 dotProduct += dotProductVector[3];
746 dotProduct += dotProductVector[4];
747 dotProduct += dotProductVector[5];
748 dotProduct += dotProductVector[6];
749 dotProduct += dotProductVector[7];
750
751 number = sixteenthPoints * 16;
752 for (; number < num_points; number++) {
753 dotProduct += ((*aPtr++) * (*bPtr++));
754 }
755
756 *result = dotProduct;
757}
758#endif /*LV_HAVE_AVX*/
759
760
761#if LV_HAVE_AVX2 && LV_HAVE_FMA
762#include <immintrin.h>
763static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
764 const float* input,
765 const float* taps,
766 unsigned int num_points)
767{
768 unsigned int number;
769 const unsigned int eighthPoints = num_points / 8;
770
771 const float* aPtr = input;
772 const float* bPtr = taps;
773
774 __m256 dotProdVal = _mm256_setzero_ps();
775 __m256 aVal1, bVal1;
776
777 for (number = 0; number < eighthPoints; number++) {
778
779 aVal1 = _mm256_load_ps(aPtr);
780 bVal1 = _mm256_load_ps(bPtr);
781 aPtr += 8;
782 bPtr += 8;
783
784 dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
785 }
786
787 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
788 _mm256_store_ps(dotProductVector,
789 dotProdVal); // Store the results back into the dot product vector
790
791 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
792 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
793 dotProductVector[6] + dotProductVector[7];
794
795 for (number = eighthPoints * 8; number < num_points; number++) {
796 dotProduct += ((*aPtr++) * (*bPtr++));
797 }
798
799 *result = dotProduct;
800}
801#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
802
803#if LV_HAVE_AVX512F
804#include <immintrin.h>
805static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
806 const float* input,
807 const float* taps,
808 unsigned int num_points)
809{
810 unsigned int number;
811 const unsigned int sixteenthPoints = num_points / 16;
812
813 const float* aPtr = input;
814 const float* bPtr = taps;
815
816 __m512 dotProdVal = _mm512_setzero_ps();
817 __m512 aVal1, bVal1;
818
819 for (number = 0; number < sixteenthPoints; number++) {
820
821 aVal1 = _mm512_load_ps(aPtr);
822 bVal1 = _mm512_load_ps(bPtr);
823 aPtr += 16;
824 bPtr += 16;
825
826 dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
827 }
828
829 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
830 _mm512_store_ps(dotProductVector,
831 dotProdVal); // Store the results back into the dot product vector
832
833 float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
834 dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
835 dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
836 dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
837 dotProductVector[12] + dotProductVector[13] +
838 dotProductVector[14] + dotProductVector[15];
839
840 for (number = sixteenthPoints * 16; number < num_points; number++) {
841 dotProduct += ((*aPtr++) * (*bPtr++));
842 }
843
844 *result = dotProduct;
845}
846#endif /* LV_HAVE_AVX512F */
847
848#ifdef LV_HAVE_NEON
849#include <arm_neon.h>
850
851static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
852 const float* input,
853 const float* taps,
854 unsigned int num_points)
855{
856 const unsigned int sixteenthPoints = num_points / 16;
857 const float* aPtr = input;
858 const float* bPtr = taps;
859
860 // Use 4 independent accumulators for better pipelining
861 float32x4_t acc0 = vdupq_n_f32(0);
862 float32x4_t acc1 = vdupq_n_f32(0);
863 float32x4_t acc2 = vdupq_n_f32(0);
864 float32x4_t acc3 = vdupq_n_f32(0);
865
866 for (unsigned int number = 0; number < sixteenthPoints; number++) {
867 float32x4_t a0 = vld1q_f32(aPtr);
868 float32x4_t a1 = vld1q_f32(aPtr + 4);
869 float32x4_t a2 = vld1q_f32(aPtr + 8);
870 float32x4_t a3 = vld1q_f32(aPtr + 12);
871
872 float32x4_t b0 = vld1q_f32(bPtr);
873 float32x4_t b1 = vld1q_f32(bPtr + 4);
874 float32x4_t b2 = vld1q_f32(bPtr + 8);
875 float32x4_t b3 = vld1q_f32(bPtr + 12);
876
877 acc0 = vmlaq_f32(acc0, a0, b0);
878 acc1 = vmlaq_f32(acc1, a1, b1);
879 acc2 = vmlaq_f32(acc2, a2, b2);
880 acc3 = vmlaq_f32(acc3, a3, b3);
881
882 aPtr += 16;
883 bPtr += 16;
884 }
885
886 // Combine accumulators
887 acc0 = vaddq_f32(acc0, acc1);
888 acc2 = vaddq_f32(acc2, acc3);
889 acc0 = vaddq_f32(acc0, acc2);
890
891 // Horizontal sum (ARMv7 compatible)
892 float32x2_t sum = vadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
893 sum = vpadd_f32(sum, sum);
894 float dotProduct = vget_lane_f32(sum, 0);
895
896 // Handle remainder
897 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
898 dotProduct += (*aPtr++) * (*bPtr++);
899 }
900
901 *result = dotProduct;
902}
903
904#endif /* LV_HAVE_NEON */
905
906#ifdef LV_HAVE_NEONV8
907#include <arm_neon.h>
908
909static inline void volk_32f_x2_dot_prod_32f_neonv8(float* result,
910 const float* input,
911 const float* taps,
912 unsigned int num_points)
913{
914 const unsigned int sixteenthPoints = num_points / 16;
915 const float* aPtr = input;
916 const float* bPtr = taps;
917
918 /* Use 4 independent accumulators for better pipelining with FMA */
919 float32x4_t acc0 = vdupq_n_f32(0);
920 float32x4_t acc1 = vdupq_n_f32(0);
921 float32x4_t acc2 = vdupq_n_f32(0);
922 float32x4_t acc3 = vdupq_n_f32(0);
923
924 for (unsigned int number = 0; number < sixteenthPoints; number++) {
925 float32x4_t a0 = vld1q_f32(aPtr);
926 float32x4_t a1 = vld1q_f32(aPtr + 4);
927 float32x4_t a2 = vld1q_f32(aPtr + 8);
928 float32x4_t a3 = vld1q_f32(aPtr + 12);
929
930 float32x4_t b0 = vld1q_f32(bPtr);
931 float32x4_t b1 = vld1q_f32(bPtr + 4);
932 float32x4_t b2 = vld1q_f32(bPtr + 8);
933 float32x4_t b3 = vld1q_f32(bPtr + 12);
934 __VOLK_PREFETCH(aPtr + 32);
935 __VOLK_PREFETCH(bPtr + 32);
936
937 /* Use FMA for accumulate */
938 acc0 = vfmaq_f32(acc0, a0, b0);
939 acc1 = vfmaq_f32(acc1, a1, b1);
940 acc2 = vfmaq_f32(acc2, a2, b2);
941 acc3 = vfmaq_f32(acc3, a3, b3);
942
943 aPtr += 16;
944 bPtr += 16;
945 }
946
947 /* Combine accumulators */
948 acc0 = vaddq_f32(acc0, acc1);
949 acc2 = vaddq_f32(acc2, acc3);
950 acc0 = vaddq_f32(acc0, acc2);
951
952 /* Horizontal sum */
953 float dotProduct = vaddvq_f32(acc0);
954
955 /* Handle remainder */
956 for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
957 dotProduct += (*aPtr++) * (*bPtr++);
958 }
959
960 *result = dotProduct;
961}
962#endif /* LV_HAVE_NEONV8 */
963
964#ifdef LV_HAVE_NEONV7
965extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
966 const float* aVector,
967 const float* bVector,
968 unsigned int num_points);
969#endif /* LV_HAVE_NEONV7 */
970
971#ifdef LV_HAVE_NEONV7
972extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
973 const float* aVector,
974 const float* bVector,
975 unsigned int num_points);
976#endif /* LV_HAVE_NEONV7 */
977
978#ifdef LV_HAVE_RVV
979#include <riscv_vector.h>
981
982static inline void volk_32f_x2_dot_prod_32f_rvv(float* result,
983 const float* input,
984 const float* taps,
985 unsigned int num_points)
986{
987 vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
988 size_t n = num_points;
989 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
990 vl = __riscv_vsetvl_e32m8(n);
991 vfloat32m8_t v0 = __riscv_vle32_v_f32m8(input, vl);
992 vfloat32m8_t v1 = __riscv_vle32_v_f32m8(taps, vl);
993 vsum = __riscv_vfmacc_tu(vsum, v0, v1, vl);
994 }
995 size_t vl = __riscv_vsetvlmax_e32m1();
996 vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum);
997 v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f32m1(0, vl), vl);
998 *result = __riscv_vfmv_f(v);
999}
1000#endif /*LV_HAVE_RVV*/
1001
1002#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/