Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
49
50#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
51#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
52
53#include <stdio.h>
54#include <volk/volk_common.h>
55
56#ifdef LV_HAVE_GENERIC
57
59 const lv_32fc_t* input,
60 const float* taps,
61 unsigned int num_points)
62{
63
64 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
65 const float* aPtr = (float*)input;
66 const float* bPtr = taps;
67 unsigned int number = 0;
68
69 for (number = 0; number < num_points; number++) {
70 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
71 aPtr += 2;
72 bPtr += 1;
73 }
74
75 *result = returnValue;
76}
77
78#endif /*LV_HAVE_GENERIC*/
79
80#ifdef LV_HAVE_AVX512F
81
82#include <immintrin.h>
83
84static inline void volk_32fc_32f_dot_prod_32fc_a_avx512f(lv_32fc_t* result,
85 const lv_32fc_t* input,
86 const float* taps,
87 unsigned int num_points)
88{
89 unsigned int number = 0;
90 const unsigned int sixteenthPoints = num_points / 16;
91
92 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
93 const float* aPtr = (float*)input;
94 const float* bPtr = taps;
95
96 __m512 a0Val, a1Val;
97 __m512 b0Val, b1Val;
98 __m512 xVal;
99
100 __m512 dotProdVal0 = _mm512_setzero_ps();
101 __m512 dotProdVal1 = _mm512_setzero_ps();
102
103 // Create index patterns for duplication: 0,0,1,1,2,2,3,3,...,15,15
104 const __m512i idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
105 const __m512i idx2 =
106 _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15);
107
108 for (; number < sixteenthPoints; number++) {
109 // Load 16 complex numbers (32 floats)
110 a0Val = _mm512_load_ps(aPtr); // 8 complex (I0,Q0,I1,Q1,...)
111 a1Val = _mm512_load_ps(aPtr + 16); // 8 complex (I8,Q8,I9,Q9,...)
112
113 // Load 16 real taps
114 xVal = _mm512_load_ps(bPtr); // t0|t1|t2|...|t15
115
116 // Duplicate each tap value to match complex format using permutexvar
117 b0Val = _mm512_permutexvar_ps(idx, xVal);
118 b1Val = _mm512_permutexvar_ps(idx2, xVal);
119
120 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
121 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
122
123 aPtr += 32;
124 bPtr += 16;
125 }
126
127 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
128
129 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
130 _mm512_store_ps(dotProductVector, dotProdVal0);
131
132 for (unsigned int i = 0; i < 16; i += 2) {
133 returnValue += lv_cmake(dotProductVector[i], dotProductVector[i + 1]);
134 }
135
136 number = sixteenthPoints * 16;
137 lv_32fc_t returnTail = lv_cmake(0.0f, 0.0f);
139 &returnTail, input + number, bPtr, num_points - number);
140 returnValue += returnTail;
141
142 *result = returnValue;
143}
144
145#endif /*LV_HAVE_AVX512F*/
146
147#if LV_HAVE_AVX2 && LV_HAVE_FMA
148
149#include <immintrin.h>
150
151static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
152 const lv_32fc_t* input,
153 const float* taps,
154 unsigned int num_points)
155{
156
157 unsigned int number = 0;
158 const unsigned int sixteenthPoints = num_points / 16;
159
160 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
161 const float* aPtr = (float*)input;
162 const float* bPtr = taps;
163
164 __m256 a0Val, a1Val, a2Val, a3Val;
165 __m256 b0Val, b1Val, b2Val, b3Val;
166 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
167
168 __m256 dotProdVal0 = _mm256_setzero_ps();
169 __m256 dotProdVal1 = _mm256_setzero_ps();
170 __m256 dotProdVal2 = _mm256_setzero_ps();
171 __m256 dotProdVal3 = _mm256_setzero_ps();
172
173 for (; number < sixteenthPoints; number++) {
174
175 a0Val = _mm256_load_ps(aPtr);
176 a1Val = _mm256_load_ps(aPtr + 8);
177 a2Val = _mm256_load_ps(aPtr + 16);
178 a3Val = _mm256_load_ps(aPtr + 24);
179
180 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
181 x1Val = _mm256_load_ps(bPtr + 8);
182 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
183 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
184 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
185 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
186
187 // TODO: it may be possible to rearrange swizzling to better pipeline data
188 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
189 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
190 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
191 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
192
193 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
194 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
195 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
196 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
197
198 aPtr += 32;
199 bPtr += 16;
200 }
201
202 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
203 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
204 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
205
206 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
207
208 _mm256_store_ps(dotProductVector,
209 dotProdVal0); // Store the results back into the dot product vector
210
211 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
212 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
213 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
214 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
215
216 number = sixteenthPoints * 16;
217 lv_32fc_t returnTail = lv_cmake(0.0f, 0.0f);
219 &returnTail, input + number, bPtr, num_points - number);
220 returnValue += returnTail;
221
222 *result = returnValue;
223}
224
225#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
226
227#ifdef LV_HAVE_AVX
228
229#include <immintrin.h>
230
232 const lv_32fc_t* input,
233 const float* taps,
234 unsigned int num_points)
235{
236
237 unsigned int number = 0;
238 const unsigned int sixteenthPoints = num_points / 16;
239
240 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
241 const float* aPtr = (float*)input;
242 const float* bPtr = taps;
243
244 __m256 a0Val, a1Val, a2Val, a3Val;
245 __m256 b0Val, b1Val, b2Val, b3Val;
246 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
247 __m256 c0Val, c1Val, c2Val, c3Val;
248
249 __m256 dotProdVal0 = _mm256_setzero_ps();
250 __m256 dotProdVal1 = _mm256_setzero_ps();
251 __m256 dotProdVal2 = _mm256_setzero_ps();
252 __m256 dotProdVal3 = _mm256_setzero_ps();
253
254 for (; number < sixteenthPoints; number++) {
255
256 a0Val = _mm256_load_ps(aPtr);
257 a1Val = _mm256_load_ps(aPtr + 8);
258 a2Val = _mm256_load_ps(aPtr + 16);
259 a3Val = _mm256_load_ps(aPtr + 24);
260
261 x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
262 x1Val = _mm256_load_ps(bPtr + 8);
263 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
264 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
265 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
266 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
267
268 // TODO: it may be possible to rearrange swizzling to better pipeline data
269 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
270 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
271 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
272 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
273
274 c0Val = _mm256_mul_ps(a0Val, b0Val);
275 c1Val = _mm256_mul_ps(a1Val, b1Val);
276 c2Val = _mm256_mul_ps(a2Val, b2Val);
277 c3Val = _mm256_mul_ps(a3Val, b3Val);
278
279 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
280 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
281 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
282 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
283
284 aPtr += 32;
285 bPtr += 16;
286 }
287
288 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
289 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
290 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
291
292 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
293
294 _mm256_store_ps(dotProductVector,
295 dotProdVal0); // Store the results back into the dot product vector
296
297 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
298 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
299 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
300 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
301
302 number = sixteenthPoints * 16;
303 for (; number < num_points; number++) {
304 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
305 aPtr += 2;
306 bPtr += 1;
307 }
308
309 *result = returnValue;
310}
311
312#endif /*LV_HAVE_AVX*/
313
314
315#ifdef LV_HAVE_SSE
316
317
319 const lv_32fc_t* input,
320 const float* taps,
321 unsigned int num_points)
322{
323
324 unsigned int number = 0;
325 const unsigned int eighthPoints = num_points / 8;
326
327 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
328 const float* aPtr = (float*)input;
329 const float* bPtr = taps;
330
331 __m128 a0Val, a1Val, a2Val, a3Val;
332 __m128 b0Val, b1Val, b2Val, b3Val;
333 __m128 x0Val, x1Val, x2Val, x3Val;
334 __m128 c0Val, c1Val, c2Val, c3Val;
335
336 __m128 dotProdVal0 = _mm_setzero_ps();
337 __m128 dotProdVal1 = _mm_setzero_ps();
338 __m128 dotProdVal2 = _mm_setzero_ps();
339 __m128 dotProdVal3 = _mm_setzero_ps();
340
341 for (; number < eighthPoints; number++) {
342
343 a0Val = _mm_load_ps(aPtr);
344 a1Val = _mm_load_ps(aPtr + 4);
345 a2Val = _mm_load_ps(aPtr + 8);
346 a3Val = _mm_load_ps(aPtr + 12);
347
348 x0Val = _mm_load_ps(bPtr);
349 x1Val = _mm_load_ps(bPtr);
350 x2Val = _mm_load_ps(bPtr + 4);
351 x3Val = _mm_load_ps(bPtr + 4);
352 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
353 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
354 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
355 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
356
357 c0Val = _mm_mul_ps(a0Val, b0Val);
358 c1Val = _mm_mul_ps(a1Val, b1Val);
359 c2Val = _mm_mul_ps(a2Val, b2Val);
360 c3Val = _mm_mul_ps(a3Val, b3Val);
361
362 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
363 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
364 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
365 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
366
367 aPtr += 16;
368 bPtr += 8;
369 }
370
371 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
372 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
373 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
374
375 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
376
377 _mm_store_ps(dotProductVector,
378 dotProdVal0); // Store the results back into the dot product vector
379
380 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
381 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
382
383 number = eighthPoints * 8;
384 for (; number < num_points; number++) {
385 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
386 aPtr += 2;
387 bPtr += 1;
388 }
389
390 *result = returnValue;
391}
392
393#endif /*LV_HAVE_SSE*/
394
395#ifdef LV_HAVE_AVX512F
396
397#include <immintrin.h>
398
399static inline void volk_32fc_32f_dot_prod_32fc_u_avx512f(lv_32fc_t* result,
400 const lv_32fc_t* input,
401 const float* taps,
402 unsigned int num_points)
403{
404 unsigned int number = 0;
405 const unsigned int sixteenthPoints = num_points / 16;
406
407 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
408 const float* aPtr = (float*)input;
409 const float* bPtr = taps;
410
411 __m512 a0Val, a1Val;
412 __m512 b0Val, b1Val;
413 __m512 xVal;
414
415 __m512 dotProdVal0 = _mm512_setzero_ps();
416 __m512 dotProdVal1 = _mm512_setzero_ps();
417
418 // Create index patterns for duplication: 0,0,1,1,2,2,3,3,...,15,15
419 const __m512i idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
420 const __m512i idx2 =
421 _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15);
422
423 for (; number < sixteenthPoints; number++) {
424 // Load 16 complex numbers (32 floats) - unaligned
425 a0Val = _mm512_loadu_ps(aPtr); // 8 complex (I0,Q0,I1,Q1,...)
426 a1Val = _mm512_loadu_ps(aPtr + 16); // 8 complex (I8,Q8,I9,Q9,...)
427
428 // Load 16 real taps - unaligned
429 xVal = _mm512_loadu_ps(bPtr); // t0|t1|t2|...|t15
430
431 // Duplicate each tap value to match complex format using permutexvar
432 b0Val = _mm512_permutexvar_ps(idx, xVal);
433 b1Val = _mm512_permutexvar_ps(idx2, xVal);
434
435 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
436 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
437
438 aPtr += 32;
439 bPtr += 16;
440 }
441
442 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
443
444 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
445 _mm512_store_ps(dotProductVector, dotProdVal0);
446
447 for (unsigned int i = 0; i < 16; i += 2) {
448 returnValue += lv_cmake(dotProductVector[i], dotProductVector[i + 1]);
449 }
450
451 number = sixteenthPoints * 16;
452 lv_32fc_t returnTail = lv_cmake(0.0f, 0.0f);
454 &returnTail, input + number, bPtr, num_points - number);
455 returnValue += returnTail;
456
457 *result = returnValue;
458}
459
460#endif /*LV_HAVE_AVX512F*/
461
462#if LV_HAVE_AVX2 && LV_HAVE_FMA
463
464#include <immintrin.h>
465
466static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
467 const lv_32fc_t* input,
468 const float* taps,
469 unsigned int num_points)
470{
471
472 unsigned int number = 0;
473 const unsigned int sixteenthPoints = num_points / 16;
474
475 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
476 const float* aPtr = (float*)input;
477 const float* bPtr = taps;
478
479 __m256 a0Val, a1Val, a2Val, a3Val;
480 __m256 b0Val, b1Val, b2Val, b3Val;
481 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
482
483 __m256 dotProdVal0 = _mm256_setzero_ps();
484 __m256 dotProdVal1 = _mm256_setzero_ps();
485 __m256 dotProdVal2 = _mm256_setzero_ps();
486 __m256 dotProdVal3 = _mm256_setzero_ps();
487
488 for (; number < sixteenthPoints; number++) {
489
490 a0Val = _mm256_loadu_ps(aPtr);
491 a1Val = _mm256_loadu_ps(aPtr + 8);
492 a2Val = _mm256_loadu_ps(aPtr + 16);
493 a3Val = _mm256_loadu_ps(aPtr + 24);
494
495 x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
496 x1Val = _mm256_loadu_ps(bPtr + 8);
497 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
498 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
499 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
500 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
501
502 // TODO: it may be possible to rearrange swizzling to better pipeline data
503 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
504 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
505 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
506 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
507
508 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
509 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
510 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
511 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
512
513 aPtr += 32;
514 bPtr += 16;
515 }
516
517 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
518 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
519 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
520
521 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
522
523 _mm256_store_ps(dotProductVector,
524 dotProdVal0); // Store the results back into the dot product vector
525
526 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
527 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
528 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
529 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
530
531 number = sixteenthPoints * 16;
532 for (; number < num_points; number++) {
533 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
534 aPtr += 2;
535 bPtr += 1;
536 }
537
538 *result = returnValue;
539}
540
541#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
542
543#ifdef LV_HAVE_AVX
544
545#include <immintrin.h>
546
548 const lv_32fc_t* input,
549 const float* taps,
550 unsigned int num_points)
551{
552
553 unsigned int number = 0;
554 const unsigned int sixteenthPoints = num_points / 16;
555
556 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
557 const float* aPtr = (float*)input;
558 const float* bPtr = taps;
559
560 __m256 a0Val, a1Val, a2Val, a3Val;
561 __m256 b0Val, b1Val, b2Val, b3Val;
562 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
563 __m256 c0Val, c1Val, c2Val, c3Val;
564
565 __m256 dotProdVal0 = _mm256_setzero_ps();
566 __m256 dotProdVal1 = _mm256_setzero_ps();
567 __m256 dotProdVal2 = _mm256_setzero_ps();
568 __m256 dotProdVal3 = _mm256_setzero_ps();
569
570 for (; number < sixteenthPoints; number++) {
571
572 a0Val = _mm256_loadu_ps(aPtr);
573 a1Val = _mm256_loadu_ps(aPtr + 8);
574 a2Val = _mm256_loadu_ps(aPtr + 16);
575 a3Val = _mm256_loadu_ps(aPtr + 24);
576
577 x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
578 x1Val = _mm256_loadu_ps(bPtr + 8);
579 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
580 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
581 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
582 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
583
584 // TODO: it may be possible to rearrange swizzling to better pipeline data
585 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
586 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
587 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
588 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
589
590 c0Val = _mm256_mul_ps(a0Val, b0Val);
591 c1Val = _mm256_mul_ps(a1Val, b1Val);
592 c2Val = _mm256_mul_ps(a2Val, b2Val);
593 c3Val = _mm256_mul_ps(a3Val, b3Val);
594
595 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
596 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
597 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
598 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
599
600 aPtr += 32;
601 bPtr += 16;
602 }
603
604 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
605 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
606 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
607
608 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
609
610 _mm256_store_ps(dotProductVector,
611 dotProdVal0); // Store the results back into the dot product vector
612
613 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
614 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
615 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
616 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
617
618 number = sixteenthPoints * 16;
619 for (; number < num_points; number++) {
620 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
621 aPtr += 2;
622 bPtr += 1;
623 }
624
625 *result = returnValue;
626}
627#endif /*LV_HAVE_AVX*/
628
629#ifdef LV_HAVE_NEON
630#include <arm_neon.h>
631
632static inline void
634 const lv_32fc_t* __restrict input,
635 const float* __restrict taps,
636 unsigned int num_points)
637{
638
639 unsigned int number;
640 const unsigned int quarterPoints = num_points / 8;
641
642 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
643 const float* inputPtr = (float*)input;
644 const float* tapsPtr = taps;
645 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
646 float accVector_real[4];
647 float accVector_imag[4];
648
649 float32x4x2_t inputVector0, inputVector1;
650 float32x4_t tapsVector0, tapsVector1;
651 float32x4_t tmp_real0, tmp_imag0;
652 float32x4_t tmp_real1, tmp_imag1;
653 float32x4_t real_accumulator0, imag_accumulator0;
654 float32x4_t real_accumulator1, imag_accumulator1;
655
656 // zero out accumulators
657 // take a *float, return float32x4_t
658 real_accumulator0 = vld1q_f32(zero);
659 imag_accumulator0 = vld1q_f32(zero);
660 real_accumulator1 = vld1q_f32(zero);
661 imag_accumulator1 = vld1q_f32(zero);
662
663 for (number = 0; number < quarterPoints; number++) {
664 // load doublewords and duplicate in to second lane
665 tapsVector0 = vld1q_f32(tapsPtr);
666 tapsVector1 = vld1q_f32(tapsPtr + 4);
667
668 // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
669 inputVector0 = vld2q_f32(inputPtr);
670 inputVector1 = vld2q_f32(inputPtr + 8);
671 // inputVector is now a struct of two vectors, 0th is real, 1st is imag
672
673 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
674 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
675
676 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
677 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
678
679 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
680 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
681
682 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
683 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
684
685 tapsPtr += 8;
686 inputPtr += 16;
687 }
688
689 real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
690 imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
691 // void vst1q_f32( float32_t * ptr, float32x4_t val);
692 // store results back to a complex (array of 2 floats)
693 vst1q_f32(accVector_real, real_accumulator0);
694 vst1q_f32(accVector_imag, imag_accumulator0);
695 returnValue += lv_cmake(
696 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
697 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
698
699 // clean up the remainder
700 for (number = quarterPoints * 8; number < num_points; number++) {
701 returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
702 inputPtr += 2;
703 tapsPtr += 1;
704 }
705
706 *result = returnValue;
707}
708
709#endif /*LV_HAVE_NEON*/
710
711#ifdef LV_HAVE_NEON
712#include <arm_neon.h>
713
714static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
715 const lv_32fc_t* __restrict input,
716 const float* __restrict taps,
717 unsigned int num_points)
718{
719
720 unsigned int number;
721 const unsigned int quarterPoints = num_points / 4;
722
723 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
724 const float* inputPtr = (float*)input;
725 const float* tapsPtr = taps;
726 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
727 float accVector_real[4];
728 float accVector_imag[4];
729
730 float32x4x2_t inputVector;
731 float32x4_t tapsVector;
732 float32x4_t tmp_real, tmp_imag;
733 float32x4_t real_accumulator, imag_accumulator;
734
735
736 // zero out accumulators
737 // take a *float, return float32x4_t
738 real_accumulator = vld1q_f32(zero);
739 imag_accumulator = vld1q_f32(zero);
740
741 for (number = 0; number < quarterPoints; number++) {
742 // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
743 // load doublewords and duplicate in to second lane
744 tapsVector = vld1q_f32(tapsPtr);
745
746 // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
747 inputVector = vld2q_f32(inputPtr);
748
749 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
750 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
751
752 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
753 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
754
755
756 tapsPtr += 4;
757 inputPtr += 8;
758 }
759
760 // store results back to a complex (array of 2 floats)
761 vst1q_f32(accVector_real, real_accumulator);
762 vst1q_f32(accVector_imag, imag_accumulator);
763 returnValue += lv_cmake(
764 accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
765 accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
766
767 // clean up the remainder
768 for (number = quarterPoints * 4; number < num_points; number++) {
769 returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
770 inputPtr += 2;
771 tapsPtr += 1;
772 }
773
774 *result = returnValue;
775}
776
777#endif /*LV_HAVE_NEON*/
778
779#ifdef LV_HAVE_NEONV8
780#include <arm_neon.h>
781
782static inline void volk_32fc_32f_dot_prod_32fc_neonv8(lv_32fc_t* result,
783 const lv_32fc_t* input,
784 const float* taps,
785 unsigned int num_points)
786{
787 const unsigned int eighthPoints = num_points / 8;
788 const float* inputPtr = (const float*)input;
789 const float* tapsPtr = taps;
790
791 /* Use 2 independent real/imag accumulators for FMA pipelining */
792 float32x4_t real_acc0 = vdupq_n_f32(0);
793 float32x4_t imag_acc0 = vdupq_n_f32(0);
794 float32x4_t real_acc1 = vdupq_n_f32(0);
795 float32x4_t imag_acc1 = vdupq_n_f32(0);
796
797 for (unsigned int number = 0; number < eighthPoints; number++) {
798 /* Load 8 complex values deinterleaved */
799 float32x4x2_t cplx0 = vld2q_f32(inputPtr);
800 float32x4x2_t cplx1 = vld2q_f32(inputPtr + 8);
801
802 /* Load 8 real taps */
803 float32x4_t taps0 = vld1q_f32(tapsPtr);
804 float32x4_t taps1 = vld1q_f32(tapsPtr + 4);
805 __VOLK_PREFETCH(inputPtr + 32);
806 __VOLK_PREFETCH(tapsPtr + 16);
807
808 /* FMA: acc += taps * complex */
809 real_acc0 = vfmaq_f32(real_acc0, taps0, cplx0.val[0]);
810 imag_acc0 = vfmaq_f32(imag_acc0, taps0, cplx0.val[1]);
811 real_acc1 = vfmaq_f32(real_acc1, taps1, cplx1.val[0]);
812 imag_acc1 = vfmaq_f32(imag_acc1, taps1, cplx1.val[1]);
813
814 inputPtr += 16;
815 tapsPtr += 8;
816 }
817
818 /* Combine accumulators */
819 real_acc0 = vaddq_f32(real_acc0, real_acc1);
820 imag_acc0 = vaddq_f32(imag_acc0, imag_acc1);
821
822 /* Horizontal sum */
823 float real_sum = vaddvq_f32(real_acc0);
824 float imag_sum = vaddvq_f32(imag_acc0);
825
826 lv_32fc_t returnValue = lv_cmake(real_sum, imag_sum);
827
828 /* Handle remainder */
829 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
830 returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
831 inputPtr += 2;
832 tapsPtr += 1;
833 }
834
835 *result = returnValue;
836}
837#endif /*LV_HAVE_NEONV8*/
838
839#ifdef LV_HAVE_NEONV7
840extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
841 const lv_32fc_t* input,
842 const float* taps,
843 unsigned int num_points);
844#endif /*LV_HAVE_NEONV7*/
845
846#ifdef LV_HAVE_NEONV7
847extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
848 const lv_32fc_t* input,
849 const float* taps,
850 unsigned int num_points);
851#endif /*LV_HAVE_NEONV7*/
852
853#ifdef LV_HAVE_NEONV7
854extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
855 const lv_32fc_t* input,
856 const float* taps,
857 unsigned int num_points);
858#endif /*LV_HAVE_NEONV7*/
859
860#ifdef LV_HAVE_SSE
861
863 const lv_32fc_t* input,
864 const float* taps,
865 unsigned int num_points)
866{
867
868 unsigned int number = 0;
869 const unsigned int eighthPoints = num_points / 8;
870
871 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
872 const float* aPtr = (float*)input;
873 const float* bPtr = taps;
874
875 __m128 a0Val, a1Val, a2Val, a3Val;
876 __m128 b0Val, b1Val, b2Val, b3Val;
877 __m128 x0Val, x1Val, x2Val, x3Val;
878 __m128 c0Val, c1Val, c2Val, c3Val;
879
880 __m128 dotProdVal0 = _mm_setzero_ps();
881 __m128 dotProdVal1 = _mm_setzero_ps();
882 __m128 dotProdVal2 = _mm_setzero_ps();
883 __m128 dotProdVal3 = _mm_setzero_ps();
884
885 for (; number < eighthPoints; number++) {
886
887 a0Val = _mm_loadu_ps(aPtr);
888 a1Val = _mm_loadu_ps(aPtr + 4);
889 a2Val = _mm_loadu_ps(aPtr + 8);
890 a3Val = _mm_loadu_ps(aPtr + 12);
891
892 x0Val = _mm_loadu_ps(bPtr);
893 x1Val = _mm_loadu_ps(bPtr);
894 x2Val = _mm_loadu_ps(bPtr + 4);
895 x3Val = _mm_loadu_ps(bPtr + 4);
896 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
897 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
898 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
899 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
900
901 c0Val = _mm_mul_ps(a0Val, b0Val);
902 c1Val = _mm_mul_ps(a1Val, b1Val);
903 c2Val = _mm_mul_ps(a2Val, b2Val);
904 c3Val = _mm_mul_ps(a3Val, b3Val);
905
906 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
907 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
908 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
909 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
910
911 aPtr += 16;
912 bPtr += 8;
913 }
914
915 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
916 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
917 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
918
919 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
920
921 _mm_store_ps(dotProductVector,
922 dotProdVal0); // Store the results back into the dot product vector
923
924 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
925 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
926
927 number = eighthPoints * 8;
928 for (; number < num_points; number++) {
929 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
930 aPtr += 2;
931 bPtr += 1;
932 }
933
934 *result = returnValue;
935}
936
937#endif /*LV_HAVE_SSE*/
938
939#ifdef LV_HAVE_RVV
940#include <riscv_vector.h>
942
943static inline void volk_32fc_32f_dot_prod_32fc_rvv(lv_32fc_t* result,
944 const lv_32fc_t* input,
945 const float* taps,
946 unsigned int num_points)
947{
948 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
949 vfloat32m4_t vsumi = vsumr;
950 size_t n = num_points;
951 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
952 vl = __riscv_vsetvl_e32m4(n);
953 vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)input, vl);
954 vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr;
955 vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
956 vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
957 vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl);
958 vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl);
959 }
960 size_t vl = __riscv_vsetvlmax_e32m1();
961 vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
962 vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
963 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
964 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
965 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
966}
967#endif /*LV_HAVE_RVV*/
968
969#ifdef LV_HAVE_RVVSEG
970#include <riscv_vector.h>
972
973static inline void volk_32fc_32f_dot_prod_32fc_rvvseg(lv_32fc_t* result,
974 const lv_32fc_t* input,
975 const float* taps,
976 unsigned int num_points)
977{
978 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
979 vfloat32m4_t vsumi = vsumr;
980 size_t n = num_points;
981 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
982 vl = __riscv_vsetvl_e32m4(n);
983 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl);
984 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
985 vfloat32m4_t vbr = __riscv_vle32_v_f32m4(taps, vl), vbi = vbr;
986 vsumr = __riscv_vfmacc_tu(vsumr, var, vbr, vl);
987 vsumi = __riscv_vfmacc_tu(vsumi, vai, vbi, vl);
988 }
989 size_t vl = __riscv_vsetvlmax_e32m1();
990 vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
991 vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
992 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
993 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
994 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
995}
996#endif /*LV_HAVE_RVVSEG*/
997
998#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/