Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_x2_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
44
45#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
46#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
47
48#include <stdio.h>
49#include <string.h>
50#include <volk/volk_common.h>
51#include <volk/volk_complex.h>
52
53
54#ifdef LV_HAVE_RISCV64
55extern void volk_32fc_x2_dot_prod_32fc_sifive_u74(lv_32fc_t* result,
56 const lv_32fc_t* input,
57 const lv_32fc_t* taps,
58 unsigned int num_points);
59#endif
60
61#ifdef LV_HAVE_GENERIC
62
63
65 const lv_32fc_t* input,
66 const lv_32fc_t* taps,
67 unsigned int num_points)
68{
69
70 float* res = (float*)result;
71 float* in = (float*)input;
72 float* tp = (float*)taps;
73 unsigned int n_2_ccomplex_blocks = num_points / 2;
74
75 float sum0[2] = { 0, 0 };
76 float sum1[2] = { 0, 0 };
77 unsigned int i = 0;
78
79 for (i = 0; i < n_2_ccomplex_blocks; ++i) {
80 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
81 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
82 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
83 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
84
85 in += 4;
86 tp += 4;
87 }
88
89 res[0] = sum0[0] + sum1[0];
90 res[1] = sum0[1] + sum1[1];
91
92 // Cleanup if we had an odd number of points
93 if (num_points & 1) {
94 *result += input[num_points - 1] * taps[num_points - 1];
95 }
96}
97
98#endif /*LV_HAVE_GENERIC*/
99
100
101#ifdef LV_HAVE_SSE3
102
103#include <pmmintrin.h>
104
106 const lv_32fc_t* input,
107 const lv_32fc_t* taps,
108 unsigned int num_points)
109{
110
111 lv_32fc_t dotProduct;
112 memset(&dotProduct, 0x0, 2 * sizeof(float));
113
114 unsigned int number = 0;
115 const unsigned int halfPoints = num_points / 2;
116 unsigned int isodd = num_points & 1;
117
118 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
119
120 const lv_32fc_t* a = input;
121 const lv_32fc_t* b = taps;
122
123 dotProdVal = _mm_setzero_ps();
124
125 for (; number < halfPoints; number++) {
126
127 x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
128 y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
129
130 yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
131 yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
132
133 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
134
135 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
136
137 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
138
139 z = _mm_addsub_ps(tmp1,
140 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
141
142 dotProdVal =
143 _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
144
145 a += 2;
146 b += 2;
147 }
148
149 __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
150
151 _mm_storeu_ps((float*)dotProductVector,
152 dotProdVal); // Store the results back into the dot product vector
153
154 dotProduct += (dotProductVector[0] + dotProductVector[1]);
155
156 if (isodd) {
157 dotProduct += input[num_points - 1] * taps[num_points - 1];
158 }
159
160 *result = dotProduct;
161}
162
163#endif /*LV_HAVE_SSE3*/
164
165#ifdef LV_HAVE_AVX
166
167#include <immintrin.h>
168
170 const lv_32fc_t* input,
171 const lv_32fc_t* taps,
172 unsigned int num_points)
173{
174
175 unsigned int isodd = num_points & 3;
176 unsigned int i = 0;
177 lv_32fc_t dotProduct;
178 memset(&dotProduct, 0x0, 2 * sizeof(float));
179
180 unsigned int number = 0;
181 const unsigned int quarterPoints = num_points / 4;
182
183 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
184
185 const lv_32fc_t* a = input;
186 const lv_32fc_t* b = taps;
187
188 dotProdVal = _mm256_setzero_ps();
189
190 for (; number < quarterPoints; number++) {
191 x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
192 y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
193
194 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
195 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
196
197 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
198
199 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
200
201 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
202
203 z = _mm256_addsub_ps(tmp1,
204 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
205
206 dotProdVal = _mm256_add_ps(dotProdVal,
207 z); // Add the complex multiplication results together
208
209 a += 4;
210 b += 4;
211 }
212
213 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
214
215 _mm256_storeu_ps((float*)dotProductVector,
216 dotProdVal); // Store the results back into the dot product vector
217
218 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
219 dotProductVector[3]);
220
221 for (i = num_points - isodd; i < num_points; i++) {
222 dotProduct += input[i] * taps[i];
223 }
224
225 *result = dotProduct;
226}
227
228#endif /*LV_HAVE_AVX*/
229
230#if LV_HAVE_AVX && LV_HAVE_FMA
231#include <immintrin.h>
232
233static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result,
234 const lv_32fc_t* input,
235 const lv_32fc_t* taps,
236 unsigned int num_points)
237{
238
239 unsigned int isodd = num_points & 3;
240 unsigned int i = 0;
241 lv_32fc_t dotProduct;
242 memset(&dotProduct, 0x0, 2 * sizeof(float));
243
244 unsigned int number = 0;
245 const unsigned int quarterPoints = num_points / 4;
246
247 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
248
249 const lv_32fc_t* a = input;
250 const lv_32fc_t* b = taps;
251
252 dotProdVal = _mm256_setzero_ps();
253
254 for (; number < quarterPoints; number++) {
255
256 x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
257 y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
258
259 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
260 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
261
262 tmp1 = x;
263
264 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
265
266 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
267
268 z = _mm256_fmaddsub_ps(
269 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
270
271 dotProdVal = _mm256_add_ps(dotProdVal,
272 z); // Add the complex multiplication results together
273
274 a += 4;
275 b += 4;
276 }
277
278 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
279
280 _mm256_storeu_ps((float*)dotProductVector,
281 dotProdVal); // Store the results back into the dot product vector
282
283 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
284 dotProductVector[3]);
285
286 for (i = num_points - isodd; i < num_points; i++) {
287 dotProduct += input[i] * taps[i];
288 }
289
290 *result = dotProduct;
291}
292
293#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
294
295#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
296
297#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
298#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
299
300#include <stdio.h>
301#include <string.h>
302#include <volk/volk_common.h>
303#include <volk/volk_complex.h>
304
305
306#ifdef LV_HAVE_SSE3
307
308#include <pmmintrin.h>
309
311 const lv_32fc_t* input,
312 const lv_32fc_t* taps,
313 unsigned int num_points)
314{
315
316 const unsigned int num_bytes = num_points * 8;
317 unsigned int isodd = num_points & 1;
318
319 lv_32fc_t dotProduct;
320 memset(&dotProduct, 0x0, 2 * sizeof(float));
321
322 unsigned int number = 0;
323 const unsigned int halfPoints = num_bytes >> 4;
324
325 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
326
327 const lv_32fc_t* a = input;
328 const lv_32fc_t* b = taps;
329
330 dotProdVal = _mm_setzero_ps();
331
332 for (; number < halfPoints; number++) {
333
334 x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
335 y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
336
337 yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
338 yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
339
340 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
341
342 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
343
344 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
345
346 z = _mm_addsub_ps(tmp1,
347 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
348
349 dotProdVal =
350 _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
351
352 a += 2;
353 b += 2;
354 }
355
356 __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
357
358 _mm_store_ps((float*)dotProductVector,
359 dotProdVal); // Store the results back into the dot product vector
360
361 dotProduct += (dotProductVector[0] + dotProductVector[1]);
362
363 if (isodd) {
364 dotProduct += input[num_points - 1] * taps[num_points - 1];
365 }
366
367 *result = dotProduct;
368}
369
370#endif /*LV_HAVE_SSE3*/
371
372
373#ifdef LV_HAVE_NEON
374#include <arm_neon.h>
375
377 const lv_32fc_t* input,
378 const lv_32fc_t* taps,
379 unsigned int num_points)
380{
381
382 unsigned int quarter_points = num_points / 4;
383 unsigned int number;
384
385 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
386 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
387 // for 2-lane vectors, 1st lane holds the real part,
388 // 2nd lane holds the imaginary part
389 float32x4x2_t a_val, b_val, c_val, accumulator;
390 float32x4x2_t tmp_real, tmp_imag;
391 accumulator.val[0] = vdupq_n_f32(0);
392 accumulator.val[1] = vdupq_n_f32(0);
393
394 for (number = 0; number < quarter_points; ++number) {
395 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
396 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
397 __VOLK_PREFETCH(a_ptr + 8);
398 __VOLK_PREFETCH(b_ptr + 8);
399
400 // multiply the real*real and imag*imag to get real result
401 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
402 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
403 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
404 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
405
406 // Multiply cross terms to get the imaginary result
407 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
408 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
409 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
410 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
411
412 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
413 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
414
415 accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
416 accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
417
418 a_ptr += 4;
419 b_ptr += 4;
420 }
421 lv_32fc_t accum_result[4];
422 vst2q_f32((float*)accum_result, accumulator);
423 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
424
425 // tail case
426 for (number = quarter_points * 4; number < num_points; ++number) {
427 *result += (*a_ptr++) * (*b_ptr++);
428 }
429}
430#endif /*LV_HAVE_NEON*/
431
432#ifdef LV_HAVE_NEON
433#include <arm_neon.h>
435 const lv_32fc_t* input,
436 const lv_32fc_t* taps,
437 unsigned int num_points)
438{
439
440 unsigned int quarter_points = num_points / 4;
441 unsigned int number;
442
443 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
444 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
445 // for 2-lane vectors, 1st lane holds the real part,
446 // 2nd lane holds the imaginary part
447 float32x4x2_t a_val, b_val, accumulator;
448 float32x4x2_t tmp_imag;
449 accumulator.val[0] = vdupq_n_f32(0);
450 accumulator.val[1] = vdupq_n_f32(0);
451
452 for (number = 0; number < quarter_points; ++number) {
453 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
454 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
455 __VOLK_PREFETCH(a_ptr + 8);
456 __VOLK_PREFETCH(b_ptr + 8);
457
458 // do the first multiply
459 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
460 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
461
462 // use multiply accumulate/subtract to get result
463 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
464 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
465
466 accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
467 accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
468
469 // increment pointers
470 a_ptr += 4;
471 b_ptr += 4;
472 }
473 lv_32fc_t accum_result[4];
474 vst2q_f32((float*)accum_result, accumulator);
475 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
476
477 // tail case
478 for (number = quarter_points * 4; number < num_points; ++number) {
479 *result += (*a_ptr++) * (*b_ptr++);
480 }
481}
482#endif /*LV_HAVE_NEON*/
483
484#ifdef LV_HAVE_NEON
486 const lv_32fc_t* input,
487 const lv_32fc_t* taps,
488 unsigned int num_points)
489{
490
491 unsigned int quarter_points = num_points / 4;
492 unsigned int number;
493
494 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
495 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
496 // for 2-lane vectors, 1st lane holds the real part,
497 // 2nd lane holds the imaginary part
498 float32x4x2_t a_val, b_val, accumulator1, accumulator2;
499 accumulator1.val[0] = vdupq_n_f32(0);
500 accumulator1.val[1] = vdupq_n_f32(0);
501 accumulator2.val[0] = vdupq_n_f32(0);
502 accumulator2.val[1] = vdupq_n_f32(0);
503
504 for (number = 0; number < quarter_points; ++number) {
505 a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
506 b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
507 __VOLK_PREFETCH(a_ptr + 8);
508 __VOLK_PREFETCH(b_ptr + 8);
509
510 // use 2 accumulators to remove inter-instruction data dependencies
511 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
512 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
513 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
514 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
515 // increment pointers
516 a_ptr += 4;
517 b_ptr += 4;
518 }
519 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
520 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
521 lv_32fc_t accum_result[4];
522 vst2q_f32((float*)accum_result, accumulator1);
523 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
524
525 // tail case
526 for (number = quarter_points * 4; number < num_points; ++number) {
527 *result += (*a_ptr++) * (*b_ptr++);
528 }
529}
530#endif /*LV_HAVE_NEON*/
531
532#ifdef LV_HAVE_NEON
534 const lv_32fc_t* input,
535 const lv_32fc_t* taps,
536 unsigned int num_points)
537{
538 // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very
539 // fast
540
541 unsigned int quarter_points = num_points / 8;
542 unsigned int number;
543
544 lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
545 lv_32fc_t* b_ptr = (lv_32fc_t*)input;
546 // for 2-lane vectors, 1st lane holds the real part,
547 // 2nd lane holds the imaginary part
548 float32x4x4_t a_val, b_val, accumulator1, accumulator2;
549 float32x4x2_t reduced_accumulator;
550 accumulator1.val[0] = vdupq_n_f32(0);
551 accumulator1.val[1] = vdupq_n_f32(0);
552 accumulator1.val[2] = vdupq_n_f32(0);
553 accumulator1.val[3] = vdupq_n_f32(0);
554 accumulator2.val[0] = vdupq_n_f32(0);
555 accumulator2.val[1] = vdupq_n_f32(0);
556 accumulator2.val[2] = vdupq_n_f32(0);
557 accumulator2.val[3] = vdupq_n_f32(0);
558
559 // 8 input regs, 8 accumulators -> 16/16 neon regs are used
560 for (number = 0; number < quarter_points; ++number) {
561 a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
562 b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
563 __VOLK_PREFETCH(a_ptr + 8);
564 __VOLK_PREFETCH(b_ptr + 8);
565
566 // use 2 accumulators to remove inter-instruction data dependencies
567 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
568 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
569
570 accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
571 accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
572
573 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
574 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
575
576 accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
577 accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
578 // increment pointers
579 a_ptr += 8;
580 b_ptr += 8;
581 }
582 // reduce 8 accumulator lanes down to 2 (1 real and 1 imag)
583 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
584 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
585 accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
586 accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
587 reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
588 reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
589 // now reduce accumulators to scalars
590 lv_32fc_t accum_result[4];
591 vst2q_f32((float*)accum_result, reduced_accumulator);
592 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
593
594 // tail case
595 for (number = quarter_points * 8; number < num_points; ++number) {
596 *result += (*a_ptr++) * (*b_ptr++);
597 }
598}
599#endif /*LV_HAVE_NEON*/
600
601
602#ifdef LV_HAVE_NEONV8
603#include <arm_neon.h>
604
605static inline void volk_32fc_x2_dot_prod_32fc_neonv8(lv_32fc_t* result,
606 const lv_32fc_t* input,
607 const lv_32fc_t* taps,
608 unsigned int num_points)
609{
610 unsigned int n = num_points;
611 const lv_32fc_t* a = input;
612 const lv_32fc_t* b = taps;
613
614 /* Use 4 accumulators to break data dependencies */
615 float32x4_t acc0_r = vdupq_n_f32(0);
616 float32x4_t acc0_i = vdupq_n_f32(0);
617 float32x4_t acc1_r = vdupq_n_f32(0);
618 float32x4_t acc1_i = vdupq_n_f32(0);
619
620 /* Process 8 complex numbers per iteration (2x unroll) */
621 while (n >= 8) {
622 float32x4x2_t a0 = vld2q_f32((const float*)a);
623 float32x4x2_t b0 = vld2q_f32((const float*)b);
624 float32x4x2_t a1 = vld2q_f32((const float*)(a + 4));
625 float32x4x2_t b1 = vld2q_f32((const float*)(b + 4));
626 __VOLK_PREFETCH(a + 8);
627 __VOLK_PREFETCH(b + 8);
628
629 /* Complex dot product accumulation using FMA:
630 * real += ar*br - ai*bi
631 * imag += ar*bi + ai*br
632 */
633 acc0_r = vfmaq_f32(acc0_r, a0.val[0], b0.val[0]); /* ar*br */
634 acc0_r = vfmsq_f32(acc0_r, a0.val[1], b0.val[1]); /* - ai*bi */
635 acc0_i = vfmaq_f32(acc0_i, a0.val[0], b0.val[1]); /* ar*bi */
636 acc0_i = vfmaq_f32(acc0_i, a0.val[1], b0.val[0]); /* + ai*br */
637
638 acc1_r = vfmaq_f32(acc1_r, a1.val[0], b1.val[0]);
639 acc1_r = vfmsq_f32(acc1_r, a1.val[1], b1.val[1]);
640 acc1_i = vfmaq_f32(acc1_i, a1.val[0], b1.val[1]);
641 acc1_i = vfmaq_f32(acc1_i, a1.val[1], b1.val[0]);
642
643 a += 8;
644 b += 8;
645 n -= 8;
646 }
647
648 /* Process remaining 4 */
649 if (n >= 4) {
650 float32x4x2_t a0 = vld2q_f32((const float*)a);
651 float32x4x2_t b0 = vld2q_f32((const float*)b);
652
653 acc0_r = vfmaq_f32(acc0_r, a0.val[0], b0.val[0]);
654 acc0_r = vfmsq_f32(acc0_r, a0.val[1], b0.val[1]);
655 acc0_i = vfmaq_f32(acc0_i, a0.val[0], b0.val[1]);
656 acc0_i = vfmaq_f32(acc0_i, a0.val[1], b0.val[0]);
657
658 a += 4;
659 b += 4;
660 n -= 4;
661 }
662
663 /* Combine accumulators */
664 acc0_r = vaddq_f32(acc0_r, acc1_r);
665 acc0_i = vaddq_f32(acc0_i, acc1_i);
666
667 /* Horizontal sum using pairwise add */
668 float32x2_t sum_r = vadd_f32(vget_low_f32(acc0_r), vget_high_f32(acc0_r));
669 float32x2_t sum_i = vadd_f32(vget_low_f32(acc0_i), vget_high_f32(acc0_i));
670 sum_r = vpadd_f32(sum_r, sum_r);
671 sum_i = vpadd_f32(sum_i, sum_i);
672
673 float res_r = vget_lane_f32(sum_r, 0);
674 float res_i = vget_lane_f32(sum_i, 0);
675
676 /* Scalar tail */
677 while (n > 0) {
678 res_r += lv_creal(*a) * lv_creal(*b) - lv_cimag(*a) * lv_cimag(*b);
679 res_i += lv_creal(*a) * lv_cimag(*b) + lv_cimag(*a) * lv_creal(*b);
680 a++;
681 b++;
682 n--;
683 }
684
685 *result = lv_cmake(res_r, res_i);
686}
687
688#endif /* LV_HAVE_NEONV8 */
689
690
691#ifdef LV_HAVE_AVX
692
693#include <immintrin.h>
694
696 const lv_32fc_t* input,
697 const lv_32fc_t* taps,
698 unsigned int num_points)
699{
700
701 unsigned int isodd = num_points & 3;
702 unsigned int i = 0;
703 lv_32fc_t dotProduct;
704 memset(&dotProduct, 0x0, 2 * sizeof(float));
705
706 unsigned int number = 0;
707 const unsigned int quarterPoints = num_points / 4;
708
709 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
710
711 const lv_32fc_t* a = input;
712 const lv_32fc_t* b = taps;
713
714 dotProdVal = _mm256_setzero_ps();
715
716 for (; number < quarterPoints; number++) {
717
718 x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
719 y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
720
721 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
722 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
723
724 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
725
726 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
727
728 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
729
730 z = _mm256_addsub_ps(tmp1,
731 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
732
733 dotProdVal = _mm256_add_ps(dotProdVal,
734 z); // Add the complex multiplication results together
735
736 a += 4;
737 b += 4;
738 }
739
740 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
741
742 _mm256_store_ps((float*)dotProductVector,
743 dotProdVal); // Store the results back into the dot product vector
744
745 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
746 dotProductVector[3]);
747
748 for (i = num_points - isodd; i < num_points; i++) {
749 dotProduct += input[i] * taps[i];
750 }
751
752 *result = dotProduct;
753}
754
755#endif /*LV_HAVE_AVX*/
756
757#if LV_HAVE_AVX && LV_HAVE_FMA
758#include <immintrin.h>
759
760static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result,
761 const lv_32fc_t* input,
762 const lv_32fc_t* taps,
763 unsigned int num_points)
764{
765
766 unsigned int isodd = num_points & 3;
767 unsigned int i = 0;
768 lv_32fc_t dotProduct;
769 memset(&dotProduct, 0x0, 2 * sizeof(float));
770
771 unsigned int number = 0;
772 const unsigned int quarterPoints = num_points / 4;
773
774 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
775
776 const lv_32fc_t* a = input;
777 const lv_32fc_t* b = taps;
778
779 dotProdVal = _mm256_setzero_ps();
780
781 for (; number < quarterPoints; number++) {
782
783 x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
784 y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
785
786 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
787 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
788
789 tmp1 = x;
790
791 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
792
793 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
794
795 z = _mm256_fmaddsub_ps(
796 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
797
798 dotProdVal = _mm256_add_ps(dotProdVal,
799 z); // Add the complex multiplication results together
800
801 a += 4;
802 b += 4;
803 }
804
805 __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
806
807 _mm256_store_ps((float*)dotProductVector,
808 dotProdVal); // Store the results back into the dot product vector
809
810 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
811 dotProductVector[3]);
812
813 for (i = num_points - isodd; i < num_points; i++) {
814 dotProduct += input[i] * taps[i];
815 }
816
817 *result = dotProduct;
818}
819
820#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
821
822#ifdef LV_HAVE_RVV
823#include <riscv_vector.h>
825
826static inline void volk_32fc_x2_dot_prod_32fc_rvv(lv_32fc_t* result,
827 const lv_32fc_t* input,
828 const lv_32fc_t* taps,
829 unsigned int num_points)
830{
831 vfloat32m2_t vsumr = __riscv_vfmv_v_f_f32m2(0, __riscv_vsetvlmax_e32m2());
832 vfloat32m2_t vsumi = vsumr;
833 size_t n = num_points;
834 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
835 vl = __riscv_vsetvl_e32m2(n);
836 vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)input, vl);
837 vuint64m4_t vb = __riscv_vle64_v_u64m4((const uint64_t*)taps, vl);
838 vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
839 vfloat32m2_t vbr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 0, vl));
840 vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
841 vfloat32m2_t vbi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(vb, 32, vl));
842 vfloat32m2_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
843 vfloat32m2_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
844 vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
845 vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
846 }
847 size_t vl = __riscv_vsetvlmax_e32m1();
848 vfloat32m1_t vr = RISCV_SHRINK2(vfadd, f, 32, vsumr);
849 vfloat32m1_t vi = RISCV_SHRINK2(vfadd, f, 32, vsumi);
850 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
851 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
852 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
853}
854#endif /*LV_HAVE_RVV*/
855
856#ifdef LV_HAVE_RVVSEG
857#include <riscv_vector.h>
859
860static inline void volk_32fc_x2_dot_prod_32fc_rvvseg(lv_32fc_t* result,
861 const lv_32fc_t* input,
862 const lv_32fc_t* taps,
863 unsigned int num_points)
864{
865 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
866 vfloat32m4_t vsumi = vsumr;
867 size_t n = num_points;
868 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
869 vl = __riscv_vsetvl_e32m4(n);
870 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)input, vl);
871 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
872 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
873 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
874 vfloat32m4_t vr = __riscv_vfnmsac(__riscv_vfmul(var, vbr, vl), vai, vbi, vl);
875 vfloat32m4_t vi = __riscv_vfmacc(__riscv_vfmul(var, vbi, vl), vai, vbr, vl);
876 vsumr = __riscv_vfadd_tu(vsumr, vsumr, vr, vl);
877 vsumi = __riscv_vfadd_tu(vsumi, vsumi, vi, vl);
878 }
879 size_t vl = __riscv_vsetvlmax_e32m1();
880 vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
881 vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
882 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
883 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
884 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
885}
886#endif /*LV_HAVE_RVVSEG*/
887
888#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/