Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
44
45#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
46#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
47
48#include <stdio.h>
49#include <volk/volk_common.h>
50
51
52#ifdef LV_HAVE_GENERIC
53
55 const short* input,
56 const lv_32fc_t* taps,
57 unsigned int num_points)
58{
59
60 static const int N_UNROLL = 4;
61
62 lv_32fc_t acc0 = 0;
63 lv_32fc_t acc1 = 0;
64 lv_32fc_t acc2 = 0;
65 lv_32fc_t acc3 = 0;
66
67 unsigned i = 0;
68 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
69
70 for (i = 0; i < n; i += N_UNROLL) {
71 acc0 += taps[i + 0] * (float)input[i + 0];
72 acc1 += taps[i + 1] * (float)input[i + 1];
73 acc2 += taps[i + 2] * (float)input[i + 2];
74 acc3 += taps[i + 3] * (float)input[i + 3];
75 }
76
77 for (; i < num_points; i++) {
78 acc0 += taps[i] * (float)input[i];
79 }
80
81 *result = acc0 + acc1 + acc2 + acc3;
82}
83
84#endif /*LV_HAVE_GENERIC*/
85
86#ifdef LV_HAVE_NEON
87#include <arm_neon.h>
89 const short* input,
90 const lv_32fc_t* taps,
91 unsigned int num_points)
92{
93
94 unsigned ii;
95 unsigned quarter_points = num_points / 4;
96 lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
97 short* inputPtr = (short*)input;
98 lv_32fc_t accumulator_vec[4];
99
100 float32x4x2_t tapsVal, accumulator_val;
101 int16x4_t input16;
102 int32x4_t input32;
103 float32x4_t input_float, prod_re, prod_im;
104
105 accumulator_val.val[0] = vdupq_n_f32(0.0);
106 accumulator_val.val[1] = vdupq_n_f32(0.0);
107
108 for (ii = 0; ii < quarter_points; ++ii) {
109 tapsVal = vld2q_f32((float*)tapsPtr);
110 input16 = vld1_s16(inputPtr);
111 // widen 16-bit int to 32-bit int
112 input32 = vmovl_s16(input16);
113 // convert 32-bit int to float with scale
114 input_float = vcvtq_f32_s32(input32);
115
116 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
117 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
118
119 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
120 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
121
122 tapsPtr += 4;
123 inputPtr += 4;
124 }
125 vst2q_f32((float*)accumulator_vec, accumulator_val);
126 accumulator_vec[0] += accumulator_vec[1];
127 accumulator_vec[2] += accumulator_vec[3];
128 accumulator_vec[0] += accumulator_vec[2];
129
130 for (ii = quarter_points * 4; ii < num_points; ++ii) {
131 accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
132 }
133
134 *result = accumulator_vec[0];
135}
136
137#endif /*LV_HAVE_NEON*/
138
139#ifdef LV_HAVE_NEONV8
140#include <arm_neon.h>
141
142static inline void volk_16i_32fc_dot_prod_32fc_neonv8(lv_32fc_t* result,
143 const short* input,
144 const lv_32fc_t* taps,
145 unsigned int num_points)
146{
147 const unsigned int eighthPoints = num_points / 8;
148 const short* inputPtr = input;
149 const lv_32fc_t* tapsPtr = taps;
150
151 /* Use 2 independent real/imag accumulators for FMA pipelining */
152 float32x4_t real_acc0 = vdupq_n_f32(0);
153 float32x4_t imag_acc0 = vdupq_n_f32(0);
154 float32x4_t real_acc1 = vdupq_n_f32(0);
155 float32x4_t imag_acc1 = vdupq_n_f32(0);
156
157 for (unsigned int number = 0; number < eighthPoints; number++) {
158 /* Load 8 int16 values and convert to float */
159 int16x8_t input16 = vld1q_s16(inputPtr);
160 float32x4_t input_lo = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input16)));
161 float32x4_t input_hi = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input16)));
162
163 /* Load 8 complex taps deinterleaved */
164 float32x4x2_t taps0 = vld2q_f32((const float*)tapsPtr);
165 float32x4x2_t taps1 = vld2q_f32((const float*)(tapsPtr + 4));
166 __VOLK_PREFETCH(inputPtr + 16);
167 __VOLK_PREFETCH(tapsPtr + 16);
168
169 /* FMA: acc += input * taps */
170 real_acc0 = vfmaq_f32(real_acc0, input_lo, taps0.val[0]);
171 imag_acc0 = vfmaq_f32(imag_acc0, input_lo, taps0.val[1]);
172 real_acc1 = vfmaq_f32(real_acc1, input_hi, taps1.val[0]);
173 imag_acc1 = vfmaq_f32(imag_acc1, input_hi, taps1.val[1]);
174
175 inputPtr += 8;
176 tapsPtr += 8;
177 }
178
179 /* Combine accumulators */
180 real_acc0 = vaddq_f32(real_acc0, real_acc1);
181 imag_acc0 = vaddq_f32(imag_acc0, imag_acc1);
182
183 /* Horizontal sum */
184 float real_sum = vaddvq_f32(real_acc0);
185 float imag_sum = vaddvq_f32(imag_acc0);
186
187 lv_32fc_t returnValue = lv_cmake(real_sum, imag_sum);
188
189 /* Handle remainder */
190 const float* bPtr = (const float*)tapsPtr;
191 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
192 returnValue += lv_cmake(inputPtr[0] * bPtr[0], inputPtr[0] * bPtr[1]);
193 inputPtr += 1;
194 bPtr += 2;
195 }
196
197 *result = returnValue;
198}
199#endif /*LV_HAVE_NEONV8*/
200
201#if LV_HAVE_SSE && LV_HAVE_MMX
202
203static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
204 const short* input,
205 const lv_32fc_t* taps,
206 unsigned int num_points)
207{
208
209 unsigned int number = 0;
210 const unsigned int eighthPoints = num_points / 8;
211
212 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
213 const short* aPtr = input;
214 const float* bPtr = (float*)taps;
215
216 __m64 m0, m1;
217 __m128 f0, f1, f2, f3;
218 __m128 a0Val, a1Val, a2Val, a3Val;
219 __m128 b0Val, b1Val, b2Val, b3Val;
220 __m128 c0Val, c1Val, c2Val, c3Val;
221
222 __m128 dotProdVal0 = _mm_setzero_ps();
223 __m128 dotProdVal1 = _mm_setzero_ps();
224 __m128 dotProdVal2 = _mm_setzero_ps();
225 __m128 dotProdVal3 = _mm_setzero_ps();
226
227 for (; number < eighthPoints; number++) {
228
229 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
230 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
231 f0 = _mm_cvtpi16_ps(m0);
232 f1 = _mm_cvtpi16_ps(m0);
233 f2 = _mm_cvtpi16_ps(m1);
234 f3 = _mm_cvtpi16_ps(m1);
235
236 a0Val = _mm_unpacklo_ps(f0, f1);
237 a1Val = _mm_unpackhi_ps(f0, f1);
238 a2Val = _mm_unpacklo_ps(f2, f3);
239 a3Val = _mm_unpackhi_ps(f2, f3);
240
241 b0Val = _mm_loadu_ps(bPtr);
242 b1Val = _mm_loadu_ps(bPtr + 4);
243 b2Val = _mm_loadu_ps(bPtr + 8);
244 b3Val = _mm_loadu_ps(bPtr + 12);
245
246 c0Val = _mm_mul_ps(a0Val, b0Val);
247 c1Val = _mm_mul_ps(a1Val, b1Val);
248 c2Val = _mm_mul_ps(a2Val, b2Val);
249 c3Val = _mm_mul_ps(a3Val, b3Val);
250
251 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
252 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
253 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
254 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
255
256 aPtr += 8;
257 bPtr += 16;
258 }
259
260 _mm_empty(); // clear the mmx technology state
261
262 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
263 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
264 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
265
266 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
267
268 _mm_store_ps(dotProductVector,
269 dotProdVal0); // Store the results back into the dot product vector
270
271 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
272 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
273
274 number = eighthPoints * 8;
275 for (; number < num_points; number++) {
276 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
277 aPtr += 1;
278 bPtr += 2;
279 }
280
281 *result = returnValue;
282}
283
284#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
285
286
287#if LV_HAVE_AVX2 && LV_HAVE_FMA
288
289static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
290 const short* input,
291 const lv_32fc_t* taps,
292 unsigned int num_points)
293{
294
295 unsigned int number = 0;
296 const unsigned int sixteenthPoints = num_points / 16;
297
298 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
299 const short* aPtr = input;
300 const float* bPtr = (float*)taps;
301
302 __m128i m0, m1;
303 __m256i f0, f1;
304 __m256 g0, g1, h0, h1, h2, h3;
305 __m256 a0Val, a1Val, a2Val, a3Val;
306 __m256 b0Val, b1Val, b2Val, b3Val;
307
308 __m256 dotProdVal0 = _mm256_setzero_ps();
309 __m256 dotProdVal1 = _mm256_setzero_ps();
310 __m256 dotProdVal2 = _mm256_setzero_ps();
311 __m256 dotProdVal3 = _mm256_setzero_ps();
312
313 for (; number < sixteenthPoints; number++) {
314
315 m0 = _mm_loadu_si128((__m128i const*)aPtr);
316 m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
317
318 f0 = _mm256_cvtepi16_epi32(m0);
319 g0 = _mm256_cvtepi32_ps(f0);
320 f1 = _mm256_cvtepi16_epi32(m1);
321 g1 = _mm256_cvtepi32_ps(f1);
322
323 h0 = _mm256_unpacklo_ps(g0, g0);
324 h1 = _mm256_unpackhi_ps(g0, g0);
325 h2 = _mm256_unpacklo_ps(g1, g1);
326 h3 = _mm256_unpackhi_ps(g1, g1);
327
328 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
329 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
330 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
331 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
332
333 b0Val = _mm256_loadu_ps(bPtr);
334 b1Val = _mm256_loadu_ps(bPtr + 8);
335 b2Val = _mm256_loadu_ps(bPtr + 16);
336 b3Val = _mm256_loadu_ps(bPtr + 24);
337
338 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
339 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
340 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
341 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
342
343 aPtr += 16;
344 bPtr += 32;
345 }
346
347 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
348 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
349 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
350
351 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
352
353 _mm256_store_ps(dotProductVector,
354 dotProdVal0); // Store the results back into the dot product vector
355
356 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
357 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
358 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
359 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
360
361 number = sixteenthPoints * 16;
362 for (; number < num_points; number++) {
363 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
364 aPtr += 1;
365 bPtr += 2;
366 }
367
368 *result = returnValue;
369}
370
371#endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
372
373
374#ifdef LV_HAVE_AVX2
375
376static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
377 const short* input,
378 const lv_32fc_t* taps,
379 unsigned int num_points)
380{
381
382 unsigned int number = 0;
383 const unsigned int sixteenthPoints = num_points / 16;
384
385 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
386 const short* aPtr = input;
387 const float* bPtr = (float*)taps;
388
389 __m128i m0, m1;
390 __m256i f0, f1;
391 __m256 g0, g1, h0, h1, h2, h3;
392 __m256 a0Val, a1Val, a2Val, a3Val;
393 __m256 b0Val, b1Val, b2Val, b3Val;
394 __m256 c0Val, c1Val, c2Val, c3Val;
395
396 __m256 dotProdVal0 = _mm256_setzero_ps();
397 __m256 dotProdVal1 = _mm256_setzero_ps();
398 __m256 dotProdVal2 = _mm256_setzero_ps();
399 __m256 dotProdVal3 = _mm256_setzero_ps();
400
401 for (; number < sixteenthPoints; number++) {
402
403 m0 = _mm_loadu_si128((__m128i const*)aPtr);
404 m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
405
406 f0 = _mm256_cvtepi16_epi32(m0);
407 g0 = _mm256_cvtepi32_ps(f0);
408 f1 = _mm256_cvtepi16_epi32(m1);
409 g1 = _mm256_cvtepi32_ps(f1);
410
411 h0 = _mm256_unpacklo_ps(g0, g0);
412 h1 = _mm256_unpackhi_ps(g0, g0);
413 h2 = _mm256_unpacklo_ps(g1, g1);
414 h3 = _mm256_unpackhi_ps(g1, g1);
415
416 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
417 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
418 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
419 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
420
421 b0Val = _mm256_loadu_ps(bPtr);
422 b1Val = _mm256_loadu_ps(bPtr + 8);
423 b2Val = _mm256_loadu_ps(bPtr + 16);
424 b3Val = _mm256_loadu_ps(bPtr + 24);
425
426 c0Val = _mm256_mul_ps(a0Val, b0Val);
427 c1Val = _mm256_mul_ps(a1Val, b1Val);
428 c2Val = _mm256_mul_ps(a2Val, b2Val);
429 c3Val = _mm256_mul_ps(a3Val, b3Val);
430
431 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
432 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
433 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
434 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
435
436 aPtr += 16;
437 bPtr += 32;
438 }
439
440 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
441 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
442 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
443
444 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
445
446 _mm256_store_ps(dotProductVector,
447 dotProdVal0); // Store the results back into the dot product vector
448
449 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
450 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
451 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
452 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
453
454 number = sixteenthPoints * 16;
455 for (; number < num_points; number++) {
456 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
457 aPtr += 1;
458 bPtr += 2;
459 }
460
461 *result = returnValue;
462}
463
464#endif /*LV_HAVE_AVX2*/
465
466
467#if LV_HAVE_SSE && LV_HAVE_MMX
468
469
470static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
471 const short* input,
472 const lv_32fc_t* taps,
473 unsigned int num_points)
474{
475
476 unsigned int number = 0;
477 const unsigned int eighthPoints = num_points / 8;
478
479 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
480 const short* aPtr = input;
481 const float* bPtr = (float*)taps;
482
483 __m64 m0, m1;
484 __m128 f0, f1, f2, f3;
485 __m128 a0Val, a1Val, a2Val, a3Val;
486 __m128 b0Val, b1Val, b2Val, b3Val;
487 __m128 c0Val, c1Val, c2Val, c3Val;
488
489 __m128 dotProdVal0 = _mm_setzero_ps();
490 __m128 dotProdVal1 = _mm_setzero_ps();
491 __m128 dotProdVal2 = _mm_setzero_ps();
492 __m128 dotProdVal3 = _mm_setzero_ps();
493
494 for (; number < eighthPoints; number++) {
495
496 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
497 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
498 f0 = _mm_cvtpi16_ps(m0);
499 f1 = _mm_cvtpi16_ps(m0);
500 f2 = _mm_cvtpi16_ps(m1);
501 f3 = _mm_cvtpi16_ps(m1);
502
503 a0Val = _mm_unpacklo_ps(f0, f1);
504 a1Val = _mm_unpackhi_ps(f0, f1);
505 a2Val = _mm_unpacklo_ps(f2, f3);
506 a3Val = _mm_unpackhi_ps(f2, f3);
507
508 b0Val = _mm_load_ps(bPtr);
509 b1Val = _mm_load_ps(bPtr + 4);
510 b2Val = _mm_load_ps(bPtr + 8);
511 b3Val = _mm_load_ps(bPtr + 12);
512
513 c0Val = _mm_mul_ps(a0Val, b0Val);
514 c1Val = _mm_mul_ps(a1Val, b1Val);
515 c2Val = _mm_mul_ps(a2Val, b2Val);
516 c3Val = _mm_mul_ps(a3Val, b3Val);
517
518 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
519 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
520 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
521 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
522
523 aPtr += 8;
524 bPtr += 16;
525 }
526
527 _mm_empty(); // clear the mmx technology state
528
529 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
530 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
531 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
532
533 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
534
535 _mm_store_ps(dotProductVector,
536 dotProdVal0); // Store the results back into the dot product vector
537
538 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
539 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
540
541 number = eighthPoints * 8;
542 for (; number < num_points; number++) {
543 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
544 aPtr += 1;
545 bPtr += 2;
546 }
547
548 *result = returnValue;
549}
550
551#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
552
553#ifdef LV_HAVE_AVX2
554
555static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
556 const short* input,
557 const lv_32fc_t* taps,
558 unsigned int num_points)
559{
560
561 unsigned int number = 0;
562 const unsigned int sixteenthPoints = num_points / 16;
563
564 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
565 const short* aPtr = input;
566 const float* bPtr = (float*)taps;
567
568 __m128i m0, m1;
569 __m256i f0, f1;
570 __m256 g0, g1, h0, h1, h2, h3;
571 __m256 a0Val, a1Val, a2Val, a3Val;
572 __m256 b0Val, b1Val, b2Val, b3Val;
573 __m256 c0Val, c1Val, c2Val, c3Val;
574
575 __m256 dotProdVal0 = _mm256_setzero_ps();
576 __m256 dotProdVal1 = _mm256_setzero_ps();
577 __m256 dotProdVal2 = _mm256_setzero_ps();
578 __m256 dotProdVal3 = _mm256_setzero_ps();
579
580 for (; number < sixteenthPoints; number++) {
581
582 m0 = _mm_load_si128((__m128i const*)aPtr);
583 m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
584
585 f0 = _mm256_cvtepi16_epi32(m0);
586 g0 = _mm256_cvtepi32_ps(f0);
587 f1 = _mm256_cvtepi16_epi32(m1);
588 g1 = _mm256_cvtepi32_ps(f1);
589
590 h0 = _mm256_unpacklo_ps(g0, g0);
591 h1 = _mm256_unpackhi_ps(g0, g0);
592 h2 = _mm256_unpacklo_ps(g1, g1);
593 h3 = _mm256_unpackhi_ps(g1, g1);
594
595 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
596 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
597 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
598 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
599
600 b0Val = _mm256_load_ps(bPtr);
601 b1Val = _mm256_load_ps(bPtr + 8);
602 b2Val = _mm256_load_ps(bPtr + 16);
603 b3Val = _mm256_load_ps(bPtr + 24);
604
605 c0Val = _mm256_mul_ps(a0Val, b0Val);
606 c1Val = _mm256_mul_ps(a1Val, b1Val);
607 c2Val = _mm256_mul_ps(a2Val, b2Val);
608 c3Val = _mm256_mul_ps(a3Val, b3Val);
609
610 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
611 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
612 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
613 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
614
615 aPtr += 16;
616 bPtr += 32;
617 }
618
619 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
620 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
621 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
622
623 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
624
625 _mm256_store_ps(dotProductVector,
626 dotProdVal0); // Store the results back into the dot product vector
627
628 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
629 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
630 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
631 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
632
633 number = sixteenthPoints * 16;
634 for (; number < num_points; number++) {
635 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
636 aPtr += 1;
637 bPtr += 2;
638 }
639
640 *result = returnValue;
641}
642
643
644#endif /*LV_HAVE_AVX2*/
645
646#if LV_HAVE_AVX2 && LV_HAVE_FMA
647
648static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
649 const short* input,
650 const lv_32fc_t* taps,
651 unsigned int num_points)
652{
653
654 unsigned int number = 0;
655 const unsigned int sixteenthPoints = num_points / 16;
656
657 lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
658 const short* aPtr = input;
659 const float* bPtr = (float*)taps;
660
661 __m128i m0, m1;
662 __m256i f0, f1;
663 __m256 g0, g1, h0, h1, h2, h3;
664 __m256 a0Val, a1Val, a2Val, a3Val;
665 __m256 b0Val, b1Val, b2Val, b3Val;
666
667 __m256 dotProdVal0 = _mm256_setzero_ps();
668 __m256 dotProdVal1 = _mm256_setzero_ps();
669 __m256 dotProdVal2 = _mm256_setzero_ps();
670 __m256 dotProdVal3 = _mm256_setzero_ps();
671
672 for (; number < sixteenthPoints; number++) {
673
674 m0 = _mm_load_si128((__m128i const*)aPtr);
675 m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
676
677 f0 = _mm256_cvtepi16_epi32(m0);
678 g0 = _mm256_cvtepi32_ps(f0);
679 f1 = _mm256_cvtepi16_epi32(m1);
680 g1 = _mm256_cvtepi32_ps(f1);
681
682 h0 = _mm256_unpacklo_ps(g0, g0);
683 h1 = _mm256_unpackhi_ps(g0, g0);
684 h2 = _mm256_unpacklo_ps(g1, g1);
685 h3 = _mm256_unpackhi_ps(g1, g1);
686
687 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
688 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
689 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
690 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
691
692 b0Val = _mm256_load_ps(bPtr);
693 b1Val = _mm256_load_ps(bPtr + 8);
694 b2Val = _mm256_load_ps(bPtr + 16);
695 b3Val = _mm256_load_ps(bPtr + 24);
696
697 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
698 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
699 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
700 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
701
702 aPtr += 16;
703 bPtr += 32;
704 }
705
706 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
707 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
708 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
709
710 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
711
712 _mm256_store_ps(dotProductVector,
713 dotProdVal0); // Store the results back into the dot product vector
714
715 returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
716 returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
717 returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
718 returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
719
720 number = sixteenthPoints * 16;
721 for (; number < num_points; number++) {
722 returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[0] * bPtr[1]);
723 aPtr += 1;
724 bPtr += 2;
725 }
726
727 *result = returnValue;
728}
729
730
731#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
732
733#ifdef LV_HAVE_RVV
734#include <riscv_vector.h>
736
737static inline void volk_16i_32fc_dot_prod_32fc_rvv(lv_32fc_t* result,
738 const short* input,
739 const lv_32fc_t* taps,
740 unsigned int num_points)
741{
742 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
743 vfloat32m4_t vsumi = vsumr;
744 size_t n = num_points;
745 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
746 vl = __riscv_vsetvl_e32m4(n);
747 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)taps, vl);
748 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
749 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
750 vfloat32m4_t v =
751 __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
752 vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
753 vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
754 }
755 size_t vl = __riscv_vsetvlmax_e32m1();
756 vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
757 vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
758 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
759 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
760 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
761}
762#endif /*LV_HAVE_RVV*/
763
764#ifdef LV_HAVE_RVVSEG
765#include <riscv_vector.h>
767
768static inline void volk_16i_32fc_dot_prod_32fc_rvvseg(lv_32fc_t* result,
769 const short* input,
770 const lv_32fc_t* taps,
771 unsigned int num_points)
772{
773 vfloat32m4_t vsumr = __riscv_vfmv_v_f_f32m4(0, __riscv_vsetvlmax_e32m4());
774 vfloat32m4_t vsumi = vsumr;
775 size_t n = num_points;
776 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
777 vl = __riscv_vsetvl_e32m4(n);
778 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)taps, vl);
779 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
780 vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
781 vfloat32m4_t v =
782 __riscv_vfwcvt_f(__riscv_vle16_v_i16m2((const int16_t*)input, vl), vl);
783 vsumr = __riscv_vfmacc_tu(vsumr, vr, v, vl);
784 vsumi = __riscv_vfmacc_tu(vsumi, vi, v, vl);
785 }
786 size_t vl = __riscv_vsetvlmax_e32m1();
787 vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsumr);
788 vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsumi);
789 vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vl);
790 *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vl)),
791 __riscv_vfmv_f(__riscv_vfredusum(vi, z, vl)));
792}
793#endif /*LV_HAVE_RVVSEG*/
794
795#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/