Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_64f_x2_dot_prod_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
52
53#ifndef INCLUDED_volk_64f_x2_dot_prod_64f_u_H
54#define INCLUDED_volk_64f_x2_dot_prod_64f_u_H
55
56#include <volk/volk_common.h>
57
58#ifdef LV_HAVE_GENERIC
59
60static inline void volk_64f_x2_dot_prod_64f_generic(double* result,
61 const double* input,
62 const double* taps,
63 unsigned int num_points)
64{
65 double dot = 0.0;
66 for (unsigned int i = 0; i < num_points; i++) {
67 dot += input[i] * taps[i];
68 }
69 *result = dot;
70}
71
72#endif /* LV_HAVE_GENERIC */
73
74
75#ifdef LV_HAVE_SSE2
76#include <emmintrin.h>
77
78static inline void volk_64f_x2_dot_prod_64f_u_sse2(double* result,
79 const double* input,
80 const double* taps,
81 unsigned int num_points)
82{
83 const unsigned int eighthPoints = num_points / 8;
84 unsigned int number = 0;
85
86 __m128d acc0 = _mm_setzero_pd();
87 __m128d acc1 = _mm_setzero_pd();
88 __m128d acc2 = _mm_setzero_pd();
89 __m128d acc3 = _mm_setzero_pd();
90
91 for (; number < eighthPoints; number++) {
92 acc0 = _mm_add_pd(acc0, _mm_mul_pd(_mm_loadu_pd(input), _mm_loadu_pd(taps)));
93 acc1 =
94 _mm_add_pd(acc1, _mm_mul_pd(_mm_loadu_pd(input + 2), _mm_loadu_pd(taps + 2)));
95 acc2 =
96 _mm_add_pd(acc2, _mm_mul_pd(_mm_loadu_pd(input + 4), _mm_loadu_pd(taps + 4)));
97 acc3 =
98 _mm_add_pd(acc3, _mm_mul_pd(_mm_loadu_pd(input + 6), _mm_loadu_pd(taps + 6)));
99 input += 8;
100 taps += 8;
101 }
102
103 acc0 = _mm_add_pd(acc0, acc1);
104 acc2 = _mm_add_pd(acc2, acc3);
105 acc0 = _mm_add_pd(acc0, acc2);
106
107 __VOLK_ATTR_ALIGNED(16) double tmp[2];
108 _mm_store_pd(tmp, acc0);
109 double dot = tmp[0] + tmp[1];
110
111 for (number = eighthPoints * 8; number < num_points; number++) {
112 dot += (*input++) * (*taps++);
113 }
114 *result = dot;
115}
116
117#endif /* LV_HAVE_SSE2 */
118
119
120#ifdef LV_HAVE_AVX
121#include <immintrin.h>
122
123static inline void volk_64f_x2_dot_prod_64f_u_avx(double* result,
124 const double* input,
125 const double* taps,
126 unsigned int num_points)
127{
128 const unsigned int eighthPoints = num_points / 8;
129 unsigned int number = 0;
130
131 __m256d acc0 = _mm256_setzero_pd();
132 __m256d acc1 = _mm256_setzero_pd();
133
134 for (; number < eighthPoints; number++) {
135 acc0 = _mm256_add_pd(
136 acc0, _mm256_mul_pd(_mm256_loadu_pd(input), _mm256_loadu_pd(taps)));
137 acc1 = _mm256_add_pd(
138 acc1, _mm256_mul_pd(_mm256_loadu_pd(input + 4), _mm256_loadu_pd(taps + 4)));
139 input += 8;
140 taps += 8;
141 }
142
143 acc0 = _mm256_add_pd(acc0, acc1);
144
145 __VOLK_ATTR_ALIGNED(32) double tmp[4];
146 _mm256_storeu_pd(tmp, acc0);
147 double dot = tmp[0] + tmp[1] + tmp[2] + tmp[3];
148
149 for (number = eighthPoints * 8; number < num_points; number++) {
150 dot += (*input++) * (*taps++);
151 }
152 *result = dot;
153}
154
155#endif /* LV_HAVE_AVX */
156
157
158#if LV_HAVE_AVX2 && LV_HAVE_FMA
159#include <immintrin.h>
160
161static inline void volk_64f_x2_dot_prod_64f_u_avx2_fma(double* result,
162 const double* input,
163 const double* taps,
164 unsigned int num_points)
165{
166 const unsigned int eighthPoints = num_points / 8;
167 unsigned int number = 0;
168
169 __m256d acc0 = _mm256_setzero_pd();
170 __m256d acc1 = _mm256_setzero_pd();
171
172 for (; number < eighthPoints; number++) {
173 acc0 = _mm256_fmadd_pd(_mm256_loadu_pd(input), _mm256_loadu_pd(taps), acc0);
174 acc1 =
175 _mm256_fmadd_pd(_mm256_loadu_pd(input + 4), _mm256_loadu_pd(taps + 4), acc1);
176 input += 8;
177 taps += 8;
178 }
179
180 acc0 = _mm256_add_pd(acc0, acc1);
181
182 __VOLK_ATTR_ALIGNED(32) double tmp[4];
183 _mm256_storeu_pd(tmp, acc0);
184 double dot = tmp[0] + tmp[1] + tmp[2] + tmp[3];
185
186 for (number = eighthPoints * 8; number < num_points; number++) {
187 dot += (*input++) * (*taps++);
188 }
189 *result = dot;
190}
191
192#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
193
194
195#ifdef LV_HAVE_AVX512F
196#include <immintrin.h>
197
198static inline void volk_64f_x2_dot_prod_64f_u_avx512f(double* result,
199 const double* input,
200 const double* taps,
201 unsigned int num_points)
202{
203 const unsigned int eighthPoints = num_points / 8;
204 unsigned int number = 0;
205
206 __m512d acc = _mm512_setzero_pd();
207
208 for (; number < eighthPoints; number++) {
209 acc = _mm512_fmadd_pd(_mm512_loadu_pd(input), _mm512_loadu_pd(taps), acc);
210 input += 8;
211 taps += 8;
212 }
213
214 double dot = _mm512_reduce_add_pd(acc);
215
216 for (number = eighthPoints * 8; number < num_points; number++) {
217 dot += (*input++) * (*taps++);
218 }
219 *result = dot;
220}
221
222#endif /* LV_HAVE_AVX512F */
223
224
225#endif /* INCLUDED_volk_64f_x2_dot_prod_64f_u_H */
226
227
228#ifndef INCLUDED_volk_64f_x2_dot_prod_64f_a_H
229#define INCLUDED_volk_64f_x2_dot_prod_64f_a_H
230
231#include <volk/volk_common.h>
232
233#ifdef LV_HAVE_SSE2
234#include <emmintrin.h>
235
236static inline void volk_64f_x2_dot_prod_64f_a_sse2(double* result,
237 const double* input,
238 const double* taps,
239 unsigned int num_points)
240{
241 const unsigned int eighthPoints = num_points / 8;
242 unsigned int number = 0;
243
244 __m128d acc0 = _mm_setzero_pd();
245 __m128d acc1 = _mm_setzero_pd();
246 __m128d acc2 = _mm_setzero_pd();
247 __m128d acc3 = _mm_setzero_pd();
248
249 for (; number < eighthPoints; number++) {
250 acc0 = _mm_add_pd(acc0, _mm_mul_pd(_mm_load_pd(input), _mm_load_pd(taps)));
251 acc1 =
252 _mm_add_pd(acc1, _mm_mul_pd(_mm_load_pd(input + 2), _mm_load_pd(taps + 2)));
253 acc2 =
254 _mm_add_pd(acc2, _mm_mul_pd(_mm_load_pd(input + 4), _mm_load_pd(taps + 4)));
255 acc3 =
256 _mm_add_pd(acc3, _mm_mul_pd(_mm_load_pd(input + 6), _mm_load_pd(taps + 6)));
257 input += 8;
258 taps += 8;
259 }
260
261 acc0 = _mm_add_pd(acc0, acc1);
262 acc2 = _mm_add_pd(acc2, acc3);
263 acc0 = _mm_add_pd(acc0, acc2);
264
265 __VOLK_ATTR_ALIGNED(16) double tmp[2];
266 _mm_store_pd(tmp, acc0);
267 double dot = tmp[0] + tmp[1];
268
269 for (number = eighthPoints * 8; number < num_points; number++) {
270 dot += (*input++) * (*taps++);
271 }
272 *result = dot;
273}
274
275#endif /* LV_HAVE_SSE2 */
276
277
278#ifdef LV_HAVE_AVX
279#include <immintrin.h>
280
281static inline void volk_64f_x2_dot_prod_64f_a_avx(double* result,
282 const double* input,
283 const double* taps,
284 unsigned int num_points)
285{
286 const unsigned int eighthPoints = num_points / 8;
287 unsigned int number = 0;
288
289 __m256d acc0 = _mm256_setzero_pd();
290 __m256d acc1 = _mm256_setzero_pd();
291
292 for (; number < eighthPoints; number++) {
293 acc0 = _mm256_add_pd(acc0,
294 _mm256_mul_pd(_mm256_load_pd(input), _mm256_load_pd(taps)));
295 acc1 = _mm256_add_pd(
296 acc1, _mm256_mul_pd(_mm256_load_pd(input + 4), _mm256_load_pd(taps + 4)));
297 input += 8;
298 taps += 8;
299 }
300
301 acc0 = _mm256_add_pd(acc0, acc1);
302
303 __VOLK_ATTR_ALIGNED(32) double tmp[4];
304 _mm256_store_pd(tmp, acc0);
305 double dot = tmp[0] + tmp[1] + tmp[2] + tmp[3];
306
307 for (number = eighthPoints * 8; number < num_points; number++) {
308 dot += (*input++) * (*taps++);
309 }
310 *result = dot;
311}
312
313#endif /* LV_HAVE_AVX */
314
315
316#if LV_HAVE_AVX2 && LV_HAVE_FMA
317#include <immintrin.h>
318
319static inline void volk_64f_x2_dot_prod_64f_a_avx2_fma(double* result,
320 const double* input,
321 const double* taps,
322 unsigned int num_points)
323{
324 const unsigned int eighthPoints = num_points / 8;
325 unsigned int number = 0;
326
327 __m256d acc0 = _mm256_setzero_pd();
328 __m256d acc1 = _mm256_setzero_pd();
329
330 for (; number < eighthPoints; number++) {
331 acc0 = _mm256_fmadd_pd(_mm256_load_pd(input), _mm256_load_pd(taps), acc0);
332 acc1 = _mm256_fmadd_pd(_mm256_load_pd(input + 4), _mm256_load_pd(taps + 4), acc1);
333 input += 8;
334 taps += 8;
335 }
336
337 acc0 = _mm256_add_pd(acc0, acc1);
338
339 __VOLK_ATTR_ALIGNED(32) double tmp[4];
340 _mm256_store_pd(tmp, acc0);
341 double dot = tmp[0] + tmp[1] + tmp[2] + tmp[3];
342
343 for (number = eighthPoints * 8; number < num_points; number++) {
344 dot += (*input++) * (*taps++);
345 }
346 *result = dot;
347}
348
349#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
350
351
352#ifdef LV_HAVE_AVX512F
353#include <immintrin.h>
354
355static inline void volk_64f_x2_dot_prod_64f_a_avx512f(double* result,
356 const double* input,
357 const double* taps,
358 unsigned int num_points)
359{
360 const unsigned int eighthPoints = num_points / 8;
361 unsigned int number = 0;
362
363 __m512d acc = _mm512_setzero_pd();
364
365 for (; number < eighthPoints; number++) {
366 acc = _mm512_fmadd_pd(_mm512_load_pd(input), _mm512_load_pd(taps), acc);
367 input += 8;
368 taps += 8;
369 }
370
371 double dot = _mm512_reduce_add_pd(acc);
372
373 for (number = eighthPoints * 8; number < num_points; number++) {
374 dot += (*input++) * (*taps++);
375 }
376 *result = dot;
377}
378
379#endif /* LV_HAVE_AVX512F */
380
381
382#ifdef LV_HAVE_NEONV8
383#include <arm_neon.h>
384
385static inline void volk_64f_x2_dot_prod_64f_neonv8(double* result,
386 const double* input,
387 const double* taps,
388 unsigned int num_points)
389{
390 const unsigned int eighthPoints = num_points / 8;
391 unsigned int number = 0;
392
393 float64x2_t acc0 = vdupq_n_f64(0.0);
394 float64x2_t acc1 = vdupq_n_f64(0.0);
395 float64x2_t acc2 = vdupq_n_f64(0.0);
396 float64x2_t acc3 = vdupq_n_f64(0.0);
397
398 for (; number < eighthPoints; number++) {
399 __VOLK_PREFETCH(input + 16);
400 __VOLK_PREFETCH(taps + 16);
401
402 acc0 = vfmaq_f64(acc0, vld1q_f64(input), vld1q_f64(taps));
403 acc1 = vfmaq_f64(acc1, vld1q_f64(input + 2), vld1q_f64(taps + 2));
404 acc2 = vfmaq_f64(acc2, vld1q_f64(input + 4), vld1q_f64(taps + 4));
405 acc3 = vfmaq_f64(acc3, vld1q_f64(input + 6), vld1q_f64(taps + 6));
406 input += 8;
407 taps += 8;
408 }
409
410 acc0 = vaddq_f64(acc0, acc1);
411 acc2 = vaddq_f64(acc2, acc3);
412 acc0 = vaddq_f64(acc0, acc2);
413
414 double dot = vaddvq_f64(acc0);
415
416 for (number = eighthPoints * 8; number < num_points; number++) {
417 dot += (*input++) * (*taps++);
418 }
419 *result = dot;
420}
421
422#endif /* LV_HAVE_NEONV8 */
423
424
425#ifdef LV_HAVE_RVV
426#include <riscv_vector.h>
428
429static inline void volk_64f_x2_dot_prod_64f_rvv(double* result,
430 const double* input,
431 const double* taps,
432 unsigned int num_points)
433{
434 vfloat64m8_t vsum = __riscv_vfmv_v_f_f64m8(0, __riscv_vsetvlmax_e64m8());
435 size_t n = num_points;
436 for (size_t vl; n > 0; n -= vl, input += vl, taps += vl) {
437 vl = __riscv_vsetvl_e64m8(n);
438 vfloat64m8_t v0 = __riscv_vle64_v_f64m8(input, vl);
439 vfloat64m8_t v1 = __riscv_vle64_v_f64m8(taps, vl);
440 vsum = __riscv_vfmacc_tu(vsum, v0, v1, vl);
441 }
442 size_t vl = __riscv_vsetvlmax_e64m1();
443 vfloat64m1_t v = RISCV_SHRINK8(vfadd, f, 64, vsum);
444 v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f64m1(0, vl), vl);
445 *result = __riscv_vfmv_f(v);
446}
447
448#endif /* LV_HAVE_RVV */
449
450
451#endif /* INCLUDED_volk_64f_x2_dot_prod_64f_a_H */