58#ifndef INCLUDED_volk_64f_x2_add_64f_H
59#define INCLUDED_volk_64f_x2_add_64f_H
67 const double* aVector,
68 const double* bVector,
69 unsigned int num_points)
71 double* cPtr = cVector;
72 const double* aPtr = aVector;
73 const double* bPtr = bVector;
74 unsigned int number = 0;
76 for (number = 0; number < num_points; number++) {
77 *cPtr++ = (*aPtr++) + (*bPtr++);
87static inline void volk_64f_x2_add_64f_neonv8(
double* cVector,
88 const double* aVector,
89 const double* bVector,
90 unsigned int num_points)
92 unsigned int number = 0;
93 const unsigned int quarter_points = num_points / 4;
95 double* cPtr = cVector;
96 const double* aPtr = aVector;
97 const double* bPtr = bVector;
99 for (; number < quarter_points; number++) {
100 float64x2_t aVal0 = vld1q_f64(aPtr);
101 float64x2_t aVal1 = vld1q_f64(aPtr + 2);
102 float64x2_t bVal0 = vld1q_f64(bPtr);
103 float64x2_t bVal1 = vld1q_f64(bPtr + 2);
107 float64x2_t cVal0 = vaddq_f64(aVal0, bVal0);
108 float64x2_t cVal1 = vaddq_f64(aVal1, bVal1);
110 vst1q_f64(cPtr, cVal0);
111 vst1q_f64(cPtr + 2, cVal1);
118 number = quarter_points * 4;
119 for (; number < num_points; number++) {
120 *cPtr++ = (*aPtr++) + (*bPtr++);
133#include <emmintrin.h>
136 const double* aVector,
137 const double* bVector,
138 unsigned int num_points)
140 unsigned int number = 0;
141 const unsigned int half_points = num_points / 2;
143 double* cPtr = cVector;
144 const double* aPtr = aVector;
145 const double* bPtr = bVector;
147 __m128d aVal, bVal, cVal;
148 for (; number < half_points; number++) {
149 aVal = _mm_loadu_pd(aPtr);
150 bVal = _mm_loadu_pd(bPtr);
152 cVal = _mm_add_pd(aVal, bVal);
154 _mm_storeu_pd(cPtr, cVal);
161 number = half_points * 2;
162 for (; number < num_points; number++) {
163 *cPtr++ = (*aPtr++) + (*bPtr++);
172#include <immintrin.h>
175 const double* aVector,
176 const double* bVector,
177 unsigned int num_points)
179 unsigned int number = 0;
180 const unsigned int quarter_points = num_points / 4;
182 double* cPtr = cVector;
183 const double* aPtr = aVector;
184 const double* bPtr = bVector;
186 __m256d aVal, bVal, cVal;
187 for (; number < quarter_points; number++) {
189 aVal = _mm256_loadu_pd(aPtr);
190 bVal = _mm256_loadu_pd(bPtr);
192 cVal = _mm256_add_pd(aVal, bVal);
194 _mm256_storeu_pd(cPtr, cVal);
201 number = quarter_points * 4;
202 for (; number < num_points; number++) {
203 *cPtr++ = (*aPtr++) + (*bPtr++);
215#include <emmintrin.h>
218 const double* aVector,
219 const double* bVector,
220 unsigned int num_points)
222 unsigned int number = 0;
223 const unsigned int half_points = num_points / 2;
225 double* cPtr = cVector;
226 const double* aPtr = aVector;
227 const double* bPtr = bVector;
229 __m128d aVal, bVal, cVal;
230 for (; number < half_points; number++) {
231 aVal = _mm_load_pd(aPtr);
232 bVal = _mm_load_pd(bPtr);
234 cVal = _mm_add_pd(aVal, bVal);
236 _mm_store_pd(cPtr, cVal);
243 number = half_points * 2;
244 for (; number < num_points; number++) {
245 *cPtr++ = (*aPtr++) + (*bPtr++);
254#include <immintrin.h>
257 const double* aVector,
258 const double* bVector,
259 unsigned int num_points)
261 unsigned int number = 0;
262 const unsigned int quarter_points = num_points / 4;
264 double* cPtr = cVector;
265 const double* aPtr = aVector;
266 const double* bPtr = bVector;
268 __m256d aVal, bVal, cVal;
269 for (; number < quarter_points; number++) {
271 aVal = _mm256_load_pd(aPtr);
272 bVal = _mm256_load_pd(bPtr);
274 cVal = _mm256_add_pd(aVal, bVal);
276 _mm256_store_pd(cPtr, cVal);
283 number = quarter_points * 4;
284 for (; number < num_points; number++) {
285 *cPtr++ = (*aPtr++) + (*bPtr++);
292#include <riscv_vector.h>
294static inline void volk_64f_x2_add_64f_rvv(
double* cVector,
295 const double* aVector,
296 const double* bVector,
297 unsigned int num_points)
299 size_t n = num_points;
300 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
301 vl = __riscv_vsetvl_e64m8(n);
302 vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl);
303 vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
304 __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl);