60#ifndef INCLUDED_volk_32f_64f_add_64f_H
61#define INCLUDED_volk_32f_64f_add_64f_H
69 const double* bVector,
70 unsigned int num_points)
72 double* cPtr = cVector;
73 const float* aPtr = aVector;
74 const double* bPtr = bVector;
75 unsigned int number = 0;
77 for (number = 0; number < num_points; number++) {
78 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
87static inline void volk_32f_64f_add_64f_neonv8(
double* cVector,
89 const double* bVector,
90 unsigned int num_points)
92 unsigned int number = 0;
93 const unsigned int quarter_points = num_points / 4;
95 double* cPtr = cVector;
96 const float* aPtr = aVector;
97 const double* bPtr = bVector;
99 for (; number < quarter_points; number++) {
101 float32x4_t aVal_f32 = vld1q_f32(aPtr);
103 float64x2_t bVal0 = vld1q_f64(bPtr);
104 float64x2_t bVal1 = vld1q_f64(bPtr + 2);
109 float64x2_t aVal0 = vcvt_f64_f32(vget_low_f32(aVal_f32));
110 float64x2_t aVal1 = vcvt_f64_f32(vget_high_f32(aVal_f32));
113 float64x2_t cVal0 = vaddq_f64(aVal0, bVal0);
114 float64x2_t cVal1 = vaddq_f64(aVal1, bVal1);
117 vst1q_f64(cPtr, cVal0);
118 vst1q_f64(cPtr + 2, cVal1);
125 number = quarter_points * 4;
126 for (; number < num_points; number++) {
127 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
135#include <immintrin.h>
136#include <xmmintrin.h>
139 const float* aVector,
140 const double* bVector,
141 unsigned int num_points)
143 unsigned int number = 0;
144 const unsigned int eighth_points = num_points / 8;
146 double* cPtr = cVector;
147 const float* aPtr = aVector;
148 const double* bPtr = bVector;
152 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
153 for (; number < eighth_points; number++) {
155 aVal = _mm256_loadu_ps(aPtr);
156 bVal1 = _mm256_loadu_pd(bPtr);
157 bVal2 = _mm256_loadu_pd(bPtr + 4);
159 aVal1 = _mm256_extractf128_ps(aVal, 0);
160 aVal2 = _mm256_extractf128_ps(aVal, 1);
162 aDbl1 = _mm256_cvtps_pd(aVal1);
163 aDbl2 = _mm256_cvtps_pd(aVal2);
165 cVal1 = _mm256_add_pd(aDbl1, bVal1);
166 cVal2 = _mm256_add_pd(aDbl2, bVal2);
168 _mm256_storeu_pd(cPtr,
170 _mm256_storeu_pd(cPtr + 4,
178 number = eighth_points * 8;
179 for (; number < num_points; number++) {
180 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
188#include <immintrin.h>
189#include <xmmintrin.h>
192 const float* aVector,
193 const double* bVector,
194 unsigned int num_points)
196 unsigned int number = 0;
197 const unsigned int eighth_points = num_points / 8;
199 double* cPtr = cVector;
200 const float* aPtr = aVector;
201 const double* bPtr = bVector;
205 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
206 for (; number < eighth_points; number++) {
208 aVal = _mm256_load_ps(aPtr);
209 bVal1 = _mm256_load_pd(bPtr);
210 bVal2 = _mm256_load_pd(bPtr + 4);
212 aVal1 = _mm256_extractf128_ps(aVal, 0);
213 aVal2 = _mm256_extractf128_ps(aVal, 1);
215 aDbl1 = _mm256_cvtps_pd(aVal1);
216 aDbl2 = _mm256_cvtps_pd(aVal2);
218 cVal1 = _mm256_add_pd(aDbl1, bVal1);
219 cVal2 = _mm256_add_pd(aDbl2, bVal2);
221 _mm256_store_pd(cPtr, cVal1);
222 _mm256_store_pd(cPtr + 4,
230 number = eighth_points * 8;
231 for (; number < num_points; number++) {
232 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
239#include <riscv_vector.h>
241static inline void volk_32f_64f_add_64f_rvv(
double* cVector,
242 const float* aVector,
243 const double* bVector,
244 unsigned int num_points)
246 size_t n = num_points;
247 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
248 vl = __riscv_vsetvl_e64m8(n);
249 vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl);
250 vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
251 __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl);