58#ifndef INCLUDED_volk_32f_64f_multiply_64f_H
59#define INCLUDED_volk_32f_64f_multiply_64f_H
68 const double* bVector,
69 unsigned int num_points)
71 double* cPtr = cVector;
72 const float* aPtr = aVector;
73 const double* bPtr = bVector;
74 unsigned int number = 0;
76 for (number = 0; number < num_points; number++) {
77 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
95 const double* bVector,
96 unsigned int num_points)
98 unsigned int number = 0;
99 const unsigned int eighth_points = num_points / 8;
101 double* cPtr = cVector;
102 const float* aPtr = aVector;
103 const double* bPtr = bVector;
107 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
108 for (; number < eighth_points; number++) {
110 aVal = _mm256_loadu_ps(aPtr);
111 bVal1 = _mm256_loadu_pd(bPtr);
112 bVal2 = _mm256_loadu_pd(bPtr + 4);
114 aVal1 = _mm256_extractf128_ps(aVal, 0);
115 aVal2 = _mm256_extractf128_ps(aVal, 1);
117 aDbl1 = _mm256_cvtps_pd(aVal1);
118 aDbl2 = _mm256_cvtps_pd(aVal2);
120 cVal1 = _mm256_mul_pd(aDbl1, bVal1);
121 cVal2 = _mm256_mul_pd(aDbl2, bVal2);
123 _mm256_storeu_pd(cPtr, cVal1);
124 _mm256_storeu_pd(cPtr + 4, cVal2);
131 number = eighth_points * 8;
132 for (; number < num_points; number++) {
133 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
142#include <immintrin.h>
143#include <xmmintrin.h>
146 const float* aVector,
147 const double* bVector,
148 unsigned int num_points)
150 unsigned int number = 0;
151 const unsigned int eighth_points = num_points / 8;
153 double* cPtr = cVector;
154 const float* aPtr = aVector;
155 const double* bPtr = bVector;
159 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
160 for (; number < eighth_points; number++) {
162 aVal = _mm256_load_ps(aPtr);
163 bVal1 = _mm256_load_pd(bPtr);
164 bVal2 = _mm256_load_pd(bPtr + 4);
166 aVal1 = _mm256_extractf128_ps(aVal, 0);
167 aVal2 = _mm256_extractf128_ps(aVal, 1);
169 aDbl1 = _mm256_cvtps_pd(aVal1);
170 aDbl2 = _mm256_cvtps_pd(aVal2);
172 cVal1 = _mm256_mul_pd(aDbl1, bVal1);
173 cVal2 = _mm256_mul_pd(aDbl2, bVal2);
175 _mm256_store_pd(cPtr, cVal1);
176 _mm256_store_pd(cPtr + 4, cVal2);
183 number = eighth_points * 8;
184 for (; number < num_points; number++) {
185 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
194static inline void volk_32f_64f_multiply_64f_neonv8(
double* cVector,
195 const float* aVector,
196 const double* bVector,
197 unsigned int num_points)
199 unsigned int number = 0;
200 const unsigned int eighth_points = num_points / 8;
202 double* cPtr = cVector;
203 const float* aPtr = aVector;
204 const double* bPtr = bVector;
206 for (; number < eighth_points; number++) {
207 float32x4_t aVal0 = vld1q_f32(aPtr);
208 float32x4_t aVal1 = vld1q_f32(aPtr + 4);
211 float64x2_t bVal0 = vld1q_f64(bPtr);
212 float64x2_t bVal1 = vld1q_f64(bPtr + 2);
213 float64x2_t bVal2 = vld1q_f64(bPtr + 4);
214 float64x2_t bVal3 = vld1q_f64(bPtr + 6);
217 float64x2_t aDbl0 = vcvt_f64_f32(vget_low_f32(aVal0));
218 float64x2_t aDbl1 = vcvt_f64_f32(vget_high_f32(aVal0));
219 float64x2_t aDbl2 = vcvt_f64_f32(vget_low_f32(aVal1));
220 float64x2_t aDbl3 = vcvt_f64_f32(vget_high_f32(aVal1));
222 float64x2_t cVal0 = vmulq_f64(aDbl0, bVal0);
223 float64x2_t cVal1 = vmulq_f64(aDbl1, bVal1);
224 float64x2_t cVal2 = vmulq_f64(aDbl2, bVal2);
225 float64x2_t cVal3 = vmulq_f64(aDbl3, bVal3);
227 vst1q_f64(cPtr, cVal0);
228 vst1q_f64(cPtr + 2, cVal1);
229 vst1q_f64(cPtr + 4, cVal2);
230 vst1q_f64(cPtr + 6, cVal3);
237 number = eighth_points * 8;
238 for (; number < num_points; number++) {
239 *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
245#include <riscv_vector.h>
247static inline void volk_32f_64f_multiply_64f_rvv(
double* cVector,
248 const float* aVector,
249 const double* bVector,
250 unsigned int num_points)
252 size_t n = num_points;
253 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
254 vl = __riscv_vsetvl_e64m8(n);
255 vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl);
256 vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
257 __riscv_vse64(cVector, __riscv_vfmul(va, vb, vl), vl);