Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_64f_add_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
59
60#ifndef INCLUDED_volk_32f_64f_add_64f_H
61#define INCLUDED_volk_32f_64f_add_64f_H
62
63#include <inttypes.h>
64
65#ifdef LV_HAVE_GENERIC
66
67static inline void volk_32f_64f_add_64f_generic(double* cVector,
68 const float* aVector,
69 const double* bVector,
70 unsigned int num_points)
71{
72 double* cPtr = cVector;
73 const float* aPtr = aVector;
74 const double* bPtr = bVector;
75 unsigned int number = 0;
76
77 for (number = 0; number < num_points; number++) {
78 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
79 }
80}
81
82#endif /* LV_HAVE_GENERIC */
83
84#ifdef LV_HAVE_NEONV8
85#include <arm_neon.h>
86
87static inline void volk_32f_64f_add_64f_neonv8(double* cVector,
88 const float* aVector,
89 const double* bVector,
90 unsigned int num_points)
91{
92 unsigned int number = 0;
93 const unsigned int quarter_points = num_points / 4;
94
95 double* cPtr = cVector;
96 const float* aPtr = aVector;
97 const double* bPtr = bVector;
98
99 for (; number < quarter_points; number++) {
100 // Load 4 floats
101 float32x4_t aVal_f32 = vld1q_f32(aPtr);
102 // Load 4 doubles (2x2)
103 float64x2_t bVal0 = vld1q_f64(bPtr);
104 float64x2_t bVal1 = vld1q_f64(bPtr + 2);
105 __VOLK_PREFETCH(aPtr + 4);
106 __VOLK_PREFETCH(bPtr + 4);
107
108 // Convert float to double (low and high halves)
109 float64x2_t aVal0 = vcvt_f64_f32(vget_low_f32(aVal_f32));
110 float64x2_t aVal1 = vcvt_f64_f32(vget_high_f32(aVal_f32));
111
112 // Add
113 float64x2_t cVal0 = vaddq_f64(aVal0, bVal0);
114 float64x2_t cVal1 = vaddq_f64(aVal1, bVal1);
115
116 // Store
117 vst1q_f64(cPtr, cVal0);
118 vst1q_f64(cPtr + 2, cVal1);
119
120 aPtr += 4;
121 bPtr += 4;
122 cPtr += 4;
123 }
124
125 number = quarter_points * 4;
126 for (; number < num_points; number++) {
127 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
128 }
129}
130
131#endif /* LV_HAVE_NEONV8 */
132
133#ifdef LV_HAVE_AVX
134
135#include <immintrin.h>
136#include <xmmintrin.h>
137
138static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
139 const float* aVector,
140 const double* bVector,
141 unsigned int num_points)
142{
143 unsigned int number = 0;
144 const unsigned int eighth_points = num_points / 8;
145
146 double* cPtr = cVector;
147 const float* aPtr = aVector;
148 const double* bPtr = bVector;
149
150 __m256 aVal;
151 __m128 aVal1, aVal2;
152 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
153 for (; number < eighth_points; number++) {
154
155 aVal = _mm256_loadu_ps(aPtr);
156 bVal1 = _mm256_loadu_pd(bPtr);
157 bVal2 = _mm256_loadu_pd(bPtr + 4);
158
159 aVal1 = _mm256_extractf128_ps(aVal, 0);
160 aVal2 = _mm256_extractf128_ps(aVal, 1);
161
162 aDbl1 = _mm256_cvtps_pd(aVal1);
163 aDbl2 = _mm256_cvtps_pd(aVal2);
164
165 cVal1 = _mm256_add_pd(aDbl1, bVal1);
166 cVal2 = _mm256_add_pd(aDbl2, bVal2);
167
168 _mm256_storeu_pd(cPtr,
169 cVal1); // Store the results back into the C container
170 _mm256_storeu_pd(cPtr + 4,
171 cVal2); // Store the results back into the C container
172
173 aPtr += 8;
174 bPtr += 8;
175 cPtr += 8;
176 }
177
178 number = eighth_points * 8;
179 for (; number < num_points; number++) {
180 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
181 }
182}
183
184#endif /* LV_HAVE_AVX */
185
186#ifdef LV_HAVE_AVX
187
188#include <immintrin.h>
189#include <xmmintrin.h>
190
191static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
192 const float* aVector,
193 const double* bVector,
194 unsigned int num_points)
195{
196 unsigned int number = 0;
197 const unsigned int eighth_points = num_points / 8;
198
199 double* cPtr = cVector;
200 const float* aPtr = aVector;
201 const double* bPtr = bVector;
202
203 __m256 aVal;
204 __m128 aVal1, aVal2;
205 __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
206 for (; number < eighth_points; number++) {
207
208 aVal = _mm256_load_ps(aPtr);
209 bVal1 = _mm256_load_pd(bPtr);
210 bVal2 = _mm256_load_pd(bPtr + 4);
211
212 aVal1 = _mm256_extractf128_ps(aVal, 0);
213 aVal2 = _mm256_extractf128_ps(aVal, 1);
214
215 aDbl1 = _mm256_cvtps_pd(aVal1);
216 aDbl2 = _mm256_cvtps_pd(aVal2);
217
218 cVal1 = _mm256_add_pd(aDbl1, bVal1);
219 cVal2 = _mm256_add_pd(aDbl2, bVal2);
220
221 _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
222 _mm256_store_pd(cPtr + 4,
223 cVal2); // Store the results back into the C container
224
225 aPtr += 8;
226 bPtr += 8;
227 cPtr += 8;
228 }
229
230 number = eighth_points * 8;
231 for (; number < num_points; number++) {
232 *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
233 }
234}
235
236#endif /* LV_HAVE_AVX */
237
238#ifdef LV_HAVE_RVV
239#include <riscv_vector.h>
240
241static inline void volk_32f_64f_add_64f_rvv(double* cVector,
242 const float* aVector,
243 const double* bVector,
244 unsigned int num_points)
245{
246 size_t n = num_points;
247 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
248 vl = __riscv_vsetvl_e64m8(n);
249 vfloat64m8_t va = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(aVector, vl), vl);
250 vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
251 __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl);
252 }
253}
254#endif /*LV_HAVE_RVV*/
255
256#endif /* INCLUDED_volk_32f_64f_add_64f_u_H */