Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_64f_x2_add_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
57
58#ifndef INCLUDED_volk_64f_x2_add_64f_H
59#define INCLUDED_volk_64f_x2_add_64f_H
60
61#include <inttypes.h>
62
63
64#ifdef LV_HAVE_GENERIC
65
66static inline void volk_64f_x2_add_64f_generic(double* cVector,
67 const double* aVector,
68 const double* bVector,
69 unsigned int num_points)
70{
71 double* cPtr = cVector;
72 const double* aPtr = aVector;
73 const double* bPtr = bVector;
74 unsigned int number = 0;
75
76 for (number = 0; number < num_points; number++) {
77 *cPtr++ = (*aPtr++) + (*bPtr++);
78 }
79}
80
81#endif /* LV_HAVE_GENERIC */
82
83
84#ifdef LV_HAVE_NEONV8
85#include <arm_neon.h>
86
87static inline void volk_64f_x2_add_64f_neonv8(double* cVector,
88 const double* aVector,
89 const double* bVector,
90 unsigned int num_points)
91{
92 unsigned int number = 0;
93 const unsigned int quarter_points = num_points / 4;
94
95 double* cPtr = cVector;
96 const double* aPtr = aVector;
97 const double* bPtr = bVector;
98
99 for (; number < quarter_points; number++) {
100 float64x2_t aVal0 = vld1q_f64(aPtr);
101 float64x2_t aVal1 = vld1q_f64(aPtr + 2);
102 float64x2_t bVal0 = vld1q_f64(bPtr);
103 float64x2_t bVal1 = vld1q_f64(bPtr + 2);
104 __VOLK_PREFETCH(aPtr + 4);
105 __VOLK_PREFETCH(bPtr + 4);
106
107 float64x2_t cVal0 = vaddq_f64(aVal0, bVal0);
108 float64x2_t cVal1 = vaddq_f64(aVal1, bVal1);
109
110 vst1q_f64(cPtr, cVal0);
111 vst1q_f64(cPtr + 2, cVal1);
112
113 aPtr += 4;
114 bPtr += 4;
115 cPtr += 4;
116 }
117
118 number = quarter_points * 4;
119 for (; number < num_points; number++) {
120 *cPtr++ = (*aPtr++) + (*bPtr++);
121 }
122}
123
124#endif /* LV_HAVE_NEONV8 */
125
126
127/*
128 * Unaligned versions
129 */
130
131#ifdef LV_HAVE_SSE2
132
133#include <emmintrin.h>
134
135static inline void volk_64f_x2_add_64f_u_sse2(double* cVector,
136 const double* aVector,
137 const double* bVector,
138 unsigned int num_points)
139{
140 unsigned int number = 0;
141 const unsigned int half_points = num_points / 2;
142
143 double* cPtr = cVector;
144 const double* aPtr = aVector;
145 const double* bPtr = bVector;
146
147 __m128d aVal, bVal, cVal;
148 for (; number < half_points; number++) {
149 aVal = _mm_loadu_pd(aPtr);
150 bVal = _mm_loadu_pd(bPtr);
151
152 cVal = _mm_add_pd(aVal, bVal);
153
154 _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
155
156 aPtr += 2;
157 bPtr += 2;
158 cPtr += 2;
159 }
160
161 number = half_points * 2;
162 for (; number < num_points; number++) {
163 *cPtr++ = (*aPtr++) + (*bPtr++);
164 }
165}
166
167#endif /* LV_HAVE_SSE2 */
168
169
170#ifdef LV_HAVE_AVX
171
172#include <immintrin.h>
173
174static inline void volk_64f_x2_add_64f_u_avx(double* cVector,
175 const double* aVector,
176 const double* bVector,
177 unsigned int num_points)
178{
179 unsigned int number = 0;
180 const unsigned int quarter_points = num_points / 4;
181
182 double* cPtr = cVector;
183 const double* aPtr = aVector;
184 const double* bPtr = bVector;
185
186 __m256d aVal, bVal, cVal;
187 for (; number < quarter_points; number++) {
188
189 aVal = _mm256_loadu_pd(aPtr);
190 bVal = _mm256_loadu_pd(bPtr);
191
192 cVal = _mm256_add_pd(aVal, bVal);
193
194 _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
195
196 aPtr += 4;
197 bPtr += 4;
198 cPtr += 4;
199 }
200
201 number = quarter_points * 4;
202 for (; number < num_points; number++) {
203 *cPtr++ = (*aPtr++) + (*bPtr++);
204 }
205}
206
207#endif /* LV_HAVE_AVX */
208
209/*
210 * Aligned versions
211 */
212
213#ifdef LV_HAVE_SSE2
214
215#include <emmintrin.h>
216
217static inline void volk_64f_x2_add_64f_a_sse2(double* cVector,
218 const double* aVector,
219 const double* bVector,
220 unsigned int num_points)
221{
222 unsigned int number = 0;
223 const unsigned int half_points = num_points / 2;
224
225 double* cPtr = cVector;
226 const double* aPtr = aVector;
227 const double* bPtr = bVector;
228
229 __m128d aVal, bVal, cVal;
230 for (; number < half_points; number++) {
231 aVal = _mm_load_pd(aPtr);
232 bVal = _mm_load_pd(bPtr);
233
234 cVal = _mm_add_pd(aVal, bVal);
235
236 _mm_store_pd(cPtr, cVal); // Store the results back into the C container
237
238 aPtr += 2;
239 bPtr += 2;
240 cPtr += 2;
241 }
242
243 number = half_points * 2;
244 for (; number < num_points; number++) {
245 *cPtr++ = (*aPtr++) + (*bPtr++);
246 }
247}
248
249#endif /* LV_HAVE_SSE2 */
250
251
252#ifdef LV_HAVE_AVX
253
254#include <immintrin.h>
255
256static inline void volk_64f_x2_add_64f_a_avx(double* cVector,
257 const double* aVector,
258 const double* bVector,
259 unsigned int num_points)
260{
261 unsigned int number = 0;
262 const unsigned int quarter_points = num_points / 4;
263
264 double* cPtr = cVector;
265 const double* aPtr = aVector;
266 const double* bPtr = bVector;
267
268 __m256d aVal, bVal, cVal;
269 for (; number < quarter_points; number++) {
270
271 aVal = _mm256_load_pd(aPtr);
272 bVal = _mm256_load_pd(bPtr);
273
274 cVal = _mm256_add_pd(aVal, bVal);
275
276 _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
277
278 aPtr += 4;
279 bPtr += 4;
280 cPtr += 4;
281 }
282
283 number = quarter_points * 4;
284 for (; number < num_points; number++) {
285 *cPtr++ = (*aPtr++) + (*bPtr++);
286 }
287}
288
289#endif /* LV_HAVE_AVX */
290
291#ifdef LV_HAVE_RVV
292#include <riscv_vector.h>
293
294static inline void volk_64f_x2_add_64f_rvv(double* cVector,
295 const double* aVector,
296 const double* bVector,
297 unsigned int num_points)
298{
299 size_t n = num_points;
300 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
301 vl = __riscv_vsetvl_e64m8(n);
302 vfloat64m8_t va = __riscv_vle64_v_f64m8(aVector, vl);
303 vfloat64m8_t vb = __riscv_vle64_v_f64m8(bVector, vl);
304 __riscv_vse64(cVector, __riscv_vfadd(va, vb, vl), vl);
305 }
306}
307#endif /*LV_HAVE_RVV*/
308
309#endif /* INCLUDED_volk_64f_x2_add_64f_u_H */