Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_convert_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
51
52
53#ifndef INCLUDED_volk_32f_convert_64f_u_H
54#define INCLUDED_volk_32f_convert_64f_u_H
55
56#include <inttypes.h>
57#include <stdio.h>
58
59#ifdef LV_HAVE_AVX
60#include <immintrin.h>
61
62static inline void volk_32f_convert_64f_u_avx(double* outputVector,
63 const float* inputVector,
64 unsigned int num_points)
65{
66 unsigned int number = 0;
67
68 const unsigned int quarterPoints = num_points / 4;
69
70 const float* inputVectorPtr = (const float*)inputVector;
71 double* outputVectorPtr = outputVector;
72 __m256d ret;
73 __m128 inputVal;
74
75 for (; number < quarterPoints; number++) {
76 inputVal = _mm_loadu_ps(inputVectorPtr);
77 inputVectorPtr += 4;
78
79 ret = _mm256_cvtps_pd(inputVal);
80 _mm256_storeu_pd(outputVectorPtr, ret);
81
82 outputVectorPtr += 4;
83 }
84
85 number = quarterPoints * 4;
86 for (; number < num_points; number++) {
87 outputVector[number] = (double)(inputVector[number]);
88 }
89}
90
91#endif /* LV_HAVE_AVX */
92
93#ifdef LV_HAVE_SSE2
94#include <emmintrin.h>
95
96static inline void volk_32f_convert_64f_u_sse2(double* outputVector,
97 const float* inputVector,
98 unsigned int num_points)
99{
100 unsigned int number = 0;
101
102 const unsigned int quarterPoints = num_points / 4;
103
104 const float* inputVectorPtr = (const float*)inputVector;
105 double* outputVectorPtr = outputVector;
106 __m128d ret;
107 __m128 inputVal;
108
109 for (; number < quarterPoints; number++) {
110 inputVal = _mm_loadu_ps(inputVectorPtr);
111 inputVectorPtr += 4;
112
113 ret = _mm_cvtps_pd(inputVal);
114
115 _mm_storeu_pd(outputVectorPtr, ret);
116 outputVectorPtr += 2;
117
118 inputVal = _mm_movehl_ps(inputVal, inputVal);
119
120 ret = _mm_cvtps_pd(inputVal);
121
122 _mm_storeu_pd(outputVectorPtr, ret);
123 outputVectorPtr += 2;
124 }
125
126 number = quarterPoints * 4;
127 for (; number < num_points; number++) {
128 outputVector[number] = (double)(inputVector[number]);
129 }
130}
131#endif /* LV_HAVE_SSE2 */
132
133
134#ifdef LV_HAVE_GENERIC
135
136static inline void volk_32f_convert_64f_generic(double* outputVector,
137 const float* inputVector,
138 unsigned int num_points)
139{
140 double* outputVectorPtr = outputVector;
141 const float* inputVectorPtr = inputVector;
142 unsigned int number = 0;
143
144 for (number = 0; number < num_points; number++) {
145 *outputVectorPtr++ = ((double)(*inputVectorPtr++));
146 }
147}
148#endif /* LV_HAVE_GENERIC */
149
150
151#endif /* INCLUDED_volk_32f_convert_64f_u_H */
152
153
154#ifndef INCLUDED_volk_32f_convert_64f_a_H
155#define INCLUDED_volk_32f_convert_64f_a_H
156
157#include <inttypes.h>
158#include <stdio.h>
159
160#ifdef LV_HAVE_AVX
161#include <immintrin.h>
162
163static inline void volk_32f_convert_64f_a_avx(double* outputVector,
164 const float* inputVector,
165 unsigned int num_points)
166{
167 unsigned int number = 0;
168
169 const unsigned int quarterPoints = num_points / 4;
170
171 const float* inputVectorPtr = (const float*)inputVector;
172 double* outputVectorPtr = outputVector;
173 __m256d ret;
174 __m128 inputVal;
175
176 for (; number < quarterPoints; number++) {
177 inputVal = _mm_load_ps(inputVectorPtr);
178 inputVectorPtr += 4;
179
180 ret = _mm256_cvtps_pd(inputVal);
181 _mm256_store_pd(outputVectorPtr, ret);
182
183 outputVectorPtr += 4;
184 }
185
186 number = quarterPoints * 4;
187 for (; number < num_points; number++) {
188 outputVector[number] = (double)(inputVector[number]);
189 }
190}
191#endif /* LV_HAVE_AVX */
192
193#ifdef LV_HAVE_SSE2
194#include <emmintrin.h>
195
196static inline void volk_32f_convert_64f_a_sse2(double* outputVector,
197 const float* inputVector,
198 unsigned int num_points)
199{
200 unsigned int number = 0;
201
202 const unsigned int quarterPoints = num_points / 4;
203
204 const float* inputVectorPtr = (const float*)inputVector;
205 double* outputVectorPtr = outputVector;
206 __m128d ret;
207 __m128 inputVal;
208
209 for (; number < quarterPoints; number++) {
210 inputVal = _mm_load_ps(inputVectorPtr);
211 inputVectorPtr += 4;
212
213 ret = _mm_cvtps_pd(inputVal);
214
215 _mm_store_pd(outputVectorPtr, ret);
216 outputVectorPtr += 2;
217
218 inputVal = _mm_movehl_ps(inputVal, inputVal);
219
220 ret = _mm_cvtps_pd(inputVal);
221
222 _mm_store_pd(outputVectorPtr, ret);
223 outputVectorPtr += 2;
224 }
225
226 number = quarterPoints * 4;
227 for (; number < num_points; number++) {
228 outputVector[number] = (double)(inputVector[number]);
229 }
230}
231#endif /* LV_HAVE_SSE2 */
232
233#ifdef LV_HAVE_NEONV8
234#include <arm_neon.h>
235
236static inline void volk_32f_convert_64f_neonv8(double* outputVector,
237 const float* inputVector,
238 unsigned int num_points)
239{
240 unsigned int number = 0;
241 const unsigned int eighth_points = num_points / 8;
242
243 const float* inputPtr = inputVector;
244 double* outputPtr = outputVector;
245
246 for (; number < eighth_points; number++) {
247 float32x4_t in0 = vld1q_f32(inputPtr);
248 float32x4_t in1 = vld1q_f32(inputPtr + 4);
249 __VOLK_PREFETCH(inputPtr + 8);
250
251 float64x2_t out0 = vcvt_f64_f32(vget_low_f32(in0));
252 float64x2_t out1 = vcvt_f64_f32(vget_high_f32(in0));
253 float64x2_t out2 = vcvt_f64_f32(vget_low_f32(in1));
254 float64x2_t out3 = vcvt_f64_f32(vget_high_f32(in1));
255
256 vst1q_f64(outputPtr, out0);
257 vst1q_f64(outputPtr + 2, out1);
258 vst1q_f64(outputPtr + 4, out2);
259 vst1q_f64(outputPtr + 6, out3);
260
261 inputPtr += 8;
262 outputPtr += 8;
263 }
264
265 number = eighth_points * 8;
266 for (; number < num_points; number++) {
267 *outputPtr++ = (double)(*inputPtr++);
268 }
269}
270#endif /* LV_HAVE_NEONV8 */
271
272#ifdef LV_HAVE_RVV
273#include <riscv_vector.h>
274
275static inline void volk_32f_convert_64f_rvv(double* outputVector,
276 const float* inputVector,
277 unsigned int num_points)
278{
279 size_t n = num_points;
280 for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
281 vl = __riscv_vsetvl_e32m4(n);
282 vfloat32m4_t v = __riscv_vle32_v_f32m4(inputVector, vl);
283 __riscv_vse64(outputVector, __riscv_vfwcvt_f(v, vl), vl);
284 }
285}
286#endif /*LV_HAVE_RVV*/
287
288#endif /* INCLUDED_volk_32f_convert_64f_a_H */