Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_sqrt_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
51
52#ifndef INCLUDED_volk_32f_sqrt_32f_a_H
53#define INCLUDED_volk_32f_sqrt_32f_a_H
54
55#include <inttypes.h>
56#include <math.h>
57#include <stdio.h>
58
59#ifdef LV_HAVE_GENERIC
60
61static inline void
62volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
63{
64 float* cPtr = cVector;
65 const float* aPtr = aVector;
66 unsigned int number = 0;
67
68 for (number = 0; number < num_points; number++) {
69 *cPtr++ = sqrtf(*aPtr++);
70 }
71}
72
73#endif /* LV_HAVE_GENERIC */
74
75
76#ifdef LV_HAVE_SSE
77#include <xmmintrin.h>
78
79static inline void
80volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
81{
82 unsigned int number = 0;
83 const unsigned int quarterPoints = num_points / 4;
84
85 float* cPtr = cVector;
86 const float* aPtr = aVector;
87
88 __m128 aVal, cVal;
89 for (; number < quarterPoints; number++) {
90 aVal = _mm_load_ps(aPtr);
91
92 cVal = _mm_sqrt_ps(aVal);
93
94 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
95
96 aPtr += 4;
97 cPtr += 4;
98 }
99
100 number = quarterPoints * 4;
101 for (; number < num_points; number++) {
102 *cPtr++ = sqrtf(*aPtr++);
103 }
104}
105
106#endif /* LV_HAVE_SSE */
107
108#if LV_HAVE_AVX512F
109#include <immintrin.h>
110
111static inline void
112volk_32f_sqrt_32f_a_avx512(float* cVector, const float* aVector, unsigned int num_points)
113{
114 unsigned int number = 0;
115 const unsigned int sixteenthPoints = num_points / 16;
116
117 float* cPtr = cVector;
118 const float* aPtr = aVector;
119
120 __m512 aVal, cVal;
121 for (; number < sixteenthPoints; number++) {
122 aVal = _mm512_load_ps(aPtr);
123 cVal = _mm512_sqrt_ps(aVal);
124 _mm512_store_ps(cPtr, cVal);
125
126 aPtr += 16;
127 cPtr += 16;
128 }
129
130 number = sixteenthPoints * 16;
131 for (; number < num_points; number++) {
132 *cPtr++ = sqrtf(*aPtr++);
133 }
134}
135
136#endif /* LV_HAVE_AVX512F */
137
138
139#ifdef LV_HAVE_AVX2
140#include <immintrin.h>
141
142static inline void
143volk_32f_sqrt_32f_a_avx2(float* cVector, const float* aVector, unsigned int num_points)
144{
145 unsigned int number = 0;
146 const unsigned int eighthPoints = num_points / 8;
147
148 float* cPtr = cVector;
149 const float* aPtr = aVector;
150
151 __m256 aVal, cVal;
152 for (; number < eighthPoints; number++) {
153 aVal = _mm256_load_ps(aPtr);
154 cVal = _mm256_sqrt_ps(aVal);
155 _mm256_store_ps(cPtr, cVal);
156
157 aPtr += 8;
158 cPtr += 8;
159 }
160
161 number = eighthPoints * 8;
162 for (; number < num_points; number++) {
163 *cPtr++ = sqrtf(*aPtr++);
164 }
165}
166
167#endif /* LV_HAVE_AVX2 */
168
169
170#ifdef LV_HAVE_AVX
171#include <immintrin.h>
172
173static inline void
174volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
175{
176 unsigned int number = 0;
177 const unsigned int eighthPoints = num_points / 8;
178
179 float* cPtr = cVector;
180 const float* aPtr = aVector;
181
182 __m256 aVal, cVal;
183 for (; number < eighthPoints; number++) {
184 aVal = _mm256_load_ps(aPtr);
185
186 cVal = _mm256_sqrt_ps(aVal);
187
188 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
189
190 aPtr += 8;
191 cPtr += 8;
192 }
193
194 number = eighthPoints * 8;
195 for (; number < num_points; number++) {
196 *cPtr++ = sqrtf(*aPtr++);
197 }
198}
199
200#endif /* LV_HAVE_AVX */
201
202
203#ifdef LV_HAVE_NEON
204#include <arm_neon.h>
205
206static inline void
207volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
208{
209 float* cPtr = cVector;
210 const float* aPtr = aVector;
211 unsigned int number = 0;
212 unsigned int quarter_points = num_points / 4;
213 float32x4_t in_vec, out_vec;
214
215 for (number = 0; number < quarter_points; number++) {
216 in_vec = vld1q_f32(aPtr);
217 // note that armv8 has vsqrt_f32 which will be much better
218 out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
219 vst1q_f32(cPtr, out_vec);
220 aPtr += 4;
221 cPtr += 4;
222 }
223
224 for (number = quarter_points * 4; number < num_points; number++) {
225 *cPtr++ = sqrtf(*aPtr++);
226 }
227}
228
229#endif /* LV_HAVE_NEON */
230
231#ifdef LV_HAVE_NEONV8
232#include <arm_neon.h>
233
234static inline void
235volk_32f_sqrt_32f_neonv8(float* cVector, const float* aVector, unsigned int num_points)
236{
237 float* cPtr = cVector;
238 const float* aPtr = aVector;
239 unsigned int number = 0;
240 unsigned int quarter_points = num_points / 4;
241
242 for (number = 0; number < quarter_points; number++) {
243 float32x4_t in_vec = vld1q_f32(aPtr);
244 float32x4_t out_vec = vsqrtq_f32(in_vec);
245 vst1q_f32(cPtr, out_vec);
246 aPtr += 4;
247 cPtr += 4;
248 }
249
250 for (number = quarter_points * 4; number < num_points; number++) {
251 *cPtr++ = sqrtf(*aPtr++);
252 }
253}
254
255#endif /* LV_HAVE_NEONV8 */
256
257#endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
258
259#ifndef INCLUDED_volk_32f_sqrt_32f_u_H
260#define INCLUDED_volk_32f_sqrt_32f_u_H
261
262#include <inttypes.h>
263#include <math.h>
264#include <stdio.h>
265
266#ifdef LV_HAVE_SSE
267#include <xmmintrin.h>
268
269static inline void
270volk_32f_sqrt_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
271{
272 unsigned int number = 0;
273 const unsigned int quarterPoints = num_points / 4;
274
275 float* cPtr = cVector;
276 const float* aPtr = aVector;
277
278 __m128 aVal, cVal;
279 for (; number < quarterPoints; number++) {
280 aVal = _mm_loadu_ps(aPtr);
281 cVal = _mm_sqrt_ps(aVal);
282 _mm_storeu_ps(cPtr, cVal);
283
284 aPtr += 4;
285 cPtr += 4;
286 }
287
288 number = quarterPoints * 4;
289 for (; number < num_points; number++) {
290 *cPtr++ = sqrtf(*aPtr++);
291 }
292}
293
294#endif /* LV_HAVE_SSE */
295
296
297#if LV_HAVE_AVX512F
298#include <immintrin.h>
299
300static inline void
301volk_32f_sqrt_32f_u_avx512(float* cVector, const float* aVector, unsigned int num_points)
302{
303 unsigned int number = 0;
304 const unsigned int sixteenthPoints = num_points / 16;
305
306 float* cPtr = cVector;
307 const float* aPtr = aVector;
308
309 __m512 aVal, cVal;
310 for (; number < sixteenthPoints; number++) {
311 aVal = _mm512_loadu_ps(aPtr);
312 cVal = _mm512_sqrt_ps(aVal);
313 _mm512_storeu_ps(cPtr, cVal);
314
315 aPtr += 16;
316 cPtr += 16;
317 }
318
319 number = sixteenthPoints * 16;
320 for (; number < num_points; number++) {
321 *cPtr++ = sqrtf(*aPtr++);
322 }
323}
324
325#endif /* LV_HAVE_AVX512F */
326
327
328#ifdef LV_HAVE_AVX2
329#include <immintrin.h>
330
331static inline void
332volk_32f_sqrt_32f_u_avx2(float* cVector, const float* aVector, unsigned int num_points)
333{
334 unsigned int number = 0;
335 const unsigned int eighthPoints = num_points / 8;
336
337 float* cPtr = cVector;
338 const float* aPtr = aVector;
339
340 __m256 aVal, cVal;
341 for (; number < eighthPoints; number++) {
342 aVal = _mm256_loadu_ps(aPtr);
343 cVal = _mm256_sqrt_ps(aVal);
344 _mm256_storeu_ps(cPtr, cVal);
345
346 aPtr += 8;
347 cPtr += 8;
348 }
349
350 number = eighthPoints * 8;
351 for (; number < num_points; number++) {
352 *cPtr++ = sqrtf(*aPtr++);
353 }
354}
355
356#endif /* LV_HAVE_AVX2 */
357
358
359#ifdef LV_HAVE_AVX
360#include <immintrin.h>
361
362static inline void
363volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
364{
365 unsigned int number = 0;
366 const unsigned int eighthPoints = num_points / 8;
367
368 float* cPtr = cVector;
369 const float* aPtr = aVector;
370
371 __m256 aVal, cVal;
372 for (; number < eighthPoints; number++) {
373 aVal = _mm256_loadu_ps(aPtr);
374
375 cVal = _mm256_sqrt_ps(aVal);
376
377 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
378
379 aPtr += 8;
380 cPtr += 8;
381 }
382
383 number = eighthPoints * 8;
384 for (; number < num_points; number++) {
385 *cPtr++ = sqrtf(*aPtr++);
386 }
387}
388
389#endif /* LV_HAVE_AVX */
390
391#ifdef LV_HAVE_RVV
392#include <riscv_vector.h>
393
394static inline void
395volk_32f_sqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_points)
396{
397 size_t n = num_points;
398 for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
399 vl = __riscv_vsetvl_e32m8(n);
400 vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
401 __riscv_vse32(cVector, __riscv_vfsqrt(v, vl), vl);
402 }
403}
404#endif /*LV_HAVE_RVV*/
405
406#endif /* INCLUDED_volk_32f_sqrt_32f_u_H */