52#ifndef INCLUDED_volk_32f_sqrt_32f_a_H
53#define INCLUDED_volk_32f_sqrt_32f_a_H
64 float* cPtr = cVector;
65 const float* aPtr = aVector;
66 unsigned int number = 0;
68 for (number = 0; number < num_points; number++) {
69 *cPtr++ = sqrtf(*aPtr++);
82 unsigned int number = 0;
83 const unsigned int quarterPoints = num_points / 4;
85 float* cPtr = cVector;
86 const float* aPtr = aVector;
89 for (; number < quarterPoints; number++) {
90 aVal = _mm_load_ps(aPtr);
92 cVal = _mm_sqrt_ps(aVal);
94 _mm_store_ps(cPtr, cVal);
100 number = quarterPoints * 4;
101 for (; number < num_points; number++) {
102 *cPtr++ = sqrtf(*aPtr++);
109#include <immintrin.h>
112volk_32f_sqrt_32f_a_avx512(
float* cVector,
const float* aVector,
unsigned int num_points)
114 unsigned int number = 0;
115 const unsigned int sixteenthPoints = num_points / 16;
117 float* cPtr = cVector;
118 const float* aPtr = aVector;
121 for (; number < sixteenthPoints; number++) {
122 aVal = _mm512_load_ps(aPtr);
123 cVal = _mm512_sqrt_ps(aVal);
124 _mm512_store_ps(cPtr, cVal);
130 number = sixteenthPoints * 16;
131 for (; number < num_points; number++) {
132 *cPtr++ = sqrtf(*aPtr++);
140#include <immintrin.h>
143volk_32f_sqrt_32f_a_avx2(
float* cVector,
const float* aVector,
unsigned int num_points)
145 unsigned int number = 0;
146 const unsigned int eighthPoints = num_points / 8;
148 float* cPtr = cVector;
149 const float* aPtr = aVector;
152 for (; number < eighthPoints; number++) {
153 aVal = _mm256_load_ps(aPtr);
154 cVal = _mm256_sqrt_ps(aVal);
155 _mm256_store_ps(cPtr, cVal);
161 number = eighthPoints * 8;
162 for (; number < num_points; number++) {
163 *cPtr++ = sqrtf(*aPtr++);
171#include <immintrin.h>
176 unsigned int number = 0;
177 const unsigned int eighthPoints = num_points / 8;
179 float* cPtr = cVector;
180 const float* aPtr = aVector;
183 for (; number < eighthPoints; number++) {
184 aVal = _mm256_load_ps(aPtr);
186 cVal = _mm256_sqrt_ps(aVal);
188 _mm256_store_ps(cPtr, cVal);
194 number = eighthPoints * 8;
195 for (; number < num_points; number++) {
196 *cPtr++ = sqrtf(*aPtr++);
209 float* cPtr = cVector;
210 const float* aPtr = aVector;
211 unsigned int number = 0;
212 unsigned int quarter_points = num_points / 4;
213 float32x4_t in_vec, out_vec;
215 for (number = 0; number < quarter_points; number++) {
216 in_vec = vld1q_f32(aPtr);
218 out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
219 vst1q_f32(cPtr, out_vec);
224 for (number = quarter_points * 4; number < num_points; number++) {
225 *cPtr++ = sqrtf(*aPtr++);
235volk_32f_sqrt_32f_neonv8(
float* cVector,
const float* aVector,
unsigned int num_points)
237 float* cPtr = cVector;
238 const float* aPtr = aVector;
239 unsigned int number = 0;
240 unsigned int quarter_points = num_points / 4;
242 for (number = 0; number < quarter_points; number++) {
243 float32x4_t in_vec = vld1q_f32(aPtr);
244 float32x4_t out_vec = vsqrtq_f32(in_vec);
245 vst1q_f32(cPtr, out_vec);
250 for (number = quarter_points * 4; number < num_points; number++) {
251 *cPtr++ = sqrtf(*aPtr++);
259#ifndef INCLUDED_volk_32f_sqrt_32f_u_H
260#define INCLUDED_volk_32f_sqrt_32f_u_H
267#include <xmmintrin.h>
272 unsigned int number = 0;
273 const unsigned int quarterPoints = num_points / 4;
275 float* cPtr = cVector;
276 const float* aPtr = aVector;
279 for (; number < quarterPoints; number++) {
280 aVal = _mm_loadu_ps(aPtr);
281 cVal = _mm_sqrt_ps(aVal);
282 _mm_storeu_ps(cPtr, cVal);
288 number = quarterPoints * 4;
289 for (; number < num_points; number++) {
290 *cPtr++ = sqrtf(*aPtr++);
298#include <immintrin.h>
301volk_32f_sqrt_32f_u_avx512(
float* cVector,
const float* aVector,
unsigned int num_points)
303 unsigned int number = 0;
304 const unsigned int sixteenthPoints = num_points / 16;
306 float* cPtr = cVector;
307 const float* aPtr = aVector;
310 for (; number < sixteenthPoints; number++) {
311 aVal = _mm512_loadu_ps(aPtr);
312 cVal = _mm512_sqrt_ps(aVal);
313 _mm512_storeu_ps(cPtr, cVal);
319 number = sixteenthPoints * 16;
320 for (; number < num_points; number++) {
321 *cPtr++ = sqrtf(*aPtr++);
329#include <immintrin.h>
332volk_32f_sqrt_32f_u_avx2(
float* cVector,
const float* aVector,
unsigned int num_points)
334 unsigned int number = 0;
335 const unsigned int eighthPoints = num_points / 8;
337 float* cPtr = cVector;
338 const float* aPtr = aVector;
341 for (; number < eighthPoints; number++) {
342 aVal = _mm256_loadu_ps(aPtr);
343 cVal = _mm256_sqrt_ps(aVal);
344 _mm256_storeu_ps(cPtr, cVal);
350 number = eighthPoints * 8;
351 for (; number < num_points; number++) {
352 *cPtr++ = sqrtf(*aPtr++);
360#include <immintrin.h>
365 unsigned int number = 0;
366 const unsigned int eighthPoints = num_points / 8;
368 float* cPtr = cVector;
369 const float* aPtr = aVector;
372 for (; number < eighthPoints; number++) {
373 aVal = _mm256_loadu_ps(aPtr);
375 cVal = _mm256_sqrt_ps(aVal);
377 _mm256_storeu_ps(cPtr, cVal);
383 number = eighthPoints * 8;
384 for (; number < num_points; number++) {
385 *cPtr++ = sqrtf(*aPtr++);
392#include <riscv_vector.h>
395volk_32f_sqrt_32f_rvv(
float* cVector,
const float* aVector,
unsigned int num_points)
397 size_t n = num_points;
398 for (
size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
399 vl = __riscv_vsetvl_e32m8(n);
400 vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
401 __riscv_vse32(cVector, __riscv_vfsqrt(v, vl), vl);