63#ifndef INCLUDED_volk_32f_asin_32f_a_H
64#define INCLUDED_volk_32f_asin_32f_a_H
71 for (
unsigned int i = 0; i < num_points; i++) {
83volk_32f_asin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
85 const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
86 const __m128 half = _mm_set1_ps(0.5f);
87 const __m128 one = _mm_set1_ps(1.0f);
88 const __m128 two = _mm_set1_ps(2.0f);
89 const __m128 sign_mask = _mm_set1_ps(-0.0f);
91 unsigned int number = 0;
92 const unsigned int quarterPoints = num_points / 4;
94 for (; number < quarterPoints; number++) {
95 __m128 aVal = _mm_load_ps(aVector);
98 __m128 sign = _mm_and_ps(aVal, sign_mask);
99 __m128 ax = _mm_andnot_ps(sign_mask, aVal);
105 __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
106 __m128 s = _mm_sqrt_ps(t);
113 __m128 result_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
116 __m128 mask = _mm_cmpgt_ps(ax, half);
117 __m128 result = _mm_blendv_ps(poly_small, result_large, mask);
120 result = _mm_or_ps(result, sign);
122 _mm_store_ps(bVector, result);
128 number = quarterPoints * 4;
129 for (; number < num_points; number++) {
137#include <immintrin.h>
143 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
144 const __m256 half = _mm256_set1_ps(0.5f);
145 const __m256 one = _mm256_set1_ps(1.0f);
146 const __m256 two = _mm256_set1_ps(2.0f);
147 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
149 unsigned int number = 0;
150 const unsigned int eighthPoints = num_points / 8;
152 for (; number < eighthPoints; number++) {
153 __m256 aVal = _mm256_load_ps(aVector);
156 __m256 sign = _mm256_and_ps(aVal, sign_mask);
157 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
160 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
161 __m256 s = _mm256_sqrt_ps(t);
168 __m256 result_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
171 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
172 __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
175 result = _mm256_or_ps(result, sign);
177 _mm256_store_ps(bVector, result);
183 number = eighthPoints * 8;
184 for (; number < num_points; number++) {
192#include <immintrin.h>
195static inline void volk_32f_asin_32f_a_avx2_fma(
float* bVector,
196 const float* aVector,
197 unsigned int num_points)
199 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
200 const __m256 half = _mm256_set1_ps(0.5f);
201 const __m256 one = _mm256_set1_ps(1.0f);
202 const __m256 two = _mm256_set1_ps(2.0f);
203 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
205 unsigned int number = 0;
206 const unsigned int eighthPoints = num_points / 8;
208 for (; number < eighthPoints; number++) {
209 __m256 aVal = _mm256_load_ps(aVector);
212 __m256 sign = _mm256_and_ps(aVal, sign_mask);
213 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
216 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
217 __m256 s = _mm256_sqrt_ps(t);
224 __m256 result_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
227 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
228 __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
231 result = _mm256_or_ps(result, sign);
233 _mm256_store_ps(bVector, result);
239 number = eighthPoints * 8;
240 for (; number < num_points; number++) {
247#ifdef LV_HAVE_AVX512F
248#include <immintrin.h>
252volk_32f_asin_32f_a_avx512(
float* bVector,
const float* aVector,
unsigned int num_points)
254 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
255 const __m512 half = _mm512_set1_ps(0.5f);
256 const __m512 one = _mm512_set1_ps(1.0f);
257 const __m512 two = _mm512_set1_ps(2.0f);
258 const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
260 unsigned int number = 0;
261 const unsigned int sixteenthPoints = num_points / 16;
263 for (; number < sixteenthPoints; number++) {
264 __m512 aVal = _mm512_load_ps(aVector);
267 __m512i aVal_i = _mm512_castps_si512(aVal);
268 __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
269 __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
272 __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
273 __m512 s = _mm512_sqrt_ps(t);
280 __m512 result_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
283 __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
284 __m512 result = _mm512_mask_blend_ps(mask, poly_small, result_large);
287 result = _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(result), sign));
289 _mm512_store_ps(bVector, result);
295 number = sixteenthPoints * 16;
296 for (; number < num_points; number++) {
305#ifndef INCLUDED_volk_32f_asin_32f_u_H
306#define INCLUDED_volk_32f_asin_32f_u_H
309#include <smmintrin.h>
313volk_32f_asin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
315 const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
316 const __m128 half = _mm_set1_ps(0.5f);
317 const __m128 one = _mm_set1_ps(1.0f);
318 const __m128 two = _mm_set1_ps(2.0f);
319 const __m128 sign_mask = _mm_set1_ps(-0.0f);
321 unsigned int number = 0;
322 const unsigned int quarterPoints = num_points / 4;
324 for (; number < quarterPoints; number++) {
325 __m128 aVal = _mm_loadu_ps(aVector);
327 __m128 sign = _mm_and_ps(aVal, sign_mask);
328 __m128 ax = _mm_andnot_ps(sign_mask, aVal);
330 __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
331 __m128 s = _mm_sqrt_ps(t);
336 __m128 result_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
338 __m128 mask = _mm_cmpgt_ps(ax, half);
339 __m128 result = _mm_blendv_ps(poly_small, result_large, mask);
341 result = _mm_or_ps(result, sign);
343 _mm_storeu_ps(bVector, result);
349 number = quarterPoints * 4;
350 for (; number < num_points; number++) {
358#include <immintrin.h>
364 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
365 const __m256 half = _mm256_set1_ps(0.5f);
366 const __m256 one = _mm256_set1_ps(1.0f);
367 const __m256 two = _mm256_set1_ps(2.0f);
368 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
370 unsigned int number = 0;
371 const unsigned int eighthPoints = num_points / 8;
373 for (; number < eighthPoints; number++) {
374 __m256 aVal = _mm256_loadu_ps(aVector);
376 __m256 sign = _mm256_and_ps(aVal, sign_mask);
377 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
379 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
380 __m256 s = _mm256_sqrt_ps(t);
385 __m256 result_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
387 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
388 __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
390 result = _mm256_or_ps(result, sign);
392 _mm256_storeu_ps(bVector, result);
398 number = eighthPoints * 8;
399 for (; number < num_points; number++) {
407#include <immintrin.h>
410static inline void volk_32f_asin_32f_u_avx2_fma(
float* bVector,
411 const float* aVector,
412 unsigned int num_points)
414 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
415 const __m256 half = _mm256_set1_ps(0.5f);
416 const __m256 one = _mm256_set1_ps(1.0f);
417 const __m256 two = _mm256_set1_ps(2.0f);
418 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
420 unsigned int number = 0;
421 const unsigned int eighthPoints = num_points / 8;
423 for (; number < eighthPoints; number++) {
424 __m256 aVal = _mm256_loadu_ps(aVector);
426 __m256 sign = _mm256_and_ps(aVal, sign_mask);
427 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
429 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
430 __m256 s = _mm256_sqrt_ps(t);
435 __m256 result_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
437 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
438 __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
440 result = _mm256_or_ps(result, sign);
442 _mm256_storeu_ps(bVector, result);
448 number = eighthPoints * 8;
449 for (; number < num_points; number++) {
456#ifdef LV_HAVE_AVX512F
457#include <immintrin.h>
461volk_32f_asin_32f_u_avx512(
float* bVector,
const float* aVector,
unsigned int num_points)
463 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
464 const __m512 half = _mm512_set1_ps(0.5f);
465 const __m512 one = _mm512_set1_ps(1.0f);
466 const __m512 two = _mm512_set1_ps(2.0f);
467 const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
469 unsigned int number = 0;
470 const unsigned int sixteenthPoints = num_points / 16;
472 for (; number < sixteenthPoints; number++) {
473 __m512 aVal = _mm512_loadu_ps(aVector);
475 __m512i aVal_i = _mm512_castps_si512(aVal);
476 __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
477 __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
479 __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
480 __m512 s = _mm512_sqrt_ps(t);
485 __m512 result_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
487 __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
488 __m512 result = _mm512_mask_blend_ps(mask, poly_small, result_large);
490 result = _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(result), sign));
492 _mm512_storeu_ps(bVector, result);
498 number = sixteenthPoints * 16;
499 for (; number < num_points; number++) {
513 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
514 const float32x4_t half = vdupq_n_f32(0.5f);
515 const float32x4_t one = vdupq_n_f32(1.0f);
516 const float32x4_t two = vdupq_n_f32(2.0f);
518 unsigned int number = 0;
519 const unsigned int quarterPoints = num_points / 4;
521 for (; number < quarterPoints; number++) {
522 float32x4_t aVal = vld1q_f32(aVector);
525 float32x4_t ax = vabsq_f32(aVal);
526 uint32x4_t sign_bits =
527 vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
530 float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
538 float32x4_t result_large = vmlsq_f32(pi_2, two, poly_large);
541 uint32x4_t mask = vcgtq_f32(ax, half);
542 float32x4_t result = vbslq_f32(mask, result_large, poly_small);
546 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(result), sign_bits));
548 vst1q_f32(bVector, result);
554 number = quarterPoints * 4;
555 for (; number < num_points; number++) {
567volk_32f_asin_32f_neonv8(
float* bVector,
const float* aVector,
unsigned int num_points)
569 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
570 const float32x4_t half = vdupq_n_f32(0.5f);
571 const float32x4_t one = vdupq_n_f32(1.0f);
572 const float32x4_t two = vdupq_n_f32(2.0f);
574 unsigned int number = 0;
575 const unsigned int quarterPoints = num_points / 4;
577 for (; number < quarterPoints; number++) {
578 float32x4_t aVal = vld1q_f32(aVector);
580 float32x4_t ax = vabsq_f32(aVal);
581 uint32x4_t sign_bits =
582 vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
584 float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
585 float32x4_t s = vsqrtq_f32(t);
587 float32x4_t poly_small = _varcsinq_f32_neonv8(ax);
588 float32x4_t poly_large = _varcsinq_f32_neonv8(s);
590 float32x4_t result_large = vfmsq_f32(pi_2, two, poly_large);
592 uint32x4_t mask = vcgtq_f32(ax, half);
593 float32x4_t result = vbslq_f32(mask, result_large, poly_small);
596 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(result), sign_bits));
598 vst1q_f32(bVector, result);
604 number = quarterPoints * 4;
605 for (; number < num_points; number++) {