63#ifndef INCLUDED_volk_32f_acos_32f_a_H
64#define INCLUDED_volk_32f_acos_32f_a_H
71 for (
unsigned int i = 0; i < num_points; i++) {
83volk_32f_acos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
85 const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
86 const __m128 half = _mm_set1_ps(0.5f);
87 const __m128 one = _mm_set1_ps(1.0f);
88 const __m128 two = _mm_set1_ps(2.0f);
89 const __m128 sign_mask = _mm_set1_ps(-0.0f);
91 unsigned int number = 0;
92 const unsigned int quarterPoints = num_points / 4;
94 for (; number < quarterPoints; number++) {
95 __m128 aVal = _mm_load_ps(aVector);
99 __m128 sign = _mm_and_ps(aVal, sign_mask);
100 __m128 ax = _mm_andnot_ps(sign_mask, aVal);
103 __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
104 __m128 s = _mm_sqrt_ps(t);
109 __m128 asin_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
111 __m128 mask = _mm_cmpgt_ps(ax, half);
112 __m128 asin_result = _mm_blendv_ps(poly_small, asin_large, mask);
115 asin_result = _mm_or_ps(asin_result, sign);
118 __m128 result = _mm_sub_ps(pi_2, asin_result);
120 _mm_store_ps(bVector, result);
126 number = quarterPoints * 4;
127 for (; number < num_points; number++) {
135#include <immintrin.h>
141 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
142 const __m256 half = _mm256_set1_ps(0.5f);
143 const __m256 one = _mm256_set1_ps(1.0f);
144 const __m256 two = _mm256_set1_ps(2.0f);
145 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
147 unsigned int number = 0;
148 const unsigned int eighthPoints = num_points / 8;
150 for (; number < eighthPoints; number++) {
151 __m256 aVal = _mm256_load_ps(aVector);
153 __m256 sign = _mm256_and_ps(aVal, sign_mask);
154 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
156 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
157 __m256 s = _mm256_sqrt_ps(t);
162 __m256 asin_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
164 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
165 __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
167 asin_result = _mm256_or_ps(asin_result, sign);
169 __m256 result = _mm256_sub_ps(pi_2, asin_result);
171 _mm256_store_ps(bVector, result);
177 number = eighthPoints * 8;
178 for (; number < num_points; number++) {
186#include <immintrin.h>
189static inline void volk_32f_acos_32f_a_avx2_fma(
float* bVector,
190 const float* aVector,
191 unsigned int num_points)
193 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
194 const __m256 half = _mm256_set1_ps(0.5f);
195 const __m256 one = _mm256_set1_ps(1.0f);
196 const __m256 two = _mm256_set1_ps(2.0f);
197 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
199 unsigned int number = 0;
200 const unsigned int eighthPoints = num_points / 8;
202 for (; number < eighthPoints; number++) {
203 __m256 aVal = _mm256_load_ps(aVector);
205 __m256 sign = _mm256_and_ps(aVal, sign_mask);
206 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
208 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
209 __m256 s = _mm256_sqrt_ps(t);
214 __m256 asin_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
216 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
217 __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
219 asin_result = _mm256_or_ps(asin_result, sign);
221 __m256 result = _mm256_sub_ps(pi_2, asin_result);
223 _mm256_store_ps(bVector, result);
229 number = eighthPoints * 8;
230 for (; number < num_points; number++) {
237#ifdef LV_HAVE_AVX512F
238#include <immintrin.h>
242volk_32f_acos_32f_a_avx512(
float* bVector,
const float* aVector,
unsigned int num_points)
244 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
245 const __m512 half = _mm512_set1_ps(0.5f);
246 const __m512 one = _mm512_set1_ps(1.0f);
247 const __m512 two = _mm512_set1_ps(2.0f);
248 const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
250 unsigned int number = 0;
251 const unsigned int sixteenthPoints = num_points / 16;
253 for (; number < sixteenthPoints; number++) {
254 __m512 aVal = _mm512_load_ps(aVector);
256 __m512i aVal_i = _mm512_castps_si512(aVal);
257 __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
258 __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
260 __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
261 __m512 s = _mm512_sqrt_ps(t);
266 __m512 asin_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
268 __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
269 __m512 asin_result = _mm512_mask_blend_ps(mask, poly_small, asin_large);
272 _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(asin_result), sign));
274 __m512 result = _mm512_sub_ps(pi_2, asin_result);
276 _mm512_store_ps(bVector, result);
282 number = sixteenthPoints * 16;
283 for (; number < num_points; number++) {
292#ifndef INCLUDED_volk_32f_acos_32f_u_H
293#define INCLUDED_volk_32f_acos_32f_u_H
296#include <smmintrin.h>
300volk_32f_acos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
302 const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
303 const __m128 half = _mm_set1_ps(0.5f);
304 const __m128 one = _mm_set1_ps(1.0f);
305 const __m128 two = _mm_set1_ps(2.0f);
306 const __m128 sign_mask = _mm_set1_ps(-0.0f);
308 unsigned int number = 0;
309 const unsigned int quarterPoints = num_points / 4;
311 for (; number < quarterPoints; number++) {
312 __m128 aVal = _mm_loadu_ps(aVector);
314 __m128 sign = _mm_and_ps(aVal, sign_mask);
315 __m128 ax = _mm_andnot_ps(sign_mask, aVal);
317 __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
318 __m128 s = _mm_sqrt_ps(t);
323 __m128 asin_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
325 __m128 mask = _mm_cmpgt_ps(ax, half);
326 __m128 asin_result = _mm_blendv_ps(poly_small, asin_large, mask);
328 asin_result = _mm_or_ps(asin_result, sign);
330 __m128 result = _mm_sub_ps(pi_2, asin_result);
332 _mm_storeu_ps(bVector, result);
338 number = quarterPoints * 4;
339 for (; number < num_points; number++) {
347#include <immintrin.h>
353 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
354 const __m256 half = _mm256_set1_ps(0.5f);
355 const __m256 one = _mm256_set1_ps(1.0f);
356 const __m256 two = _mm256_set1_ps(2.0f);
357 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
359 unsigned int number = 0;
360 const unsigned int eighthPoints = num_points / 8;
362 for (; number < eighthPoints; number++) {
363 __m256 aVal = _mm256_loadu_ps(aVector);
365 __m256 sign = _mm256_and_ps(aVal, sign_mask);
366 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
368 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
369 __m256 s = _mm256_sqrt_ps(t);
374 __m256 asin_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
376 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
377 __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
379 asin_result = _mm256_or_ps(asin_result, sign);
381 __m256 result = _mm256_sub_ps(pi_2, asin_result);
383 _mm256_storeu_ps(bVector, result);
389 number = eighthPoints * 8;
390 for (; number < num_points; number++) {
398#include <immintrin.h>
401static inline void volk_32f_acos_32f_u_avx2_fma(
float* bVector,
402 const float* aVector,
403 unsigned int num_points)
405 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
406 const __m256 half = _mm256_set1_ps(0.5f);
407 const __m256 one = _mm256_set1_ps(1.0f);
408 const __m256 two = _mm256_set1_ps(2.0f);
409 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
411 unsigned int number = 0;
412 const unsigned int eighthPoints = num_points / 8;
414 for (; number < eighthPoints; number++) {
415 __m256 aVal = _mm256_loadu_ps(aVector);
417 __m256 sign = _mm256_and_ps(aVal, sign_mask);
418 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
420 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
421 __m256 s = _mm256_sqrt_ps(t);
426 __m256 asin_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
428 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
429 __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
431 asin_result = _mm256_or_ps(asin_result, sign);
433 __m256 result = _mm256_sub_ps(pi_2, asin_result);
435 _mm256_storeu_ps(bVector, result);
441 number = eighthPoints * 8;
442 for (; number < num_points; number++) {
449#ifdef LV_HAVE_AVX512F
450#include <immintrin.h>
454volk_32f_acos_32f_u_avx512(
float* bVector,
const float* aVector,
unsigned int num_points)
456 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
457 const __m512 half = _mm512_set1_ps(0.5f);
458 const __m512 one = _mm512_set1_ps(1.0f);
459 const __m512 two = _mm512_set1_ps(2.0f);
460 const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
462 unsigned int number = 0;
463 const unsigned int sixteenthPoints = num_points / 16;
465 for (; number < sixteenthPoints; number++) {
466 __m512 aVal = _mm512_loadu_ps(aVector);
468 __m512i aVal_i = _mm512_castps_si512(aVal);
469 __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
470 __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
472 __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
473 __m512 s = _mm512_sqrt_ps(t);
478 __m512 asin_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
480 __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
481 __m512 asin_result = _mm512_mask_blend_ps(mask, poly_small, asin_large);
484 _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(asin_result), sign));
486 __m512 result = _mm512_sub_ps(pi_2, asin_result);
488 _mm512_storeu_ps(bVector, result);
494 number = sixteenthPoints * 16;
495 for (; number < num_points; number++) {
509 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
510 const float32x4_t half = vdupq_n_f32(0.5f);
511 const float32x4_t one = vdupq_n_f32(1.0f);
512 const float32x4_t two = vdupq_n_f32(2.0f);
514 unsigned int number = 0;
515 const unsigned int quarterPoints = num_points / 4;
517 for (; number < quarterPoints; number++) {
518 float32x4_t aVal = vld1q_f32(aVector);
520 float32x4_t ax = vabsq_f32(aVal);
521 uint32x4_t sign_bits =
522 vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
524 float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
530 float32x4_t asin_large = vmlsq_f32(pi_2, two, poly_large);
532 uint32x4_t mask = vcgtq_f32(ax, half);
533 float32x4_t asin_result = vbslq_f32(mask, asin_large, poly_small);
535 asin_result = vreinterpretq_f32_u32(
536 vorrq_u32(vreinterpretq_u32_f32(asin_result), sign_bits));
538 float32x4_t result = vsubq_f32(pi_2, asin_result);
540 vst1q_f32(bVector, result);
546 number = quarterPoints * 4;
547 for (; number < num_points; number++) {
559volk_32f_acos_32f_neonv8(
float* bVector,
const float* aVector,
unsigned int num_points)
561 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
562 const float32x4_t half = vdupq_n_f32(0.5f);
563 const float32x4_t one = vdupq_n_f32(1.0f);
564 const float32x4_t two = vdupq_n_f32(2.0f);
566 unsigned int number = 0;
567 const unsigned int quarterPoints = num_points / 4;
569 for (; number < quarterPoints; number++) {
570 float32x4_t aVal = vld1q_f32(aVector);
572 float32x4_t ax = vabsq_f32(aVal);
573 uint32x4_t sign_bits =
574 vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
576 float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
577 float32x4_t s = vsqrtq_f32(t);
579 float32x4_t poly_small = _varcsinq_f32_neonv8(ax);
580 float32x4_t poly_large = _varcsinq_f32_neonv8(s);
582 float32x4_t asin_large = vfmsq_f32(pi_2, two, poly_large);
584 uint32x4_t mask = vcgtq_f32(ax, half);
585 float32x4_t asin_result = vbslq_f32(mask, asin_large, poly_small);
587 asin_result = vreinterpretq_f32_u32(
588 vorrq_u32(vreinterpretq_u32_f32(asin_result), sign_bits));
590 float32x4_t result = vsubq_f32(pi_2, asin_result);
592 vst1q_f32(bVector, result);
598 number = quarterPoints * 4;
599 for (; number < num_points; number++) {