Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_acos_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
57
58#include <inttypes.h>
59#include <math.h>
60#include <stdio.h>
61#include <volk/volk_common.h>
62
63#ifndef INCLUDED_volk_32f_acos_32f_a_H
64#define INCLUDED_volk_32f_acos_32f_a_H
65
66#ifdef LV_HAVE_GENERIC
67
68static inline void
69volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
70{
71 for (unsigned int i = 0; i < num_points; i++) {
72 bVector[i] = volk_arccos(aVector[i]);
73 }
74}
75
76#endif /* LV_HAVE_GENERIC */
77
78#ifdef LV_HAVE_SSE4_1
79#include <smmintrin.h>
81
82static inline void
83volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
84{
85 const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
86 const __m128 half = _mm_set1_ps(0.5f);
87 const __m128 one = _mm_set1_ps(1.0f);
88 const __m128 two = _mm_set1_ps(2.0f);
89 const __m128 sign_mask = _mm_set1_ps(-0.0f);
90
91 unsigned int number = 0;
92 const unsigned int quarterPoints = num_points / 4;
93
94 for (; number < quarterPoints; number++) {
95 __m128 aVal = _mm_load_ps(aVector);
96
97 // acos(x) = pi/2 - asin(x)
98 // Get absolute value and sign
99 __m128 sign = _mm_and_ps(aVal, sign_mask);
100 __m128 ax = _mm_andnot_ps(sign_mask, aVal);
101
102 // Two-range asin computation
103 __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
104 __m128 s = _mm_sqrt_ps(t);
105
106 __m128 poly_small = _mm_arcsin_poly_sse(ax);
107 __m128 poly_large = _mm_arcsin_poly_sse(s);
108
109 __m128 asin_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
110
111 __m128 mask = _mm_cmpgt_ps(ax, half);
112 __m128 asin_result = _mm_blendv_ps(poly_small, asin_large, mask);
113
114 // Apply sign to get asin(x)
115 asin_result = _mm_or_ps(asin_result, sign);
116
117 // acos(x) = pi/2 - asin(x)
118 __m128 result = _mm_sub_ps(pi_2, asin_result);
119
120 _mm_store_ps(bVector, result);
121
122 aVector += 4;
123 bVector += 4;
124 }
125
126 number = quarterPoints * 4;
127 for (; number < num_points; number++) {
128 *bVector++ = volk_arccos(*aVector++);
129 }
130}
131
132#endif /* LV_HAVE_SSE4_1 */
133
134#ifdef LV_HAVE_AVX
135#include <immintrin.h>
137
138static inline void
139volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
140{
141 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
142 const __m256 half = _mm256_set1_ps(0.5f);
143 const __m256 one = _mm256_set1_ps(1.0f);
144 const __m256 two = _mm256_set1_ps(2.0f);
145 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
146
147 unsigned int number = 0;
148 const unsigned int eighthPoints = num_points / 8;
149
150 for (; number < eighthPoints; number++) {
151 __m256 aVal = _mm256_load_ps(aVector);
152
153 __m256 sign = _mm256_and_ps(aVal, sign_mask);
154 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
155
156 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
157 __m256 s = _mm256_sqrt_ps(t);
158
159 __m256 poly_small = _mm256_arcsin_poly_avx(ax);
160 __m256 poly_large = _mm256_arcsin_poly_avx(s);
161
162 __m256 asin_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
163
164 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
165 __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
166
167 asin_result = _mm256_or_ps(asin_result, sign);
168
169 __m256 result = _mm256_sub_ps(pi_2, asin_result);
170
171 _mm256_store_ps(bVector, result);
172
173 aVector += 8;
174 bVector += 8;
175 }
176
177 number = eighthPoints * 8;
178 for (; number < num_points; number++) {
179 *bVector++ = volk_arccos(*aVector++);
180 }
181}
182
183#endif /* LV_HAVE_AVX */
184
185#ifdef LV_HAVE_AVX2
186#include <immintrin.h>
188
189static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
190 const float* aVector,
191 unsigned int num_points)
192{
193 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
194 const __m256 half = _mm256_set1_ps(0.5f);
195 const __m256 one = _mm256_set1_ps(1.0f);
196 const __m256 two = _mm256_set1_ps(2.0f);
197 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
198
199 unsigned int number = 0;
200 const unsigned int eighthPoints = num_points / 8;
201
202 for (; number < eighthPoints; number++) {
203 __m256 aVal = _mm256_load_ps(aVector);
204
205 __m256 sign = _mm256_and_ps(aVal, sign_mask);
206 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
207
208 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
209 __m256 s = _mm256_sqrt_ps(t);
210
211 __m256 poly_small = _mm256_arcsin_poly_avx2_fma(ax);
212 __m256 poly_large = _mm256_arcsin_poly_avx2_fma(s);
213
214 __m256 asin_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
215
216 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
217 __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
218
219 asin_result = _mm256_or_ps(asin_result, sign);
220
221 __m256 result = _mm256_sub_ps(pi_2, asin_result);
222
223 _mm256_store_ps(bVector, result);
224
225 aVector += 8;
226 bVector += 8;
227 }
228
229 number = eighthPoints * 8;
230 for (; number < num_points; number++) {
231 *bVector++ = volk_arccos(*aVector++);
232 }
233}
234
235#endif /* LV_HAVE_AVX2 */
236
237#ifdef LV_HAVE_AVX512F
238#include <immintrin.h>
240
241static inline void
242volk_32f_acos_32f_a_avx512(float* bVector, const float* aVector, unsigned int num_points)
243{
244 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
245 const __m512 half = _mm512_set1_ps(0.5f);
246 const __m512 one = _mm512_set1_ps(1.0f);
247 const __m512 two = _mm512_set1_ps(2.0f);
248 const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
249
250 unsigned int number = 0;
251 const unsigned int sixteenthPoints = num_points / 16;
252
253 for (; number < sixteenthPoints; number++) {
254 __m512 aVal = _mm512_load_ps(aVector);
255
256 __m512i aVal_i = _mm512_castps_si512(aVal);
257 __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
258 __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
259
260 __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
261 __m512 s = _mm512_sqrt_ps(t);
262
263 __m512 poly_small = _mm512_arcsin_poly_avx512(ax);
264 __m512 poly_large = _mm512_arcsin_poly_avx512(s);
265
266 __m512 asin_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
267
268 __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
269 __m512 asin_result = _mm512_mask_blend_ps(mask, poly_small, asin_large);
270
271 asin_result =
272 _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(asin_result), sign));
273
274 __m512 result = _mm512_sub_ps(pi_2, asin_result);
275
276 _mm512_store_ps(bVector, result);
277
278 aVector += 16;
279 bVector += 16;
280 }
281
282 number = sixteenthPoints * 16;
283 for (; number < num_points; number++) {
284 *bVector++ = volk_arccos(*aVector++);
285 }
286}
287
288#endif /* LV_HAVE_AVX512F */
289
290#endif /* INCLUDED_volk_32f_acos_32f_a_H */
291
292#ifndef INCLUDED_volk_32f_acos_32f_u_H
293#define INCLUDED_volk_32f_acos_32f_u_H
294
295#ifdef LV_HAVE_SSE4_1
296#include <smmintrin.h>
298
299static inline void
300volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
301{
302 const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
303 const __m128 half = _mm_set1_ps(0.5f);
304 const __m128 one = _mm_set1_ps(1.0f);
305 const __m128 two = _mm_set1_ps(2.0f);
306 const __m128 sign_mask = _mm_set1_ps(-0.0f);
307
308 unsigned int number = 0;
309 const unsigned int quarterPoints = num_points / 4;
310
311 for (; number < quarterPoints; number++) {
312 __m128 aVal = _mm_loadu_ps(aVector);
313
314 __m128 sign = _mm_and_ps(aVal, sign_mask);
315 __m128 ax = _mm_andnot_ps(sign_mask, aVal);
316
317 __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
318 __m128 s = _mm_sqrt_ps(t);
319
320 __m128 poly_small = _mm_arcsin_poly_sse(ax);
321 __m128 poly_large = _mm_arcsin_poly_sse(s);
322
323 __m128 asin_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
324
325 __m128 mask = _mm_cmpgt_ps(ax, half);
326 __m128 asin_result = _mm_blendv_ps(poly_small, asin_large, mask);
327
328 asin_result = _mm_or_ps(asin_result, sign);
329
330 __m128 result = _mm_sub_ps(pi_2, asin_result);
331
332 _mm_storeu_ps(bVector, result);
333
334 aVector += 4;
335 bVector += 4;
336 }
337
338 number = quarterPoints * 4;
339 for (; number < num_points; number++) {
340 *bVector++ = volk_arccos(*aVector++);
341 }
342}
343
344#endif /* LV_HAVE_SSE4_1 */
345
346#ifdef LV_HAVE_AVX
347#include <immintrin.h>
349
350static inline void
351volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
352{
353 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
354 const __m256 half = _mm256_set1_ps(0.5f);
355 const __m256 one = _mm256_set1_ps(1.0f);
356 const __m256 two = _mm256_set1_ps(2.0f);
357 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
358
359 unsigned int number = 0;
360 const unsigned int eighthPoints = num_points / 8;
361
362 for (; number < eighthPoints; number++) {
363 __m256 aVal = _mm256_loadu_ps(aVector);
364
365 __m256 sign = _mm256_and_ps(aVal, sign_mask);
366 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
367
368 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
369 __m256 s = _mm256_sqrt_ps(t);
370
371 __m256 poly_small = _mm256_arcsin_poly_avx(ax);
372 __m256 poly_large = _mm256_arcsin_poly_avx(s);
373
374 __m256 asin_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
375
376 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
377 __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
378
379 asin_result = _mm256_or_ps(asin_result, sign);
380
381 __m256 result = _mm256_sub_ps(pi_2, asin_result);
382
383 _mm256_storeu_ps(bVector, result);
384
385 aVector += 8;
386 bVector += 8;
387 }
388
389 number = eighthPoints * 8;
390 for (; number < num_points; number++) {
391 *bVector++ = volk_arccos(*aVector++);
392 }
393}
394
395#endif /* LV_HAVE_AVX */
396
397#ifdef LV_HAVE_AVX2
398#include <immintrin.h>
400
401static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
402 const float* aVector,
403 unsigned int num_points)
404{
405 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
406 const __m256 half = _mm256_set1_ps(0.5f);
407 const __m256 one = _mm256_set1_ps(1.0f);
408 const __m256 two = _mm256_set1_ps(2.0f);
409 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
410
411 unsigned int number = 0;
412 const unsigned int eighthPoints = num_points / 8;
413
414 for (; number < eighthPoints; number++) {
415 __m256 aVal = _mm256_loadu_ps(aVector);
416
417 __m256 sign = _mm256_and_ps(aVal, sign_mask);
418 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
419
420 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
421 __m256 s = _mm256_sqrt_ps(t);
422
423 __m256 poly_small = _mm256_arcsin_poly_avx2_fma(ax);
424 __m256 poly_large = _mm256_arcsin_poly_avx2_fma(s);
425
426 __m256 asin_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
427
428 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
429 __m256 asin_result = _mm256_blendv_ps(poly_small, asin_large, mask);
430
431 asin_result = _mm256_or_ps(asin_result, sign);
432
433 __m256 result = _mm256_sub_ps(pi_2, asin_result);
434
435 _mm256_storeu_ps(bVector, result);
436
437 aVector += 8;
438 bVector += 8;
439 }
440
441 number = eighthPoints * 8;
442 for (; number < num_points; number++) {
443 *bVector++ = volk_arccos(*aVector++);
444 }
445}
446
447#endif /* LV_HAVE_AVX2 */
448
449#ifdef LV_HAVE_AVX512F
450#include <immintrin.h>
452
453static inline void
454volk_32f_acos_32f_u_avx512(float* bVector, const float* aVector, unsigned int num_points)
455{
456 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
457 const __m512 half = _mm512_set1_ps(0.5f);
458 const __m512 one = _mm512_set1_ps(1.0f);
459 const __m512 two = _mm512_set1_ps(2.0f);
460 const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
461
462 unsigned int number = 0;
463 const unsigned int sixteenthPoints = num_points / 16;
464
465 for (; number < sixteenthPoints; number++) {
466 __m512 aVal = _mm512_loadu_ps(aVector);
467
468 __m512i aVal_i = _mm512_castps_si512(aVal);
469 __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
470 __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
471
472 __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
473 __m512 s = _mm512_sqrt_ps(t);
474
475 __m512 poly_small = _mm512_arcsin_poly_avx512(ax);
476 __m512 poly_large = _mm512_arcsin_poly_avx512(s);
477
478 __m512 asin_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
479
480 __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
481 __m512 asin_result = _mm512_mask_blend_ps(mask, poly_small, asin_large);
482
483 asin_result =
484 _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(asin_result), sign));
485
486 __m512 result = _mm512_sub_ps(pi_2, asin_result);
487
488 _mm512_storeu_ps(bVector, result);
489
490 aVector += 16;
491 bVector += 16;
492 }
493
494 number = sixteenthPoints * 16;
495 for (; number < num_points; number++) {
496 *bVector++ = volk_arccos(*aVector++);
497 }
498}
499
500#endif /* LV_HAVE_AVX512F */
501
502#ifdef LV_HAVE_NEON
503#include <arm_neon.h>
505
506static inline void
507volk_32f_acos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
508{
509 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
510 const float32x4_t half = vdupq_n_f32(0.5f);
511 const float32x4_t one = vdupq_n_f32(1.0f);
512 const float32x4_t two = vdupq_n_f32(2.0f);
513
514 unsigned int number = 0;
515 const unsigned int quarterPoints = num_points / 4;
516
517 for (; number < quarterPoints; number++) {
518 float32x4_t aVal = vld1q_f32(aVector);
519
520 float32x4_t ax = vabsq_f32(aVal);
521 uint32x4_t sign_bits =
522 vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
523
524 float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
525 float32x4_t s = _vsqrtq_f32(t);
526
527 float32x4_t poly_small = _varcsinq_f32(ax);
528 float32x4_t poly_large = _varcsinq_f32(s);
529
530 float32x4_t asin_large = vmlsq_f32(pi_2, two, poly_large);
531
532 uint32x4_t mask = vcgtq_f32(ax, half);
533 float32x4_t asin_result = vbslq_f32(mask, asin_large, poly_small);
534
535 asin_result = vreinterpretq_f32_u32(
536 vorrq_u32(vreinterpretq_u32_f32(asin_result), sign_bits));
537
538 float32x4_t result = vsubq_f32(pi_2, asin_result);
539
540 vst1q_f32(bVector, result);
541
542 aVector += 4;
543 bVector += 4;
544 }
545
546 number = quarterPoints * 4;
547 for (; number < num_points; number++) {
548 *bVector++ = volk_arccos(*aVector++);
549 }
550}
551
552#endif /* LV_HAVE_NEON */
553
554#ifdef LV_HAVE_NEONV8
555#include <arm_neon.h>
557
558static inline void
559volk_32f_acos_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
560{
561 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
562 const float32x4_t half = vdupq_n_f32(0.5f);
563 const float32x4_t one = vdupq_n_f32(1.0f);
564 const float32x4_t two = vdupq_n_f32(2.0f);
565
566 unsigned int number = 0;
567 const unsigned int quarterPoints = num_points / 4;
568
569 for (; number < quarterPoints; number++) {
570 float32x4_t aVal = vld1q_f32(aVector);
571
572 float32x4_t ax = vabsq_f32(aVal);
573 uint32x4_t sign_bits =
574 vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
575
576 float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
577 float32x4_t s = vsqrtq_f32(t);
578
579 float32x4_t poly_small = _varcsinq_f32_neonv8(ax);
580 float32x4_t poly_large = _varcsinq_f32_neonv8(s);
581
582 float32x4_t asin_large = vfmsq_f32(pi_2, two, poly_large);
583
584 uint32x4_t mask = vcgtq_f32(ax, half);
585 float32x4_t asin_result = vbslq_f32(mask, asin_large, poly_small);
586
587 asin_result = vreinterpretq_f32_u32(
588 vorrq_u32(vreinterpretq_u32_f32(asin_result), sign_bits));
589
590 float32x4_t result = vsubq_f32(pi_2, asin_result);
591
592 vst1q_f32(bVector, result);
593
594 aVector += 4;
595 bVector += 4;
596 }
597
598 number = quarterPoints * 4;
599 for (; number < num_points; number++) {
600 *bVector++ = volk_arccos(*aVector++);
601 }
602}
603
604#endif /* LV_HAVE_NEONV8 */
605
606#endif /* INCLUDED_volk_32f_acos_32f_u_H */