Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_asin_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
57
58#include <inttypes.h>
59#include <math.h>
60#include <stdio.h>
61#include <volk/volk_common.h>
62
63#ifndef INCLUDED_volk_32f_asin_32f_a_H
64#define INCLUDED_volk_32f_asin_32f_a_H
65
66#ifdef LV_HAVE_GENERIC
67
68static inline void
69volk_32f_asin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
70{
71 for (unsigned int i = 0; i < num_points; i++) {
72 bVector[i] = volk_arcsin(aVector[i]);
73 }
74}
75
76#endif /* LV_HAVE_GENERIC */
77
78#ifdef LV_HAVE_SSE4_1
79#include <smmintrin.h>
81
82static inline void
83volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
84{
85 const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
86 const __m128 half = _mm_set1_ps(0.5f);
87 const __m128 one = _mm_set1_ps(1.0f);
88 const __m128 two = _mm_set1_ps(2.0f);
89 const __m128 sign_mask = _mm_set1_ps(-0.0f);
90
91 unsigned int number = 0;
92 const unsigned int quarterPoints = num_points / 4;
93
94 for (; number < quarterPoints; number++) {
95 __m128 aVal = _mm_load_ps(aVector);
96
97 // Get absolute value and sign
98 __m128 sign = _mm_and_ps(aVal, sign_mask);
99 __m128 ax = _mm_andnot_ps(sign_mask, aVal);
100
101 // Two-range computation
102 // Small: result = arcsin_poly(x)
103 // Large: result = pi/2 - 2*arcsin_poly(sqrt((1-|x|)/2))
104
105 __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
106 __m128 s = _mm_sqrt_ps(t);
107
108 // Compute polynomial for both ranges
109 __m128 poly_small = _mm_arcsin_poly_sse(ax);
110 __m128 poly_large = _mm_arcsin_poly_sse(s);
111
112 // Large range: pi/2 - 2*poly_large
113 __m128 result_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
114
115 // Blend based on |x| > 0.5
116 __m128 mask = _mm_cmpgt_ps(ax, half);
117 __m128 result = _mm_blendv_ps(poly_small, result_large, mask);
118
119 // Apply sign
120 result = _mm_or_ps(result, sign);
121
122 _mm_store_ps(bVector, result);
123
124 aVector += 4;
125 bVector += 4;
126 }
127
128 number = quarterPoints * 4;
129 for (; number < num_points; number++) {
130 *bVector++ = volk_arcsin(*aVector++);
131 }
132}
133
134#endif /* LV_HAVE_SSE4_1 */
135
136#ifdef LV_HAVE_AVX
137#include <immintrin.h>
139
140static inline void
141volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
142{
143 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
144 const __m256 half = _mm256_set1_ps(0.5f);
145 const __m256 one = _mm256_set1_ps(1.0f);
146 const __m256 two = _mm256_set1_ps(2.0f);
147 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
148
149 unsigned int number = 0;
150 const unsigned int eighthPoints = num_points / 8;
151
152 for (; number < eighthPoints; number++) {
153 __m256 aVal = _mm256_load_ps(aVector);
154
155 // Get absolute value and sign
156 __m256 sign = _mm256_and_ps(aVal, sign_mask);
157 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
158
159 // Two-range computation
160 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
161 __m256 s = _mm256_sqrt_ps(t);
162
163 // Compute polynomial for both ranges
164 __m256 poly_small = _mm256_arcsin_poly_avx(ax);
165 __m256 poly_large = _mm256_arcsin_poly_avx(s);
166
167 // Large range: pi/2 - 2*poly_large
168 __m256 result_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
169
170 // Blend based on |x| > 0.5
171 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
172 __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
173
174 // Apply sign
175 result = _mm256_or_ps(result, sign);
176
177 _mm256_store_ps(bVector, result);
178
179 aVector += 8;
180 bVector += 8;
181 }
182
183 number = eighthPoints * 8;
184 for (; number < num_points; number++) {
185 *bVector++ = volk_arcsin(*aVector++);
186 }
187}
188
189#endif /* LV_HAVE_AVX */
190
191#ifdef LV_HAVE_AVX2
192#include <immintrin.h>
194
195static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
196 const float* aVector,
197 unsigned int num_points)
198{
199 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
200 const __m256 half = _mm256_set1_ps(0.5f);
201 const __m256 one = _mm256_set1_ps(1.0f);
202 const __m256 two = _mm256_set1_ps(2.0f);
203 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
204
205 unsigned int number = 0;
206 const unsigned int eighthPoints = num_points / 8;
207
208 for (; number < eighthPoints; number++) {
209 __m256 aVal = _mm256_load_ps(aVector);
210
211 // Get absolute value and sign
212 __m256 sign = _mm256_and_ps(aVal, sign_mask);
213 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
214
215 // Two-range computation
216 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
217 __m256 s = _mm256_sqrt_ps(t);
218
219 // Compute polynomial for both ranges
220 __m256 poly_small = _mm256_arcsin_poly_avx2_fma(ax);
221 __m256 poly_large = _mm256_arcsin_poly_avx2_fma(s);
222
223 // Large range: pi/2 - 2*poly_large
224 __m256 result_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
225
226 // Blend based on |x| > 0.5
227 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
228 __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
229
230 // Apply sign
231 result = _mm256_or_ps(result, sign);
232
233 _mm256_store_ps(bVector, result);
234
235 aVector += 8;
236 bVector += 8;
237 }
238
239 number = eighthPoints * 8;
240 for (; number < num_points; number++) {
241 *bVector++ = volk_arcsin(*aVector++);
242 }
243}
244
245#endif /* LV_HAVE_AVX2 */
246
247#ifdef LV_HAVE_AVX512F
248#include <immintrin.h>
250
251static inline void
252volk_32f_asin_32f_a_avx512(float* bVector, const float* aVector, unsigned int num_points)
253{
254 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
255 const __m512 half = _mm512_set1_ps(0.5f);
256 const __m512 one = _mm512_set1_ps(1.0f);
257 const __m512 two = _mm512_set1_ps(2.0f);
258 const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
259
260 unsigned int number = 0;
261 const unsigned int sixteenthPoints = num_points / 16;
262
263 for (; number < sixteenthPoints; number++) {
264 __m512 aVal = _mm512_load_ps(aVector);
265
266 // Get absolute value and sign using integer ops (AVX512F compatible)
267 __m512i aVal_i = _mm512_castps_si512(aVal);
268 __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
269 __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
270
271 // Two-range computation
272 __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
273 __m512 s = _mm512_sqrt_ps(t);
274
275 // Compute polynomial for both ranges
276 __m512 poly_small = _mm512_arcsin_poly_avx512(ax);
277 __m512 poly_large = _mm512_arcsin_poly_avx512(s);
278
279 // Large range: pi/2 - 2*poly_large
280 __m512 result_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
281
282 // Blend based on |x| > 0.5
283 __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
284 __m512 result = _mm512_mask_blend_ps(mask, poly_small, result_large);
285
286 // Apply sign
287 result = _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(result), sign));
288
289 _mm512_store_ps(bVector, result);
290
291 aVector += 16;
292 bVector += 16;
293 }
294
295 number = sixteenthPoints * 16;
296 for (; number < num_points; number++) {
297 *bVector++ = volk_arcsin(*aVector++);
298 }
299}
300
301#endif /* LV_HAVE_AVX512F */
302
303#endif /* INCLUDED_volk_32f_asin_32f_a_H */
304
305#ifndef INCLUDED_volk_32f_asin_32f_u_H
306#define INCLUDED_volk_32f_asin_32f_u_H
307
308#ifdef LV_HAVE_SSE4_1
309#include <smmintrin.h>
311
312static inline void
313volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
314{
315 const __m128 pi_2 = _mm_set1_ps(0x1.921fb6p0f);
316 const __m128 half = _mm_set1_ps(0.5f);
317 const __m128 one = _mm_set1_ps(1.0f);
318 const __m128 two = _mm_set1_ps(2.0f);
319 const __m128 sign_mask = _mm_set1_ps(-0.0f);
320
321 unsigned int number = 0;
322 const unsigned int quarterPoints = num_points / 4;
323
324 for (; number < quarterPoints; number++) {
325 __m128 aVal = _mm_loadu_ps(aVector);
326
327 __m128 sign = _mm_and_ps(aVal, sign_mask);
328 __m128 ax = _mm_andnot_ps(sign_mask, aVal);
329
330 __m128 t = _mm_mul_ps(_mm_sub_ps(one, ax), half);
331 __m128 s = _mm_sqrt_ps(t);
332
333 __m128 poly_small = _mm_arcsin_poly_sse(ax);
334 __m128 poly_large = _mm_arcsin_poly_sse(s);
335
336 __m128 result_large = _mm_sub_ps(pi_2, _mm_mul_ps(two, poly_large));
337
338 __m128 mask = _mm_cmpgt_ps(ax, half);
339 __m128 result = _mm_blendv_ps(poly_small, result_large, mask);
340
341 result = _mm_or_ps(result, sign);
342
343 _mm_storeu_ps(bVector, result);
344
345 aVector += 4;
346 bVector += 4;
347 }
348
349 number = quarterPoints * 4;
350 for (; number < num_points; number++) {
351 *bVector++ = volk_arcsin(*aVector++);
352 }
353}
354
355#endif /* LV_HAVE_SSE4_1 */
356
357#ifdef LV_HAVE_AVX
358#include <immintrin.h>
360
361static inline void
362volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
363{
364 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
365 const __m256 half = _mm256_set1_ps(0.5f);
366 const __m256 one = _mm256_set1_ps(1.0f);
367 const __m256 two = _mm256_set1_ps(2.0f);
368 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
369
370 unsigned int number = 0;
371 const unsigned int eighthPoints = num_points / 8;
372
373 for (; number < eighthPoints; number++) {
374 __m256 aVal = _mm256_loadu_ps(aVector);
375
376 __m256 sign = _mm256_and_ps(aVal, sign_mask);
377 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
378
379 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
380 __m256 s = _mm256_sqrt_ps(t);
381
382 __m256 poly_small = _mm256_arcsin_poly_avx(ax);
383 __m256 poly_large = _mm256_arcsin_poly_avx(s);
384
385 __m256 result_large = _mm256_sub_ps(pi_2, _mm256_mul_ps(two, poly_large));
386
387 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
388 __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
389
390 result = _mm256_or_ps(result, sign);
391
392 _mm256_storeu_ps(bVector, result);
393
394 aVector += 8;
395 bVector += 8;
396 }
397
398 number = eighthPoints * 8;
399 for (; number < num_points; number++) {
400 *bVector++ = volk_arcsin(*aVector++);
401 }
402}
403
404#endif /* LV_HAVE_AVX */
405
406#ifdef LV_HAVE_AVX2
407#include <immintrin.h>
409
410static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
411 const float* aVector,
412 unsigned int num_points)
413{
414 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
415 const __m256 half = _mm256_set1_ps(0.5f);
416 const __m256 one = _mm256_set1_ps(1.0f);
417 const __m256 two = _mm256_set1_ps(2.0f);
418 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
419
420 unsigned int number = 0;
421 const unsigned int eighthPoints = num_points / 8;
422
423 for (; number < eighthPoints; number++) {
424 __m256 aVal = _mm256_loadu_ps(aVector);
425
426 __m256 sign = _mm256_and_ps(aVal, sign_mask);
427 __m256 ax = _mm256_andnot_ps(sign_mask, aVal);
428
429 __m256 t = _mm256_mul_ps(_mm256_sub_ps(one, ax), half);
430 __m256 s = _mm256_sqrt_ps(t);
431
432 __m256 poly_small = _mm256_arcsin_poly_avx2_fma(ax);
433 __m256 poly_large = _mm256_arcsin_poly_avx2_fma(s);
434
435 __m256 result_large = _mm256_fnmadd_ps(two, poly_large, pi_2);
436
437 __m256 mask = _mm256_cmp_ps(ax, half, _CMP_GT_OS);
438 __m256 result = _mm256_blendv_ps(poly_small, result_large, mask);
439
440 result = _mm256_or_ps(result, sign);
441
442 _mm256_storeu_ps(bVector, result);
443
444 aVector += 8;
445 bVector += 8;
446 }
447
448 number = eighthPoints * 8;
449 for (; number < num_points; number++) {
450 *bVector++ = volk_arcsin(*aVector++);
451 }
452}
453
454#endif /* LV_HAVE_AVX2 */
455
456#ifdef LV_HAVE_AVX512F
457#include <immintrin.h>
459
460static inline void
461volk_32f_asin_32f_u_avx512(float* bVector, const float* aVector, unsigned int num_points)
462{
463 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
464 const __m512 half = _mm512_set1_ps(0.5f);
465 const __m512 one = _mm512_set1_ps(1.0f);
466 const __m512 two = _mm512_set1_ps(2.0f);
467 const __m512i sign_mask = _mm512_set1_epi32(0x80000000);
468
469 unsigned int number = 0;
470 const unsigned int sixteenthPoints = num_points / 16;
471
472 for (; number < sixteenthPoints; number++) {
473 __m512 aVal = _mm512_loadu_ps(aVector);
474
475 __m512i aVal_i = _mm512_castps_si512(aVal);
476 __m512i sign = _mm512_and_epi32(aVal_i, sign_mask);
477 __m512 ax = _mm512_castsi512_ps(_mm512_andnot_epi32(sign_mask, aVal_i));
478
479 __m512 t = _mm512_mul_ps(_mm512_sub_ps(one, ax), half);
480 __m512 s = _mm512_sqrt_ps(t);
481
482 __m512 poly_small = _mm512_arcsin_poly_avx512(ax);
483 __m512 poly_large = _mm512_arcsin_poly_avx512(s);
484
485 __m512 result_large = _mm512_fnmadd_ps(two, poly_large, pi_2);
486
487 __mmask16 mask = _mm512_cmp_ps_mask(ax, half, _CMP_GT_OS);
488 __m512 result = _mm512_mask_blend_ps(mask, poly_small, result_large);
489
490 result = _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(result), sign));
491
492 _mm512_storeu_ps(bVector, result);
493
494 aVector += 16;
495 bVector += 16;
496 }
497
498 number = sixteenthPoints * 16;
499 for (; number < num_points; number++) {
500 *bVector++ = volk_arcsin(*aVector++);
501 }
502}
503
504#endif /* LV_HAVE_AVX512F */
505
506#ifdef LV_HAVE_NEON
507#include <arm_neon.h>
509
510static inline void
511volk_32f_asin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
512{
513 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
514 const float32x4_t half = vdupq_n_f32(0.5f);
515 const float32x4_t one = vdupq_n_f32(1.0f);
516 const float32x4_t two = vdupq_n_f32(2.0f);
517
518 unsigned int number = 0;
519 const unsigned int quarterPoints = num_points / 4;
520
521 for (; number < quarterPoints; number++) {
522 float32x4_t aVal = vld1q_f32(aVector);
523
524 // Get absolute value and sign
525 float32x4_t ax = vabsq_f32(aVal);
526 uint32x4_t sign_bits =
527 vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
528
529 // Two-range computation
530 float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
531 float32x4_t s = _vsqrtq_f32(t);
532
533 // Compute polynomial for both ranges
534 float32x4_t poly_small = _varcsinq_f32(ax);
535 float32x4_t poly_large = _varcsinq_f32(s);
536
537 // Large range: pi/2 - 2*poly_large
538 float32x4_t result_large = vmlsq_f32(pi_2, two, poly_large);
539
540 // Blend based on |x| > 0.5
541 uint32x4_t mask = vcgtq_f32(ax, half);
542 float32x4_t result = vbslq_f32(mask, result_large, poly_small);
543
544 // Apply sign
545 result =
546 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(result), sign_bits));
547
548 vst1q_f32(bVector, result);
549
550 aVector += 4;
551 bVector += 4;
552 }
553
554 number = quarterPoints * 4;
555 for (; number < num_points; number++) {
556 *bVector++ = volk_arcsin(*aVector++);
557 }
558}
559
560#endif /* LV_HAVE_NEON */
561
562#ifdef LV_HAVE_NEONV8
563#include <arm_neon.h>
565
566static inline void
567volk_32f_asin_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
568{
569 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
570 const float32x4_t half = vdupq_n_f32(0.5f);
571 const float32x4_t one = vdupq_n_f32(1.0f);
572 const float32x4_t two = vdupq_n_f32(2.0f);
573
574 unsigned int number = 0;
575 const unsigned int quarterPoints = num_points / 4;
576
577 for (; number < quarterPoints; number++) {
578 float32x4_t aVal = vld1q_f32(aVector);
579
580 float32x4_t ax = vabsq_f32(aVal);
581 uint32x4_t sign_bits =
582 vandq_u32(vreinterpretq_u32_f32(aVal), vdupq_n_u32(0x80000000));
583
584 float32x4_t t = vmulq_f32(vsubq_f32(one, ax), half);
585 float32x4_t s = vsqrtq_f32(t);
586
587 float32x4_t poly_small = _varcsinq_f32_neonv8(ax);
588 float32x4_t poly_large = _varcsinq_f32_neonv8(s);
589
590 float32x4_t result_large = vfmsq_f32(pi_2, two, poly_large);
591
592 uint32x4_t mask = vcgtq_f32(ax, half);
593 float32x4_t result = vbslq_f32(mask, result_large, poly_small);
594
595 result =
596 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(result), sign_bits));
597
598 vst1q_f32(bVector, result);
599
600 aVector += 4;
601 bVector += 4;
602 }
603
604 number = quarterPoints * 4;
605 for (; number < num_points; number++) {
606 *bVector++ = volk_arcsin(*aVector++);
607 }
608}
609
610#endif /* LV_HAVE_NEONV8 */
611
612#endif /* INCLUDED_volk_32f_asin_32f_u_H */