Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_atan_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 * Copyright 2023, 2024 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
57#include <math.h>
58
59#ifndef INCLUDED_volk_32f_atan_32f_a_H
60#define INCLUDED_volk_32f_atan_32f_a_H
61
62#ifdef LV_HAVE_GENERIC
63static inline void
64volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
65{
66 for (unsigned int number = 0; number < num_points; number++) {
67 *out++ = atanf(*in++);
68 }
69}
70#endif /* LV_HAVE_GENERIC */
71
72#ifdef LV_HAVE_GENERIC
73static inline void
74volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
75{
76 for (unsigned int number = 0; number < num_points; number++) {
77 *out++ = volk_arctan(*in++);
78 }
79}
80#endif /* LV_HAVE_GENERIC */
81
82#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
83#include <immintrin.h>
85static inline void
86volk_32f_atan_32f_a_avx512dq(float* out, const float* in, unsigned int num_points)
87{
88 const __m512 one = _mm512_set1_ps(1.f);
89 const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
90 const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
91 const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
92
93 const unsigned int sixteenth_points = num_points / 16;
94
95 for (unsigned int number = 0; number < sixteenth_points; number++) {
96 __m512 x = _mm512_load_ps(in);
97 in += 16;
98 __mmask16 swap_mask =
99 _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
100 __m512 x_star = _mm512_mask_blend_ps(swap_mask, x, _mm512_rcp14_ps(x));
101 __m512 result = _mm512_arctan_poly_avx512(x_star);
102 __m512 term = _mm512_and_ps(x_star, sign_mask);
103 term = _mm512_or_ps(pi_over_2, term);
104 term = _mm512_sub_ps(term, result);
105 result = _mm512_mask_blend_ps(swap_mask, result, term);
106 _mm512_store_ps(out, result);
107 out += 16;
108 }
109
110 for (unsigned int number = sixteenth_points * 16; number < num_points; number++) {
111 *out++ = volk_arctan(*in++);
112 }
113}
114#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */
115
116#if LV_HAVE_AVX2 && LV_HAVE_FMA
117#include <immintrin.h>
119static inline void
120volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
121{
122 const __m256 one = _mm256_set1_ps(1.f);
123 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
124 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
125 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
126
127 const unsigned int eighth_points = num_points / 8;
128
129 for (unsigned int number = 0; number < eighth_points; number++) {
130 __m256 x = _mm256_load_ps(in);
131 in += 8;
132 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
133 __m256 x_star = _mm256_blendv_ps(x, _mm256_rcp_ps(x), swap_mask);
134 __m256 result = _mm256_arctan_poly_avx2_fma(x_star);
135 __m256 term = _mm256_and_ps(x_star, sign_mask);
136 term = _mm256_or_ps(pi_over_2, term);
137 term = _mm256_sub_ps(term, result);
138 result = _mm256_blendv_ps(result, term, swap_mask);
139 _mm256_store_ps(out, result);
140 out += 8;
141 }
142
143 for (unsigned int number = eighth_points * 8; number < num_points; number++) {
144 *out++ = volk_arctan(*in++);
145 }
146}
147#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
148
149#if LV_HAVE_AVX
150#include <immintrin.h>
152static inline void
153volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
154{
155 const __m256 one = _mm256_set1_ps(1.f);
156 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
157 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
158 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
159
160 const unsigned int eighth_points = num_points / 8;
161
162 for (unsigned int number = 0; number < eighth_points; number++) {
163 __m256 x = _mm256_load_ps(in);
164 in += 8;
165 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
166 __m256 x_star = _mm256_blendv_ps(x, _mm256_rcp_ps(x), swap_mask);
167 __m256 result = _mm256_arctan_poly_avx(x_star);
168 __m256 term = _mm256_and_ps(x_star, sign_mask);
169 term = _mm256_or_ps(pi_over_2, term);
170 term = _mm256_sub_ps(term, result);
171 result = _mm256_blendv_ps(result, term, swap_mask);
172 _mm256_store_ps(out, result);
173 out += 8;
174 }
175
176 for (unsigned int number = eighth_points * 8; number < num_points; number++) {
177 *out++ = volk_arctan(*in++);
178 }
179}
180#endif /* LV_HAVE_AVX for aligned */
181
182#ifdef LV_HAVE_SSE4_1
183#include <smmintrin.h>
185static inline void
186volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
187{
188 const __m128 one = _mm_set1_ps(1.f);
189 const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
190 const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
191 const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
192
193 const unsigned int quarter_points = num_points / 4;
194
195 for (unsigned int number = 0; number < quarter_points; number++) {
196 __m128 x = _mm_load_ps(in);
197 in += 4;
198 __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
199 __m128 x_star = _mm_blendv_ps(x, _mm_rcp_ps(x), swap_mask);
200 __m128 result = _mm_arctan_poly_sse(x_star);
201 __m128 term = _mm_and_ps(x_star, sign_mask);
202 term = _mm_or_ps(pi_over_2, term);
203 term = _mm_sub_ps(term, result);
204 result = _mm_blendv_ps(result, term, swap_mask);
205 _mm_store_ps(out, result);
206 out += 4;
207 }
208
209 for (unsigned int number = quarter_points * 4; number < num_points; number++) {
210 *out++ = volk_arctan(*in++);
211 }
212}
213#endif /* LV_HAVE_SSE4_1 for aligned */
214
215#ifdef LV_HAVE_NEON
216#include <arm_neon.h>
218static inline void
219volk_32f_atan_32f_neon(float* out, const float* in, unsigned int num_points)
220{
221 const float32x4_t one = vdupq_n_f32(1.f);
222 const float32x4_t pi_over_2 = vdupq_n_f32(0x1.921fb6p0f);
223 const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
224
225 const unsigned int quarter_points = num_points / 4;
226
227 for (unsigned int number = 0; number < quarter_points; number++) {
228 float32x4_t x = vld1q_f32(in);
229 in += 4;
230
231 // swap_mask = |x| > 1
232 float32x4_t abs_x = vabsq_f32(x);
233 uint32x4_t swap_mask = vcgtq_f32(abs_x, one);
234
235 // x_star = swap_mask ? 1/x : x
236 float32x4_t x_star = vbslq_f32(swap_mask, _vinvq_f32(x), x);
237
238 // Compute arctan polynomial
239 float32x4_t result = _varctan_poly_f32(x_star);
240
241 // If swapped: result = sign(x_star)*pi/2 - result
242 uint32x4_t x_star_sign = vandq_u32(vreinterpretq_u32_f32(x_star), sign_mask);
243 float32x4_t term = vreinterpretq_f32_u32(
244 vorrq_u32(vreinterpretq_u32_f32(pi_over_2), x_star_sign));
245 term = vsubq_f32(term, result);
246 result = vbslq_f32(swap_mask, term, result);
247
248 vst1q_f32(out, result);
249 out += 4;
250 }
251
252 for (unsigned int number = quarter_points * 4; number < num_points; number++) {
253 *out++ = volk_arctan(*in++);
254 }
255}
256#endif /* LV_HAVE_NEON */
257
258#ifdef LV_HAVE_NEONV8
259#include <arm_neon.h>
261/* ARMv8 NEON with FMA and 2x unrolling for better ILP */
262static inline void
263volk_32f_atan_32f_neonv8(float* out, const float* in, unsigned int num_points)
264{
265 const float32x4_t one = vdupq_n_f32(1.f);
266 const float32x4_t pi_over_2 = vdupq_n_f32(0x1.921fb6p0f);
267 const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
268
269 const unsigned int eighth_points = num_points / 8;
270
271 for (unsigned int number = 0; number < eighth_points; number++) {
272 /* Load 8 floats */
273 float32x4_t x0 = vld1q_f32(in);
274 float32x4_t x1 = vld1q_f32(in + 4);
275 in += 8;
276
277 /* Process first 4 */
278 float32x4_t abs_x0 = vabsq_f32(x0);
279 uint32x4_t swap_mask0 = vcgtq_f32(abs_x0, one);
280 float32x4_t x_star0 = vbslq_f32(swap_mask0, _vinvq_f32(x0), x0);
281 float32x4_t result0 = _varctan_poly_neonv8(x_star0);
282 uint32x4_t x_star_sign0 = vandq_u32(vreinterpretq_u32_f32(x_star0), sign_mask);
283 float32x4_t term0 = vreinterpretq_f32_u32(
284 vorrq_u32(vreinterpretq_u32_f32(pi_over_2), x_star_sign0));
285 term0 = vsubq_f32(term0, result0);
286 result0 = vbslq_f32(swap_mask0, term0, result0);
287
288 /* Process second 4 */
289 float32x4_t abs_x1 = vabsq_f32(x1);
290 uint32x4_t swap_mask1 = vcgtq_f32(abs_x1, one);
291 float32x4_t x_star1 = vbslq_f32(swap_mask1, _vinvq_f32(x1), x1);
292 float32x4_t result1 = _varctan_poly_neonv8(x_star1);
293 uint32x4_t x_star_sign1 = vandq_u32(vreinterpretq_u32_f32(x_star1), sign_mask);
294 float32x4_t term1 = vreinterpretq_f32_u32(
295 vorrq_u32(vreinterpretq_u32_f32(pi_over_2), x_star_sign1));
296 term1 = vsubq_f32(term1, result1);
297 result1 = vbslq_f32(swap_mask1, term1, result1);
298
299 vst1q_f32(out, result0);
300 vst1q_f32(out + 4, result1);
301 out += 8;
302 }
303
304 for (unsigned int number = eighth_points * 8; number < num_points; number++) {
305 *out++ = volk_arctan(*in++);
306 }
307}
308#endif /* LV_HAVE_NEONV8 */
309
310#endif /* INCLUDED_volk_32f_atan_32f_a_H */
311
312#ifndef INCLUDED_volk_32f_atan_32f_u_H
313#define INCLUDED_volk_32f_atan_32f_u_H
314
315#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
316#include <immintrin.h>
318static inline void
319volk_32f_atan_32f_u_avx512dq(float* out, const float* in, unsigned int num_points)
320{
321 const __m512 one = _mm512_set1_ps(1.f);
322 const __m512 pi_over_2 = _mm512_set1_ps(0x1.921fb6p0f);
323 const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
324 const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
325
326 const unsigned int sixteenth_points = num_points / 16;
327
328 for (unsigned int number = 0; number < sixteenth_points; number++) {
329 __m512 x = _mm512_loadu_ps(in);
330 in += 16;
331 __mmask16 swap_mask =
332 _mm512_cmp_ps_mask(_mm512_and_ps(x, abs_mask), one, _CMP_GT_OS);
333 __m512 x_star = _mm512_mask_blend_ps(swap_mask, x, _mm512_rcp14_ps(x));
334 __m512 result = _mm512_arctan_poly_avx512(x_star);
335 __m512 term = _mm512_and_ps(x_star, sign_mask);
336 term = _mm512_or_ps(pi_over_2, term);
337 term = _mm512_sub_ps(term, result);
338 result = _mm512_mask_blend_ps(swap_mask, result, term);
339 _mm512_storeu_ps(out, result);
340 out += 16;
341 }
342
343 for (unsigned int number = sixteenth_points * 16; number < num_points; number++) {
344 *out++ = volk_arctan(*in++);
345 }
346}
347#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */
348
349#if LV_HAVE_AVX2 && LV_HAVE_FMA
350#include <immintrin.h>
351static inline void
352volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
353{
354 const __m256 one = _mm256_set1_ps(1.f);
355 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
356 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
357 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
358
359 const unsigned int eighth_points = num_points / 8;
360
361 for (unsigned int number = 0; number < eighth_points; number++) {
362 __m256 x = _mm256_loadu_ps(in);
363 in += 8;
364 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
365 __m256 x_star = _mm256_blendv_ps(x, _mm256_rcp_ps(x), swap_mask);
366 __m256 result = _mm256_arctan_poly_avx2_fma(x_star);
367 __m256 term = _mm256_and_ps(x_star, sign_mask);
368 term = _mm256_or_ps(pi_over_2, term);
369 term = _mm256_sub_ps(term, result);
370 result = _mm256_blendv_ps(result, term, swap_mask);
371 _mm256_storeu_ps(out, result);
372 out += 8;
373 }
374
375 for (unsigned int number = eighth_points * 8; number < num_points; number++) {
376 *out++ = volk_arctan(*in++);
377 }
378}
379#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
380
381#if LV_HAVE_AVX
382#include <immintrin.h>
383static inline void
384volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
385{
386 const __m256 one = _mm256_set1_ps(1.f);
387 const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
388 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
389 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
390
391 const unsigned int eighth_points = num_points / 8;
392
393 for (unsigned int number = 0; number < eighth_points; number++) {
394 __m256 x = _mm256_loadu_ps(in);
395 in += 8;
396 __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
397 __m256 x_star = _mm256_blendv_ps(x, _mm256_rcp_ps(x), swap_mask);
398 __m256 result = _mm256_arctan_poly_avx(x_star);
399 __m256 term = _mm256_and_ps(x_star, sign_mask);
400 term = _mm256_or_ps(pi_over_2, term);
401 term = _mm256_sub_ps(term, result);
402 result = _mm256_blendv_ps(result, term, swap_mask);
403 _mm256_storeu_ps(out, result);
404 out += 8;
405 }
406
407 for (unsigned int number = eighth_points * 8; number < num_points; number++) {
408 *out++ = volk_arctan(*in++);
409 }
410}
411#endif /* LV_HAVE_AVX for unaligned */
412
413#ifdef LV_HAVE_SSE4_1
414#include <smmintrin.h>
416static inline void
417volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
418{
419 const __m128 one = _mm_set1_ps(1.f);
420 const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
421 const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
422 const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
423
424 const unsigned int quarter_points = num_points / 4;
425
426 for (unsigned int number = 0; number < quarter_points; number++) {
427 __m128 x = _mm_loadu_ps(in);
428 in += 4;
429 __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
430 __m128 x_star = _mm_blendv_ps(x, _mm_rcp_ps(x), swap_mask);
431 __m128 result = _mm_arctan_poly_sse(x_star);
432 __m128 term = _mm_and_ps(x_star, sign_mask);
433 term = _mm_or_ps(pi_over_2, term);
434 term = _mm_sub_ps(term, result);
435 result = _mm_blendv_ps(result, term, swap_mask);
436 _mm_storeu_ps(out, result);
437 out += 4;
438 }
439
440 for (unsigned int number = quarter_points * 4; number < num_points; number++) {
441 *out++ = volk_arctan(*in++);
442 }
443}
444#endif /* LV_HAVE_SSE4_1 for unaligned */
445
446#ifdef LV_HAVE_RVV
447#include <riscv_vector.h>
448
449static inline void
450volk_32f_atan_32f_rvv(float* out, const float* in, unsigned int num_points)
451{
452 size_t vlmax = __riscv_vsetvlmax_e32m2();
453
454 const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
455 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
456 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
457 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
458 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
459 const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
460 const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
461 const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
462 const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
463
464 size_t n = num_points;
465 for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
466 vl = __riscv_vsetvl_e32m2(n);
467 vfloat32m2_t v = __riscv_vle32_v_f32m2(in, vl);
468 vbool16_t mswap = __riscv_vmfgt(__riscv_vfabs(v, vl), cf1, vl);
469 vfloat32m2_t x = __riscv_vfdiv_mu(mswap, v, cf1, v, vl);
470 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
471 vfloat32m2_t p = c13;
472 p = __riscv_vfmadd(p, xx, c11, vl);
473 p = __riscv_vfmadd(p, xx, c9, vl);
474 p = __riscv_vfmadd(p, xx, c7, vl);
475 p = __riscv_vfmadd(p, xx, c5, vl);
476 p = __riscv_vfmadd(p, xx, c3, vl);
477 p = __riscv_vfmadd(p, xx, c1, vl);
478 p = __riscv_vfmul(p, x, vl);
479
480 vfloat32m2_t t = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
481 p = __riscv_vmerge(p, t, mswap, vl);
482
483 __riscv_vse32(out, p, vl);
484 }
485}
486#endif /*LV_HAVE_RVV*/
487
488#endif /* INCLUDED_volk_32f_atan_32f_u_H */