Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_avx2_fma_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2023 - 2025 Magnus Lundmark <magnuslundmark@gmail.com>
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10/*
11 * This file is intended to hold AVX2 FMA intrinsics.
12 * They should be used in VOLK kernels to avoid copy-paste.
13 */
14
15#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
16#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_
17#include <immintrin.h>
18
19/*
20 * Approximate arctan(x) via polynomial expansion
21 * on the interval [-1, 1]
22 *
23 * Maximum relative error ~6.5e-7
24 * Polynomial evaluated via Horner's method
25 */
26static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
27{
28 const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
29 const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
30 const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
31 const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
32 const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
33 const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
34 const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
35
36 const __m256 x_times_x = _mm256_mul_ps(x, x);
37 __m256 arctan;
38 arctan = a13;
39 arctan = _mm256_fmadd_ps(x_times_x, arctan, a11);
40 arctan = _mm256_fmadd_ps(x_times_x, arctan, a9);
41 arctan = _mm256_fmadd_ps(x_times_x, arctan, a7);
42 arctan = _mm256_fmadd_ps(x_times_x, arctan, a5);
43 arctan = _mm256_fmadd_ps(x_times_x, arctan, a3);
44 arctan = _mm256_fmadd_ps(x_times_x, arctan, a1);
45 arctan = _mm256_mul_ps(x, arctan);
46
47 return arctan;
48}
49
50/*
51 * Approximate arcsin(x) via polynomial expansion
52 * P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5
53 *
54 * Maximum relative error ~1.5e-6
55 * Polynomial evaluated via Horner's method
56 */
57static inline __m256 _mm256_arcsin_poly_avx2_fma(const __m256 x)
58{
59 const __m256 c0 = _mm256_set1_ps(0x1.ffffcep-1f);
60 const __m256 c1 = _mm256_set1_ps(0x1.55b648p-3f);
61 const __m256 c2 = _mm256_set1_ps(0x1.24d192p-4f);
62 const __m256 c3 = _mm256_set1_ps(0x1.0a788p-4f);
63
64 const __m256 u = _mm256_mul_ps(x, x);
65 __m256 p = c3;
66 p = _mm256_fmadd_ps(u, p, c2);
67 p = _mm256_fmadd_ps(u, p, c1);
68 p = _mm256_fmadd_ps(u, p, c0);
69
70 return _mm256_mul_ps(x, p);
71}
72
73/*
74 * Minimax polynomial for sin(x) on [-pi/4, pi/4]
75 * Coefficients via Remez algorithm (Sollya)
76 * Max |error| < 7.3e-9
77 * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
78 */
79static inline __m256 _mm256_sin_poly_avx2_fma(const __m256 x)
80{
81 const __m256 s1 = _mm256_set1_ps(-0x1.555552p-3f);
82 const __m256 s2 = _mm256_set1_ps(+0x1.110be2p-7f);
83 const __m256 s3 = _mm256_set1_ps(-0x1.9ab22ap-13f);
84
85 const __m256 x2 = _mm256_mul_ps(x, x);
86 const __m256 x3 = _mm256_mul_ps(x2, x);
87
88 __m256 poly = _mm256_fmadd_ps(x2, s3, s2);
89 poly = _mm256_fmadd_ps(x2, poly, s1);
90 return _mm256_fmadd_ps(x3, poly, x);
91}
92
93/*
94 * Minimax polynomial for cos(x) on [-pi/4, pi/4]
95 * Coefficients via Remez algorithm (Sollya)
96 * Max |error| < 1.1e-7
97 * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
98 */
99static inline __m256 _mm256_cos_poly_avx2_fma(const __m256 x)
100{
101 const __m256 c1 = _mm256_set1_ps(-0x1.fffff4p-2f);
102 const __m256 c2 = _mm256_set1_ps(+0x1.554a46p-5f);
103 const __m256 c3 = _mm256_set1_ps(-0x1.661be2p-10f);
104 const __m256 one = _mm256_set1_ps(1.0f);
105
106 const __m256 x2 = _mm256_mul_ps(x, x);
107
108 __m256 poly = _mm256_fmadd_ps(x2, c3, c2);
109 poly = _mm256_fmadd_ps(x2, poly, c1);
110 return _mm256_fmadd_ps(x2, poly, one);
111}
112
113/*
114 * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
115 * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
116 * Max error: ~1.55e-6
117 *
118 * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
119 * Polynomial evaluated via Horner's method with FMA
120 */
121static inline __m256 _mm256_log2_poly_avx2_fma(const __m256 x)
122{
123 const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
124 const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
125 const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
126 const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
127 const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
128 const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
129 const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
130
131 // Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
132 __m256 poly = c6;
133 poly = _mm256_fmadd_ps(poly, x, c5);
134 poly = _mm256_fmadd_ps(poly, x, c4);
135 poly = _mm256_fmadd_ps(poly, x, c3);
136 poly = _mm256_fmadd_ps(poly, x, c2);
137 poly = _mm256_fmadd_ps(poly, x, c1);
138 poly = _mm256_fmadd_ps(poly, x, c0);
139 return poly;
140}
141
142#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */