28 const __m512 HALF = _mm512_set1_ps(0.5f);
29 const __m512 THREE_HALFS = _mm512_set1_ps(1.5f);
31 const __m512 x0 = _mm512_rsqrt14_ps(a);
34 __m512 x1 = _mm512_mul_ps(
35 x0, _mm512_fnmadd_ps(HALF, _mm512_mul_ps(_mm512_mul_ps(x0, x0), a), THREE_HALFS));
39 __m512i a_si = _mm512_castps_si512(a);
40 __mmask16 zero_mask = _mm512_cmpeq_epi32_mask(a_si, _mm512_setzero_si512());
41 __mmask16 inf_mask = _mm512_cmpeq_epi32_mask(a_si, _mm512_set1_epi32(0x7F800000));
42 return _mm512_mask_blend_ps(zero_mask | inf_mask, x1, x0);
49static inline __m512
_mm512_real(
const __m512 z1,
const __m512 z2)
52 _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
53 return _mm512_permutex2var_ps(z1, idx, z2);
60static inline __m512
_mm512_imag(
const __m512 z1,
const __m512 z2)
63 _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
64 return _mm512_permutex2var_ps(z1, idx, z2);
75 const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f);
76 const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f);
77 const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f);
78 const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f);
79 const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f);
80 const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f);
81 const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f);
83 const __m512 x_times_x = _mm512_mul_ps(x, x);
86 arctan = _mm512_fmadd_ps(x_times_x, arctan, a11);
87 arctan = _mm512_fmadd_ps(x_times_x, arctan, a9);
88 arctan = _mm512_fmadd_ps(x_times_x, arctan, a7);
89 arctan = _mm512_fmadd_ps(x_times_x, arctan, a5);
90 arctan = _mm512_fmadd_ps(x_times_x, arctan, a3);
91 arctan = _mm512_fmadd_ps(x_times_x, arctan, a1);
92 arctan = _mm512_mul_ps(x, arctan);
106 const __m512 c0 = _mm512_set1_ps(0x1.ffffcep-1f);
107 const __m512 c1 = _mm512_set1_ps(0x1.55b648p-3f);
108 const __m512 c2 = _mm512_set1_ps(0x1.24d192p-4f);
109 const __m512 c3 = _mm512_set1_ps(0x1.0a788p-4f);
111 const __m512 u = _mm512_mul_ps(x, x);
113 p = _mm512_fmadd_ps(u, p, c2);
114 p = _mm512_fmadd_ps(u, p, c1);
115 p = _mm512_fmadd_ps(u, p, c0);
117 return _mm512_mul_ps(x, p);
126 const __m512 yl = _mm512_moveldup_ps(y);
127 const __m512 yh = _mm512_movehdup_ps(y);
128 const __m512 tmp1 = _mm512_mul_ps(x, yl);
129 const __m512 x_swap =
130 _mm512_permute_ps(x, 0xB1);
135 const __m512 tmp2 = _mm512_mul_ps(x_swap, yh);
138 const __mmask16 addsub_mask = 0x5555;
139 return _mm512_mask_sub_ps(_mm512_add_ps(tmp1, tmp2), addsub_mask, tmp1, tmp2);
149 const __m512 nswap = _mm512_permute_ps(x, 0xb1);
150 const __m512 dreal = _mm512_moveldup_ps(y);
151 const __m512 dimag = _mm512_movehdup_ps(y);
154 const __m512i conjugator_i = _mm512_setr_epi32(0,
170 const __m512 dimagconj = _mm512_castsi512_ps(_mm512_xor_epi32(
171 _mm512_castps_si512(dimag), conjugator_i));
174 return _mm512_fmadd_ps(nswap, dimagconj, _mm512_mul_ps(x, dreal));
184 __m512 tmp1 = _mm512_mul_ps(val, val);
187 const __m512 tmp1_swapped = _mm512_permute_ps(tmp1, 0xB1);
190 __m512 mag_sq = _mm512_add_ps(tmp1, tmp1_swapped);
193 const __m512 mag = _mm512_sqrt_ps(mag_sq);
196 return _mm512_div_ps(val, mag);
208 const __m512 s1 = _mm512_set1_ps(-0x1.555552p-3f);
209 const __m512 s2 = _mm512_set1_ps(+0x1.110be2p-7f);
210 const __m512 s3 = _mm512_set1_ps(-0x1.9ab22ap-13f);
212 const __m512 x2 = _mm512_mul_ps(x, x);
213 const __m512 x3 = _mm512_mul_ps(x2, x);
215 __m512 poly = _mm512_fmadd_ps(x2, s3, s2);
216 poly = _mm512_fmadd_ps(x2, poly, s1);
217 return _mm512_fmadd_ps(x3, poly, x);
229 const __m512 c1 = _mm512_set1_ps(-0x1.fffff4p-2f);
230 const __m512 c2 = _mm512_set1_ps(+0x1.554a46p-5f);
231 const __m512 c3 = _mm512_set1_ps(-0x1.661be2p-10f);
232 const __m512 one = _mm512_set1_ps(1.0f);
234 const __m512 x2 = _mm512_mul_ps(x, x);
236 __m512 poly = _mm512_fmadd_ps(x2, c3, c2);
237 poly = _mm512_fmadd_ps(x2, poly, c1);
238 return _mm512_fmadd_ps(x2, poly, one);
252 const __m512 c0 = _mm512_set1_ps(+0x1.a8a726p+1f);
253 const __m512 c1 = _mm512_set1_ps(-0x1.0b7f7ep+2f);
254 const __m512 c2 = _mm512_set1_ps(+0x1.05d9ccp+2f);
255 const __m512 c3 = _mm512_set1_ps(-0x1.4d476cp+1f);
256 const __m512 c4 = _mm512_set1_ps(+0x1.04fc3ap+0f);
257 const __m512 c5 = _mm512_set1_ps(-0x1.c97982p-3f);
258 const __m512 c6 = _mm512_set1_ps(+0x1.57aa42p-6f);
262 poly = _mm512_fmadd_ps(poly, x, c5);
263 poly = _mm512_fmadd_ps(poly, x, c4);
264 poly = _mm512_fmadd_ps(poly, x, c3);
265 poly = _mm512_fmadd_ps(poly, x, c2);
266 poly = _mm512_fmadd_ps(poly, x, c1);
267 poly = _mm512_fmadd_ps(poly, x, c0);