29 const __m128 HALF = _mm_set1_ps(0.5f);
30 const __m128 THREE_HALFS = _mm_set1_ps(1.5f);
32 const __m128 x0 = _mm_rsqrt_ps(a);
35 __m128 x1 = _mm_mul_ps(
36 x0, _mm_sub_ps(THREE_HALFS, _mm_mul_ps(HALF, _mm_mul_ps(_mm_mul_ps(x0, x0), a))));
40 __m128i a_si = _mm_castps_si128(a);
41 __m128i zero_mask = _mm_cmpeq_epi32(a_si, _mm_setzero_si128());
42 __m128i inf_mask = _mm_cmpeq_epi32(a_si, _mm_set1_epi32(0x7F800000));
43 __m128 special_mask = _mm_castsi128_ps(_mm_or_si128(zero_mask, inf_mask));
45 return _mm_or_ps(_mm_and_ps(special_mask, x0), _mm_andnot_ps(special_mask, x1));
57 const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
58 const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
59 const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
60 const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
61 const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
62 const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
63 const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);
65 const __m128 x_times_x = _mm_mul_ps(x, x);
68 arctan = _mm_mul_ps(x_times_x, arctan);
69 arctan = _mm_add_ps(arctan, a11);
70 arctan = _mm_mul_ps(x_times_x, arctan);
71 arctan = _mm_add_ps(arctan, a9);
72 arctan = _mm_mul_ps(x_times_x, arctan);
73 arctan = _mm_add_ps(arctan, a7);
74 arctan = _mm_mul_ps(x_times_x, arctan);
75 arctan = _mm_add_ps(arctan, a5);
76 arctan = _mm_mul_ps(x_times_x, arctan);
77 arctan = _mm_add_ps(arctan, a3);
78 arctan = _mm_mul_ps(x_times_x, arctan);
79 arctan = _mm_add_ps(arctan, a1);
80 arctan = _mm_mul_ps(x, arctan);
94 const __m128 c0 = _mm_set1_ps(0x1.ffffcep-1f);
95 const __m128 c1 = _mm_set1_ps(0x1.55b648p-3f);
96 const __m128 c2 = _mm_set1_ps(0x1.24d192p-4f);
97 const __m128 c3 = _mm_set1_ps(0x1.0a788p-4f);
99 const __m128 u = _mm_mul_ps(x, x);
101 p = _mm_mul_ps(u, p);
102 p = _mm_add_ps(p, c2);
103 p = _mm_mul_ps(u, p);
104 p = _mm_add_ps(p, c1);
105 p = _mm_mul_ps(u, p);
106 p = _mm_add_ps(p, c0);
108 return _mm_mul_ps(x, p);
113 __m128 iValue, qValue;
115 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
117 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
118 iValue = _mm_mul_ps(iValue, iValue);
119 qValue = _mm_mul_ps(qValue, qValue);
120 return _mm_add_ps(iValue, qValue);
142 __m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
144 aux = _mm_mul_ps(aux, val);
145 aux = _mm_sub_ps(aux, acc);
146 aux = _mm_mul_ps(aux, aux);
147 aux = _mm_mul_ps(aux, rec);
148 return _mm_add_ps(sq_acc, aux);
159 const __m128 s1 = _mm_set1_ps(-0x1.555552p-3f);
160 const __m128 s2 = _mm_set1_ps(+0x1.110be2p-7f);
161 const __m128 s3 = _mm_set1_ps(-0x1.9ab22ap-13f);
163 const __m128 x2 = _mm_mul_ps(x, x);
164 const __m128 x3 = _mm_mul_ps(x2, x);
166 __m128 poly = _mm_add_ps(_mm_mul_ps(x2, s3), s2);
167 poly = _mm_add_ps(_mm_mul_ps(x2, poly), s1);
168 return _mm_add_ps(_mm_mul_ps(x3, poly), x);
179 const __m128 c1 = _mm_set1_ps(-0x1.fffff4p-2f);
180 const __m128 c2 = _mm_set1_ps(+0x1.554a46p-5f);
181 const __m128 c3 = _mm_set1_ps(-0x1.661be2p-10f);
182 const __m128 one = _mm_set1_ps(1.0f);
184 const __m128 x2 = _mm_mul_ps(x, x);
186 __m128 poly = _mm_add_ps(_mm_mul_ps(x2, c3), c2);
187 poly = _mm_add_ps(_mm_mul_ps(x2, poly), c1);
188 return _mm_add_ps(_mm_mul_ps(x2, poly), one);
201 const __m128 c0 = _mm_set1_ps(+0x1.a8a726p+1f);
202 const __m128 c1 = _mm_set1_ps(-0x1.0b7f7ep+2f);
203 const __m128 c2 = _mm_set1_ps(+0x1.05d9ccp+2f);
204 const __m128 c3 = _mm_set1_ps(-0x1.4d476cp+1f);
205 const __m128 c4 = _mm_set1_ps(+0x1.04fc3ap+0f);
206 const __m128 c5 = _mm_set1_ps(-0x1.c97982p-3f);
207 const __m128 c6 = _mm_set1_ps(+0x1.57aa42p-6f);
211 poly = _mm_add_ps(_mm_mul_ps(poly, x), c5);
212 poly = _mm_add_ps(_mm_mul_ps(poly, x), c4);
213 poly = _mm_add_ps(_mm_mul_ps(poly, x), c3);
214 poly = _mm_add_ps(_mm_mul_ps(poly, x), c2);
215 poly = _mm_add_ps(_mm_mul_ps(poly, x), c1);
216 poly = _mm_add_ps(_mm_mul_ps(poly, x), c0);