28 const __m256 HALF = _mm256_set1_ps(0.5f);
29 const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);
31 const __m256 x0 = _mm256_rsqrt_ps(a);
34 __m256 x1 = _mm256_mul_ps(
36 _mm256_sub_ps(THREE_HALFS,
37 _mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));
42 __m128i a_lo = _mm256_castsi256_si128(_mm256_castps_si256(a));
43 __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
44 __m128i zero_si = _mm_setzero_si128();
45 __m128i inf_si = _mm_set1_epi32(0x7F800000);
46 __m128i zero_mask_lo = _mm_cmpeq_epi32(a_lo, zero_si);
47 __m128i zero_mask_hi = _mm_cmpeq_epi32(a_hi, zero_si);
48 __m128i inf_mask_lo = _mm_cmpeq_epi32(a_lo, inf_si);
49 __m128i inf_mask_hi = _mm_cmpeq_epi32(a_hi, inf_si);
50 __m128 mask_lo = _mm_castsi128_ps(_mm_or_si128(zero_mask_lo, inf_mask_lo));
51 __m128 mask_hi = _mm_castsi128_ps(_mm_or_si128(zero_mask_hi, inf_mask_hi));
53 _mm256_insertf128_ps(_mm256_castps128_ps256(mask_lo), mask_hi, 1);
54 return _mm256_blendv_ps(x1, x0, special_mask);
66 const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
67 const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
68 const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f);
69 const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f);
70 const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f);
71 const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f);
72 const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f);
74 const __m256 x_times_x = _mm256_mul_ps(x, x);
77 arctan = _mm256_mul_ps(x_times_x, arctan);
78 arctan = _mm256_add_ps(arctan, a11);
79 arctan = _mm256_mul_ps(x_times_x, arctan);
80 arctan = _mm256_add_ps(arctan, a9);
81 arctan = _mm256_mul_ps(x_times_x, arctan);
82 arctan = _mm256_add_ps(arctan, a7);
83 arctan = _mm256_mul_ps(x_times_x, arctan);
84 arctan = _mm256_add_ps(arctan, a5);
85 arctan = _mm256_mul_ps(x_times_x, arctan);
86 arctan = _mm256_add_ps(arctan, a3);
87 arctan = _mm256_mul_ps(x_times_x, arctan);
88 arctan = _mm256_add_ps(arctan, a1);
89 arctan = _mm256_mul_ps(x, arctan);
103 const __m256 c0 = _mm256_set1_ps(0x1.ffffcep-1f);
104 const __m256 c1 = _mm256_set1_ps(0x1.55b648p-3f);
105 const __m256 c2 = _mm256_set1_ps(0x1.24d192p-4f);
106 const __m256 c3 = _mm256_set1_ps(0x1.0a788p-4f);
108 const __m256 u = _mm256_mul_ps(x, x);
110 p = _mm256_mul_ps(u, p);
111 p = _mm256_add_ps(p, c2);
112 p = _mm256_mul_ps(u, p);
113 p = _mm256_add_ps(p, c1);
114 p = _mm256_mul_ps(u, p);
115 p = _mm256_add_ps(p, c0);
117 return _mm256_mul_ps(x, p);
122 __m256 yl, yh, tmp1, tmp2;
123 yl = _mm256_moveldup_ps(y);
124 yh = _mm256_movehdup_ps(y);
125 tmp1 = _mm256_mul_ps(x, yl);
126 x = _mm256_shuffle_ps(x, x, 0xB1);
127 tmp2 = _mm256_mul_ps(x, yh);
130 return _mm256_addsub_ps(tmp1, tmp2);
135 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
136 return _mm256_xor_ps(x, conjugator);
141 const __m256 nswap = _mm256_permute_ps(x, 0xb1);
142 const __m256 dreal = _mm256_moveldup_ps(y);
143 const __m256 dimag = _mm256_movehdup_ps(y);
145 const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
146 const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
147 const __m256 multreal = _mm256_mul_ps(x, dreal);
148 const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
149 return _mm256_add_ps(multreal, multimag);
154 __m256 tmp1 = _mm256_mul_ps(val, val);
155 tmp1 = _mm256_hadd_ps(tmp1, tmp1);
156 tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0));
157 tmp1 = _mm256_sqrt_ps(tmp1);
158 return _mm256_div_ps(val, tmp1);
163 __m256 complex1, complex2;
164 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
165 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
166 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
167 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
168 return _mm256_hadd_ps(complex1, complex2);
195 __m256 sign_mask_dummy = _mm256_setzero_ps();
196 const __m128i zeros = _mm_set1_epi8(0x00);
197 const __m128i sign_extract = _mm_set1_epi8(0x80);
198 const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
214 const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
231 fbits = _mm_cmpgt_epi8(fbits, zeros);
232 fbits = _mm_and_si128(fbits, sign_extract);
233 __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
234 __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
237 _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
238 return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
249 __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
250 __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
251 *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
252 *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
257 const __m256 sign_mask = _mm256_set1_ps(-0.0f);
258 const __m256 abs_mask =
259 _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
266 _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
268 _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
269 return _mm256_or_ps(dst, sign);
281 llr0 = _mm256_xor_ps(llr0, sign_mask);
282 __m256 dst = _mm256_add_ps(llr0, llr1);
287 __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
289 aux = _mm256_mul_ps(aux, val);
290 aux = _mm256_sub_ps(aux, acc);
291 aux = _mm256_mul_ps(aux, aux);
292 aux = _mm256_mul_ps(aux, rec);
293 return _mm256_add_ps(sq_acc, aux);
306 const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
307 const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
308 const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
309 const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
310 const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
311 const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
312 const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
316 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c5);
317 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c4);
318 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c3);
319 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c2);
320 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c1);
321 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c0);