28 const __m256 HALF = _mm256_set1_ps(0.5f);
29 const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);
31 const __m256 x0 = _mm256_rsqrt_ps(a);
34 __m256 x1 = _mm256_mul_ps(
36 _mm256_sub_ps(THREE_HALFS,
37 _mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));
41 __m256i a_si = _mm256_castps_si256(a);
42 __m256i zero_mask = _mm256_cmpeq_epi32(a_si, _mm256_setzero_si256());
43 __m256i inf_mask = _mm256_cmpeq_epi32(a_si, _mm256_set1_epi32(0x7F800000));
44 __m256 special_mask = _mm256_castsi256_ps(_mm256_or_si256(zero_mask, inf_mask));
45 return _mm256_blendv_ps(x1, x0, special_mask);
48static inline __m256
_mm256_real(
const __m256 z1,
const __m256 z2)
50 const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
51 __m256 r = _mm256_shuffle_ps(z1, z2, _MM_SHUFFLE(2, 0, 2, 0));
52 return _mm256_permutevar8x32_ps(r, permute_mask);
55static inline __m256
_mm256_imag(
const __m256 z1,
const __m256 z2)
57 const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
58 __m256 i = _mm256_shuffle_ps(z1, z2, _MM_SHUFFLE(3, 1, 3, 1));
59 return _mm256_permutevar8x32_ps(i, permute_mask);
64 const __m128i zeros = _mm_set1_epi8(0x00);
65 const __m128i sign_extract = _mm_set1_epi8(0x80);
66 const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
98 __m256i sign_bits = _mm256_setzero_si256();
100 fbits = _mm_cmpgt_epi8(fbits, zeros);
101 fbits = _mm_and_si128(fbits, sign_extract);
102 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
103 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
104 sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
106 return _mm256_castsi256_ps(sign_bits);
125 const __m256 cplxValue1)
127 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
128 const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0);
129 const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1);
130 const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
131 return _mm256_permutevar8x32_ps(complex_result, idx);
171 __m256i* max_indices,
172 __m256i* current_indices,
173 __m256i indices_increment)
175 in0 = _mm256_mul_ps(in0, in0);
176 in1 = _mm256_mul_ps(in1, in1);
196 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
207 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
210 *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
221 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
222 _mm256_castsi256_ps(*current_indices),
226 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
233 __m256i* max_indices,
234 __m256i* current_indices,
235 __m256i indices_increment)
237 in0 = _mm256_mul_ps(in0, in0);
238 in1 = _mm256_mul_ps(in1, in1);
240 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
241 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
253 *max_values = _mm256_max_ps(abs_squared, *max_values);
256 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
257 _mm256_castsi256_ps(*current_indices),
260 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
283 __m256i* min_indices,
284 __m256i* current_indices,
285 __m256i indices_increment)
287 in0 = _mm256_mul_ps(in0, in0);
288 in1 = _mm256_mul_ps(in1, in1);
308 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
319 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
322 *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
333 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
334 _mm256_castsi256_ps(*current_indices),
338 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
345 __m256i* min_indices,
346 __m256i* current_indices,
347 __m256i indices_increment)
349 in0 = _mm256_mul_ps(in0, in0);
350 in1 = _mm256_mul_ps(in1, in1);
352 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
353 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
365 *min_values = _mm256_min_ps(abs_squared, *min_values);
368 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
369 _mm256_castsi256_ps(*current_indices),
372 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
384 const __m256 s1 = _mm256_set1_ps(-0x1.555552p-3f);
385 const __m256 s2 = _mm256_set1_ps(+0x1.110be2p-7f);
386 const __m256 s3 = _mm256_set1_ps(-0x1.9ab22ap-13f);
388 const __m256 x2 = _mm256_mul_ps(x, x);
389 const __m256 x3 = _mm256_mul_ps(x2, x);
391 __m256 poly = _mm256_add_ps(_mm256_mul_ps(x2, s3), s2);
392 poly = _mm256_add_ps(_mm256_mul_ps(x2, poly), s1);
393 return _mm256_add_ps(_mm256_mul_ps(x3, poly), x);
405 const __m256 c1 = _mm256_set1_ps(-0x1.fffff4p-2f);
406 const __m256 c2 = _mm256_set1_ps(+0x1.554a46p-5f);
407 const __m256 c3 = _mm256_set1_ps(-0x1.661be2p-10f);
408 const __m256 one = _mm256_set1_ps(1.0f);
410 const __m256 x2 = _mm256_mul_ps(x, x);
412 __m256 poly = _mm256_add_ps(_mm256_mul_ps(x2, c3), c2);
413 poly = _mm256_add_ps(_mm256_mul_ps(x2, poly), c1);
414 return _mm256_add_ps(_mm256_mul_ps(x2, poly), one);
427 const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
428 const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
429 const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
430 const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
431 const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
432 const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
433 const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
437 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c5);
438 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c4);
439 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c3);
440 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c2);
441 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c1);
442 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c0);