61#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
62#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
69 const float normalizeFactor,
70 unsigned int num_points)
72 float* outPtr = outputVector;
73 const float* inPtr = (
float*)inputVector;
74 const float invNormalizeFactor = 1.f / normalizeFactor;
76 for (
unsigned int number = 0; number < num_points; number++) {
77 const float real = *inPtr++;
78 const float imag = *inPtr++;
79 *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
88 const float normalizeFactor,
89 unsigned int num_points)
91 float* outPtr = outputVector;
92 const float* inPtr = (
float*)inputVector;
93 const float invNormalizeFactor = 1.f / normalizeFactor;
95 for (
unsigned int number = 0; number < num_points; number++) {
96 const float x = *inPtr++;
97 const float y = *inPtr++;
98 *outPtr++ =
volk_atan2(y, x) * invNormalizeFactor;
103#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
104#include <immintrin.h>
106static inline void volk_32fc_s32f_atan2_32f_a_avx512dq(
float* outputVector,
108 const float normalizeFactor,
109 unsigned int num_points)
111 const float* in = (
float*)complexVector;
112 float* out = (
float*)outputVector;
114 const float invNormalizeFactor = 1.f / normalizeFactor;
115 const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor);
116 const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f);
117 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
118 const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
119 const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
121 unsigned int number = 0;
122 const unsigned int sixteenth_points = num_points / 16;
123 for (; number < sixteenth_points; number++) {
124 __m512 z1 = _mm512_load_ps(in);
126 __m512 z2 = _mm512_load_ps(in);
133 __mmask16 input_nan_mask = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q) |
134 _mm512_cmp_ps_mask(y, y, _CMP_UNORD_Q);
137 const __m512 zero = _mm512_setzero_ps();
138 const __m512 pi_4 = _mm512_set1_ps(0x1.921fb6p-1f);
139 const __m512 three_pi_4 = _mm512_set1_ps(0x1.2d97c8p1f);
141 __mmask16 y_inf_mask = _mm512_fpclass_ps_mask(y, 0x18);
142 __mmask16 x_inf_mask = _mm512_fpclass_ps_mask(x, 0x18);
143 __mmask16 x_pos_mask = _mm512_cmp_ps_mask(x, zero, _CMP_GT_OS);
146 __m512 inf_result = zero;
148 __mmask16 both_inf = y_inf_mask & x_inf_mask;
149 __m512 both_inf_result = _mm512_mask_blend_ps(x_pos_mask, three_pi_4, pi_4);
150 both_inf_result = _mm512_or_ps(both_inf_result, _mm512_and_ps(y, sign_mask));
151 inf_result = _mm512_mask_blend_ps(both_inf, inf_result, both_inf_result);
154 __mmask16 y_inf_only = y_inf_mask & ~x_inf_mask;
155 __m512 y_inf_result = _mm512_or_ps(pi_2, _mm512_and_ps(y, sign_mask));
156 inf_result = _mm512_mask_blend_ps(y_inf_only, inf_result, y_inf_result);
159 __mmask16 x_inf_only = x_inf_mask & ~y_inf_mask;
160 __m512 x_inf_result =
161 _mm512_mask_blend_ps(x_pos_mask,
162 _mm512_or_ps(pi, _mm512_and_ps(y, sign_mask)),
163 _mm512_or_ps(zero, _mm512_and_ps(y, sign_mask)));
164 inf_result = _mm512_mask_blend_ps(x_inf_only, inf_result, x_inf_result);
166 __mmask16 any_inf_mask = y_inf_mask | x_inf_mask;
168 __mmask16 swap_mask = _mm512_cmp_ps_mask(
169 _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS);
170 __m512 numerator = _mm512_mask_blend_ps(swap_mask, y, x);
171 __m512 denominator = _mm512_mask_blend_ps(swap_mask, x, y);
172 __m512 input = _mm512_div_ps(numerator, denominator);
176 __mmask16 div_nan_mask =
177 _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q) & ~input_nan_mask;
178 input = _mm512_mask_blend_ps(div_nan_mask, input, numerator);
182 _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result);
183 result = _mm512_mask_blend_ps(swap_mask, result, input);
186 _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31));
188 result = _mm512_add_ps(
189 _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask),
193 result = _mm512_mask_blend_ps(any_inf_mask, result, inf_result);
195 result = _mm512_mul_ps(result, vinvNormalizeFactor);
197 _mm512_store_ps(out, result);
201 number = sixteenth_points * 16;
203 out, complexVector + number, normalizeFactor, num_points - number);
207#if LV_HAVE_AVX2 && LV_HAVE_FMA
208#include <immintrin.h>
210static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(
float* outputVector,
212 const float normalizeFactor,
213 unsigned int num_points)
215 const float* in = (
float*)complexVector;
216 float* out = (
float*)outputVector;
218 const float invNormalizeFactor = 1.f / normalizeFactor;
219 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
220 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
221 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
222 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
223 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
225 unsigned int number = 0;
226 const unsigned int eighth_points = num_points / 8;
227 for (; number < eighth_points; number++) {
228 __m256 z1 = _mm256_load_ps(in);
230 __m256 z2 = _mm256_load_ps(in);
237 __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
238 _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
241 const __m256 zero = _mm256_setzero_ps();
242 const __m256 inf = _mm256_set1_ps(HUGE_VALF);
243 const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f);
244 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f);
246 __m256 y_abs = _mm256_and_ps(y, abs_mask);
247 __m256 x_abs = _mm256_and_ps(x, abs_mask);
248 __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ);
249 __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ);
250 __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
253 __m256 inf_result = zero;
255 __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
256 __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
257 both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
258 inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
261 __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
262 __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
263 inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
266 __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
267 __m256 x_inf_result =
268 _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
269 _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
271 inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
273 __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
275 __m256 swap_mask = _mm256_cmp_ps(
276 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
277 __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
278 __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
279 __m256 input = _mm256_div_ps(numerator, denominator);
283 __m256 div_nan_mask =
284 _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
285 input = _mm256_blendv_ps(input, numerator, div_nan_mask);
289 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
290 result = _mm256_blendv_ps(result, input, swap_mask);
293 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
295 result = _mm256_add_ps(
296 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
300 result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
302 result = _mm256_mul_ps(result, vinvNormalizeFactor);
304 _mm256_store_ps(out, result);
308 number = eighth_points * 8;
310 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
315#include <immintrin.h>
317static inline void volk_32fc_s32f_atan2_32f_a_avx2(
float* outputVector,
319 const float normalizeFactor,
320 unsigned int num_points)
322 const float* in = (
float*)complexVector;
323 float* out = (
float*)outputVector;
325 const float invNormalizeFactor = 1.f / normalizeFactor;
326 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
327 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
328 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
329 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
330 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
332 unsigned int number = 0;
333 const unsigned int eighth_points = num_points / 8;
334 for (; number < eighth_points; number++) {
335 __m256 z1 = _mm256_load_ps(in);
337 __m256 z2 = _mm256_load_ps(in);
344 __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
345 _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
348 const __m256 zero = _mm256_setzero_ps();
349 const __m256 inf = _mm256_set1_ps(HUGE_VALF);
350 const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f);
351 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f);
353 __m256 y_abs = _mm256_and_ps(y, abs_mask);
354 __m256 x_abs = _mm256_and_ps(x, abs_mask);
355 __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ);
356 __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ);
357 __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
360 __m256 inf_result = zero;
362 __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
363 __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
364 both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
365 inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
368 __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
369 __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
370 inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
373 __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
374 __m256 x_inf_result =
375 _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
376 _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
378 inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
380 __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
382 __m256 swap_mask = _mm256_cmp_ps(
383 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
384 __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
385 __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
386 __m256 input = _mm256_div_ps(numerator, denominator);
390 __m256 div_nan_mask =
391 _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
392 input = _mm256_blendv_ps(input, numerator, div_nan_mask);
396 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
397 result = _mm256_blendv_ps(result, input, swap_mask);
400 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
402 result = _mm256_add_ps(
403 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
407 result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
409 result = _mm256_mul_ps(result, vinvNormalizeFactor);
411 _mm256_store_ps(out, result);
415 number = eighth_points * 8;
417 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
426 const float normalizeFactor,
427 unsigned int num_points)
429 const float* in = (
float*)complexVector;
430 float* out = outputVector;
432 const float invNormalizeFactor = 1.f / normalizeFactor;
433 const float32x4_t vInvNorm = vdupq_n_f32(invNormalizeFactor);
434 const float32x4_t pi = vdupq_n_f32(0x1.921fb6p1f);
435 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
436 const float32x4_t pi_4 = vdupq_n_f32(0x1.921fb6p-1f);
437 const float32x4_t three_pi_4 = vdupq_n_f32(0x1.2d97c8p1f);
438 const float32x4_t zero = vdupq_n_f32(0.f);
439 const float32x4_t inf = vdupq_n_f32(__builtin_huge_valf());
440 const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
442 const unsigned int quarter_points = num_points / 4;
444 for (
unsigned int number = 0; number < quarter_points; number++) {
445 float32x4x2_t z = vld2q_f32(in);
448 float32x4_t x = z.val[0];
449 float32x4_t y = z.val[1];
451 float32x4_t abs_x = vabsq_f32(x);
452 float32x4_t abs_y = vabsq_f32(y);
455 uint32x4_t input_nan =
456 vorrq_u32(vmvnq_u32(vceqq_f32(x, x)), vmvnq_u32(vceqq_f32(y, y)));
459 uint32x4_t y_inf = vceqq_f32(abs_y, inf);
460 uint32x4_t x_inf = vceqq_f32(abs_x, inf);
461 uint32x4_t x_pos = vcgtq_f32(x, zero);
462 uint32x4_t both_inf = vandq_u32(y_inf, x_inf);
463 uint32x4_t y_inf_only = vbicq_u32(y_inf, x_inf);
464 uint32x4_t x_inf_only = vbicq_u32(x_inf, y_inf);
465 uint32x4_t any_inf = vorrq_u32(y_inf, x_inf);
467 float32x4_t inf_result = zero;
469 float32x4_t both_inf_r = vbslq_f32(x_pos, pi_4, three_pi_4);
470 both_inf_r = vreinterpretq_f32_u32(
471 vorrq_u32(vreinterpretq_u32_f32(both_inf_r),
472 vandq_u32(vreinterpretq_u32_f32(y), sign_mask)));
473 inf_result = vbslq_f32(both_inf, both_inf_r, inf_result);
475 float32x4_t y_inf_r = vreinterpretq_f32_u32(vorrq_u32(
476 vreinterpretq_u32_f32(pi_2), vandq_u32(vreinterpretq_u32_f32(y), sign_mask)));
477 inf_result = vbslq_f32(y_inf_only, y_inf_r, inf_result);
479 float32x4_t x_inf_r = vbslq_f32(
481 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(y), sign_mask)),
482 vreinterpretq_f32_u32(
483 vorrq_u32(vreinterpretq_u32_f32(pi),
484 vandq_u32(vreinterpretq_u32_f32(y), sign_mask))));
485 inf_result = vbslq_f32(x_inf_only, x_inf_r, inf_result);
488 uint32x4_t swap_mask = vcgtq_f32(abs_y, abs_x);
489 float32x4_t num = vbslq_f32(swap_mask, x, y);
490 float32x4_t den = vbslq_f32(swap_mask, y, x);
491 float32x4_t input = vmulq_f32(num,
_vinvq_f32(den));
494 uint32x4_t div_nan = vbicq_u32(vmvnq_u32(vceqq_f32(input, input)), input_nan);
495 input = vbslq_f32(div_nan, num, input);
498 uint32x4_t input_sign = vandq_u32(vreinterpretq_u32_f32(input), sign_mask);
500 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi_2), input_sign));
501 term = vsubq_f32(term, result);
502 result = vbslq_f32(swap_mask, term, result);
504 uint32x4_t x_neg = vcltq_f32(x, zero);
505 uint32x4_t y_sign = vandq_u32(vreinterpretq_u32_f32(y), sign_mask);
507 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi), y_sign));
508 result = vbslq_f32(x_neg, vaddq_f32(result, pi_adj), result);
511 result = vbslq_f32(any_inf, inf_result, result);
512 result = vmulq_f32(result, vInvNorm);
514 vst1q_f32(out, result);
518 unsigned int number = quarter_points * 4;
520 out, complexVector + number, normalizeFactor, num_points - number);
528static inline void volk_32fc_s32f_atan2_32f_neonv8(
float* outputVector,
530 const float normalizeFactor,
531 unsigned int num_points)
533 const float* in = (
float*)complexVector;
534 float* out = outputVector;
536 const float invNormalizeFactor = 1.f / normalizeFactor;
537 const float32x4_t vInvNorm = vdupq_n_f32(invNormalizeFactor);
538 const float32x4_t pi = vdupq_n_f32(0x1.921fb6p1f);
539 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
540 const float32x4_t pi_4 = vdupq_n_f32(0x1.921fb6p-1f);
541 const float32x4_t three_pi_4 = vdupq_n_f32(0x1.2d97c8p1f);
542 const float32x4_t zero = vdupq_n_f32(0.f);
543 const float32x4_t inf = vdupq_n_f32(__builtin_huge_valf());
544 const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
546 const unsigned int eighth_points = num_points / 8;
548 for (
unsigned int number = 0; number < eighth_points; number++) {
550 float32x4x2_t z0 = vld2q_f32(in);
551 float32x4x2_t z1 = vld2q_f32(in + 8);
554 float32x4_t x0 = z0.val[0], y0 = z0.val[1];
555 float32x4_t x1 = z1.val[0], y1 = z1.val[1];
558 float32x4_t abs_x0 = vabsq_f32(x0);
559 float32x4_t abs_y0 = vabsq_f32(y0);
562 uint32x4_t input_nan0 =
563 vorrq_u32(vmvnq_u32(vceqq_f32(x0, x0)), vmvnq_u32(vceqq_f32(y0, y0)));
566 uint32x4_t y_inf0 = vceqq_f32(abs_y0, inf);
567 uint32x4_t x_inf0 = vceqq_f32(abs_x0, inf);
568 uint32x4_t x_pos0 = vcgtq_f32(x0, zero);
569 uint32x4_t both_inf0 = vandq_u32(y_inf0, x_inf0);
570 uint32x4_t y_inf_only0 = vbicq_u32(y_inf0, x_inf0);
571 uint32x4_t x_inf_only0 = vbicq_u32(x_inf0, y_inf0);
572 uint32x4_t any_inf0 = vorrq_u32(y_inf0, x_inf0);
574 float32x4_t inf_result0 = zero;
575 float32x4_t both_inf_r0 = vbslq_f32(x_pos0, pi_4, three_pi_4);
576 both_inf_r0 = vreinterpretq_f32_u32(
577 vorrq_u32(vreinterpretq_u32_f32(both_inf_r0),
578 vandq_u32(vreinterpretq_u32_f32(y0), sign_mask)));
579 inf_result0 = vbslq_f32(both_inf0, both_inf_r0, inf_result0);
580 float32x4_t y_inf_r0 = vreinterpretq_f32_u32(
581 vorrq_u32(vreinterpretq_u32_f32(pi_2),
582 vandq_u32(vreinterpretq_u32_f32(y0), sign_mask)));
583 inf_result0 = vbslq_f32(y_inf_only0, y_inf_r0, inf_result0);
584 float32x4_t x_inf_r0 = vbslq_f32(
586 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(y0), sign_mask)),
587 vreinterpretq_f32_u32(
588 vorrq_u32(vreinterpretq_u32_f32(pi),
589 vandq_u32(vreinterpretq_u32_f32(y0), sign_mask))));
590 inf_result0 = vbslq_f32(x_inf_only0, x_inf_r0, inf_result0);
593 uint32x4_t swap_mask0 = vcgtq_f32(abs_y0, abs_x0);
594 float32x4_t num0 = vbslq_f32(swap_mask0, x0, y0);
595 float32x4_t den0 = vbslq_f32(swap_mask0, y0, x0);
596 float32x4_t input0 = vmulq_f32(num0,
_vinvq_f32(den0));
598 uint32x4_t div_nan0 = vbicq_u32(vmvnq_u32(vceqq_f32(input0, input0)), input_nan0);
599 input0 = vbslq_f32(div_nan0, num0, input0);
601 float32x4_t result0 = _varctan_poly_neonv8(input0);
602 uint32x4_t input_sign0 = vandq_u32(vreinterpretq_u32_f32(input0), sign_mask);
604 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi_2), input_sign0));
605 term0 = vsubq_f32(term0, result0);
606 result0 = vbslq_f32(swap_mask0, term0, result0);
608 uint32x4_t x_neg0 = vcltq_f32(x0, zero);
609 uint32x4_t y_sign0 = vandq_u32(vreinterpretq_u32_f32(y0), sign_mask);
610 float32x4_t pi_adj0 =
611 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi), y_sign0));
612 result0 = vbslq_f32(x_neg0, vaddq_f32(result0, pi_adj0), result0);
614 result0 = vbslq_f32(any_inf0, inf_result0, result0);
615 result0 = vmulq_f32(result0, vInvNorm);
618 float32x4_t abs_x1 = vabsq_f32(x1);
619 float32x4_t abs_y1 = vabsq_f32(y1);
621 uint32x4_t input_nan1 =
622 vorrq_u32(vmvnq_u32(vceqq_f32(x1, x1)), vmvnq_u32(vceqq_f32(y1, y1)));
624 uint32x4_t y_inf1 = vceqq_f32(abs_y1, inf);
625 uint32x4_t x_inf1 = vceqq_f32(abs_x1, inf);
626 uint32x4_t x_pos1 = vcgtq_f32(x1, zero);
627 uint32x4_t both_inf1 = vandq_u32(y_inf1, x_inf1);
628 uint32x4_t y_inf_only1 = vbicq_u32(y_inf1, x_inf1);
629 uint32x4_t x_inf_only1 = vbicq_u32(x_inf1, y_inf1);
630 uint32x4_t any_inf1 = vorrq_u32(y_inf1, x_inf1);
632 float32x4_t inf_result1 = zero;
633 float32x4_t both_inf_r1 = vbslq_f32(x_pos1, pi_4, three_pi_4);
634 both_inf_r1 = vreinterpretq_f32_u32(
635 vorrq_u32(vreinterpretq_u32_f32(both_inf_r1),
636 vandq_u32(vreinterpretq_u32_f32(y1), sign_mask)));
637 inf_result1 = vbslq_f32(both_inf1, both_inf_r1, inf_result1);
638 float32x4_t y_inf_r1 = vreinterpretq_f32_u32(
639 vorrq_u32(vreinterpretq_u32_f32(pi_2),
640 vandq_u32(vreinterpretq_u32_f32(y1), sign_mask)));
641 inf_result1 = vbslq_f32(y_inf_only1, y_inf_r1, inf_result1);
642 float32x4_t x_inf_r1 = vbslq_f32(
644 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(y1), sign_mask)),
645 vreinterpretq_f32_u32(
646 vorrq_u32(vreinterpretq_u32_f32(pi),
647 vandq_u32(vreinterpretq_u32_f32(y1), sign_mask))));
648 inf_result1 = vbslq_f32(x_inf_only1, x_inf_r1, inf_result1);
650 uint32x4_t swap_mask1 = vcgtq_f32(abs_y1, abs_x1);
651 float32x4_t num1 = vbslq_f32(swap_mask1, x1, y1);
652 float32x4_t den1 = vbslq_f32(swap_mask1, y1, x1);
653 float32x4_t input1 = vmulq_f32(num1,
_vinvq_f32(den1));
655 uint32x4_t div_nan1 = vbicq_u32(vmvnq_u32(vceqq_f32(input1, input1)), input_nan1);
656 input1 = vbslq_f32(div_nan1, num1, input1);
658 float32x4_t result1 = _varctan_poly_neonv8(input1);
659 uint32x4_t input_sign1 = vandq_u32(vreinterpretq_u32_f32(input1), sign_mask);
661 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi_2), input_sign1));
662 term1 = vsubq_f32(term1, result1);
663 result1 = vbslq_f32(swap_mask1, term1, result1);
665 uint32x4_t x_neg1 = vcltq_f32(x1, zero);
666 uint32x4_t y_sign1 = vandq_u32(vreinterpretq_u32_f32(y1), sign_mask);
667 float32x4_t pi_adj1 =
668 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi), y_sign1));
669 result1 = vbslq_f32(x_neg1, vaddq_f32(result1, pi_adj1), result1);
671 result1 = vbslq_f32(any_inf1, inf_result1, result1);
672 result1 = vmulq_f32(result1, vInvNorm);
674 vst1q_f32(out, result0);
675 vst1q_f32(out + 4, result1);
679 unsigned int number = eighth_points * 8;
681 out, complexVector + number, normalizeFactor, num_points - number);
687#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
688#define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
690#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
691#include <immintrin.h>
693static inline void volk_32fc_s32f_atan2_32f_u_avx512dq(
float* outputVector,
695 const float normalizeFactor,
696 unsigned int num_points)
698 const float* in = (
float*)complexVector;
699 float* out = (
float*)outputVector;
701 const float invNormalizeFactor = 1.f / normalizeFactor;
702 const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor);
703 const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f);
704 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
705 const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
706 const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
708 const unsigned int sixteenth_points = num_points / 16;
710 for (
unsigned int number = 0; number < sixteenth_points; number++) {
711 __m512 z1 = _mm512_loadu_ps(in);
713 __m512 z2 = _mm512_loadu_ps(in);
720 __mmask16 input_nan_mask = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q) |
721 _mm512_cmp_ps_mask(y, y, _CMP_UNORD_Q);
724 const __m512 zero = _mm512_setzero_ps();
725 const __m512 pi_4 = _mm512_set1_ps(0x1.921fb6p-1f);
726 const __m512 three_pi_4 = _mm512_set1_ps(0x1.2d97c8p1f);
728 __mmask16 y_inf_mask = _mm512_fpclass_ps_mask(y, 0x18);
729 __mmask16 x_inf_mask = _mm512_fpclass_ps_mask(x, 0x18);
730 __mmask16 x_pos_mask = _mm512_cmp_ps_mask(x, zero, _CMP_GT_OS);
733 __m512 inf_result = zero;
735 __mmask16 both_inf = y_inf_mask & x_inf_mask;
736 __m512 both_inf_result = _mm512_mask_blend_ps(x_pos_mask, three_pi_4, pi_4);
737 both_inf_result = _mm512_or_ps(both_inf_result, _mm512_and_ps(y, sign_mask));
738 inf_result = _mm512_mask_blend_ps(both_inf, inf_result, both_inf_result);
741 __mmask16 y_inf_only = y_inf_mask & ~x_inf_mask;
742 __m512 y_inf_result = _mm512_or_ps(pi_2, _mm512_and_ps(y, sign_mask));
743 inf_result = _mm512_mask_blend_ps(y_inf_only, inf_result, y_inf_result);
746 __mmask16 x_inf_only = x_inf_mask & ~y_inf_mask;
747 __m512 x_inf_result =
748 _mm512_mask_blend_ps(x_pos_mask,
749 _mm512_or_ps(pi, _mm512_and_ps(y, sign_mask)),
750 _mm512_or_ps(zero, _mm512_and_ps(y, sign_mask)));
751 inf_result = _mm512_mask_blend_ps(x_inf_only, inf_result, x_inf_result);
753 __mmask16 any_inf_mask = y_inf_mask | x_inf_mask;
755 __mmask16 swap_mask = _mm512_cmp_ps_mask(
756 _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS);
757 __m512 numerator = _mm512_mask_blend_ps(swap_mask, y, x);
758 __m512 denominator = _mm512_mask_blend_ps(swap_mask, x, y);
759 __m512 input = _mm512_div_ps(numerator, denominator);
763 __mmask16 div_nan_mask =
764 _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q) & ~input_nan_mask;
765 input = _mm512_mask_blend_ps(div_nan_mask, input, numerator);
769 _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result);
770 result = _mm512_mask_blend_ps(swap_mask, result, input);
773 _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31));
775 result = _mm512_add_ps(
776 _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask),
780 result = _mm512_mask_blend_ps(any_inf_mask, result, inf_result);
782 result = _mm512_mul_ps(result, vinvNormalizeFactor);
784 _mm512_storeu_ps(out, result);
788 unsigned int number = sixteenth_points * 16;
790 out, complexVector + number, normalizeFactor, num_points - number);
794#if LV_HAVE_AVX2 && LV_HAVE_FMA
795#include <immintrin.h>
797static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(
float* outputVector,
799 const float normalizeFactor,
800 unsigned int num_points)
802 const float* in = (
float*)complexVector;
803 float* out = (
float*)outputVector;
805 const float invNormalizeFactor = 1.f / normalizeFactor;
806 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
807 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
808 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
809 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
810 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
812 unsigned int number = 0;
813 const unsigned int eighth_points = num_points / 8;
814 for (; number < eighth_points; number++) {
815 __m256 z1 = _mm256_loadu_ps(in);
817 __m256 z2 = _mm256_loadu_ps(in);
824 __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
825 _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
828 const __m256 zero = _mm256_setzero_ps();
829 const __m256 inf = _mm256_set1_ps(HUGE_VALF);
830 const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f);
831 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f);
833 __m256 y_abs = _mm256_and_ps(y, abs_mask);
834 __m256 x_abs = _mm256_and_ps(x, abs_mask);
835 __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ);
836 __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ);
837 __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
840 __m256 inf_result = zero;
842 __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
843 __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
844 both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
845 inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
848 __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
849 __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
850 inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
853 __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
854 __m256 x_inf_result =
855 _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
856 _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
858 inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
860 __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
862 __m256 swap_mask = _mm256_cmp_ps(
863 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
864 __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
865 __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
866 __m256 input = _mm256_div_ps(numerator, denominator);
870 __m256 div_nan_mask =
871 _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
872 input = _mm256_blendv_ps(input, numerator, div_nan_mask);
876 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
877 result = _mm256_blendv_ps(result, input, swap_mask);
880 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
882 result = _mm256_add_ps(
883 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
887 result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
889 result = _mm256_mul_ps(result, vinvNormalizeFactor);
891 _mm256_storeu_ps(out, result);
895 number = eighth_points * 8;
897 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
902#include <immintrin.h>
904static inline void volk_32fc_s32f_atan2_32f_u_avx2(
float* outputVector,
906 const float normalizeFactor,
907 unsigned int num_points)
909 const float* in = (
float*)complexVector;
910 float* out = (
float*)outputVector;
912 const float invNormalizeFactor = 1.f / normalizeFactor;
913 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
914 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
915 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
916 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
917 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
919 unsigned int number = 0;
920 const unsigned int eighth_points = num_points / 8;
921 for (; number < eighth_points; number++) {
922 __m256 z1 = _mm256_loadu_ps(in);
924 __m256 z2 = _mm256_loadu_ps(in);
931 __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
932 _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
935 const __m256 zero = _mm256_setzero_ps();
936 const __m256 inf = _mm256_set1_ps(HUGE_VALF);
937 const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f);
938 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f);
940 __m256 y_abs = _mm256_and_ps(y, abs_mask);
941 __m256 x_abs = _mm256_and_ps(x, abs_mask);
942 __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ);
943 __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ);
944 __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
947 __m256 inf_result = zero;
949 __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
950 __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
951 both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
952 inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
955 __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
956 __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
957 inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
960 __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
961 __m256 x_inf_result =
962 _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
963 _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
965 inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
967 __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
969 __m256 swap_mask = _mm256_cmp_ps(
970 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
971 __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
972 __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
973 __m256 input = _mm256_div_ps(numerator, denominator);
977 __m256 div_nan_mask =
978 _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
979 input = _mm256_blendv_ps(input, numerator, div_nan_mask);
983 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
984 result = _mm256_blendv_ps(result, input, swap_mask);
987 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
989 result = _mm256_add_ps(
990 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
994 result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
996 result = _mm256_mul_ps(result, vinvNormalizeFactor);
998 _mm256_storeu_ps(out, result);
1002 number = eighth_points * 8;
1004 out, (
lv_32fc_t*)in, normalizeFactor, num_points - number);
1009#include <riscv_vector.h>
1012static inline void volk_32fc_s32f_atan2_32f_rvv(
float* outputVector,
1014 const float normalizeFactor,
1015 unsigned int num_points)
1017 size_t vlmax = __riscv_vsetvlmax_e32m2();
1019 const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
1020 const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
1021 const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
1022 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
1023 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
1024 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
1025 const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
1026 const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
1027 const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
1028 const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
1030 const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
1031 const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(HUGE_VALF, vlmax);
1032 const vfloat32m2_t pi_4 = __riscv_vfmv_v_f_f32m2(0x1.921fb6p-1f, vlmax);
1033 const vfloat32m2_t three_pi_4 = __riscv_vfmv_v_f_f32m2(0x1.2d97c8p1f, vlmax);
1035 size_t n = num_points;
1036 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
1037 vl = __riscv_vsetvl_e32m2(n);
1038 vuint64m4_t v = __riscv_vle64_v_u64m4((
const uint64_t*)inputVector, vl);
1039 vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 0, vl));
1040 vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 32, vl));
1043 vbool16_t input_nan_mask =
1044 __riscv_vmor(__riscv_vmfne(vr, vr, vl), __riscv_vmfne(vi, vi, vl), vl);
1047 vfloat32m2_t vr_abs = __riscv_vfabs(vr, vl);
1048 vfloat32m2_t vi_abs = __riscv_vfabs(vi, vl);
1049 vbool16_t vr_inf_mask = __riscv_vmfeq(vr_abs, inf, vl);
1050 vbool16_t vi_inf_mask = __riscv_vmfeq(vi_abs, inf, vl);
1051 vbool16_t vr_pos_mask = __riscv_vmfgt(vr, zero, vl);
1054 vfloat32m2_t inf_result = zero;
1056 vbool16_t both_inf = __riscv_vmand(vi_inf_mask, vr_inf_mask, vl);
1057 vfloat32m2_t both_inf_result = __riscv_vmerge(three_pi_4, pi_4, vr_pos_mask, vl);
1058 both_inf_result = __riscv_vfsgnj(both_inf_result, vi, vl);
1059 inf_result = __riscv_vmerge(inf_result, both_inf_result, both_inf, vl);
1062 vbool16_t vi_inf_only = __riscv_vmandn(vi_inf_mask, vr_inf_mask, vl);
1063 vfloat32m2_t vi_inf_result = __riscv_vfsgnj(cpio2, vi, vl);
1064 inf_result = __riscv_vmerge(inf_result, vi_inf_result, vi_inf_only, vl);
1067 vbool16_t vr_inf_only = __riscv_vmandn(vr_inf_mask, vi_inf_mask, vl);
1068 vfloat32m2_t vr_inf_result =
1069 __riscv_vmerge(__riscv_vfsgnj(cpi, vi, vl),
1070 __riscv_vfsgnj(zero, vi, vl),
1073 inf_result = __riscv_vmerge(inf_result, vr_inf_result, vr_inf_only, vl);
1075 vbool16_t any_inf_mask = __riscv_vmor(vi_inf_mask, vr_inf_mask, vl);
1077 vbool16_t mswap = __riscv_vmfgt(vi_abs, vr_abs, vl);
1078 vfloat32m2_t numerator = __riscv_vmerge(vi, vr, mswap, vl);
1079 vfloat32m2_t denominator = __riscv_vmerge(vr, vi, mswap, vl);
1080 vfloat32m2_t x = __riscv_vfdiv(numerator, denominator, vl);
1084 vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl);
1086 vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl);
1087 x = __riscv_vmerge(x, numerator, div_nan_mask, vl);
1089 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
1090 vfloat32m2_t p = c13;
1091 p = __riscv_vfmadd(p, xx, c11, vl);
1092 p = __riscv_vfmadd(p, xx, c9, vl);
1093 p = __riscv_vfmadd(p, xx, c7, vl);
1094 p = __riscv_vfmadd(p, xx, c5, vl);
1095 p = __riscv_vfmadd(p, xx, c3, vl);
1096 p = __riscv_vfmadd(p, xx, c1, vl);
1097 p = __riscv_vfmul(p, x, vl);
1099 x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
1100 p = __riscv_vmerge(p, x, mswap, vl);
1101 p = __riscv_vfadd_mu(
1102 RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
1105 p = __riscv_vmerge(p, inf_result, any_inf_mask, vl);
1107 __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);
1112#ifdef LV_HAVE_RVVSEG
1113#include <riscv_vector.h>
1116static inline void volk_32fc_s32f_atan2_32f_rvvseg(
float* outputVector,
1118 const float normalizeFactor,
1119 unsigned int num_points)
1121 size_t vlmax = __riscv_vsetvlmax_e32m2();
1123 const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
1124 const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
1125 const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
1126 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
1127 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
1128 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
1129 const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
1130 const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
1131 const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
1132 const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
1134 const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
1135 const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(HUGE_VALF, vlmax);
1136 const vfloat32m2_t pi_4 = __riscv_vfmv_v_f_f32m2(0x1.921fb6p-1f, vlmax);
1137 const vfloat32m2_t three_pi_4 = __riscv_vfmv_v_f_f32m2(0x1.2d97c8p1f, vlmax);
1139 size_t n = num_points;
1140 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
1141 vl = __riscv_vsetvl_e32m2(n);
1142 vfloat32m2x2_t v = __riscv_vlseg2e32_v_f32m2x2((
const float*)inputVector, vl);
1143 vfloat32m2_t vr = __riscv_vget_f32m2(v, 0), vi = __riscv_vget_f32m2(v, 1);
1146 vbool16_t input_nan_mask =
1147 __riscv_vmor(__riscv_vmfne(vr, vr, vl), __riscv_vmfne(vi, vi, vl), vl);
1150 vfloat32m2_t vr_abs = __riscv_vfabs(vr, vl);
1151 vfloat32m2_t vi_abs = __riscv_vfabs(vi, vl);
1152 vbool16_t vr_inf_mask = __riscv_vmfeq(vr_abs, inf, vl);
1153 vbool16_t vi_inf_mask = __riscv_vmfeq(vi_abs, inf, vl);
1154 vbool16_t vr_pos_mask = __riscv_vmfgt(vr, zero, vl);
1157 vfloat32m2_t inf_result = zero;
1159 vbool16_t both_inf = __riscv_vmand(vi_inf_mask, vr_inf_mask, vl);
1160 vfloat32m2_t both_inf_result = __riscv_vmerge(three_pi_4, pi_4, vr_pos_mask, vl);
1161 both_inf_result = __riscv_vfsgnj(both_inf_result, vi, vl);
1162 inf_result = __riscv_vmerge(inf_result, both_inf_result, both_inf, vl);
1165 vbool16_t vi_inf_only = __riscv_vmandn(vi_inf_mask, vr_inf_mask, vl);
1166 vfloat32m2_t vi_inf_result = __riscv_vfsgnj(cpio2, vi, vl);
1167 inf_result = __riscv_vmerge(inf_result, vi_inf_result, vi_inf_only, vl);
1170 vbool16_t vr_inf_only = __riscv_vmandn(vr_inf_mask, vi_inf_mask, vl);
1171 vfloat32m2_t vr_inf_result =
1172 __riscv_vmerge(__riscv_vfsgnj(cpi, vi, vl),
1173 __riscv_vfsgnj(zero, vi, vl),
1176 inf_result = __riscv_vmerge(inf_result, vr_inf_result, vr_inf_only, vl);
1178 vbool16_t any_inf_mask = __riscv_vmor(vi_inf_mask, vr_inf_mask, vl);
1180 vbool16_t mswap = __riscv_vmfgt(vi_abs, vr_abs, vl);
1181 vfloat32m2_t numerator = __riscv_vmerge(vi, vr, mswap, vl);
1182 vfloat32m2_t denominator = __riscv_vmerge(vr, vi, mswap, vl);
1183 vfloat32m2_t x = __riscv_vfdiv(numerator, denominator, vl);
1187 vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl);
1189 vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl);
1190 x = __riscv_vmerge(x, numerator, div_nan_mask, vl);
1192 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
1193 vfloat32m2_t p = c13;
1194 p = __riscv_vfmadd(p, xx, c11, vl);
1195 p = __riscv_vfmadd(p, xx, c9, vl);
1196 p = __riscv_vfmadd(p, xx, c7, vl);
1197 p = __riscv_vfmadd(p, xx, c5, vl);
1198 p = __riscv_vfmadd(p, xx, c3, vl);
1199 p = __riscv_vfmadd(p, xx, c1, vl);
1200 p = __riscv_vfmul(p, x, vl);
1202 x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
1203 p = __riscv_vmerge(p, x, mswap, vl);
1204 p = __riscv_vfadd_mu(
1205 RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
1208 p = __riscv_vmerge(p, inf_result, any_inf_mask, vl);
1210 __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);