Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_s32f_atan2_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 * Copyright 2023, 2024 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
60
61#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
62#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
63
64#include <math.h>
65
66#ifdef LV_HAVE_GENERIC
67static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
68 const lv_32fc_t* inputVector,
69 const float normalizeFactor,
70 unsigned int num_points)
71{
72 float* outPtr = outputVector;
73 const float* inPtr = (float*)inputVector;
74 const float invNormalizeFactor = 1.f / normalizeFactor;
75
76 for (unsigned int number = 0; number < num_points; number++) {
77 const float real = *inPtr++;
78 const float imag = *inPtr++;
79 *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
80 }
81}
82#endif /* LV_HAVE_GENERIC */
83
84#ifdef LV_HAVE_GENERIC
85#include <volk/volk_common.h>
86static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector,
87 const lv_32fc_t* inputVector,
88 const float normalizeFactor,
89 unsigned int num_points)
90{
91 float* outPtr = outputVector;
92 const float* inPtr = (float*)inputVector;
93 const float invNormalizeFactor = 1.f / normalizeFactor;
94
95 for (unsigned int number = 0; number < num_points; number++) {
96 const float x = *inPtr++;
97 const float y = *inPtr++;
98 *outPtr++ = volk_atan2(y, x) * invNormalizeFactor;
99 }
100}
101#endif /* LV_HAVE_GENERIC */
102
103#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
104#include <immintrin.h>
106static inline void volk_32fc_s32f_atan2_32f_a_avx512dq(float* outputVector,
107 const lv_32fc_t* complexVector,
108 const float normalizeFactor,
109 unsigned int num_points)
110{
111 const float* in = (float*)complexVector;
112 float* out = (float*)outputVector;
113
114 const float invNormalizeFactor = 1.f / normalizeFactor;
115 const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor);
116 const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f);
117 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
118 const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
119 const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
120
121 unsigned int number = 0;
122 const unsigned int sixteenth_points = num_points / 16;
123 for (; number < sixteenth_points; number++) {
124 __m512 z1 = _mm512_load_ps(in);
125 in += 16;
126 __m512 z2 = _mm512_load_ps(in);
127 in += 16;
128
129 __m512 x = _mm512_real(z1, z2);
130 __m512 y = _mm512_imag(z1, z2);
131
132 // Detect NaN in original inputs before division
133 __mmask16 input_nan_mask = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q) |
134 _mm512_cmp_ps_mask(y, y, _CMP_UNORD_Q);
135
136 // Handle infinity cases per IEEE 754
137 const __m512 zero = _mm512_setzero_ps();
138 const __m512 pi_4 = _mm512_set1_ps(0x1.921fb6p-1f); // π/4
139 const __m512 three_pi_4 = _mm512_set1_ps(0x1.2d97c8p1f); // 3π/4
140
141 __mmask16 y_inf_mask = _mm512_fpclass_ps_mask(y, 0x18); // ±inf
142 __mmask16 x_inf_mask = _mm512_fpclass_ps_mask(x, 0x18); // ±inf
143 __mmask16 x_pos_mask = _mm512_cmp_ps_mask(x, zero, _CMP_GT_OS);
144
145 // Build infinity result
146 __m512 inf_result = zero;
147 // Both infinite: ±π/4 or ±3π/4
148 __mmask16 both_inf = y_inf_mask & x_inf_mask;
149 __m512 both_inf_result = _mm512_mask_blend_ps(x_pos_mask, three_pi_4, pi_4);
150 both_inf_result = _mm512_or_ps(both_inf_result, _mm512_and_ps(y, sign_mask));
151 inf_result = _mm512_mask_blend_ps(both_inf, inf_result, both_inf_result);
152
153 // y infinite, x finite: ±π/2
154 __mmask16 y_inf_only = y_inf_mask & ~x_inf_mask;
155 __m512 y_inf_result = _mm512_or_ps(pi_2, _mm512_and_ps(y, sign_mask));
156 inf_result = _mm512_mask_blend_ps(y_inf_only, inf_result, y_inf_result);
157
158 // x infinite, y finite: 0 or ±π
159 __mmask16 x_inf_only = x_inf_mask & ~y_inf_mask;
160 __m512 x_inf_result =
161 _mm512_mask_blend_ps(x_pos_mask,
162 _mm512_or_ps(pi, _mm512_and_ps(y, sign_mask)),
163 _mm512_or_ps(zero, _mm512_and_ps(y, sign_mask)));
164 inf_result = _mm512_mask_blend_ps(x_inf_only, inf_result, x_inf_result);
165
166 __mmask16 any_inf_mask = y_inf_mask | x_inf_mask;
167
168 __mmask16 swap_mask = _mm512_cmp_ps_mask(
169 _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS);
170 __m512 numerator = _mm512_mask_blend_ps(swap_mask, y, x);
171 __m512 denominator = _mm512_mask_blend_ps(swap_mask, x, y);
172 __m512 input = _mm512_div_ps(numerator, denominator);
173
174 // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
175 // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
176 __mmask16 div_nan_mask =
177 _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q) & ~input_nan_mask;
178 input = _mm512_mask_blend_ps(div_nan_mask, input, numerator);
179 __m512 result = _mm512_arctan_poly_avx512(input);
180
181 input =
182 _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result);
183 result = _mm512_mask_blend_ps(swap_mask, result, input);
184
185 __m512 x_sign_mask =
186 _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31));
187
188 result = _mm512_add_ps(
189 _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask),
190 result);
191
192 // Select infinity result or normal result
193 result = _mm512_mask_blend_ps(any_inf_mask, result, inf_result);
194
195 result = _mm512_mul_ps(result, vinvNormalizeFactor);
196
197 _mm512_store_ps(out, result);
198 out += 16;
199 }
200
201 number = sixteenth_points * 16;
203 out, complexVector + number, normalizeFactor, num_points - number);
204}
205#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for aligned */
206
207#if LV_HAVE_AVX2 && LV_HAVE_FMA
208#include <immintrin.h>
210static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector,
211 const lv_32fc_t* complexVector,
212 const float normalizeFactor,
213 unsigned int num_points)
214{
215 const float* in = (float*)complexVector;
216 float* out = (float*)outputVector;
217
218 const float invNormalizeFactor = 1.f / normalizeFactor;
219 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
220 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
221 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
222 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
223 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
224
225 unsigned int number = 0;
226 const unsigned int eighth_points = num_points / 8;
227 for (; number < eighth_points; number++) {
228 __m256 z1 = _mm256_load_ps(in);
229 in += 8;
230 __m256 z2 = _mm256_load_ps(in);
231 in += 8;
232
233 __m256 x = _mm256_real(z1, z2);
234 __m256 y = _mm256_imag(z1, z2);
235
236 // Detect NaN in original inputs before division
237 __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
238 _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
239
240 // Handle infinity cases per IEEE 754
241 const __m256 zero = _mm256_setzero_ps();
242 const __m256 inf = _mm256_set1_ps(HUGE_VALF);
243 const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4
244 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4
245
246 __m256 y_abs = _mm256_and_ps(y, abs_mask);
247 __m256 x_abs = _mm256_and_ps(x, abs_mask);
248 __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf
249 __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf
250 __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
251
252 // Build infinity result
253 __m256 inf_result = zero;
254 // Both infinite: ±π/4 or ±3π/4
255 __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
256 __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
257 both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
258 inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
259
260 // y infinite, x finite: ±π/2
261 __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
262 __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
263 inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
264
265 // x infinite, y finite: 0 or ±π
266 __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
267 __m256 x_inf_result =
268 _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
269 _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
270 x_pos_mask);
271 inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
272
273 __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
274
275 __m256 swap_mask = _mm256_cmp_ps(
276 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
277 __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
278 __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
279 __m256 input = _mm256_div_ps(numerator, denominator);
280
281 // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
282 // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
283 __m256 div_nan_mask =
284 _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
285 input = _mm256_blendv_ps(input, numerator, div_nan_mask);
286 __m256 result = _mm256_arctan_poly_avx2_fma(input);
287
288 input =
289 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
290 result = _mm256_blendv_ps(result, input, swap_mask);
291
292 __m256 x_sign_mask =
293 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
294
295 result = _mm256_add_ps(
296 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
297 result);
298
299 // Select infinity result or normal result
300 result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
301
302 result = _mm256_mul_ps(result, vinvNormalizeFactor);
303
304 _mm256_store_ps(out, result);
305 out += 8;
306 }
307
308 number = eighth_points * 8;
310 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
311}
312#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
313
314#if LV_HAVE_AVX2
315#include <immintrin.h>
317static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector,
318 const lv_32fc_t* complexVector,
319 const float normalizeFactor,
320 unsigned int num_points)
321{
322 const float* in = (float*)complexVector;
323 float* out = (float*)outputVector;
324
325 const float invNormalizeFactor = 1.f / normalizeFactor;
326 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
327 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
328 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
329 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
330 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
331
332 unsigned int number = 0;
333 const unsigned int eighth_points = num_points / 8;
334 for (; number < eighth_points; number++) {
335 __m256 z1 = _mm256_load_ps(in);
336 in += 8;
337 __m256 z2 = _mm256_load_ps(in);
338 in += 8;
339
340 __m256 x = _mm256_real(z1, z2);
341 __m256 y = _mm256_imag(z1, z2);
342
343 // Detect NaN in original inputs before division
344 __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
345 _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
346
347 // Handle infinity cases per IEEE 754
348 const __m256 zero = _mm256_setzero_ps();
349 const __m256 inf = _mm256_set1_ps(HUGE_VALF);
350 const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4
351 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4
352
353 __m256 y_abs = _mm256_and_ps(y, abs_mask);
354 __m256 x_abs = _mm256_and_ps(x, abs_mask);
355 __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf
356 __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf
357 __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
358
359 // Build infinity result
360 __m256 inf_result = zero;
361 // Both infinite: ±π/4 or ±3π/4
362 __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
363 __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
364 both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
365 inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
366
367 // y infinite, x finite: ±π/2
368 __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
369 __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
370 inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
371
372 // x infinite, y finite: 0 or ±π
373 __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
374 __m256 x_inf_result =
375 _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
376 _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
377 x_pos_mask);
378 inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
379
380 __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
381
382 __m256 swap_mask = _mm256_cmp_ps(
383 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
384 __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
385 __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
386 __m256 input = _mm256_div_ps(numerator, denominator);
387
388 // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
389 // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
390 __m256 div_nan_mask =
391 _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
392 input = _mm256_blendv_ps(input, numerator, div_nan_mask);
393 __m256 result = _mm256_arctan_poly_avx(input);
394
395 input =
396 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
397 result = _mm256_blendv_ps(result, input, swap_mask);
398
399 __m256 x_sign_mask =
400 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
401
402 result = _mm256_add_ps(
403 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
404 result);
405
406 // Select infinity result or normal result
407 result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
408
409 result = _mm256_mul_ps(result, vinvNormalizeFactor);
410
411 _mm256_store_ps(out, result);
412 out += 8;
413 }
414
415 number = eighth_points * 8;
417 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
418}
419#endif /* LV_HAVE_AVX2 for aligned */
420
421#ifdef LV_HAVE_NEON
422#include <arm_neon.h>
424static inline void volk_32fc_s32f_atan2_32f_neon(float* outputVector,
425 const lv_32fc_t* complexVector,
426 const float normalizeFactor,
427 unsigned int num_points)
428{
429 const float* in = (float*)complexVector;
430 float* out = outputVector;
431
432 const float invNormalizeFactor = 1.f / normalizeFactor;
433 const float32x4_t vInvNorm = vdupq_n_f32(invNormalizeFactor);
434 const float32x4_t pi = vdupq_n_f32(0x1.921fb6p1f);
435 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
436 const float32x4_t pi_4 = vdupq_n_f32(0x1.921fb6p-1f);
437 const float32x4_t three_pi_4 = vdupq_n_f32(0x1.2d97c8p1f);
438 const float32x4_t zero = vdupq_n_f32(0.f);
439 const float32x4_t inf = vdupq_n_f32(__builtin_huge_valf());
440 const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
441
442 const unsigned int quarter_points = num_points / 4;
443
444 for (unsigned int number = 0; number < quarter_points; number++) {
445 float32x4x2_t z = vld2q_f32(in);
446 in += 8;
447
448 float32x4_t x = z.val[0];
449 float32x4_t y = z.val[1];
450
451 float32x4_t abs_x = vabsq_f32(x);
452 float32x4_t abs_y = vabsq_f32(y);
453
454 /* Detect input NaN */
455 uint32x4_t input_nan =
456 vorrq_u32(vmvnq_u32(vceqq_f32(x, x)), vmvnq_u32(vceqq_f32(y, y)));
457
458 /* Infinity handling */
459 uint32x4_t y_inf = vceqq_f32(abs_y, inf);
460 uint32x4_t x_inf = vceqq_f32(abs_x, inf);
461 uint32x4_t x_pos = vcgtq_f32(x, zero);
462 uint32x4_t both_inf = vandq_u32(y_inf, x_inf);
463 uint32x4_t y_inf_only = vbicq_u32(y_inf, x_inf);
464 uint32x4_t x_inf_only = vbicq_u32(x_inf, y_inf);
465 uint32x4_t any_inf = vorrq_u32(y_inf, x_inf);
466
467 float32x4_t inf_result = zero;
468 /* Both infinite: ±π/4 or ±3π/4 */
469 float32x4_t both_inf_r = vbslq_f32(x_pos, pi_4, three_pi_4);
470 both_inf_r = vreinterpretq_f32_u32(
471 vorrq_u32(vreinterpretq_u32_f32(both_inf_r),
472 vandq_u32(vreinterpretq_u32_f32(y), sign_mask)));
473 inf_result = vbslq_f32(both_inf, both_inf_r, inf_result);
474 /* y infinite, x finite: ±π/2 */
475 float32x4_t y_inf_r = vreinterpretq_f32_u32(vorrq_u32(
476 vreinterpretq_u32_f32(pi_2), vandq_u32(vreinterpretq_u32_f32(y), sign_mask)));
477 inf_result = vbslq_f32(y_inf_only, y_inf_r, inf_result);
478 /* x infinite, y finite: 0 or ±π */
479 float32x4_t x_inf_r = vbslq_f32(
480 x_pos,
481 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(y), sign_mask)),
482 vreinterpretq_f32_u32(
483 vorrq_u32(vreinterpretq_u32_f32(pi),
484 vandq_u32(vreinterpretq_u32_f32(y), sign_mask))));
485 inf_result = vbslq_f32(x_inf_only, x_inf_r, inf_result);
486
487 /* Normal computation */
488 uint32x4_t swap_mask = vcgtq_f32(abs_y, abs_x);
489 float32x4_t num = vbslq_f32(swap_mask, x, y);
490 float32x4_t den = vbslq_f32(swap_mask, y, x);
491 float32x4_t input = vmulq_f32(num, _vinvq_f32(den));
492
493 /* Handle division NaN (0/0), but not input NaN */
494 uint32x4_t div_nan = vbicq_u32(vmvnq_u32(vceqq_f32(input, input)), input_nan);
495 input = vbslq_f32(div_nan, num, input);
496
497 float32x4_t result = _varctan_poly_f32(input);
498 uint32x4_t input_sign = vandq_u32(vreinterpretq_u32_f32(input), sign_mask);
499 float32x4_t term =
500 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi_2), input_sign));
501 term = vsubq_f32(term, result);
502 result = vbslq_f32(swap_mask, term, result);
503
504 uint32x4_t x_neg = vcltq_f32(x, zero);
505 uint32x4_t y_sign = vandq_u32(vreinterpretq_u32_f32(y), sign_mask);
506 float32x4_t pi_adj =
507 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi), y_sign));
508 result = vbslq_f32(x_neg, vaddq_f32(result, pi_adj), result);
509
510 /* Select infinity result or normal result */
511 result = vbslq_f32(any_inf, inf_result, result);
512 result = vmulq_f32(result, vInvNorm);
513
514 vst1q_f32(out, result);
515 out += 4;
516 }
517
518 unsigned int number = quarter_points * 4;
520 out, complexVector + number, normalizeFactor, num_points - number);
521}
522#endif /* LV_HAVE_NEON */
523
524#ifdef LV_HAVE_NEONV8
525#include <arm_neon.h>
527/* ARMv8 NEON with FMA, proper infinity/NaN handling, and 2x unrolling */
528static inline void volk_32fc_s32f_atan2_32f_neonv8(float* outputVector,
529 const lv_32fc_t* complexVector,
530 const float normalizeFactor,
531 unsigned int num_points)
532{
533 const float* in = (float*)complexVector;
534 float* out = outputVector;
535
536 const float invNormalizeFactor = 1.f / normalizeFactor;
537 const float32x4_t vInvNorm = vdupq_n_f32(invNormalizeFactor);
538 const float32x4_t pi = vdupq_n_f32(0x1.921fb6p1f);
539 const float32x4_t pi_2 = vdupq_n_f32(0x1.921fb6p0f);
540 const float32x4_t pi_4 = vdupq_n_f32(0x1.921fb6p-1f);
541 const float32x4_t three_pi_4 = vdupq_n_f32(0x1.2d97c8p1f);
542 const float32x4_t zero = vdupq_n_f32(0.f);
543 const float32x4_t inf = vdupq_n_f32(__builtin_huge_valf());
544 const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
545
546 const unsigned int eighth_points = num_points / 8;
547
548 for (unsigned int number = 0; number < eighth_points; number++) {
549 /* Load 8 complex numbers and deinterleave */
550 float32x4x2_t z0 = vld2q_f32(in);
551 float32x4x2_t z1 = vld2q_f32(in + 8);
552 in += 16;
553
554 float32x4_t x0 = z0.val[0], y0 = z0.val[1];
555 float32x4_t x1 = z1.val[0], y1 = z1.val[1];
556
557 /* --- Process first 4 --- */
558 float32x4_t abs_x0 = vabsq_f32(x0);
559 float32x4_t abs_y0 = vabsq_f32(y0);
560
561 /* Detect input NaN */
562 uint32x4_t input_nan0 =
563 vorrq_u32(vmvnq_u32(vceqq_f32(x0, x0)), vmvnq_u32(vceqq_f32(y0, y0)));
564
565 /* Infinity handling */
566 uint32x4_t y_inf0 = vceqq_f32(abs_y0, inf);
567 uint32x4_t x_inf0 = vceqq_f32(abs_x0, inf);
568 uint32x4_t x_pos0 = vcgtq_f32(x0, zero);
569 uint32x4_t both_inf0 = vandq_u32(y_inf0, x_inf0);
570 uint32x4_t y_inf_only0 = vbicq_u32(y_inf0, x_inf0);
571 uint32x4_t x_inf_only0 = vbicq_u32(x_inf0, y_inf0);
572 uint32x4_t any_inf0 = vorrq_u32(y_inf0, x_inf0);
573
574 float32x4_t inf_result0 = zero;
575 float32x4_t both_inf_r0 = vbslq_f32(x_pos0, pi_4, three_pi_4);
576 both_inf_r0 = vreinterpretq_f32_u32(
577 vorrq_u32(vreinterpretq_u32_f32(both_inf_r0),
578 vandq_u32(vreinterpretq_u32_f32(y0), sign_mask)));
579 inf_result0 = vbslq_f32(both_inf0, both_inf_r0, inf_result0);
580 float32x4_t y_inf_r0 = vreinterpretq_f32_u32(
581 vorrq_u32(vreinterpretq_u32_f32(pi_2),
582 vandq_u32(vreinterpretq_u32_f32(y0), sign_mask)));
583 inf_result0 = vbslq_f32(y_inf_only0, y_inf_r0, inf_result0);
584 float32x4_t x_inf_r0 = vbslq_f32(
585 x_pos0,
586 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(y0), sign_mask)),
587 vreinterpretq_f32_u32(
588 vorrq_u32(vreinterpretq_u32_f32(pi),
589 vandq_u32(vreinterpretq_u32_f32(y0), sign_mask))));
590 inf_result0 = vbslq_f32(x_inf_only0, x_inf_r0, inf_result0);
591
592 /* Normal computation */
593 uint32x4_t swap_mask0 = vcgtq_f32(abs_y0, abs_x0);
594 float32x4_t num0 = vbslq_f32(swap_mask0, x0, y0);
595 float32x4_t den0 = vbslq_f32(swap_mask0, y0, x0);
596 float32x4_t input0 = vmulq_f32(num0, _vinvq_f32(den0));
597
598 uint32x4_t div_nan0 = vbicq_u32(vmvnq_u32(vceqq_f32(input0, input0)), input_nan0);
599 input0 = vbslq_f32(div_nan0, num0, input0);
600
601 float32x4_t result0 = _varctan_poly_neonv8(input0);
602 uint32x4_t input_sign0 = vandq_u32(vreinterpretq_u32_f32(input0), sign_mask);
603 float32x4_t term0 =
604 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi_2), input_sign0));
605 term0 = vsubq_f32(term0, result0);
606 result0 = vbslq_f32(swap_mask0, term0, result0);
607
608 uint32x4_t x_neg0 = vcltq_f32(x0, zero);
609 uint32x4_t y_sign0 = vandq_u32(vreinterpretq_u32_f32(y0), sign_mask);
610 float32x4_t pi_adj0 =
611 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi), y_sign0));
612 result0 = vbslq_f32(x_neg0, vaddq_f32(result0, pi_adj0), result0);
613
614 result0 = vbslq_f32(any_inf0, inf_result0, result0);
615 result0 = vmulq_f32(result0, vInvNorm);
616
617 /* --- Process second 4 --- */
618 float32x4_t abs_x1 = vabsq_f32(x1);
619 float32x4_t abs_y1 = vabsq_f32(y1);
620
621 uint32x4_t input_nan1 =
622 vorrq_u32(vmvnq_u32(vceqq_f32(x1, x1)), vmvnq_u32(vceqq_f32(y1, y1)));
623
624 uint32x4_t y_inf1 = vceqq_f32(abs_y1, inf);
625 uint32x4_t x_inf1 = vceqq_f32(abs_x1, inf);
626 uint32x4_t x_pos1 = vcgtq_f32(x1, zero);
627 uint32x4_t both_inf1 = vandq_u32(y_inf1, x_inf1);
628 uint32x4_t y_inf_only1 = vbicq_u32(y_inf1, x_inf1);
629 uint32x4_t x_inf_only1 = vbicq_u32(x_inf1, y_inf1);
630 uint32x4_t any_inf1 = vorrq_u32(y_inf1, x_inf1);
631
632 float32x4_t inf_result1 = zero;
633 float32x4_t both_inf_r1 = vbslq_f32(x_pos1, pi_4, three_pi_4);
634 both_inf_r1 = vreinterpretq_f32_u32(
635 vorrq_u32(vreinterpretq_u32_f32(both_inf_r1),
636 vandq_u32(vreinterpretq_u32_f32(y1), sign_mask)));
637 inf_result1 = vbslq_f32(both_inf1, both_inf_r1, inf_result1);
638 float32x4_t y_inf_r1 = vreinterpretq_f32_u32(
639 vorrq_u32(vreinterpretq_u32_f32(pi_2),
640 vandq_u32(vreinterpretq_u32_f32(y1), sign_mask)));
641 inf_result1 = vbslq_f32(y_inf_only1, y_inf_r1, inf_result1);
642 float32x4_t x_inf_r1 = vbslq_f32(
643 x_pos1,
644 vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(y1), sign_mask)),
645 vreinterpretq_f32_u32(
646 vorrq_u32(vreinterpretq_u32_f32(pi),
647 vandq_u32(vreinterpretq_u32_f32(y1), sign_mask))));
648 inf_result1 = vbslq_f32(x_inf_only1, x_inf_r1, inf_result1);
649
650 uint32x4_t swap_mask1 = vcgtq_f32(abs_y1, abs_x1);
651 float32x4_t num1 = vbslq_f32(swap_mask1, x1, y1);
652 float32x4_t den1 = vbslq_f32(swap_mask1, y1, x1);
653 float32x4_t input1 = vmulq_f32(num1, _vinvq_f32(den1));
654
655 uint32x4_t div_nan1 = vbicq_u32(vmvnq_u32(vceqq_f32(input1, input1)), input_nan1);
656 input1 = vbslq_f32(div_nan1, num1, input1);
657
658 float32x4_t result1 = _varctan_poly_neonv8(input1);
659 uint32x4_t input_sign1 = vandq_u32(vreinterpretq_u32_f32(input1), sign_mask);
660 float32x4_t term1 =
661 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi_2), input_sign1));
662 term1 = vsubq_f32(term1, result1);
663 result1 = vbslq_f32(swap_mask1, term1, result1);
664
665 uint32x4_t x_neg1 = vcltq_f32(x1, zero);
666 uint32x4_t y_sign1 = vandq_u32(vreinterpretq_u32_f32(y1), sign_mask);
667 float32x4_t pi_adj1 =
668 vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(pi), y_sign1));
669 result1 = vbslq_f32(x_neg1, vaddq_f32(result1, pi_adj1), result1);
670
671 result1 = vbslq_f32(any_inf1, inf_result1, result1);
672 result1 = vmulq_f32(result1, vInvNorm);
673
674 vst1q_f32(out, result0);
675 vst1q_f32(out + 4, result1);
676 out += 8;
677 }
678
679 unsigned int number = eighth_points * 8;
681 out, complexVector + number, normalizeFactor, num_points - number);
682}
683#endif /* LV_HAVE_NEONV8 */
684
685#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
686
687#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
688#define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
689
690#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
691#include <immintrin.h>
693static inline void volk_32fc_s32f_atan2_32f_u_avx512dq(float* outputVector,
694 const lv_32fc_t* complexVector,
695 const float normalizeFactor,
696 unsigned int num_points)
697{
698 const float* in = (float*)complexVector;
699 float* out = (float*)outputVector;
700
701 const float invNormalizeFactor = 1.f / normalizeFactor;
702 const __m512 vinvNormalizeFactor = _mm512_set1_ps(invNormalizeFactor);
703 const __m512 pi = _mm512_set1_ps(0x1.921fb6p1f);
704 const __m512 pi_2 = _mm512_set1_ps(0x1.921fb6p0f);
705 const __m512 abs_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7FFFFFFF));
706 const __m512 sign_mask = _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000));
707
708 const unsigned int sixteenth_points = num_points / 16;
709
710 for (unsigned int number = 0; number < sixteenth_points; number++) {
711 __m512 z1 = _mm512_loadu_ps(in);
712 in += 16;
713 __m512 z2 = _mm512_loadu_ps(in);
714 in += 16;
715
716 __m512 x = _mm512_real(z1, z2);
717 __m512 y = _mm512_imag(z1, z2);
718
719 // Detect NaN in original inputs before division
720 __mmask16 input_nan_mask = _mm512_cmp_ps_mask(x, x, _CMP_UNORD_Q) |
721 _mm512_cmp_ps_mask(y, y, _CMP_UNORD_Q);
722
723 // Handle infinity cases per IEEE 754
724 const __m512 zero = _mm512_setzero_ps();
725 const __m512 pi_4 = _mm512_set1_ps(0x1.921fb6p-1f); // π/4
726 const __m512 three_pi_4 = _mm512_set1_ps(0x1.2d97c8p1f); // 3π/4
727
728 __mmask16 y_inf_mask = _mm512_fpclass_ps_mask(y, 0x18); // ±inf
729 __mmask16 x_inf_mask = _mm512_fpclass_ps_mask(x, 0x18); // ±inf
730 __mmask16 x_pos_mask = _mm512_cmp_ps_mask(x, zero, _CMP_GT_OS);
731
732 // Build infinity result
733 __m512 inf_result = zero;
734 // Both infinite: ±π/4 or ±3π/4
735 __mmask16 both_inf = y_inf_mask & x_inf_mask;
736 __m512 both_inf_result = _mm512_mask_blend_ps(x_pos_mask, three_pi_4, pi_4);
737 both_inf_result = _mm512_or_ps(both_inf_result, _mm512_and_ps(y, sign_mask));
738 inf_result = _mm512_mask_blend_ps(both_inf, inf_result, both_inf_result);
739
740 // y infinite, x finite: ±π/2
741 __mmask16 y_inf_only = y_inf_mask & ~x_inf_mask;
742 __m512 y_inf_result = _mm512_or_ps(pi_2, _mm512_and_ps(y, sign_mask));
743 inf_result = _mm512_mask_blend_ps(y_inf_only, inf_result, y_inf_result);
744
745 // x infinite, y finite: 0 or ±π
746 __mmask16 x_inf_only = x_inf_mask & ~y_inf_mask;
747 __m512 x_inf_result =
748 _mm512_mask_blend_ps(x_pos_mask,
749 _mm512_or_ps(pi, _mm512_and_ps(y, sign_mask)),
750 _mm512_or_ps(zero, _mm512_and_ps(y, sign_mask)));
751 inf_result = _mm512_mask_blend_ps(x_inf_only, inf_result, x_inf_result);
752
753 __mmask16 any_inf_mask = y_inf_mask | x_inf_mask;
754
755 __mmask16 swap_mask = _mm512_cmp_ps_mask(
756 _mm512_and_ps(y, abs_mask), _mm512_and_ps(x, abs_mask), _CMP_GT_OS);
757 __m512 numerator = _mm512_mask_blend_ps(swap_mask, y, x);
758 __m512 denominator = _mm512_mask_blend_ps(swap_mask, x, y);
759 __m512 input = _mm512_div_ps(numerator, denominator);
760
761 // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
762 // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
763 __mmask16 div_nan_mask =
764 _mm512_cmp_ps_mask(input, input, _CMP_UNORD_Q) & ~input_nan_mask;
765 input = _mm512_mask_blend_ps(div_nan_mask, input, numerator);
766 __m512 result = _mm512_arctan_poly_avx512(input);
767
768 input =
769 _mm512_sub_ps(_mm512_or_ps(pi_2, _mm512_and_ps(input, sign_mask)), result);
770 result = _mm512_mask_blend_ps(swap_mask, result, input);
771
772 __m512 x_sign_mask =
773 _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(x), 31));
774
775 result = _mm512_add_ps(
776 _mm512_and_ps(_mm512_xor_ps(pi, _mm512_and_ps(sign_mask, y)), x_sign_mask),
777 result);
778
779 // Select infinity result or normal result
780 result = _mm512_mask_blend_ps(any_inf_mask, result, inf_result);
781
782 result = _mm512_mul_ps(result, vinvNormalizeFactor);
783
784 _mm512_storeu_ps(out, result);
785 out += 16;
786 }
787
788 unsigned int number = sixteenth_points * 16;
790 out, complexVector + number, normalizeFactor, num_points - number);
791}
792#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */
793
794#if LV_HAVE_AVX2 && LV_HAVE_FMA
795#include <immintrin.h>
797static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector,
798 const lv_32fc_t* complexVector,
799 const float normalizeFactor,
800 unsigned int num_points)
801{
802 const float* in = (float*)complexVector;
803 float* out = (float*)outputVector;
804
805 const float invNormalizeFactor = 1.f / normalizeFactor;
806 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
807 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
808 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
809 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
810 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
811
812 unsigned int number = 0;
813 const unsigned int eighth_points = num_points / 8;
814 for (; number < eighth_points; number++) {
815 __m256 z1 = _mm256_loadu_ps(in);
816 in += 8;
817 __m256 z2 = _mm256_loadu_ps(in);
818 in += 8;
819
820 __m256 x = _mm256_real(z1, z2);
821 __m256 y = _mm256_imag(z1, z2);
822
823 // Detect NaN in original inputs before division
824 __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
825 _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
826
827 // Handle infinity cases per IEEE 754
828 const __m256 zero = _mm256_setzero_ps();
829 const __m256 inf = _mm256_set1_ps(HUGE_VALF);
830 const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4
831 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4
832
833 __m256 y_abs = _mm256_and_ps(y, abs_mask);
834 __m256 x_abs = _mm256_and_ps(x, abs_mask);
835 __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf
836 __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf
837 __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
838
839 // Build infinity result
840 __m256 inf_result = zero;
841 // Both infinite: ±π/4 or ±3π/4
842 __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
843 __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
844 both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
845 inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
846
847 // y infinite, x finite: ±π/2
848 __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
849 __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
850 inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
851
852 // x infinite, y finite: 0 or ±π
853 __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
854 __m256 x_inf_result =
855 _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
856 _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
857 x_pos_mask);
858 inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
859
860 __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
861
862 __m256 swap_mask = _mm256_cmp_ps(
863 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
864 __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
865 __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
866 __m256 input = _mm256_div_ps(numerator, denominator);
867
868 // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
869 // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
870 __m256 div_nan_mask =
871 _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
872 input = _mm256_blendv_ps(input, numerator, div_nan_mask);
873 __m256 result = _mm256_arctan_poly_avx2_fma(input);
874
875 input =
876 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
877 result = _mm256_blendv_ps(result, input, swap_mask);
878
879 __m256 x_sign_mask =
880 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
881
882 result = _mm256_add_ps(
883 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
884 result);
885
886 // Select infinity result or normal result
887 result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
888
889 result = _mm256_mul_ps(result, vinvNormalizeFactor);
890
891 _mm256_storeu_ps(out, result);
892 out += 8;
893 }
894
895 number = eighth_points * 8;
897 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
898}
899#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
900
901#if LV_HAVE_AVX2
902#include <immintrin.h>
904static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector,
905 const lv_32fc_t* complexVector,
906 const float normalizeFactor,
907 unsigned int num_points)
908{
909 const float* in = (float*)complexVector;
910 float* out = (float*)outputVector;
911
912 const float invNormalizeFactor = 1.f / normalizeFactor;
913 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
914 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
915 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
916 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
917 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
918
919 unsigned int number = 0;
920 const unsigned int eighth_points = num_points / 8;
921 for (; number < eighth_points; number++) {
922 __m256 z1 = _mm256_loadu_ps(in);
923 in += 8;
924 __m256 z2 = _mm256_loadu_ps(in);
925 in += 8;
926
927 __m256 x = _mm256_real(z1, z2);
928 __m256 y = _mm256_imag(z1, z2);
929
930 // Detect NaN in original inputs before division
931 __m256 input_nan_mask = _mm256_or_ps(_mm256_cmp_ps(x, x, _CMP_UNORD_Q),
932 _mm256_cmp_ps(y, y, _CMP_UNORD_Q));
933
934 // Handle infinity cases per IEEE 754
935 const __m256 zero = _mm256_setzero_ps();
936 const __m256 inf = _mm256_set1_ps(HUGE_VALF);
937 const __m256 pi_4 = _mm256_set1_ps(0x1.921fb6p-1f); // π/4
938 const __m256 three_pi_4 = _mm256_set1_ps(0x1.2d97c8p1f); // 3π/4
939
940 __m256 y_abs = _mm256_and_ps(y, abs_mask);
941 __m256 x_abs = _mm256_and_ps(x, abs_mask);
942 __m256 y_inf_mask = _mm256_cmp_ps(y_abs, inf, _CMP_EQ_OQ); // |y| == inf
943 __m256 x_inf_mask = _mm256_cmp_ps(x_abs, inf, _CMP_EQ_OQ); // |x| == inf
944 __m256 x_pos_mask = _mm256_cmp_ps(x, zero, _CMP_GT_OS);
945
946 // Build infinity result
947 __m256 inf_result = zero;
948 // Both infinite: ±π/4 or ±3π/4
949 __m256 both_inf = _mm256_and_ps(y_inf_mask, x_inf_mask);
950 __m256 both_inf_result = _mm256_blendv_ps(three_pi_4, pi_4, x_pos_mask);
951 both_inf_result = _mm256_or_ps(both_inf_result, _mm256_and_ps(y, sign_mask));
952 inf_result = _mm256_blendv_ps(inf_result, both_inf_result, both_inf);
953
954 // y infinite, x finite: ±π/2
955 __m256 y_inf_only = _mm256_andnot_ps(x_inf_mask, y_inf_mask);
956 __m256 y_inf_result = _mm256_or_ps(pi_2, _mm256_and_ps(y, sign_mask));
957 inf_result = _mm256_blendv_ps(inf_result, y_inf_result, y_inf_only);
958
959 // x infinite, y finite: 0 or ±π
960 __m256 x_inf_only = _mm256_andnot_ps(y_inf_mask, x_inf_mask);
961 __m256 x_inf_result =
962 _mm256_blendv_ps(_mm256_or_ps(pi, _mm256_and_ps(y, sign_mask)),
963 _mm256_or_ps(zero, _mm256_and_ps(y, sign_mask)),
964 x_pos_mask);
965 inf_result = _mm256_blendv_ps(inf_result, x_inf_result, x_inf_only);
966
967 __m256 any_inf_mask = _mm256_or_ps(y_inf_mask, x_inf_mask);
968
969 __m256 swap_mask = _mm256_cmp_ps(
970 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
971 __m256 numerator = _mm256_blendv_ps(y, x, swap_mask);
972 __m256 denominator = _mm256_blendv_ps(x, y, swap_mask);
973 __m256 input = _mm256_div_ps(numerator, denominator);
974
975 // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
976 // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
977 __m256 div_nan_mask =
978 _mm256_andnot_ps(input_nan_mask, _mm256_cmp_ps(input, input, _CMP_UNORD_Q));
979 input = _mm256_blendv_ps(input, numerator, div_nan_mask);
980 __m256 result = _mm256_arctan_poly_avx(input);
981
982 input =
983 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
984 result = _mm256_blendv_ps(result, input, swap_mask);
985
986 __m256 x_sign_mask =
987 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
988
989 result = _mm256_add_ps(
990 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
991 result);
992
993 // Select infinity result or normal result
994 result = _mm256_blendv_ps(result, inf_result, any_inf_mask);
995
996 result = _mm256_mul_ps(result, vinvNormalizeFactor);
997
998 _mm256_storeu_ps(out, result);
999 out += 8;
1000 }
1001
1002 number = eighth_points * 8;
1004 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
1005}
1006#endif /* LV_HAVE_AVX2 for unaligned */
1007
1008#ifdef LV_HAVE_RVV
1009#include <riscv_vector.h>
1011
1012static inline void volk_32fc_s32f_atan2_32f_rvv(float* outputVector,
1013 const lv_32fc_t* inputVector,
1014 const float normalizeFactor,
1015 unsigned int num_points)
1016{
1017 size_t vlmax = __riscv_vsetvlmax_e32m2();
1018
1019 const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
1020 const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
1021 const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
1022 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
1023 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
1024 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
1025 const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
1026 const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
1027 const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
1028 const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
1029
1030 const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
1031 const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(HUGE_VALF, vlmax);
1032 const vfloat32m2_t pi_4 = __riscv_vfmv_v_f_f32m2(0x1.921fb6p-1f, vlmax); // π/4
1033 const vfloat32m2_t three_pi_4 = __riscv_vfmv_v_f_f32m2(0x1.2d97c8p1f, vlmax); // 3π/4
1034
1035 size_t n = num_points;
1036 for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
1037 vl = __riscv_vsetvl_e32m2(n);
1038 vuint64m4_t v = __riscv_vle64_v_u64m4((const uint64_t*)inputVector, vl);
1039 vfloat32m2_t vr = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 0, vl));
1040 vfloat32m2_t vi = __riscv_vreinterpret_f32m2(__riscv_vnsrl(v, 32, vl));
1041
1042 // Detect NaN in original inputs before division
1043 vbool16_t input_nan_mask =
1044 __riscv_vmor(__riscv_vmfne(vr, vr, vl), __riscv_vmfne(vi, vi, vl), vl);
1045
1046 // Handle infinity cases per IEEE 754
1047 vfloat32m2_t vr_abs = __riscv_vfabs(vr, vl);
1048 vfloat32m2_t vi_abs = __riscv_vfabs(vi, vl);
1049 vbool16_t vr_inf_mask = __riscv_vmfeq(vr_abs, inf, vl); // |vr| == inf
1050 vbool16_t vi_inf_mask = __riscv_vmfeq(vi_abs, inf, vl); // |vi| == inf
1051 vbool16_t vr_pos_mask = __riscv_vmfgt(vr, zero, vl);
1052
1053 // Build infinity result
1054 vfloat32m2_t inf_result = zero;
1055 // Both infinite: ±π/4 or ±3π/4
1056 vbool16_t both_inf = __riscv_vmand(vi_inf_mask, vr_inf_mask, vl);
1057 vfloat32m2_t both_inf_result = __riscv_vmerge(three_pi_4, pi_4, vr_pos_mask, vl);
1058 both_inf_result = __riscv_vfsgnj(both_inf_result, vi, vl); // Copy sign from vi
1059 inf_result = __riscv_vmerge(inf_result, both_inf_result, both_inf, vl);
1060
1061 // vi infinite, vr finite: ±π/2
1062 vbool16_t vi_inf_only = __riscv_vmandn(vi_inf_mask, vr_inf_mask, vl);
1063 vfloat32m2_t vi_inf_result = __riscv_vfsgnj(cpio2, vi, vl); // π/2 with sign of vi
1064 inf_result = __riscv_vmerge(inf_result, vi_inf_result, vi_inf_only, vl);
1065
1066 // vr infinite, vi finite: 0 or ±π
1067 vbool16_t vr_inf_only = __riscv_vmandn(vr_inf_mask, vi_inf_mask, vl);
1068 vfloat32m2_t vr_inf_result =
1069 __riscv_vmerge(__riscv_vfsgnj(cpi, vi, vl), // π with sign of vi
1070 __riscv_vfsgnj(zero, vi, vl), // 0 with sign of vi
1071 vr_pos_mask,
1072 vl);
1073 inf_result = __riscv_vmerge(inf_result, vr_inf_result, vr_inf_only, vl);
1074
1075 vbool16_t any_inf_mask = __riscv_vmor(vi_inf_mask, vr_inf_mask, vl);
1076
1077 vbool16_t mswap = __riscv_vmfgt(vi_abs, vr_abs, vl);
1078 vfloat32m2_t numerator = __riscv_vmerge(vi, vr, mswap, vl);
1079 vfloat32m2_t denominator = __riscv_vmerge(vr, vi, mswap, vl);
1080 vfloat32m2_t x = __riscv_vfdiv(numerator, denominator, vl);
1081
1082 // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
1083 // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
1084 vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl);
1085 // div_nan_mask = x_nan_mask & ~input_nan_mask (vmandn computes vs2 & ~vs1)
1086 vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl);
1087 x = __riscv_vmerge(x, numerator, div_nan_mask, vl);
1088
1089 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
1090 vfloat32m2_t p = c13;
1091 p = __riscv_vfmadd(p, xx, c11, vl);
1092 p = __riscv_vfmadd(p, xx, c9, vl);
1093 p = __riscv_vfmadd(p, xx, c7, vl);
1094 p = __riscv_vfmadd(p, xx, c5, vl);
1095 p = __riscv_vfmadd(p, xx, c3, vl);
1096 p = __riscv_vfmadd(p, xx, c1, vl);
1097 p = __riscv_vfmul(p, x, vl);
1098
1099 x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
1100 p = __riscv_vmerge(p, x, mswap, vl);
1101 p = __riscv_vfadd_mu(
1102 RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
1103
1104 // Select infinity result or normal result
1105 p = __riscv_vmerge(p, inf_result, any_inf_mask, vl);
1106
1107 __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);
1108 }
1109}
1110#endif /*LV_HAVE_RVV*/
1111
1112#ifdef LV_HAVE_RVVSEG
1113#include <riscv_vector.h>
1115
1116static inline void volk_32fc_s32f_atan2_32f_rvvseg(float* outputVector,
1117 const lv_32fc_t* inputVector,
1118 const float normalizeFactor,
1119 unsigned int num_points)
1120{
1121 size_t vlmax = __riscv_vsetvlmax_e32m2();
1122
1123 const vfloat32m2_t norm = __riscv_vfmv_v_f_f32m2(1 / normalizeFactor, vlmax);
1124 const vfloat32m2_t cpi = __riscv_vfmv_v_f_f32m2(3.1415927f, vlmax);
1125 const vfloat32m2_t cpio2 = __riscv_vfmv_v_f_f32m2(1.5707964f, vlmax);
1126 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(+0x1.ffffeap-1f, vlmax);
1127 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(-0x1.55437p-2f, vlmax);
1128 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(+0x1.972be6p-3f, vlmax);
1129 const vfloat32m2_t c7 = __riscv_vfmv_v_f_f32m2(-0x1.1436ap-3f, vlmax);
1130 const vfloat32m2_t c9 = __riscv_vfmv_v_f_f32m2(+0x1.5785aap-4f, vlmax);
1131 const vfloat32m2_t c11 = __riscv_vfmv_v_f_f32m2(-0x1.2f3004p-5f, vlmax);
1132 const vfloat32m2_t c13 = __riscv_vfmv_v_f_f32m2(+0x1.01a37cp-7f, vlmax);
1133
1134 const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
1135 const vfloat32m2_t inf = __riscv_vfmv_v_f_f32m2(HUGE_VALF, vlmax);
1136 const vfloat32m2_t pi_4 = __riscv_vfmv_v_f_f32m2(0x1.921fb6p-1f, vlmax); // π/4
1137 const vfloat32m2_t three_pi_4 = __riscv_vfmv_v_f_f32m2(0x1.2d97c8p1f, vlmax); // 3π/4
1138
1139 size_t n = num_points;
1140 for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
1141 vl = __riscv_vsetvl_e32m2(n);
1142 vfloat32m2x2_t v = __riscv_vlseg2e32_v_f32m2x2((const float*)inputVector, vl);
1143 vfloat32m2_t vr = __riscv_vget_f32m2(v, 0), vi = __riscv_vget_f32m2(v, 1);
1144
1145 // Detect NaN in original inputs before division
1146 vbool16_t input_nan_mask =
1147 __riscv_vmor(__riscv_vmfne(vr, vr, vl), __riscv_vmfne(vi, vi, vl), vl);
1148
1149 // Handle infinity cases per IEEE 754
1150 vfloat32m2_t vr_abs = __riscv_vfabs(vr, vl);
1151 vfloat32m2_t vi_abs = __riscv_vfabs(vi, vl);
1152 vbool16_t vr_inf_mask = __riscv_vmfeq(vr_abs, inf, vl); // |vr| == inf
1153 vbool16_t vi_inf_mask = __riscv_vmfeq(vi_abs, inf, vl); // |vi| == inf
1154 vbool16_t vr_pos_mask = __riscv_vmfgt(vr, zero, vl);
1155
1156 // Build infinity result
1157 vfloat32m2_t inf_result = zero;
1158 // Both infinite: ±π/4 or ±3π/4
1159 vbool16_t both_inf = __riscv_vmand(vi_inf_mask, vr_inf_mask, vl);
1160 vfloat32m2_t both_inf_result = __riscv_vmerge(three_pi_4, pi_4, vr_pos_mask, vl);
1161 both_inf_result = __riscv_vfsgnj(both_inf_result, vi, vl); // Copy sign from vi
1162 inf_result = __riscv_vmerge(inf_result, both_inf_result, both_inf, vl);
1163
1164 // vi infinite, vr finite: ±π/2
1165 vbool16_t vi_inf_only = __riscv_vmandn(vi_inf_mask, vr_inf_mask, vl);
1166 vfloat32m2_t vi_inf_result = __riscv_vfsgnj(cpio2, vi, vl); // π/2 with sign of vi
1167 inf_result = __riscv_vmerge(inf_result, vi_inf_result, vi_inf_only, vl);
1168
1169 // vr infinite, vi finite: 0 or ±π
1170 vbool16_t vr_inf_only = __riscv_vmandn(vr_inf_mask, vi_inf_mask, vl);
1171 vfloat32m2_t vr_inf_result =
1172 __riscv_vmerge(__riscv_vfsgnj(cpi, vi, vl), // π with sign of vi
1173 __riscv_vfsgnj(zero, vi, vl), // 0 with sign of vi
1174 vr_pos_mask,
1175 vl);
1176 inf_result = __riscv_vmerge(inf_result, vr_inf_result, vr_inf_only, vl);
1177
1178 vbool16_t any_inf_mask = __riscv_vmor(vi_inf_mask, vr_inf_mask, vl);
1179
1180 vbool16_t mswap = __riscv_vmfgt(vi_abs, vr_abs, vl);
1181 vfloat32m2_t numerator = __riscv_vmerge(vi, vr, mswap, vl);
1182 vfloat32m2_t denominator = __riscv_vmerge(vr, vi, mswap, vl);
1183 vfloat32m2_t x = __riscv_vfdiv(numerator, denominator, vl);
1184
1185 // Only handle NaN from division (0/0, inf/inf), not from NaN inputs
1186 // Replace with numerator to preserve sign (e.g., atan2(-0, 0) = -0)
1187 vbool16_t x_nan_mask = __riscv_vmfne(x, x, vl);
1188 // div_nan_mask = x_nan_mask & ~input_nan_mask (vmandn computes vs2 & ~vs1)
1189 vbool16_t div_nan_mask = __riscv_vmandn(x_nan_mask, input_nan_mask, vl);
1190 x = __riscv_vmerge(x, numerator, div_nan_mask, vl);
1191
1192 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
1193 vfloat32m2_t p = c13;
1194 p = __riscv_vfmadd(p, xx, c11, vl);
1195 p = __riscv_vfmadd(p, xx, c9, vl);
1196 p = __riscv_vfmadd(p, xx, c7, vl);
1197 p = __riscv_vfmadd(p, xx, c5, vl);
1198 p = __riscv_vfmadd(p, xx, c3, vl);
1199 p = __riscv_vfmadd(p, xx, c1, vl);
1200 p = __riscv_vfmul(p, x, vl);
1201
1202 x = __riscv_vfsub(__riscv_vfsgnj(cpio2, x, vl), p, vl);
1203 p = __riscv_vmerge(p, x, mswap, vl);
1204 p = __riscv_vfadd_mu(
1205 RISCV_VMFLTZ(32m2, vr, vl), p, p, __riscv_vfsgnjx(cpi, vi, vl), vl);
1206
1207 // Select infinity result or normal result
1208 p = __riscv_vmerge(p, inf_result, any_inf_mask, vl);
1209
1210 __riscv_vse32(outputVector, __riscv_vfmul(p, norm, vl), vl);
1211 }
1212}
1213#endif /*LV_HAVE_RVVSEG*/
1214
1215#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_u_H */