Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_sincos_32f_x2.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
57
58#include <inttypes.h>
59#include <math.h>
60#include <stdio.h>
61
62#ifndef INCLUDED_volk_32f_sincos_32f_x2_a_H
63#define INCLUDED_volk_32f_sincos_32f_x2_a_H
64
65#ifdef LV_HAVE_GENERIC
66
67static inline void volk_32f_sincos_32f_x2_generic(float* sinVector,
68 float* cosVector,
69 const float* inVector,
70 unsigned int num_points)
71{
72 for (unsigned int i = 0; i < num_points; i++) {
73 sinVector[i] = sinf(inVector[i]);
74 cosVector[i] = cosf(inVector[i]);
75 }
76}
77
78#endif /* LV_HAVE_GENERIC */
79
80#ifdef LV_HAVE_GENERIC
81#include <volk/volk_common.h>
82
83static inline void volk_32f_sincos_32f_x2_polynomial(float* sinVector,
84 float* cosVector,
85 const float* inVector,
86 unsigned int num_points)
87{
88 /*
89 * Cody-Waite argument reduction with shared sin/cos polynomial evaluation
90 */
91 const float two_over_pi = 0x1.45f306p-1f;
92 const float pi_over_2_hi = 0x1.921fb6p+0f;
93 const float pi_over_2_lo = -0x1.777a5cp-25f;
94
95 for (unsigned int i = 0; i < num_points; i++) {
96 float x = inVector[i];
97
98 float n_f = rintf(x * two_over_pi);
99 int n = (int)n_f;
100
101 float r = fmaf(-n_f, pi_over_2_hi, x);
102 r = fmaf(-n_f, pi_over_2_lo, r);
103
104 float sin_r = volk_sin_poly(r);
105 float cos_r = volk_cos_poly(r);
106
107 // Sin: n&1 swaps, n&2 negates
108 float sin_result = (n & 1) ? cos_r : sin_r;
109 sinVector[i] = (n & 2) ? -sin_result : sin_result;
110
111 // Cos: n&1 swaps, (n+1)&2 negates
112 float cos_result = (n & 1) ? sin_r : cos_r;
113 cosVector[i] = ((n + 1) & 2) ? -cos_result : cos_result;
114 }
115}
116#endif /* LV_HAVE_GENERIC */
117
118#ifdef LV_HAVE_AVX512F
119#include <immintrin.h>
121
122static inline void volk_32f_sincos_32f_x2_a_avx512f(float* sinVector,
123 float* cosVector,
124 const float* inVector,
125 unsigned int num_points)
126{
127 float* sinPtr = sinVector;
128 float* cosPtr = cosVector;
129 const float* inPtr = inVector;
130
131 unsigned int number = 0;
132 unsigned int sixteenPoints = num_points / 16;
133
134 // Constants for Cody-Waite argument reduction
135 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f);
136 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f);
137 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f);
138
139 const __m512i ones = _mm512_set1_epi32(1);
140 const __m512i twos = _mm512_set1_epi32(2);
141 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
142
143 for (; number < sixteenPoints; number++) {
144 __m512 x = _mm512_load_ps(inPtr);
145
146 // Argument reduction: n = round(x * 2/pi)
147 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
148 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
149 __m512i n = _mm512_cvtps_epi32(n_f);
150
151 // r = x - n * (pi/2)
152 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
153 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
154
155 // Evaluate both sin and cos polynomials
156 __m512 sin_r = _mm512_sin_poly_avx512(r);
157 __m512 cos_r = _mm512_cos_poly_avx512(r);
158
159 // Quadrant selection
160 __m512i n_and_1 = _mm512_and_si512(n, ones);
161 __m512i n_and_2 = _mm512_and_si512(n, twos);
162 __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
163
164 // For sin: swap when n&1, negate when n&2
165 __mmask16 sin_swap = _mm512_cmpeq_epi32_mask(n_and_1, ones);
166 __m512 sin_result = _mm512_mask_blend_ps(sin_swap, sin_r, cos_r);
167 __mmask16 sin_neg = _mm512_cmpeq_epi32_mask(n_and_2, twos);
168 sin_result =
169 _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(sin_result),
170 sin_neg,
171 _mm512_castps_si512(sin_result),
172 sign_bit));
173
174 // For cos: swap when n&1, negate when (n+1)&2
175 __mmask16 cos_swap = sin_swap;
176 __m512 cos_result = _mm512_mask_blend_ps(cos_swap, cos_r, sin_r);
177 __mmask16 cos_neg = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
178 cos_result =
179 _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(cos_result),
180 cos_neg,
181 _mm512_castps_si512(cos_result),
182 sign_bit));
183
184 _mm512_store_ps(sinPtr, sin_result);
185 _mm512_store_ps(cosPtr, cos_result);
186 inPtr += 16;
187 sinPtr += 16;
188 cosPtr += 16;
189 }
190
191 number = sixteenPoints * 16;
192 for (; number < num_points; number++) {
193 *sinPtr++ = sinf(*inPtr);
194 *cosPtr++ = cosf(*inPtr++);
195 }
196}
197
198#endif /* LV_HAVE_AVX512F for aligned */
199
200#if LV_HAVE_AVX2 && LV_HAVE_FMA
201#include <immintrin.h>
203
204static inline void volk_32f_sincos_32f_x2_a_avx2_fma(float* sinVector,
205 float* cosVector,
206 const float* inVector,
207 unsigned int num_points)
208{
209 float* sinPtr = sinVector;
210 float* cosPtr = cosVector;
211 const float* inPtr = inVector;
212
213 unsigned int number = 0;
214 unsigned int eighthPoints = num_points / 8;
215
216 // Constants for Cody-Waite argument reduction
217 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
218 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
219 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
220
221 const __m256i ones = _mm256_set1_epi32(1);
222 const __m256i twos = _mm256_set1_epi32(2);
223 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
224
225 for (; number < eighthPoints; number++) {
226 __m256 x = _mm256_load_ps(inPtr);
227
228 // Argument reduction: n = round(x * 2/pi)
229 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
230 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
231 __m256i n = _mm256_cvtps_epi32(n_f);
232
233 // r = x - n * (pi/2)
234 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
235 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
236
237 // Evaluate both sin and cos polynomials
238 __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
239 __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
240
241 // Quadrant selection
242 __m256i n_and_1 = _mm256_and_si256(n, ones);
243 __m256i n_and_2 = _mm256_and_si256(n, twos);
244 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
245
246 // For sin: swap when n&1, negate when n&2
247 __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
248 __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
249 __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
250 sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
251
252 // For cos: swap when n&1, negate when (n+1)&2
253 __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
254 __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
255 cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
256
257 _mm256_store_ps(sinPtr, sin_result);
258 _mm256_store_ps(cosPtr, cos_result);
259 inPtr += 8;
260 sinPtr += 8;
261 cosPtr += 8;
262 }
263
264 number = eighthPoints * 8;
265 for (; number < num_points; number++) {
266 *sinPtr++ = sinf(*inPtr);
267 *cosPtr++ = cosf(*inPtr++);
268 }
269}
270
271#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
272
273#ifdef LV_HAVE_AVX2
274#include <immintrin.h>
276
277static inline void volk_32f_sincos_32f_x2_a_avx2(float* sinVector,
278 float* cosVector,
279 const float* inVector,
280 unsigned int num_points)
281{
282 float* sinPtr = sinVector;
283 float* cosPtr = cosVector;
284 const float* inPtr = inVector;
285
286 unsigned int number = 0;
287 unsigned int eighthPoints = num_points / 8;
288
289 // Constants for Cody-Waite argument reduction
290 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
291 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
292 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
293
294 const __m256i ones = _mm256_set1_epi32(1);
295 const __m256i twos = _mm256_set1_epi32(2);
296 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
297
298 for (; number < eighthPoints; number++) {
299 __m256 x = _mm256_load_ps(inPtr);
300
301 // Argument reduction: n = round(x * 2/pi)
302 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
303 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
304 __m256i n = _mm256_cvtps_epi32(n_f);
305
306 // r = x - n * (pi/2)
307 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
308 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
309
310 // Evaluate both sin and cos polynomials
311 __m256 sin_r = _mm256_sin_poly_avx2(r);
312 __m256 cos_r = _mm256_cos_poly_avx2(r);
313
314 // Quadrant selection
315 __m256i n_and_1 = _mm256_and_si256(n, ones);
316 __m256i n_and_2 = _mm256_and_si256(n, twos);
317 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
318
319 // For sin: swap when n&1, negate when n&2
320 __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
321 __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
322 __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
323 sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
324
325 // For cos: swap when n&1, negate when (n+1)&2
326 __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
327 __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
328 cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
329
330 _mm256_store_ps(sinPtr, sin_result);
331 _mm256_store_ps(cosPtr, cos_result);
332 inPtr += 8;
333 sinPtr += 8;
334 cosPtr += 8;
335 }
336
337 number = eighthPoints * 8;
338 for (; number < num_points; number++) {
339 *sinPtr++ = sinf(*inPtr);
340 *cosPtr++ = cosf(*inPtr++);
341 }
342}
343
344#endif /* LV_HAVE_AVX2 for aligned */
345
346#ifdef LV_HAVE_SSE4_1
347#include <smmintrin.h>
349
350static inline void volk_32f_sincos_32f_x2_a_sse4_1(float* sinVector,
351 float* cosVector,
352 const float* inVector,
353 unsigned int num_points)
354{
355 float* sinPtr = sinVector;
356 float* cosPtr = cosVector;
357 const float* inPtr = inVector;
358
359 unsigned int number = 0;
360 unsigned int quarterPoints = num_points / 4;
361
362 // Constants for Cody-Waite argument reduction
363 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f);
364 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f);
365 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f);
366
367 const __m128i ones = _mm_set1_epi32(1);
368 const __m128i twos = _mm_set1_epi32(2);
369 const __m128 sign_bit = _mm_set1_ps(-0.0f);
370
371 for (; number < quarterPoints; number++) {
372 __m128 x = _mm_load_ps(inPtr);
373
374 // Argument reduction: n = round(x * 2/pi)
375 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
376 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
377 __m128i n = _mm_cvtps_epi32(n_f);
378
379 // r = x - n * (pi/2)
380 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
381 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
382
383 // Evaluate both sin and cos polynomials
384 __m128 sin_r = _mm_sin_poly_sse(r);
385 __m128 cos_r = _mm_cos_poly_sse(r);
386
387 // Quadrant selection
388 __m128i n_and_1 = _mm_and_si128(n, ones);
389 __m128i n_and_2 = _mm_and_si128(n, twos);
390 __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
391
392 // For sin: swap when n&1, negate when n&2
393 __m128 sin_swap = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
394 __m128 sin_result = _mm_blendv_ps(sin_r, cos_r, sin_swap);
395 __m128 sin_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
396 sin_result = _mm_xor_ps(sin_result, _mm_and_ps(sin_neg, sign_bit));
397
398 // For cos: swap when n&1, negate when (n+1)&2
399 __m128 cos_result = _mm_blendv_ps(cos_r, sin_r, sin_swap);
400 __m128 cos_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
401 cos_result = _mm_xor_ps(cos_result, _mm_and_ps(cos_neg, sign_bit));
402
403 _mm_store_ps(sinPtr, sin_result);
404 _mm_store_ps(cosPtr, cos_result);
405 inPtr += 4;
406 sinPtr += 4;
407 cosPtr += 4;
408 }
409
410 number = quarterPoints * 4;
411 for (; number < num_points; number++) {
412 *sinPtr++ = sinf(*inPtr);
413 *cosPtr++ = cosf(*inPtr++);
414 }
415}
416
417#endif /* LV_HAVE_SSE4_1 for aligned */
418
419#endif /* INCLUDED_volk_32f_sincos_32f_x2_a_H */
420
421
422#ifndef INCLUDED_volk_32f_sincos_32f_x2_u_H
423#define INCLUDED_volk_32f_sincos_32f_x2_u_H
424
425#ifdef LV_HAVE_AVX512F
426#include <immintrin.h>
428
429static inline void volk_32f_sincos_32f_x2_u_avx512f(float* sinVector,
430 float* cosVector,
431 const float* inVector,
432 unsigned int num_points)
433{
434 float* sinPtr = sinVector;
435 float* cosPtr = cosVector;
436 const float* inPtr = inVector;
437
438 unsigned int number = 0;
439 unsigned int sixteenPoints = num_points / 16;
440
441 // Constants for Cody-Waite argument reduction
442 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f);
443 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f);
444 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f);
445
446 const __m512i ones = _mm512_set1_epi32(1);
447 const __m512i twos = _mm512_set1_epi32(2);
448 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
449
450 for (; number < sixteenPoints; number++) {
451 __m512 x = _mm512_loadu_ps(inPtr);
452
453 // Argument reduction: n = round(x * 2/pi)
454 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
455 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
456 __m512i n = _mm512_cvtps_epi32(n_f);
457
458 // r = x - n * (pi/2)
459 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
460 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
461
462 // Evaluate both sin and cos polynomials
463 __m512 sin_r = _mm512_sin_poly_avx512(r);
464 __m512 cos_r = _mm512_cos_poly_avx512(r);
465
466 // Quadrant selection
467 __m512i n_and_1 = _mm512_and_si512(n, ones);
468 __m512i n_and_2 = _mm512_and_si512(n, twos);
469 __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
470
471 // For sin: swap when n&1, negate when n&2
472 __mmask16 sin_swap = _mm512_cmpeq_epi32_mask(n_and_1, ones);
473 __m512 sin_result = _mm512_mask_blend_ps(sin_swap, sin_r, cos_r);
474 __mmask16 sin_neg = _mm512_cmpeq_epi32_mask(n_and_2, twos);
475 sin_result =
476 _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(sin_result),
477 sin_neg,
478 _mm512_castps_si512(sin_result),
479 sign_bit));
480
481 // For cos: swap when n&1, negate when (n+1)&2
482 __mmask16 cos_swap = sin_swap;
483 __m512 cos_result = _mm512_mask_blend_ps(cos_swap, cos_r, sin_r);
484 __mmask16 cos_neg = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
485 cos_result =
486 _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(cos_result),
487 cos_neg,
488 _mm512_castps_si512(cos_result),
489 sign_bit));
490
491 _mm512_storeu_ps(sinPtr, sin_result);
492 _mm512_storeu_ps(cosPtr, cos_result);
493 inPtr += 16;
494 sinPtr += 16;
495 cosPtr += 16;
496 }
497
498 number = sixteenPoints * 16;
499 for (; number < num_points; number++) {
500 *sinPtr++ = sinf(*inPtr);
501 *cosPtr++ = cosf(*inPtr++);
502 }
503}
504
505#endif /* LV_HAVE_AVX512F for unaligned */
506
507#if LV_HAVE_AVX2 && LV_HAVE_FMA
508#include <immintrin.h>
510
511static inline void volk_32f_sincos_32f_x2_u_avx2_fma(float* sinVector,
512 float* cosVector,
513 const float* inVector,
514 unsigned int num_points)
515{
516 float* sinPtr = sinVector;
517 float* cosPtr = cosVector;
518 const float* inPtr = inVector;
519
520 unsigned int number = 0;
521 unsigned int eighthPoints = num_points / 8;
522
523 // Constants for Cody-Waite argument reduction
524 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
525 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
526 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
527
528 const __m256i ones = _mm256_set1_epi32(1);
529 const __m256i twos = _mm256_set1_epi32(2);
530 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
531
532 for (; number < eighthPoints; number++) {
533 __m256 x = _mm256_loadu_ps(inPtr);
534
535 // Argument reduction: n = round(x * 2/pi)
536 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
537 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
538 __m256i n = _mm256_cvtps_epi32(n_f);
539
540 // r = x - n * (pi/2)
541 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
542 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
543
544 // Evaluate both sin and cos polynomials
545 __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
546 __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
547
548 // Quadrant selection
549 __m256i n_and_1 = _mm256_and_si256(n, ones);
550 __m256i n_and_2 = _mm256_and_si256(n, twos);
551 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
552
553 // For sin: swap when n&1, negate when n&2
554 __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
555 __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
556 __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
557 sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
558
559 // For cos: swap when n&1, negate when (n+1)&2
560 __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
561 __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
562 cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
563
564 _mm256_storeu_ps(sinPtr, sin_result);
565 _mm256_storeu_ps(cosPtr, cos_result);
566 inPtr += 8;
567 sinPtr += 8;
568 cosPtr += 8;
569 }
570
571 number = eighthPoints * 8;
572 for (; number < num_points; number++) {
573 *sinPtr++ = sinf(*inPtr);
574 *cosPtr++ = cosf(*inPtr++);
575 }
576}
577
578#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
579
580#ifdef LV_HAVE_AVX2
581#include <immintrin.h>
583
584static inline void volk_32f_sincos_32f_x2_u_avx2(float* sinVector,
585 float* cosVector,
586 const float* inVector,
587 unsigned int num_points)
588{
589 float* sinPtr = sinVector;
590 float* cosPtr = cosVector;
591 const float* inPtr = inVector;
592
593 unsigned int number = 0;
594 unsigned int eighthPoints = num_points / 8;
595
596 // Constants for Cody-Waite argument reduction
597 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
598 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
599 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
600
601 const __m256i ones = _mm256_set1_epi32(1);
602 const __m256i twos = _mm256_set1_epi32(2);
603 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
604
605 for (; number < eighthPoints; number++) {
606 __m256 x = _mm256_loadu_ps(inPtr);
607
608 // Argument reduction: n = round(x * 2/pi)
609 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
610 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
611 __m256i n = _mm256_cvtps_epi32(n_f);
612
613 // r = x - n * (pi/2)
614 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
615 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
616
617 // Evaluate both sin and cos polynomials
618 __m256 sin_r = _mm256_sin_poly_avx2(r);
619 __m256 cos_r = _mm256_cos_poly_avx2(r);
620
621 // Quadrant selection
622 __m256i n_and_1 = _mm256_and_si256(n, ones);
623 __m256i n_and_2 = _mm256_and_si256(n, twos);
624 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
625
626 // For sin: swap when n&1, negate when n&2
627 __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
628 __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
629 __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
630 sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
631
632 // For cos: swap when n&1, negate when (n+1)&2
633 __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
634 __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
635 cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
636
637 _mm256_storeu_ps(sinPtr, sin_result);
638 _mm256_storeu_ps(cosPtr, cos_result);
639 inPtr += 8;
640 sinPtr += 8;
641 cosPtr += 8;
642 }
643
644 number = eighthPoints * 8;
645 for (; number < num_points; number++) {
646 *sinPtr++ = sinf(*inPtr);
647 *cosPtr++ = cosf(*inPtr++);
648 }
649}
650
651#endif /* LV_HAVE_AVX2 for unaligned */
652
653#ifdef LV_HAVE_SSE4_1
654#include <smmintrin.h>
656
657static inline void volk_32f_sincos_32f_x2_u_sse4_1(float* sinVector,
658 float* cosVector,
659 const float* inVector,
660 unsigned int num_points)
661{
662 float* sinPtr = sinVector;
663 float* cosPtr = cosVector;
664 const float* inPtr = inVector;
665
666 unsigned int number = 0;
667 unsigned int quarterPoints = num_points / 4;
668
669 // Constants for Cody-Waite argument reduction
670 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f);
671 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f);
672 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f);
673
674 const __m128i ones = _mm_set1_epi32(1);
675 const __m128i twos = _mm_set1_epi32(2);
676 const __m128 sign_bit = _mm_set1_ps(-0.0f);
677
678 for (; number < quarterPoints; number++) {
679 __m128 x = _mm_loadu_ps(inPtr);
680
681 // Argument reduction: n = round(x * 2/pi)
682 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
683 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
684 __m128i n = _mm_cvtps_epi32(n_f);
685
686 // r = x - n * (pi/2)
687 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
688 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
689
690 // Evaluate both sin and cos polynomials
691 __m128 sin_r = _mm_sin_poly_sse(r);
692 __m128 cos_r = _mm_cos_poly_sse(r);
693
694 // Quadrant selection
695 __m128i n_and_1 = _mm_and_si128(n, ones);
696 __m128i n_and_2 = _mm_and_si128(n, twos);
697 __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
698
699 // For sin: swap when n&1, negate when n&2
700 __m128 sin_swap = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
701 __m128 sin_result = _mm_blendv_ps(sin_r, cos_r, sin_swap);
702 __m128 sin_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
703 sin_result = _mm_xor_ps(sin_result, _mm_and_ps(sin_neg, sign_bit));
704
705 // For cos: swap when n&1, negate when (n+1)&2
706 __m128 cos_result = _mm_blendv_ps(cos_r, sin_r, sin_swap);
707 __m128 cos_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
708 cos_result = _mm_xor_ps(cos_result, _mm_and_ps(cos_neg, sign_bit));
709
710 _mm_storeu_ps(sinPtr, sin_result);
711 _mm_storeu_ps(cosPtr, cos_result);
712 inPtr += 4;
713 sinPtr += 4;
714 cosPtr += 4;
715 }
716
717 number = quarterPoints * 4;
718 for (; number < num_points; number++) {
719 *sinPtr++ = sinf(*inPtr);
720 *cosPtr++ = cosf(*inPtr++);
721 }
722}
723
724#endif /* LV_HAVE_SSE4_1 for unaligned */
725
726#ifdef LV_HAVE_NEON
727#include <arm_neon.h>
729
730/* NEON polynomial-based sincos with shared argument reduction */
731static inline void volk_32f_sincos_32f_x2_neon(float* sinVector,
732 float* cosVector,
733 const float* inVector,
734 unsigned int num_points)
735{
736 // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
737 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f);
738 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f);
739 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f);
740
741 const int32x4_t ones = vdupq_n_s32(1);
742 const int32x4_t twos = vdupq_n_s32(2);
743 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
744 const float32x4_t half = vdupq_n_f32(0.5f);
745 const float32x4_t neg_half = vdupq_n_f32(-0.5f);
746 const float32x4_t fzeroes = vdupq_n_f32(0.0f);
747
748 unsigned int number = 0;
749 const unsigned int quarterPoints = num_points / 4;
750
751 for (; number < quarterPoints; number++) {
752 float32x4_t x = vld1q_f32(inVector);
753 inVector += 4;
754
755 // n = round(x * 2/pi) - emulate round-to-nearest for ARMv7
756 float32x4_t scaled = vmulq_f32(x, two_over_pi);
757 uint32x4_t is_neg = vcltq_f32(scaled, fzeroes);
758 float32x4_t adj = vbslq_f32(is_neg, neg_half, half);
759 float32x4_t n_f = vcvtq_f32_s32(vcvtq_s32_f32(vaddq_f32(scaled, adj)));
760 int32x4_t n = vcvtq_s32_f32(n_f);
761
762 // r = x - n * (pi/2) using extended precision
763 float32x4_t r = vmlsq_f32(x, n_f, pi_over_2_hi);
764 r = vmlsq_f32(r, n_f, pi_over_2_lo);
765
766 // Evaluate sin and cos polynomials (shared for both outputs)
767 float32x4_t sin_r = _vsin_poly_f32(r);
768 float32x4_t cos_r = _vcos_poly_f32(r);
769
770 // Quadrant selection
771 int32x4_t n_and_1 = vandq_s32(n, ones);
772 int32x4_t n_and_2 = vandq_s32(n, twos);
773 int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
774
775 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
776
777 // Sin result: use cos when n&1, negate when n&2
778 float32x4_t sin_result = vbslq_f32(swap_mask, cos_r, sin_r);
779 uint32x4_t sin_neg = vceqq_s32(n_and_2, twos);
780 sin_result = vreinterpretq_f32_u32(
781 veorq_u32(vreinterpretq_u32_f32(sin_result),
782 vandq_u32(sin_neg, vreinterpretq_u32_f32(sign_bit))));
783
784 // Cos result: use sin when n&1, negate when (n+1)&2
785 float32x4_t cos_result = vbslq_f32(swap_mask, sin_r, cos_r);
786 uint32x4_t cos_neg = vceqq_s32(n_plus_1_and_2, twos);
787 cos_result = vreinterpretq_f32_u32(
788 veorq_u32(vreinterpretq_u32_f32(cos_result),
789 vandq_u32(cos_neg, vreinterpretq_u32_f32(sign_bit))));
790
791 vst1q_f32(sinVector, sin_result);
792 vst1q_f32(cosVector, cos_result);
793 sinVector += 4;
794 cosVector += 4;
795 }
796
797 for (number = quarterPoints * 4; number < num_points; number++) {
798 *sinVector++ = sinf(*inVector);
799 *cosVector++ = cosf(*inVector++);
800 }
801}
802#endif /* LV_HAVE_NEON */
803
804#ifdef LV_HAVE_NEONV8
805#include <arm_neon.h>
807
808/* NEONv8 polynomial-based sincos with FMA and shared argument reduction */
809static inline void volk_32f_sincos_32f_x2_neonv8(float* sinVector,
810 float* cosVector,
811 const float* inVector,
812 unsigned int num_points)
813{
814 // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
815 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f);
816 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f);
817 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f);
818
819 const int32x4_t ones = vdupq_n_s32(1);
820 const int32x4_t twos = vdupq_n_s32(2);
821 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
822
823 unsigned int number = 0;
824 const unsigned int quarterPoints = num_points / 4;
825
826 for (; number < quarterPoints; number++) {
827 float32x4_t x = vld1q_f32(inVector);
828 inVector += 4;
829
830 // n = round(x * 2/pi) using ARMv8 vrndnq_f32
831 float32x4_t n_f = vrndnq_f32(vmulq_f32(x, two_over_pi));
832 int32x4_t n = vcvtq_s32_f32(n_f);
833
834 // r = x - n * (pi/2) using FMA for extended precision
835 float32x4_t r = vfmsq_f32(x, n_f, pi_over_2_hi);
836 r = vfmsq_f32(r, n_f, pi_over_2_lo);
837
838 // Evaluate sin and cos polynomials using FMA (shared for both outputs)
839 float32x4_t sin_r = _vsin_poly_neonv8(r);
840 float32x4_t cos_r = _vcos_poly_neonv8(r);
841
842 // Quadrant selection
843 int32x4_t n_and_1 = vandq_s32(n, ones);
844 int32x4_t n_and_2 = vandq_s32(n, twos);
845 int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
846
847 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
848
849 // Sin result: use cos when n&1, negate when n&2
850 float32x4_t sin_result = vbslq_f32(swap_mask, cos_r, sin_r);
851 uint32x4_t sin_neg = vceqq_s32(n_and_2, twos);
852 sin_result = vreinterpretq_f32_u32(
853 veorq_u32(vreinterpretq_u32_f32(sin_result),
854 vandq_u32(sin_neg, vreinterpretq_u32_f32(sign_bit))));
855
856 // Cos result: use sin when n&1, negate when (n+1)&2
857 float32x4_t cos_result = vbslq_f32(swap_mask, sin_r, cos_r);
858 uint32x4_t cos_neg = vceqq_s32(n_plus_1_and_2, twos);
859 cos_result = vreinterpretq_f32_u32(
860 veorq_u32(vreinterpretq_u32_f32(cos_result),
861 vandq_u32(cos_neg, vreinterpretq_u32_f32(sign_bit))));
862
863 vst1q_f32(sinVector, sin_result);
864 vst1q_f32(cosVector, cos_result);
865 sinVector += 4;
866 cosVector += 4;
867 }
868
869 for (number = quarterPoints * 4; number < num_points; number++) {
870 *sinVector++ = sinf(*inVector);
871 *cosVector++ = cosf(*inVector++);
872 }
873}
874#endif /* LV_HAVE_NEONV8 */
875
876#endif /* INCLUDED_volk_32f_sincos_32f_x2_u_H */