Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_sin_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
59
60#include <inttypes.h>
61#include <math.h>
62#include <stdio.h>
63
64#ifndef INCLUDED_volk_32f_sin_32f_a_H
65#define INCLUDED_volk_32f_sin_32f_a_H
66
67#ifdef LV_HAVE_GENERIC
68#include <volk/volk_common.h>
69
70static inline void
71volk_32f_sin_32f_polynomial(float* bVector, const float* aVector, unsigned int num_points)
72{
73 for (unsigned int number = 0; number < num_points; number++) {
74 *bVector++ = volk_sin(*aVector++);
75 }
76}
77#endif /* LV_HAVE_GENERIC */
78
79#ifdef LV_HAVE_AVX512F
80#include <immintrin.h>
82
83static inline void volk_32f_sin_32f_a_avx512f(float* sinVector,
84 const float* inVector,
85 unsigned int num_points)
86{
87 float* sinPtr = sinVector;
88 const float* inPtr = inVector;
89
90 unsigned int number = 0;
91 unsigned int sixteenPoints = num_points / 16;
92
93 // Constants for Cody-Waite argument reduction
94 // n = round(x * 2/pi), then r = x - n * pi/2
95 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f); // 2/pi
96 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f); // pi/2 high
97 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f); // pi/2 low
98
99 const __m512i ones = _mm512_set1_epi32(1);
100 const __m512i twos = _mm512_set1_epi32(2);
101 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
102
103 for (; number < sixteenPoints; number++) {
104 __m512 x = _mm512_load_ps(inPtr);
105
106 // Argument reduction: n = round(x * 2/pi)
107 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
108 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
109 __m512i n = _mm512_cvtps_epi32(n_f);
110
111 // r = x - n * (pi/2), using extended precision
112 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
113 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
114
115 // Evaluate both sin and cos polynomials
116 __m512 sin_r = _mm512_sin_poly_avx512(r);
117 __m512 cos_r = _mm512_cos_poly_avx512(r);
118
119 // Reconstruct sin(x) based on quadrant (n mod 4):
120 // n&1 == 0: use sin_r, n&1 == 1: use cos_r
121 // n&2 == 0: positive, n&2 == 2: negative
122 __m512i n_and_1 = _mm512_and_si512(n, ones);
123 __m512i n_and_2 = _mm512_and_si512(n, twos);
124
125 // swap_mask: where n&1 != 0, we use cos instead of sin
126 __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
127 __m512 result = _mm512_mask_blend_ps(swap_mask, sin_r, cos_r);
128
129 // neg_mask: where n&2 != 0, we negate the result (use integer xor for AVX512F)
130 __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_and_2, twos);
131 result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
132 neg_mask,
133 _mm512_castps_si512(result),
134 sign_bit));
135
136 _mm512_store_ps(sinPtr, result);
137 inPtr += 16;
138 sinPtr += 16;
139 }
140
141 number = sixteenPoints * 16;
142 for (; number < num_points; number++) {
143 *sinPtr++ = sinf(*inPtr++);
144 }
145}
146#endif /* LV_HAVE_AVX512F */
147
148#if LV_HAVE_AVX2 && LV_HAVE_FMA
149#include <immintrin.h>
151
152static inline void
153volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
154{
155 float* bPtr = bVector;
156 const float* aPtr = aVector;
157
158 unsigned int number = 0;
159 unsigned int eighthPoints = num_points / 8;
160
161 // Constants for Cody-Waite argument reduction
162 // n = round(x * 2/pi), then r = x - n * pi/2
163 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
164 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
165 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
166
167 const __m256i ones = _mm256_set1_epi32(1);
168 const __m256i twos = _mm256_set1_epi32(2);
169 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
170
171 for (; number < eighthPoints; number++) {
172 __m256 x = _mm256_load_ps(aPtr);
173
174 // Argument reduction: n = round(x * 2/pi)
175 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
176 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
177 __m256i n = _mm256_cvtps_epi32(n_f);
178
179 // r = x - n * (pi/2), using extended precision
180 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
181 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
182
183 // Evaluate both sin and cos polynomials
184 __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
185 __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
186
187 // Reconstruct sin(x) based on quadrant (n mod 4):
188 // n&1 == 0: use sin_r, n&1 == 1: use cos_r
189 // n&2 == 0: positive, n&2 == 2: negative
190 __m256i n_and_1 = _mm256_and_si256(n, ones);
191 __m256i n_and_2 = _mm256_and_si256(n, twos);
192
193 // swap_mask: where n&1 != 0, we use cos instead of sin
194 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
195 __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
196
197 // neg_mask: where n&2 != 0, we negate the result
198 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
199 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
200
201 _mm256_store_ps(bPtr, result);
202 aPtr += 8;
203 bPtr += 8;
204 }
205
206 number = eighthPoints * 8;
207 for (; number < num_points; number++) {
208 *bPtr++ = sinf(*aPtr++);
209 }
210}
211
212#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
213
214#ifdef LV_HAVE_AVX2
215#include <immintrin.h>
217
218static inline void
219volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
220{
221 float* bPtr = bVector;
222 const float* aPtr = aVector;
223
224 unsigned int number = 0;
225 unsigned int eighthPoints = num_points / 8;
226
227 // Constants for Cody-Waite argument reduction
228 // n = round(x * 2/pi), then r = x - n * pi/2
229 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
230 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
231 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
232
233 const __m256i ones = _mm256_set1_epi32(1);
234 const __m256i twos = _mm256_set1_epi32(2);
235 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
236
237 for (; number < eighthPoints; number++) {
238 __m256 x = _mm256_load_ps(aPtr);
239
240 // Argument reduction: n = round(x * 2/pi)
241 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
242 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
243 __m256i n = _mm256_cvtps_epi32(n_f);
244
245 // r = x - n * (pi/2), using extended precision
246 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
247 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
248
249 // Evaluate both sin and cos polynomials
250 __m256 sin_r = _mm256_sin_poly_avx2(r);
251 __m256 cos_r = _mm256_cos_poly_avx2(r);
252
253 // Reconstruct sin(x) based on quadrant (n mod 4):
254 // n&1 == 0: use sin_r, n&1 == 1: use cos_r
255 // n&2 == 0: positive, n&2 == 2: negative
256 __m256i n_and_1 = _mm256_and_si256(n, ones);
257 __m256i n_and_2 = _mm256_and_si256(n, twos);
258
259 // swap_mask: where n&1 != 0, we use cos instead of sin
260 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
261 __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
262
263 // neg_mask: where n&2 != 0, we negate the result
264 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
265 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
266
267 _mm256_store_ps(bPtr, result);
268 aPtr += 8;
269 bPtr += 8;
270 }
271
272 number = eighthPoints * 8;
273 for (; number < num_points; number++) {
274 *bPtr++ = sinf(*aPtr++);
275 }
276}
277
278#endif /* LV_HAVE_AVX2 */
279
280#ifdef LV_HAVE_SSE4_1
281#include <smmintrin.h>
283
284static inline void
285volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
286{
287 float* bPtr = bVector;
288 const float* aPtr = aVector;
289
290 unsigned int number = 0;
291 unsigned int quarterPoints = num_points / 4;
292
293 // Constants for Cody-Waite argument reduction
294 // n = round(x * 2/pi), then r = x - n * pi/2
295 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f); // 2/pi
296 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f); // pi/2 high
297 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f); // pi/2 low
298
299 const __m128i ones = _mm_set1_epi32(1);
300 const __m128i twos = _mm_set1_epi32(2);
301 const __m128 sign_bit = _mm_set1_ps(-0.0f);
302
303 for (; number < quarterPoints; number++) {
304 __m128 x = _mm_load_ps(aPtr);
305
306 // Argument reduction: n = round(x * 2/pi)
307 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
308 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
309 __m128i n = _mm_cvtps_epi32(n_f);
310
311 // r = x - n * (pi/2), using extended precision
312 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
313 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
314
315 // Evaluate both sin and cos polynomials
316 __m128 sin_r = _mm_sin_poly_sse(r);
317 __m128 cos_r = _mm_cos_poly_sse(r);
318
319 // Reconstruct sin(x) based on quadrant (n mod 4):
320 // n&1 == 0: use sin_r, n&1 == 1: use cos_r
321 // n&2 == 0: positive, n&2 == 2: negative
322 __m128i n_and_1 = _mm_and_si128(n, ones);
323 __m128i n_and_2 = _mm_and_si128(n, twos);
324
325 // swap_mask: where n&1 != 0, we use cos instead of sin
326 __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
327 __m128 result = _mm_blendv_ps(sin_r, cos_r, swap_mask);
328
329 // neg_mask: where n&2 != 0, we negate the result
330 __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
331 result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
332
333 _mm_store_ps(bPtr, result);
334 aPtr += 4;
335 bPtr += 4;
336 }
337
338 number = quarterPoints * 4;
339 for (; number < num_points; number++) {
340 *bPtr++ = sinf(*aPtr++);
341 }
342}
343
344#endif /* LV_HAVE_SSE4_1 */
345
346
347#endif /* INCLUDED_volk_32f_sin_32f_a_H */
348
349#ifndef INCLUDED_volk_32f_sin_32f_u_H
350#define INCLUDED_volk_32f_sin_32f_u_H
351
352#ifdef LV_HAVE_AVX512F
353#include <immintrin.h>
355
356static inline void volk_32f_sin_32f_u_avx512f(float* sinVector,
357 const float* inVector,
358 unsigned int num_points)
359{
360 float* sinPtr = sinVector;
361 const float* inPtr = inVector;
362
363 unsigned int number = 0;
364 unsigned int sixteenPoints = num_points / 16;
365
366 // Constants for Cody-Waite argument reduction
367 // n = round(x * 2/pi), then r = x - n * pi/2
368 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f); // 2/pi
369 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f); // pi/2 high
370 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f); // pi/2 low
371
372 const __m512i ones = _mm512_set1_epi32(1);
373 const __m512i twos = _mm512_set1_epi32(2);
374 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
375
376 for (; number < sixteenPoints; number++) {
377 __m512 x = _mm512_loadu_ps(inPtr);
378
379 // Argument reduction: n = round(x * 2/pi)
380 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
381 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
382 __m512i n = _mm512_cvtps_epi32(n_f);
383
384 // r = x - n * (pi/2), using extended precision
385 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
386 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
387
388 // Evaluate both sin and cos polynomials
389 __m512 sin_r = _mm512_sin_poly_avx512(r);
390 __m512 cos_r = _mm512_cos_poly_avx512(r);
391
392 // Reconstruct sin(x) based on quadrant (n mod 4):
393 // n&1 == 0: use sin_r, n&1 == 1: use cos_r
394 // n&2 == 0: positive, n&2 == 2: negative
395 __m512i n_and_1 = _mm512_and_si512(n, ones);
396 __m512i n_and_2 = _mm512_and_si512(n, twos);
397
398 // swap_mask: where n&1 != 0, we use cos instead of sin
399 __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
400 __m512 result = _mm512_mask_blend_ps(swap_mask, sin_r, cos_r);
401
402 // neg_mask: where n&2 != 0, we negate the result (use integer xor for AVX512F)
403 __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_and_2, twos);
404 result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
405 neg_mask,
406 _mm512_castps_si512(result),
407 sign_bit));
408
409 _mm512_storeu_ps(sinPtr, result);
410 inPtr += 16;
411 sinPtr += 16;
412 }
413
414 number = sixteenPoints * 16;
415 for (; number < num_points; number++) {
416 *sinPtr++ = sinf(*inPtr++);
417 }
418}
419#endif /* LV_HAVE_AVX512F */
420
421#if LV_HAVE_AVX2 && LV_HAVE_FMA
422#include <immintrin.h>
424
425static inline void
426volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
427{
428 float* bPtr = bVector;
429 const float* aPtr = aVector;
430
431 unsigned int number = 0;
432 unsigned int eighthPoints = num_points / 8;
433
434 // Constants for Cody-Waite argument reduction
435 // n = round(x * 2/pi), then r = x - n * pi/2
436 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
437 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
438 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
439
440 const __m256i ones = _mm256_set1_epi32(1);
441 const __m256i twos = _mm256_set1_epi32(2);
442 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
443
444 for (; number < eighthPoints; number++) {
445 __m256 x = _mm256_loadu_ps(aPtr);
446
447 // Argument reduction: n = round(x * 2/pi)
448 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
449 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
450 __m256i n = _mm256_cvtps_epi32(n_f);
451
452 // r = x - n * (pi/2), using extended precision
453 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
454 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
455
456 // Evaluate both sin and cos polynomials
457 __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
458 __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
459
460 // Reconstruct sin(x) based on quadrant (n mod 4):
461 // n&1 == 0: use sin_r, n&1 == 1: use cos_r
462 // n&2 == 0: positive, n&2 == 2: negative
463 __m256i n_and_1 = _mm256_and_si256(n, ones);
464 __m256i n_and_2 = _mm256_and_si256(n, twos);
465
466 // swap_mask: where n&1 != 0, we use cos instead of sin
467 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
468 __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
469
470 // neg_mask: where n&2 != 0, we negate the result
471 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
472 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
473
474 _mm256_storeu_ps(bPtr, result);
475 aPtr += 8;
476 bPtr += 8;
477 }
478
479 number = eighthPoints * 8;
480 for (; number < num_points; number++) {
481 *bPtr++ = sinf(*aPtr++);
482 }
483}
484
485#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
486
487#ifdef LV_HAVE_AVX2
488#include <immintrin.h>
490
491static inline void
492volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
493{
494 float* bPtr = bVector;
495 const float* aPtr = aVector;
496
497 unsigned int number = 0;
498 unsigned int eighthPoints = num_points / 8;
499
500 // Constants for Cody-Waite argument reduction
501 // n = round(x * 2/pi), then r = x - n * pi/2
502 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
503 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
504 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
505
506 const __m256i ones = _mm256_set1_epi32(1);
507 const __m256i twos = _mm256_set1_epi32(2);
508 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
509
510 for (; number < eighthPoints; number++) {
511 __m256 x = _mm256_loadu_ps(aPtr);
512
513 // Argument reduction: n = round(x * 2/pi)
514 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
515 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
516 __m256i n = _mm256_cvtps_epi32(n_f);
517
518 // r = x - n * (pi/2), using extended precision
519 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
520 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
521
522 // Evaluate both sin and cos polynomials
523 __m256 sin_r = _mm256_sin_poly_avx2(r);
524 __m256 cos_r = _mm256_cos_poly_avx2(r);
525
526 // Reconstruct sin(x) based on quadrant (n mod 4):
527 // n&1 == 0: use sin_r, n&1 == 1: use cos_r
528 // n&2 == 0: positive, n&2 == 2: negative
529 __m256i n_and_1 = _mm256_and_si256(n, ones);
530 __m256i n_and_2 = _mm256_and_si256(n, twos);
531
532 // swap_mask: where n&1 != 0, we use cos instead of sin
533 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
534 __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
535
536 // neg_mask: where n&2 != 0, we negate the result
537 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
538 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
539
540 _mm256_storeu_ps(bPtr, result);
541 aPtr += 8;
542 bPtr += 8;
543 }
544
545 number = eighthPoints * 8;
546 for (; number < num_points; number++) {
547 *bPtr++ = sinf(*aPtr++);
548 }
549}
550
551#endif /* LV_HAVE_AVX2 */
552
553
554#ifdef LV_HAVE_SSE4_1
555#include <smmintrin.h>
557
558static inline void
559volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
560{
561 float* bPtr = bVector;
562 const float* aPtr = aVector;
563
564 unsigned int number = 0;
565 unsigned int quarterPoints = num_points / 4;
566
567 // Constants for Cody-Waite argument reduction
568 // n = round(x * 2/pi), then r = x - n * pi/2
569 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f); // 2/pi
570 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f); // pi/2 high
571 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f); // pi/2 low
572
573 const __m128i ones = _mm_set1_epi32(1);
574 const __m128i twos = _mm_set1_epi32(2);
575 const __m128 sign_bit = _mm_set1_ps(-0.0f);
576
577 for (; number < quarterPoints; number++) {
578 __m128 x = _mm_loadu_ps(aPtr);
579
580 // Argument reduction: n = round(x * 2/pi)
581 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
582 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
583 __m128i n = _mm_cvtps_epi32(n_f);
584
585 // r = x - n * (pi/2), using extended precision
586 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
587 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
588
589 // Evaluate both sin and cos polynomials
590 __m128 sin_r = _mm_sin_poly_sse(r);
591 __m128 cos_r = _mm_cos_poly_sse(r);
592
593 // Reconstruct sin(x) based on quadrant (n mod 4):
594 // n&1 == 0: use sin_r, n&1 == 1: use cos_r
595 // n&2 == 0: positive, n&2 == 2: negative
596 __m128i n_and_1 = _mm_and_si128(n, ones);
597 __m128i n_and_2 = _mm_and_si128(n, twos);
598
599 // swap_mask: where n&1 != 0, we use cos instead of sin
600 __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
601 __m128 result = _mm_blendv_ps(sin_r, cos_r, swap_mask);
602
603 // neg_mask: where n&2 != 0, we negate the result
604 __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
605 result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
606
607 _mm_storeu_ps(bPtr, result);
608 aPtr += 4;
609 bPtr += 4;
610 }
611
612 number = quarterPoints * 4;
613 for (; number < num_points; number++) {
614 *bPtr++ = sinf(*aPtr++);
615 }
616}
617
618#endif /* LV_HAVE_SSE4_1 */
619
620
621#ifdef LV_HAVE_GENERIC
622
623static inline void
624volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
625{
626 float* bPtr = bVector;
627 const float* aPtr = aVector;
628 unsigned int number = 0;
629
630 for (number = 0; number < num_points; number++) {
631 *bPtr++ = sinf(*aPtr++);
632 }
633}
634
635#endif /* LV_HAVE_GENERIC */
636
637#ifdef LV_HAVE_NEON
638#include <arm_neon.h>
640
641/* NEON polynomial-based sin using Cody-Waite argument reduction */
642static inline void
643volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
644{
645 // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
646 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f); // 2/pi
647 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f); // pi/2 high
648 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f); // pi/2 low
649
650 const int32x4_t ones = vdupq_n_s32(1);
651 const int32x4_t twos = vdupq_n_s32(2);
652 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
653 const float32x4_t half = vdupq_n_f32(0.5f);
654 const float32x4_t neg_half = vdupq_n_f32(-0.5f);
655 const float32x4_t fzeroes = vdupq_n_f32(0.0f);
656
657 unsigned int number = 0;
658 const unsigned int quarterPoints = num_points / 4;
659
660 for (; number < quarterPoints; number++) {
661 float32x4_t x = vld1q_f32(aVector);
662 aVector += 4;
663
664 // n = round(x * 2/pi) - emulate round-to-nearest for ARMv7
665 float32x4_t scaled = vmulq_f32(x, two_over_pi);
666 uint32x4_t is_neg = vcltq_f32(scaled, fzeroes);
667 float32x4_t adj = vbslq_f32(is_neg, neg_half, half);
668 float32x4_t n_f = vcvtq_f32_s32(vcvtq_s32_f32(vaddq_f32(scaled, adj)));
669 int32x4_t n = vcvtq_s32_f32(n_f);
670
671 // r = x - n * (pi/2) using extended precision
672 float32x4_t r = vmlsq_f32(x, n_f, pi_over_2_hi);
673 r = vmlsq_f32(r, n_f, pi_over_2_lo);
674
675 // Evaluate sin and cos polynomials
676 float32x4_t sin_r = _vsin_poly_f32(r);
677 float32x4_t cos_r = _vcos_poly_f32(r);
678
679 // Quadrant-based reconstruction:
680 // n&1 == 0: use sin_r, n&1 == 1: use cos_r
681 // n&2 == 0: positive, n&2 == 2: negative
682 int32x4_t n_and_1 = vandq_s32(n, ones);
683 int32x4_t n_and_2 = vandq_s32(n, twos);
684
685 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
686 float32x4_t result = vbslq_f32(swap_mask, cos_r, sin_r);
687
688 uint32x4_t neg_mask = vceqq_s32(n_and_2, twos);
689 result = vreinterpretq_f32_u32(
690 veorq_u32(vreinterpretq_u32_f32(result),
691 vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
692
693 vst1q_f32(bVector, result);
694 bVector += 4;
695 }
696
697 for (number = quarterPoints * 4; number < num_points; number++) {
698 *bVector++ = sinf(*aVector++);
699 }
700}
701#endif /* LV_HAVE_NEON */
702
703#ifdef LV_HAVE_NEONV8
704#include <arm_neon.h>
706
707/* NEONv8 polynomial-based sin using Cody-Waite argument reduction with FMA */
708static inline void
709volk_32f_sin_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
710{
711 // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
712 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f); // 2/pi
713 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f); // pi/2 high
714 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f); // pi/2 low
715
716 const int32x4_t ones = vdupq_n_s32(1);
717 const int32x4_t twos = vdupq_n_s32(2);
718 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
719
720 unsigned int number = 0;
721 const unsigned int quarterPoints = num_points / 4;
722
723 for (; number < quarterPoints; number++) {
724 float32x4_t x = vld1q_f32(aVector);
725 aVector += 4;
726
727 // n = round(x * 2/pi) using ARMv8 vrndnq_f32
728 float32x4_t n_f = vrndnq_f32(vmulq_f32(x, two_over_pi));
729 int32x4_t n = vcvtq_s32_f32(n_f);
730
731 // r = x - n * (pi/2) using FMA for extended precision
732 float32x4_t r = vfmsq_f32(x, n_f, pi_over_2_hi);
733 r = vfmsq_f32(r, n_f, pi_over_2_lo);
734
735 // Evaluate sin and cos polynomials using FMA
736 float32x4_t sin_r = _vsin_poly_neonv8(r);
737 float32x4_t cos_r = _vcos_poly_neonv8(r);
738
739 // Quadrant-based reconstruction:
740 // n&1 == 0: use sin_r, n&1 == 1: use cos_r
741 // n&2 == 0: positive, n&2 == 2: negative
742 int32x4_t n_and_1 = vandq_s32(n, ones);
743 int32x4_t n_and_2 = vandq_s32(n, twos);
744
745 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
746 float32x4_t result = vbslq_f32(swap_mask, cos_r, sin_r);
747
748 uint32x4_t neg_mask = vceqq_s32(n_and_2, twos);
749 result = vreinterpretq_f32_u32(
750 veorq_u32(vreinterpretq_u32_f32(result),
751 vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
752
753 vst1q_f32(bVector, result);
754 bVector += 4;
755 }
756
757 for (number = quarterPoints * 4; number < num_points; number++) {
758 *bVector++ = sinf(*aVector++);
759 }
760}
761
762#endif /* LV_HAVE_NEONV8 */
763
764#ifdef LV_HAVE_RVV
765#include <riscv_vector.h>
766
767static inline void
768volk_32f_sin_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
769{
770 size_t vlmax = __riscv_vsetvlmax_e32m2();
771
772 const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
773 const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
774 const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
775 const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
776
777 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
778 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
779
780 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
781 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
782 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05, vlmax);
783 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07, vlmax);
784
785 size_t n = num_points;
786 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
787 vl = __riscv_vsetvl_e32m2(n);
788 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
789 vfloat32m2_t s = __riscv_vfabs(v, vl);
790 vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
791 vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
792
793 s = __riscv_vfnmsac(s, cPio4a, r, vl);
794 s = __riscv_vfnmsac(s, cPio4b, r, vl);
795 s = __riscv_vfnmsac(s, cPio4c, r, vl);
796
797 s = __riscv_vfmul(s, 1 / 8.0f, vl);
798 s = __riscv_vfmul(s, s, vl);
799 vfloat32m2_t t = s;
800 s = __riscv_vfmsub(s, c5, c4, vl);
801 s = __riscv_vfmadd(s, t, c3, vl);
802 s = __riscv_vfmsub(s, t, c2, vl);
803 s = __riscv_vfmadd(s, t, cf1, vl);
804 s = __riscv_vfmul(s, t, vl);
805 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
806 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
807 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
808 s = __riscv_vfmul(s, 1 / 2.0f, vl);
809
810 vfloat32m2_t sine =
811 __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
812 vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
813
814 vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
815 vbool16_t m2 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl),
816 __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl),
817 vl);
818
819 sine = __riscv_vmerge(sine, cosine, m1, vl);
820 sine = __riscv_vfneg_mu(m2, sine, sine, vl);
821
822 __riscv_vse32(bVector, sine, vl);
823 }
824}
825#endif /*LV_HAVE_RVV*/
826
827#endif /* INCLUDED_volk_32f_sin_32f_u_H */