62#ifndef INCLUDED_volk_32f_sincos_32f_x2_a_H
63#define INCLUDED_volk_32f_sincos_32f_x2_a_H
69 const float* inVector,
70 unsigned int num_points)
72 for (
unsigned int i = 0; i < num_points; i++) {
73 sinVector[i] = sinf(inVector[i]);
74 cosVector[i] = cosf(inVector[i]);
85 const float* inVector,
86 unsigned int num_points)
91 const float two_over_pi = 0x1.45f306p-1f;
92 const float pi_over_2_hi = 0x1.921fb6p+0f;
93 const float pi_over_2_lo = -0x1.777a5cp-25f;
95 for (
unsigned int i = 0; i < num_points; i++) {
96 float x = inVector[i];
98 float n_f = rintf(x * two_over_pi);
101 float r = fmaf(-n_f, pi_over_2_hi, x);
102 r = fmaf(-n_f, pi_over_2_lo, r);
108 float sin_result = (n & 1) ? cos_r : sin_r;
109 sinVector[i] = (n & 2) ? -sin_result : sin_result;
112 float cos_result = (n & 1) ? sin_r : cos_r;
113 cosVector[i] = ((n + 1) & 2) ? -cos_result : cos_result;
118#ifdef LV_HAVE_AVX512F
119#include <immintrin.h>
122static inline void volk_32f_sincos_32f_x2_a_avx512f(
float* sinVector,
124 const float* inVector,
125 unsigned int num_points)
127 float* sinPtr = sinVector;
128 float* cosPtr = cosVector;
129 const float* inPtr = inVector;
131 unsigned int number = 0;
132 unsigned int sixteenPoints = num_points / 16;
135 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f);
136 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f);
137 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f);
139 const __m512i ones = _mm512_set1_epi32(1);
140 const __m512i twos = _mm512_set1_epi32(2);
141 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
143 for (; number < sixteenPoints; number++) {
144 __m512 x = _mm512_load_ps(inPtr);
147 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
148 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
149 __m512i n = _mm512_cvtps_epi32(n_f);
152 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
153 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
160 __m512i n_and_1 = _mm512_and_si512(n, ones);
161 __m512i n_and_2 = _mm512_and_si512(n, twos);
162 __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
165 __mmask16 sin_swap = _mm512_cmpeq_epi32_mask(n_and_1, ones);
166 __m512 sin_result = _mm512_mask_blend_ps(sin_swap, sin_r, cos_r);
167 __mmask16 sin_neg = _mm512_cmpeq_epi32_mask(n_and_2, twos);
169 _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(sin_result),
171 _mm512_castps_si512(sin_result),
175 __mmask16 cos_swap = sin_swap;
176 __m512 cos_result = _mm512_mask_blend_ps(cos_swap, cos_r, sin_r);
177 __mmask16 cos_neg = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
179 _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(cos_result),
181 _mm512_castps_si512(cos_result),
184 _mm512_store_ps(sinPtr, sin_result);
185 _mm512_store_ps(cosPtr, cos_result);
191 number = sixteenPoints * 16;
192 for (; number < num_points; number++) {
193 *sinPtr++ = sinf(*inPtr);
194 *cosPtr++ = cosf(*inPtr++);
200#if LV_HAVE_AVX2 && LV_HAVE_FMA
201#include <immintrin.h>
204static inline void volk_32f_sincos_32f_x2_a_avx2_fma(
float* sinVector,
206 const float* inVector,
207 unsigned int num_points)
209 float* sinPtr = sinVector;
210 float* cosPtr = cosVector;
211 const float* inPtr = inVector;
213 unsigned int number = 0;
214 unsigned int eighthPoints = num_points / 8;
217 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
218 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
219 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
221 const __m256i ones = _mm256_set1_epi32(1);
222 const __m256i twos = _mm256_set1_epi32(2);
223 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
225 for (; number < eighthPoints; number++) {
226 __m256 x = _mm256_load_ps(inPtr);
229 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
230 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
231 __m256i n = _mm256_cvtps_epi32(n_f);
234 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
235 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
242 __m256i n_and_1 = _mm256_and_si256(n, ones);
243 __m256i n_and_2 = _mm256_and_si256(n, twos);
244 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
247 __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
248 __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
249 __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
250 sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
253 __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
254 __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
255 cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
257 _mm256_store_ps(sinPtr, sin_result);
258 _mm256_store_ps(cosPtr, cos_result);
264 number = eighthPoints * 8;
265 for (; number < num_points; number++) {
266 *sinPtr++ = sinf(*inPtr);
267 *cosPtr++ = cosf(*inPtr++);
274#include <immintrin.h>
277static inline void volk_32f_sincos_32f_x2_a_avx2(
float* sinVector,
279 const float* inVector,
280 unsigned int num_points)
282 float* sinPtr = sinVector;
283 float* cosPtr = cosVector;
284 const float* inPtr = inVector;
286 unsigned int number = 0;
287 unsigned int eighthPoints = num_points / 8;
290 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
291 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
292 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
294 const __m256i ones = _mm256_set1_epi32(1);
295 const __m256i twos = _mm256_set1_epi32(2);
296 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
298 for (; number < eighthPoints; number++) {
299 __m256 x = _mm256_load_ps(inPtr);
302 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
303 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
304 __m256i n = _mm256_cvtps_epi32(n_f);
307 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
308 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
315 __m256i n_and_1 = _mm256_and_si256(n, ones);
316 __m256i n_and_2 = _mm256_and_si256(n, twos);
317 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
320 __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
321 __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
322 __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
323 sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
326 __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
327 __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
328 cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
330 _mm256_store_ps(sinPtr, sin_result);
331 _mm256_store_ps(cosPtr, cos_result);
337 number = eighthPoints * 8;
338 for (; number < num_points; number++) {
339 *sinPtr++ = sinf(*inPtr);
340 *cosPtr++ = cosf(*inPtr++);
347#include <smmintrin.h>
350static inline void volk_32f_sincos_32f_x2_a_sse4_1(
float* sinVector,
352 const float* inVector,
353 unsigned int num_points)
355 float* sinPtr = sinVector;
356 float* cosPtr = cosVector;
357 const float* inPtr = inVector;
359 unsigned int number = 0;
360 unsigned int quarterPoints = num_points / 4;
363 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f);
364 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f);
365 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f);
367 const __m128i ones = _mm_set1_epi32(1);
368 const __m128i twos = _mm_set1_epi32(2);
369 const __m128 sign_bit = _mm_set1_ps(-0.0f);
371 for (; number < quarterPoints; number++) {
372 __m128 x = _mm_load_ps(inPtr);
375 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
376 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
377 __m128i n = _mm_cvtps_epi32(n_f);
380 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
381 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
388 __m128i n_and_1 = _mm_and_si128(n, ones);
389 __m128i n_and_2 = _mm_and_si128(n, twos);
390 __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
393 __m128 sin_swap = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
394 __m128 sin_result = _mm_blendv_ps(sin_r, cos_r, sin_swap);
395 __m128 sin_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
396 sin_result = _mm_xor_ps(sin_result, _mm_and_ps(sin_neg, sign_bit));
399 __m128 cos_result = _mm_blendv_ps(cos_r, sin_r, sin_swap);
400 __m128 cos_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
401 cos_result = _mm_xor_ps(cos_result, _mm_and_ps(cos_neg, sign_bit));
403 _mm_store_ps(sinPtr, sin_result);
404 _mm_store_ps(cosPtr, cos_result);
410 number = quarterPoints * 4;
411 for (; number < num_points; number++) {
412 *sinPtr++ = sinf(*inPtr);
413 *cosPtr++ = cosf(*inPtr++);
422#ifndef INCLUDED_volk_32f_sincos_32f_x2_u_H
423#define INCLUDED_volk_32f_sincos_32f_x2_u_H
425#ifdef LV_HAVE_AVX512F
426#include <immintrin.h>
429static inline void volk_32f_sincos_32f_x2_u_avx512f(
float* sinVector,
431 const float* inVector,
432 unsigned int num_points)
434 float* sinPtr = sinVector;
435 float* cosPtr = cosVector;
436 const float* inPtr = inVector;
438 unsigned int number = 0;
439 unsigned int sixteenPoints = num_points / 16;
442 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f);
443 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f);
444 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f);
446 const __m512i ones = _mm512_set1_epi32(1);
447 const __m512i twos = _mm512_set1_epi32(2);
448 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
450 for (; number < sixteenPoints; number++) {
451 __m512 x = _mm512_loadu_ps(inPtr);
454 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
455 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
456 __m512i n = _mm512_cvtps_epi32(n_f);
459 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
460 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
467 __m512i n_and_1 = _mm512_and_si512(n, ones);
468 __m512i n_and_2 = _mm512_and_si512(n, twos);
469 __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
472 __mmask16 sin_swap = _mm512_cmpeq_epi32_mask(n_and_1, ones);
473 __m512 sin_result = _mm512_mask_blend_ps(sin_swap, sin_r, cos_r);
474 __mmask16 sin_neg = _mm512_cmpeq_epi32_mask(n_and_2, twos);
476 _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(sin_result),
478 _mm512_castps_si512(sin_result),
482 __mmask16 cos_swap = sin_swap;
483 __m512 cos_result = _mm512_mask_blend_ps(cos_swap, cos_r, sin_r);
484 __mmask16 cos_neg = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
486 _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(cos_result),
488 _mm512_castps_si512(cos_result),
491 _mm512_storeu_ps(sinPtr, sin_result);
492 _mm512_storeu_ps(cosPtr, cos_result);
498 number = sixteenPoints * 16;
499 for (; number < num_points; number++) {
500 *sinPtr++ = sinf(*inPtr);
501 *cosPtr++ = cosf(*inPtr++);
507#if LV_HAVE_AVX2 && LV_HAVE_FMA
508#include <immintrin.h>
511static inline void volk_32f_sincos_32f_x2_u_avx2_fma(
float* sinVector,
513 const float* inVector,
514 unsigned int num_points)
516 float* sinPtr = sinVector;
517 float* cosPtr = cosVector;
518 const float* inPtr = inVector;
520 unsigned int number = 0;
521 unsigned int eighthPoints = num_points / 8;
524 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
525 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
526 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
528 const __m256i ones = _mm256_set1_epi32(1);
529 const __m256i twos = _mm256_set1_epi32(2);
530 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
532 for (; number < eighthPoints; number++) {
533 __m256 x = _mm256_loadu_ps(inPtr);
536 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
537 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
538 __m256i n = _mm256_cvtps_epi32(n_f);
541 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
542 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
549 __m256i n_and_1 = _mm256_and_si256(n, ones);
550 __m256i n_and_2 = _mm256_and_si256(n, twos);
551 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
554 __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
555 __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
556 __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
557 sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
560 __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
561 __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
562 cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
564 _mm256_storeu_ps(sinPtr, sin_result);
565 _mm256_storeu_ps(cosPtr, cos_result);
571 number = eighthPoints * 8;
572 for (; number < num_points; number++) {
573 *sinPtr++ = sinf(*inPtr);
574 *cosPtr++ = cosf(*inPtr++);
581#include <immintrin.h>
584static inline void volk_32f_sincos_32f_x2_u_avx2(
float* sinVector,
586 const float* inVector,
587 unsigned int num_points)
589 float* sinPtr = sinVector;
590 float* cosPtr = cosVector;
591 const float* inPtr = inVector;
593 unsigned int number = 0;
594 unsigned int eighthPoints = num_points / 8;
597 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
598 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
599 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
601 const __m256i ones = _mm256_set1_epi32(1);
602 const __m256i twos = _mm256_set1_epi32(2);
603 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
605 for (; number < eighthPoints; number++) {
606 __m256 x = _mm256_loadu_ps(inPtr);
609 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
610 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
611 __m256i n = _mm256_cvtps_epi32(n_f);
614 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
615 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
622 __m256i n_and_1 = _mm256_and_si256(n, ones);
623 __m256i n_and_2 = _mm256_and_si256(n, twos);
624 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
627 __m256 sin_swap = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
628 __m256 sin_result = _mm256_blendv_ps(sin_r, cos_r, sin_swap);
629 __m256 sin_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
630 sin_result = _mm256_xor_ps(sin_result, _mm256_and_ps(sin_neg, sign_bit));
633 __m256 cos_result = _mm256_blendv_ps(cos_r, sin_r, sin_swap);
634 __m256 cos_neg = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
635 cos_result = _mm256_xor_ps(cos_result, _mm256_and_ps(cos_neg, sign_bit));
637 _mm256_storeu_ps(sinPtr, sin_result);
638 _mm256_storeu_ps(cosPtr, cos_result);
644 number = eighthPoints * 8;
645 for (; number < num_points; number++) {
646 *sinPtr++ = sinf(*inPtr);
647 *cosPtr++ = cosf(*inPtr++);
654#include <smmintrin.h>
657static inline void volk_32f_sincos_32f_x2_u_sse4_1(
float* sinVector,
659 const float* inVector,
660 unsigned int num_points)
662 float* sinPtr = sinVector;
663 float* cosPtr = cosVector;
664 const float* inPtr = inVector;
666 unsigned int number = 0;
667 unsigned int quarterPoints = num_points / 4;
670 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f);
671 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f);
672 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f);
674 const __m128i ones = _mm_set1_epi32(1);
675 const __m128i twos = _mm_set1_epi32(2);
676 const __m128 sign_bit = _mm_set1_ps(-0.0f);
678 for (; number < quarterPoints; number++) {
679 __m128 x = _mm_loadu_ps(inPtr);
682 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
683 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
684 __m128i n = _mm_cvtps_epi32(n_f);
687 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
688 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
695 __m128i n_and_1 = _mm_and_si128(n, ones);
696 __m128i n_and_2 = _mm_and_si128(n, twos);
697 __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
700 __m128 sin_swap = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
701 __m128 sin_result = _mm_blendv_ps(sin_r, cos_r, sin_swap);
702 __m128 sin_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
703 sin_result = _mm_xor_ps(sin_result, _mm_and_ps(sin_neg, sign_bit));
706 __m128 cos_result = _mm_blendv_ps(cos_r, sin_r, sin_swap);
707 __m128 cos_neg = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
708 cos_result = _mm_xor_ps(cos_result, _mm_and_ps(cos_neg, sign_bit));
710 _mm_storeu_ps(sinPtr, sin_result);
711 _mm_storeu_ps(cosPtr, cos_result);
717 number = quarterPoints * 4;
718 for (; number < num_points; number++) {
719 *sinPtr++ = sinf(*inPtr);
720 *cosPtr++ = cosf(*inPtr++);
733 const float* inVector,
734 unsigned int num_points)
737 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f);
738 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f);
739 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f);
741 const int32x4_t ones = vdupq_n_s32(1);
742 const int32x4_t twos = vdupq_n_s32(2);
743 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
744 const float32x4_t half = vdupq_n_f32(0.5f);
745 const float32x4_t neg_half = vdupq_n_f32(-0.5f);
746 const float32x4_t fzeroes = vdupq_n_f32(0.0f);
748 unsigned int number = 0;
749 const unsigned int quarterPoints = num_points / 4;
751 for (; number < quarterPoints; number++) {
752 float32x4_t x = vld1q_f32(inVector);
756 float32x4_t scaled = vmulq_f32(x, two_over_pi);
757 uint32x4_t is_neg = vcltq_f32(scaled, fzeroes);
758 float32x4_t adj = vbslq_f32(is_neg, neg_half, half);
759 float32x4_t n_f = vcvtq_f32_s32(vcvtq_s32_f32(vaddq_f32(scaled, adj)));
760 int32x4_t n = vcvtq_s32_f32(n_f);
763 float32x4_t r = vmlsq_f32(x, n_f, pi_over_2_hi);
764 r = vmlsq_f32(r, n_f, pi_over_2_lo);
771 int32x4_t n_and_1 = vandq_s32(n, ones);
772 int32x4_t n_and_2 = vandq_s32(n, twos);
773 int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
775 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
778 float32x4_t sin_result = vbslq_f32(swap_mask, cos_r, sin_r);
779 uint32x4_t sin_neg = vceqq_s32(n_and_2, twos);
780 sin_result = vreinterpretq_f32_u32(
781 veorq_u32(vreinterpretq_u32_f32(sin_result),
782 vandq_u32(sin_neg, vreinterpretq_u32_f32(sign_bit))));
785 float32x4_t cos_result = vbslq_f32(swap_mask, sin_r, cos_r);
786 uint32x4_t cos_neg = vceqq_s32(n_plus_1_and_2, twos);
787 cos_result = vreinterpretq_f32_u32(
788 veorq_u32(vreinterpretq_u32_f32(cos_result),
789 vandq_u32(cos_neg, vreinterpretq_u32_f32(sign_bit))));
791 vst1q_f32(sinVector, sin_result);
792 vst1q_f32(cosVector, cos_result);
797 for (number = quarterPoints * 4; number < num_points; number++) {
798 *sinVector++ = sinf(*inVector);
799 *cosVector++ = cosf(*inVector++);
809static inline void volk_32f_sincos_32f_x2_neonv8(
float* sinVector,
811 const float* inVector,
812 unsigned int num_points)
815 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f);
816 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f);
817 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f);
819 const int32x4_t ones = vdupq_n_s32(1);
820 const int32x4_t twos = vdupq_n_s32(2);
821 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
823 unsigned int number = 0;
824 const unsigned int quarterPoints = num_points / 4;
826 for (; number < quarterPoints; number++) {
827 float32x4_t x = vld1q_f32(inVector);
831 float32x4_t n_f = vrndnq_f32(vmulq_f32(x, two_over_pi));
832 int32x4_t n = vcvtq_s32_f32(n_f);
835 float32x4_t r = vfmsq_f32(x, n_f, pi_over_2_hi);
836 r = vfmsq_f32(r, n_f, pi_over_2_lo);
839 float32x4_t sin_r = _vsin_poly_neonv8(r);
840 float32x4_t cos_r = _vcos_poly_neonv8(r);
843 int32x4_t n_and_1 = vandq_s32(n, ones);
844 int32x4_t n_and_2 = vandq_s32(n, twos);
845 int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
847 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
850 float32x4_t sin_result = vbslq_f32(swap_mask, cos_r, sin_r);
851 uint32x4_t sin_neg = vceqq_s32(n_and_2, twos);
852 sin_result = vreinterpretq_f32_u32(
853 veorq_u32(vreinterpretq_u32_f32(sin_result),
854 vandq_u32(sin_neg, vreinterpretq_u32_f32(sign_bit))));
857 float32x4_t cos_result = vbslq_f32(swap_mask, sin_r, cos_r);
858 uint32x4_t cos_neg = vceqq_s32(n_plus_1_and_2, twos);
859 cos_result = vreinterpretq_f32_u32(
860 veorq_u32(vreinterpretq_u32_f32(cos_result),
861 vandq_u32(cos_neg, vreinterpretq_u32_f32(sign_bit))));
863 vst1q_f32(sinVector, sin_result);
864 vst1q_f32(cosVector, cos_result);
869 for (number = quarterPoints * 4; number < num_points; number++) {
870 *sinVector++ = sinf(*inVector);
871 *cosVector++ = cosf(*inVector++);