64#ifndef INCLUDED_volk_32f_sin_32f_a_H
65#define INCLUDED_volk_32f_sin_32f_a_H
73 for (
unsigned int number = 0; number < num_points; number++) {
83static inline void volk_32f_sin_32f_a_avx512f(
float* sinVector,
84 const float* inVector,
85 unsigned int num_points)
87 float* sinPtr = sinVector;
88 const float* inPtr = inVector;
90 unsigned int number = 0;
91 unsigned int sixteenPoints = num_points / 16;
95 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f);
96 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f);
97 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f);
99 const __m512i ones = _mm512_set1_epi32(1);
100 const __m512i twos = _mm512_set1_epi32(2);
101 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
103 for (; number < sixteenPoints; number++) {
104 __m512 x = _mm512_load_ps(inPtr);
107 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
108 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
109 __m512i n = _mm512_cvtps_epi32(n_f);
112 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
113 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
122 __m512i n_and_1 = _mm512_and_si512(n, ones);
123 __m512i n_and_2 = _mm512_and_si512(n, twos);
126 __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
127 __m512 result = _mm512_mask_blend_ps(swap_mask, sin_r, cos_r);
130 __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_and_2, twos);
131 result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
133 _mm512_castps_si512(result),
136 _mm512_store_ps(sinPtr, result);
141 number = sixteenPoints * 16;
142 for (; number < num_points; number++) {
143 *sinPtr++ = sinf(*inPtr++);
148#if LV_HAVE_AVX2 && LV_HAVE_FMA
149#include <immintrin.h>
153volk_32f_sin_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
155 float* bPtr = bVector;
156 const float* aPtr = aVector;
158 unsigned int number = 0;
159 unsigned int eighthPoints = num_points / 8;
163 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
164 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
165 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
167 const __m256i ones = _mm256_set1_epi32(1);
168 const __m256i twos = _mm256_set1_epi32(2);
169 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
171 for (; number < eighthPoints; number++) {
172 __m256 x = _mm256_load_ps(aPtr);
175 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
176 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
177 __m256i n = _mm256_cvtps_epi32(n_f);
180 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
181 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
190 __m256i n_and_1 = _mm256_and_si256(n, ones);
191 __m256i n_and_2 = _mm256_and_si256(n, twos);
194 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
195 __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
198 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
199 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
201 _mm256_store_ps(bPtr, result);
206 number = eighthPoints * 8;
207 for (; number < num_points; number++) {
208 *bPtr++ = sinf(*aPtr++);
215#include <immintrin.h>
219volk_32f_sin_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
221 float* bPtr = bVector;
222 const float* aPtr = aVector;
224 unsigned int number = 0;
225 unsigned int eighthPoints = num_points / 8;
229 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
230 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
231 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
233 const __m256i ones = _mm256_set1_epi32(1);
234 const __m256i twos = _mm256_set1_epi32(2);
235 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
237 for (; number < eighthPoints; number++) {
238 __m256 x = _mm256_load_ps(aPtr);
241 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
242 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
243 __m256i n = _mm256_cvtps_epi32(n_f);
246 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
247 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
256 __m256i n_and_1 = _mm256_and_si256(n, ones);
257 __m256i n_and_2 = _mm256_and_si256(n, twos);
260 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
261 __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
264 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
265 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
267 _mm256_store_ps(bPtr, result);
272 number = eighthPoints * 8;
273 for (; number < num_points; number++) {
274 *bPtr++ = sinf(*aPtr++);
281#include <smmintrin.h>
285volk_32f_sin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
287 float* bPtr = bVector;
288 const float* aPtr = aVector;
290 unsigned int number = 0;
291 unsigned int quarterPoints = num_points / 4;
295 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f);
296 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f);
297 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f);
299 const __m128i ones = _mm_set1_epi32(1);
300 const __m128i twos = _mm_set1_epi32(2);
301 const __m128 sign_bit = _mm_set1_ps(-0.0f);
303 for (; number < quarterPoints; number++) {
304 __m128 x = _mm_load_ps(aPtr);
307 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
308 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
309 __m128i n = _mm_cvtps_epi32(n_f);
312 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
313 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
322 __m128i n_and_1 = _mm_and_si128(n, ones);
323 __m128i n_and_2 = _mm_and_si128(n, twos);
326 __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
327 __m128 result = _mm_blendv_ps(sin_r, cos_r, swap_mask);
330 __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
331 result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
333 _mm_store_ps(bPtr, result);
338 number = quarterPoints * 4;
339 for (; number < num_points; number++) {
340 *bPtr++ = sinf(*aPtr++);
349#ifndef INCLUDED_volk_32f_sin_32f_u_H
350#define INCLUDED_volk_32f_sin_32f_u_H
352#ifdef LV_HAVE_AVX512F
353#include <immintrin.h>
356static inline void volk_32f_sin_32f_u_avx512f(
float* sinVector,
357 const float* inVector,
358 unsigned int num_points)
360 float* sinPtr = sinVector;
361 const float* inPtr = inVector;
363 unsigned int number = 0;
364 unsigned int sixteenPoints = num_points / 16;
368 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f);
369 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f);
370 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f);
372 const __m512i ones = _mm512_set1_epi32(1);
373 const __m512i twos = _mm512_set1_epi32(2);
374 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
376 for (; number < sixteenPoints; number++) {
377 __m512 x = _mm512_loadu_ps(inPtr);
380 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
381 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
382 __m512i n = _mm512_cvtps_epi32(n_f);
385 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
386 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
395 __m512i n_and_1 = _mm512_and_si512(n, ones);
396 __m512i n_and_2 = _mm512_and_si512(n, twos);
399 __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
400 __m512 result = _mm512_mask_blend_ps(swap_mask, sin_r, cos_r);
403 __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_and_2, twos);
404 result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
406 _mm512_castps_si512(result),
409 _mm512_storeu_ps(sinPtr, result);
414 number = sixteenPoints * 16;
415 for (; number < num_points; number++) {
416 *sinPtr++ = sinf(*inPtr++);
421#if LV_HAVE_AVX2 && LV_HAVE_FMA
422#include <immintrin.h>
426volk_32f_sin_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
428 float* bPtr = bVector;
429 const float* aPtr = aVector;
431 unsigned int number = 0;
432 unsigned int eighthPoints = num_points / 8;
436 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
437 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
438 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
440 const __m256i ones = _mm256_set1_epi32(1);
441 const __m256i twos = _mm256_set1_epi32(2);
442 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
444 for (; number < eighthPoints; number++) {
445 __m256 x = _mm256_loadu_ps(aPtr);
448 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
449 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
450 __m256i n = _mm256_cvtps_epi32(n_f);
453 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
454 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
463 __m256i n_and_1 = _mm256_and_si256(n, ones);
464 __m256i n_and_2 = _mm256_and_si256(n, twos);
467 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
468 __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
471 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
472 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
474 _mm256_storeu_ps(bPtr, result);
479 number = eighthPoints * 8;
480 for (; number < num_points; number++) {
481 *bPtr++ = sinf(*aPtr++);
488#include <immintrin.h>
492volk_32f_sin_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
494 float* bPtr = bVector;
495 const float* aPtr = aVector;
497 unsigned int number = 0;
498 unsigned int eighthPoints = num_points / 8;
502 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
503 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
504 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
506 const __m256i ones = _mm256_set1_epi32(1);
507 const __m256i twos = _mm256_set1_epi32(2);
508 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
510 for (; number < eighthPoints; number++) {
511 __m256 x = _mm256_loadu_ps(aPtr);
514 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
515 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
516 __m256i n = _mm256_cvtps_epi32(n_f);
519 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
520 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
529 __m256i n_and_1 = _mm256_and_si256(n, ones);
530 __m256i n_and_2 = _mm256_and_si256(n, twos);
533 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
534 __m256 result = _mm256_blendv_ps(sin_r, cos_r, swap_mask);
537 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_2, twos));
538 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
540 _mm256_storeu_ps(bPtr, result);
545 number = eighthPoints * 8;
546 for (; number < num_points; number++) {
547 *bPtr++ = sinf(*aPtr++);
555#include <smmintrin.h>
559volk_32f_sin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
561 float* bPtr = bVector;
562 const float* aPtr = aVector;
564 unsigned int number = 0;
565 unsigned int quarterPoints = num_points / 4;
569 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f);
570 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f);
571 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f);
573 const __m128i ones = _mm_set1_epi32(1);
574 const __m128i twos = _mm_set1_epi32(2);
575 const __m128 sign_bit = _mm_set1_ps(-0.0f);
577 for (; number < quarterPoints; number++) {
578 __m128 x = _mm_loadu_ps(aPtr);
581 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
582 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
583 __m128i n = _mm_cvtps_epi32(n_f);
586 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
587 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
596 __m128i n_and_1 = _mm_and_si128(n, ones);
597 __m128i n_and_2 = _mm_and_si128(n, twos);
600 __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
601 __m128 result = _mm_blendv_ps(sin_r, cos_r, swap_mask);
604 __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_2, twos));
605 result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
607 _mm_storeu_ps(bPtr, result);
612 number = quarterPoints * 4;
613 for (; number < num_points; number++) {
614 *bPtr++ = sinf(*aPtr++);
621#ifdef LV_HAVE_GENERIC
626 float* bPtr = bVector;
627 const float* aPtr = aVector;
628 unsigned int number = 0;
630 for (number = 0; number < num_points; number++) {
631 *bPtr++ = sinf(*aPtr++);
646 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f);
647 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f);
648 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f);
650 const int32x4_t ones = vdupq_n_s32(1);
651 const int32x4_t twos = vdupq_n_s32(2);
652 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
653 const float32x4_t half = vdupq_n_f32(0.5f);
654 const float32x4_t neg_half = vdupq_n_f32(-0.5f);
655 const float32x4_t fzeroes = vdupq_n_f32(0.0f);
657 unsigned int number = 0;
658 const unsigned int quarterPoints = num_points / 4;
660 for (; number < quarterPoints; number++) {
661 float32x4_t x = vld1q_f32(aVector);
665 float32x4_t scaled = vmulq_f32(x, two_over_pi);
666 uint32x4_t is_neg = vcltq_f32(scaled, fzeroes);
667 float32x4_t adj = vbslq_f32(is_neg, neg_half, half);
668 float32x4_t n_f = vcvtq_f32_s32(vcvtq_s32_f32(vaddq_f32(scaled, adj)));
669 int32x4_t n = vcvtq_s32_f32(n_f);
672 float32x4_t r = vmlsq_f32(x, n_f, pi_over_2_hi);
673 r = vmlsq_f32(r, n_f, pi_over_2_lo);
682 int32x4_t n_and_1 = vandq_s32(n, ones);
683 int32x4_t n_and_2 = vandq_s32(n, twos);
685 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
686 float32x4_t result = vbslq_f32(swap_mask, cos_r, sin_r);
688 uint32x4_t neg_mask = vceqq_s32(n_and_2, twos);
689 result = vreinterpretq_f32_u32(
690 veorq_u32(vreinterpretq_u32_f32(result),
691 vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
693 vst1q_f32(bVector, result);
697 for (number = quarterPoints * 4; number < num_points; number++) {
698 *bVector++ = sinf(*aVector++);
709volk_32f_sin_32f_neonv8(
float* bVector,
const float* aVector,
unsigned int num_points)
712 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f);
713 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f);
714 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f);
716 const int32x4_t ones = vdupq_n_s32(1);
717 const int32x4_t twos = vdupq_n_s32(2);
718 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
720 unsigned int number = 0;
721 const unsigned int quarterPoints = num_points / 4;
723 for (; number < quarterPoints; number++) {
724 float32x4_t x = vld1q_f32(aVector);
728 float32x4_t n_f = vrndnq_f32(vmulq_f32(x, two_over_pi));
729 int32x4_t n = vcvtq_s32_f32(n_f);
732 float32x4_t r = vfmsq_f32(x, n_f, pi_over_2_hi);
733 r = vfmsq_f32(r, n_f, pi_over_2_lo);
736 float32x4_t sin_r = _vsin_poly_neonv8(r);
737 float32x4_t cos_r = _vcos_poly_neonv8(r);
742 int32x4_t n_and_1 = vandq_s32(n, ones);
743 int32x4_t n_and_2 = vandq_s32(n, twos);
745 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
746 float32x4_t result = vbslq_f32(swap_mask, cos_r, sin_r);
748 uint32x4_t neg_mask = vceqq_s32(n_and_2, twos);
749 result = vreinterpretq_f32_u32(
750 veorq_u32(vreinterpretq_u32_f32(result),
751 vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
753 vst1q_f32(bVector, result);
757 for (number = quarterPoints * 4; number < num_points; number++) {
758 *bVector++ = sinf(*aVector++);
765#include <riscv_vector.h>
768volk_32f_sin_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
770 size_t vlmax = __riscv_vsetvlmax_e32m2();
772 const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
773 const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
774 const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
775 const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
777 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
778 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
780 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
781 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
782 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05, vlmax);
783 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07, vlmax);
785 size_t n = num_points;
786 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
787 vl = __riscv_vsetvl_e32m2(n);
788 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
789 vfloat32m2_t s = __riscv_vfabs(v, vl);
790 vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
791 vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
793 s = __riscv_vfnmsac(s, cPio4a, r, vl);
794 s = __riscv_vfnmsac(s, cPio4b, r, vl);
795 s = __riscv_vfnmsac(s, cPio4c, r, vl);
797 s = __riscv_vfmul(s, 1 / 8.0f, vl);
798 s = __riscv_vfmul(s, s, vl);
800 s = __riscv_vfmsub(s, c5, c4, vl);
801 s = __riscv_vfmadd(s, t, c3, vl);
802 s = __riscv_vfmsub(s, t, c2, vl);
803 s = __riscv_vfmadd(s, t, cf1, vl);
804 s = __riscv_vfmul(s, t, vl);
805 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
806 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
807 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
808 s = __riscv_vfmul(s, 1 / 2.0f, vl);
811 __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
812 vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
814 vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
815 vbool16_t m2 = __riscv_vmxor(__riscv_vmslt(__riscv_vreinterpret_i32m2(v), 0, vl),
816 __riscv_vmsne(__riscv_vand(q, 4, vl), 0, vl),
819 sine = __riscv_vmerge(sine, cosine, m1, vl);
820 sine = __riscv_vfneg_mu(m2, sine, sine, vl);
822 __riscv_vse32(bVector, sine, vl);