63#ifndef INCLUDED_volk_32f_cos_32f_a_H
64#define INCLUDED_volk_32f_cos_32f_a_H
71 float* bPtr = bVector;
72 const float* aPtr = aVector;
73 unsigned int number = 0;
75 for (; number < num_points; number++) {
76 *bPtr++ = cosf(*aPtr++);
88 for (
unsigned int number = 0; number < num_points; number++) {
98static inline void volk_32f_cos_32f_a_avx512f(
float* cosVector,
99 const float* inVector,
100 unsigned int num_points)
102 float* cosPtr = cosVector;
103 const float* inPtr = inVector;
105 unsigned int number = 0;
106 unsigned int sixteenPoints = num_points / 16;
110 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f);
111 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f);
112 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f);
114 const __m512i ones = _mm512_set1_epi32(1);
115 const __m512i twos = _mm512_set1_epi32(2);
116 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
118 for (; number < sixteenPoints; number++) {
119 __m512 x = _mm512_load_ps(inPtr);
122 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
123 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
124 __m512i n = _mm512_cvtps_epi32(n_f);
127 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
128 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
137 __m512i n_and_1 = _mm512_and_si512(n, ones);
138 __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
141 __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
142 __m512 result = _mm512_mask_blend_ps(swap_mask, cos_r, sin_r);
146 __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
147 result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
149 _mm512_castps_si512(result),
152 _mm512_store_ps(cosPtr, result);
157 number = sixteenPoints * 16;
158 for (; number < num_points; number++) {
159 *cosPtr++ = cosf(*inPtr++);
164#if LV_HAVE_AVX2 && LV_HAVE_FMA
165#include <immintrin.h>
169volk_32f_cos_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
171 float* bPtr = bVector;
172 const float* aPtr = aVector;
174 unsigned int number = 0;
175 unsigned int eighthPoints = num_points / 8;
179 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
180 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
181 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
183 const __m256i ones = _mm256_set1_epi32(1);
184 const __m256i twos = _mm256_set1_epi32(2);
185 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
187 for (; number < eighthPoints; number++) {
188 __m256 x = _mm256_load_ps(aPtr);
191 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
192 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
193 __m256i n = _mm256_cvtps_epi32(n_f);
196 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
197 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
206 __m256i n_and_1 = _mm256_and_si256(n, ones);
207 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
210 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
211 __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
214 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
215 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
217 _mm256_store_ps(bPtr, result);
222 number = eighthPoints * 8;
223 for (; number < num_points; number++) {
224 *bPtr++ = cosf(*aPtr++);
231#include <immintrin.h>
235volk_32f_cos_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
237 float* bPtr = bVector;
238 const float* aPtr = aVector;
240 unsigned int number = 0;
241 unsigned int eighthPoints = num_points / 8;
245 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
246 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
247 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
249 const __m256i ones = _mm256_set1_epi32(1);
250 const __m256i twos = _mm256_set1_epi32(2);
251 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
253 for (; number < eighthPoints; number++) {
254 __m256 x = _mm256_load_ps(aPtr);
257 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
258 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
259 __m256i n = _mm256_cvtps_epi32(n_f);
262 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
263 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
272 __m256i n_and_1 = _mm256_and_si256(n, ones);
273 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
276 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
277 __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
280 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
281 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
283 _mm256_store_ps(bPtr, result);
288 number = eighthPoints * 8;
289 for (; number < num_points; number++) {
290 *bPtr++ = cosf(*aPtr++);
297#include <smmintrin.h>
301volk_32f_cos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
303 float* bPtr = bVector;
304 const float* aPtr = aVector;
306 unsigned int number = 0;
307 unsigned int quarterPoints = num_points / 4;
311 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f);
312 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f);
313 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f);
315 const __m128i ones = _mm_set1_epi32(1);
316 const __m128i twos = _mm_set1_epi32(2);
317 const __m128 sign_bit = _mm_set1_ps(-0.0f);
319 for (; number < quarterPoints; number++) {
320 __m128 x = _mm_load_ps(aPtr);
323 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
324 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
325 __m128i n = _mm_cvtps_epi32(n_f);
328 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
329 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
338 __m128i n_and_1 = _mm_and_si128(n, ones);
339 __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
342 __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
343 __m128 result = _mm_blendv_ps(cos_r, sin_r, swap_mask);
346 __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
347 result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
349 _mm_store_ps(bPtr, result);
354 number = quarterPoints * 4;
355 for (; number < num_points; number++) {
356 *bPtr++ = cosf(*aPtr++);
365#ifndef INCLUDED_volk_32f_cos_32f_u_H
366#define INCLUDED_volk_32f_cos_32f_u_H
368#ifdef LV_HAVE_AVX512F
369#include <immintrin.h>
372static inline void volk_32f_cos_32f_u_avx512f(
float* cosVector,
373 const float* inVector,
374 unsigned int num_points)
376 float* cosPtr = cosVector;
377 const float* inPtr = inVector;
379 unsigned int number = 0;
380 unsigned int sixteenPoints = num_points / 16;
384 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f);
385 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f);
386 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f);
388 const __m512i ones = _mm512_set1_epi32(1);
389 const __m512i twos = _mm512_set1_epi32(2);
390 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
392 for (; number < sixteenPoints; number++) {
393 __m512 x = _mm512_loadu_ps(inPtr);
396 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
397 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
398 __m512i n = _mm512_cvtps_epi32(n_f);
401 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
402 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
411 __m512i n_and_1 = _mm512_and_si512(n, ones);
412 __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
415 __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
416 __m512 result = _mm512_mask_blend_ps(swap_mask, cos_r, sin_r);
420 __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
421 result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
423 _mm512_castps_si512(result),
426 _mm512_storeu_ps(cosPtr, result);
431 number = sixteenPoints * 16;
432 for (; number < num_points; number++) {
433 *cosPtr++ = cosf(*inPtr++);
438#if LV_HAVE_AVX2 && LV_HAVE_FMA
439#include <immintrin.h>
443volk_32f_cos_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
445 float* bPtr = bVector;
446 const float* aPtr = aVector;
448 unsigned int number = 0;
449 unsigned int eighthPoints = num_points / 8;
453 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
454 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
455 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
457 const __m256i ones = _mm256_set1_epi32(1);
458 const __m256i twos = _mm256_set1_epi32(2);
459 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
461 for (; number < eighthPoints; number++) {
462 __m256 x = _mm256_loadu_ps(aPtr);
465 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
466 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
467 __m256i n = _mm256_cvtps_epi32(n_f);
470 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
471 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
480 __m256i n_and_1 = _mm256_and_si256(n, ones);
481 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
484 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
485 __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
488 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
489 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
491 _mm256_storeu_ps(bPtr, result);
496 number = eighthPoints * 8;
497 for (; number < num_points; number++) {
498 *bPtr++ = cosf(*aPtr++);
505#include <immintrin.h>
509volk_32f_cos_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
511 float* bPtr = bVector;
512 const float* aPtr = aVector;
514 unsigned int number = 0;
515 unsigned int eighthPoints = num_points / 8;
519 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f);
520 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f);
521 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f);
523 const __m256i ones = _mm256_set1_epi32(1);
524 const __m256i twos = _mm256_set1_epi32(2);
525 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
527 for (; number < eighthPoints; number++) {
528 __m256 x = _mm256_loadu_ps(aPtr);
531 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
532 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
533 __m256i n = _mm256_cvtps_epi32(n_f);
536 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
537 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
546 __m256i n_and_1 = _mm256_and_si256(n, ones);
547 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
550 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
551 __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
554 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
555 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
557 _mm256_storeu_ps(bPtr, result);
562 number = eighthPoints * 8;
563 for (; number < num_points; number++) {
564 *bPtr++ = cosf(*aPtr++);
571#include <smmintrin.h>
575volk_32f_cos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
577 float* bPtr = bVector;
578 const float* aPtr = aVector;
580 unsigned int number = 0;
581 unsigned int quarterPoints = num_points / 4;
585 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f);
586 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f);
587 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f);
589 const __m128i ones = _mm_set1_epi32(1);
590 const __m128i twos = _mm_set1_epi32(2);
591 const __m128 sign_bit = _mm_set1_ps(-0.0f);
593 for (; number < quarterPoints; number++) {
594 __m128 x = _mm_loadu_ps(aPtr);
597 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
598 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
599 __m128i n = _mm_cvtps_epi32(n_f);
602 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
603 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
612 __m128i n_and_1 = _mm_and_si128(n, ones);
613 __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
616 __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
617 __m128 result = _mm_blendv_ps(cos_r, sin_r, swap_mask);
620 __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
621 result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
623 _mm_storeu_ps(bPtr, result);
628 number = quarterPoints * 4;
629 for (; number < num_points; number++) {
630 *bPtr++ = cosf(*aPtr++);
646 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f);
647 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f);
648 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f);
650 const int32x4_t ones = vdupq_n_s32(1);
651 const int32x4_t twos = vdupq_n_s32(2);
652 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
653 const float32x4_t half = vdupq_n_f32(0.5f);
654 const float32x4_t neg_half = vdupq_n_f32(-0.5f);
655 const float32x4_t fzeroes = vdupq_n_f32(0.0f);
657 unsigned int number = 0;
658 const unsigned int quarterPoints = num_points / 4;
660 for (; number < quarterPoints; number++) {
661 float32x4_t x = vld1q_f32(aVector);
665 float32x4_t scaled = vmulq_f32(x, two_over_pi);
666 uint32x4_t is_neg = vcltq_f32(scaled, fzeroes);
667 float32x4_t adj = vbslq_f32(is_neg, neg_half, half);
668 float32x4_t n_f = vcvtq_f32_s32(vcvtq_s32_f32(vaddq_f32(scaled, adj)));
669 int32x4_t n = vcvtq_s32_f32(n_f);
672 float32x4_t r = vmlsq_f32(x, n_f, pi_over_2_hi);
673 r = vmlsq_f32(r, n_f, pi_over_2_lo);
682 int32x4_t n_and_1 = vandq_s32(n, ones);
683 int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
685 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
686 float32x4_t result = vbslq_f32(swap_mask, sin_r, cos_r);
688 uint32x4_t neg_mask = vceqq_s32(n_plus_1_and_2, twos);
689 result = vreinterpretq_f32_u32(
690 veorq_u32(vreinterpretq_u32_f32(result),
691 vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
693 vst1q_f32(bVector, result);
697 for (number = quarterPoints * 4; number < num_points; number++) {
698 *bVector++ = cosf(*aVector++);
709volk_32f_cos_32f_neonv8(
float* bVector,
const float* aVector,
unsigned int num_points)
712 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f);
713 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f);
714 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f);
716 const int32x4_t ones = vdupq_n_s32(1);
717 const int32x4_t twos = vdupq_n_s32(2);
718 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
720 unsigned int number = 0;
721 const unsigned int quarterPoints = num_points / 4;
723 for (; number < quarterPoints; number++) {
724 float32x4_t x = vld1q_f32(aVector);
728 float32x4_t n_f = vrndnq_f32(vmulq_f32(x, two_over_pi));
729 int32x4_t n = vcvtq_s32_f32(n_f);
732 float32x4_t r = vfmsq_f32(x, n_f, pi_over_2_hi);
733 r = vfmsq_f32(r, n_f, pi_over_2_lo);
736 float32x4_t sin_r = _vsin_poly_neonv8(r);
737 float32x4_t cos_r = _vcos_poly_neonv8(r);
742 int32x4_t n_and_1 = vandq_s32(n, ones);
743 int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
745 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
746 float32x4_t result = vbslq_f32(swap_mask, sin_r, cos_r);
748 uint32x4_t neg_mask = vceqq_s32(n_plus_1_and_2, twos);
749 result = vreinterpretq_f32_u32(
750 veorq_u32(vreinterpretq_u32_f32(result),
751 vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
753 vst1q_f32(bVector, result);
757 for (number = quarterPoints * 4; number < num_points; number++) {
758 *bVector++ = cosf(*aVector++);
765#include <riscv_vector.h>
768volk_32f_cos_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
770 size_t vlmax = __riscv_vsetvlmax_e32m2();
772 const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
773 const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
774 const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
775 const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
777 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
778 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
780 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
781 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
782 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax);
783 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax);
785 size_t n = num_points;
786 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
787 vl = __riscv_vsetvl_e32m2(n);
788 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
789 vfloat32m2_t s = __riscv_vfabs(v, vl);
790 vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
791 vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
793 s = __riscv_vfnmsac(s, cPio4a, r, vl);
794 s = __riscv_vfnmsac(s, cPio4b, r, vl);
795 s = __riscv_vfnmsac(s, cPio4c, r, vl);
797 s = __riscv_vfmul(s, 1 / 8.0f, vl);
798 s = __riscv_vfmul(s, s, vl);
800 s = __riscv_vfmsub(s, c5, c4, vl);
801 s = __riscv_vfmadd(s, t, c3, vl);
802 s = __riscv_vfmsub(s, t, c2, vl);
803 s = __riscv_vfmadd(s, t, cf1, vl);
804 s = __riscv_vfmul(s, t, vl);
805 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
806 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
807 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
808 s = __riscv_vfmul(s, 1 / 2.0f, vl);
811 __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
812 vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
814 vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
815 vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl);
817 cosine = __riscv_vmerge(cosine, sine, m1, vl);
818 cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl);
820 __riscv_vse32(bVector, cosine, vl);