68#ifndef INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
69#define INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
76#define ROTATOR_RELOAD 512
77#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
78#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
79#define ROTATOR_RELOAD_8 (ROTATOR_RELOAD / 8)
88 unsigned int num_points)
92 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); ++i) {
94 *outVector++ = *inVector++ * (*phase);
95 (*phase) *= *phase_inc;
101 *outVector++ = *inVector++ * (*phase);
102 (*phase) *= *phase_inc;
118#define M_PI 3.14159265358979323846
131 unsigned int num_points)
134 const lv_32fc_t* inputVectorPtr = inVector;
137 const double initial_angle =
139 const double delta_angle =
143 double angle_sum = initial_angle;
144 double angle_c = 0.0;
150 const double delta4 = 4.0 * delta_angle;
153 float32x4x2_t input_vec;
154 float32x4x2_t output_vec;
157 const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
158 const float32x4x2_t incr_vec = vld2q_f32((
float*)incrPtr);
159 float32x4x2_t phase_vec;
162#define REDUCE_ANGLE(a) \
164 (a) = fmod((a), 2.0 * M_PI); \
167 else if ((a) < -M_PI) \
172 for (
unsigned int k = 0; k < 4; ++k) {
173 double a = angle_sum + (double)k * delta_angle;
175 phasePtr[k] =
lv_cmake((
float)cos(a), (
float)sin(a));
177 phase_vec = vld2q_f32((
float*)phasePtr);
182 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); i++) {
185 input_vec = vld2q_f32((
float*)inputVectorPtr);
193 vst2q_f32((
float*)outputVectorPtr, output_vec);
195 outputVectorPtr += 4;
201 double y = block_delta - angle_c;
202 double t = angle_sum + y;
203 angle_c = (t - angle_sum) - y;
208 for (
unsigned int k = 0; k < 4; ++k) {
209 double a = angle_sum + (double)k * delta_angle;
211 phasePtr[k] =
lv_cmake((
float)cos(a), (
float)sin(a));
213 phase_vec = vld2q_f32((
float*)phasePtr);
218 input_vec = vld2q_f32((
float*)inputVectorPtr);
223 vst2q_f32((
float*)outputVectorPtr, output_vec);
225 outputVectorPtr += 4;
230 double y = (4.0 * delta_angle) - angle_c;
231 double t = angle_sum + y;
232 angle_c = (t - angle_sum) - y;
239 double a = angle_sum;
241 *phase =
lv_cmake((
float)cos(a), (
float)sin(a));
245 for (i = 0; i < num_points % 4; i++) {
246 *outputVectorPtr++ = *inputVectorPtr++ * (*phase);
248 double y = delta_angle - angle_c;
249 double t = angle_sum + y;
250 angle_c = (t - angle_sum) - y;
253 double a = angle_sum;
255 *phase =
lv_cmake((
float)cos(a), (
float)sin(a));
265#include <immintrin.h>
269#define M_PI 3.14159265358979323846
282 unsigned int num_points)
288 const double initial_angle =
290 const double delta_angle =
294 double angle_sum = initial_angle;
295 double angle_c = 0.0;
301 const double delta4 = 4.0 * delta_angle;
304 __m256 aVal, phase_Val, z;
307 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
317#define REDUCE_ANGLE(a) \
319 (a) = fmod((a), 2.0 * M_PI); \
322 else if ((a) < -M_PI) \
327 for (
unsigned int k = 0; k < 4; ++k) {
328 double a = angle_sum + (double)k * delta_angle;
330 phase_Ptr[k] =
lv_cmake((
float)cos(a), (
float)sin(a));
332 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
337 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); ++i) {
340 aVal = _mm256_loadu_ps((
float*)aPtr);
345 _mm256_storeu_ps((
float*)cPtr, z);
353 double y = block_delta - angle_c;
354 double t = angle_sum + y;
355 angle_c = (t - angle_sum) - y;
360 for (
unsigned int k = 0; k < 4; ++k) {
361 double a = angle_sum + (double)k * delta_angle;
363 phase_Ptr[k] =
lv_cmake((
float)cos(a), (
float)sin(a));
365 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
370 aVal = _mm256_loadu_ps((
float*)aPtr);
375 _mm256_storeu_ps((
float*)cPtr, z);
382 double y = (4.0 * delta_angle) - angle_c;
383 double t = angle_sum + y;
384 angle_c = (t - angle_sum) - y;
391 double a = angle_sum;
393 *phase =
lv_cmake((
float)cos(a), (
float)sin(a));
397 for (i = 0; i < num_points % 4; ++i) {
398 *cPtr++ = *aPtr++ * (*phase);
400 double y = delta_angle - angle_c;
401 double t = angle_sum + y;
402 angle_c = (t - angle_sum) - y;
405 double a = angle_sum;
407 *phase =
lv_cmake((
float)cos(a), (
float)sin(a));
417#include <immintrin.h>
429 unsigned int num_points)
435 const double initial_angle =
437 const double delta_angle =
441 double angle_sum = initial_angle;
442 double angle_c = 0.0;
448 const double delta4 = 4.0 * delta_angle;
451 __m256 aVal, phase_Val, z;
454 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
463#define REDUCE_ANGLE(a) \
465 (a) = fmod((a), 2.0 * M_PI); \
468 else if ((a) < -M_PI) \
473 for (
unsigned int k = 0; k < 4; ++k) {
474 double a = angle_sum + (double)k * delta_angle;
476 phase_Ptr[k] =
lv_cmake((
float)cos(a), (
float)sin(a));
478 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
483 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); ++i) {
485 aVal = _mm256_loadu_ps((
float*)aPtr);
488 _mm256_storeu_ps((
float*)cPtr, z);
495 double y = block_delta - angle_c;
496 double t = angle_sum + y;
497 angle_c = (t - angle_sum) - y;
502 for (
unsigned int k = 0; k < 4; ++k) {
503 double a = angle_sum + (double)k * delta_angle;
505 phase_Ptr[k] =
lv_cmake((
float)cos(a), (
float)sin(a));
507 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
512 aVal = _mm256_loadu_ps((
float*)aPtr);
515 _mm256_storeu_ps((
float*)cPtr, z);
520 double y = (4.0 * delta_angle) - angle_c;
521 double t = angle_sum + y;
522 angle_c = (t - angle_sum) - y;
529 double a = angle_sum;
531 *phase =
lv_cmake((
float)cos(a), (
float)sin(a));
535 for (i = 0; i < num_points % 4; ++i) {
536 *cPtr++ = *aPtr++ * (*phase);
538 double y = delta_angle - angle_c;
539 double t = angle_sum + y;
540 angle_c = (t - angle_sum) - y;
543 double a = angle_sum;
545 *phase =
lv_cmake((
float)cos(a), (
float)sin(a));
554#ifdef LV_HAVE_AVX512F
555#include <immintrin.h>
564static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx512f(
lv_32fc_t* outVector,
568 unsigned int num_points)
574 const double initial_angle =
576 const double delta_angle =
580 double angle_sum = initial_angle;
581 double angle_c = 0.0;
587 const double delta8 = 8.0 * delta_angle;
590 __m512 aVal, phase_Val, z;
594 const __m512 inc_Val = _mm512_set_ps(
lv_cimag(incr),
611#define REDUCE_ANGLE(a) \
613 (a) = fmod((a), 2.0 * M_PI); \
616 else if ((a) < -M_PI) \
621 for (
unsigned int k = 0; k < 8; ++k) {
622 double a = angle_sum + (double)k * delta_angle;
624 phase_Ptr[k] =
lv_cmake((
float)cos(a), (
float)sin(a));
626 phase_Val = _mm512_load_ps((
float*)phase_Ptr);
631 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); ++i) {
633 aVal = _mm512_load_ps((
float*)aPtr);
636 _mm512_store_ps((
float*)cPtr, z);
643 double y = block_delta - angle_c;
644 double t = angle_sum + y;
645 angle_c = (t - angle_sum) - y;
650 for (
unsigned int k = 0; k < 8; ++k) {
651 double a = angle_sum + (double)k * delta_angle;
653 phase_Ptr[k] =
lv_cmake((
float)cos(a), (
float)sin(a));
655 phase_Val = _mm512_load_ps((
float*)phase_Ptr);
660 aVal = _mm512_load_ps((
float*)aPtr);
663 _mm512_store_ps((
float*)cPtr, z);
668 double y = (8.0 * delta_angle) - angle_c;
669 double t = angle_sum + y;
670 angle_c = (t - angle_sum) - y;
677 double a = angle_sum;
679 *phase =
lv_cmake((
float)cos(a), (
float)sin(a));
683 for (i = 0; i < num_points % 8; ++i) {
684 *cPtr++ = *aPtr++ * (*phase);
686 double y = delta_angle - angle_c;
687 double t = angle_sum + y;
688 angle_c = (t - angle_sum) - y;
691 double a = angle_sum;
693 *phase =
lv_cmake((
float)cos(a), (
float)sin(a));
702#ifdef LV_HAVE_AVX512F
703#include <immintrin.h>
710static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx512f(
lv_32fc_t* outVector,
714 unsigned int num_points)
719 const double initial_angle =
721 const double delta_angle =
724 double angle_sum = initial_angle;
725 double angle_c = 0.0;
729 const double delta8 = 8.0 * delta_angle;
732 __m512 aVal, phase_Val, z;
735 const __m512 inc_Val = _mm512_set_ps(
lv_cimag(incr),
752#define REDUCE_ANGLE(a) \
754 (a) = fmod((a), 2.0 * M_PI); \
757 else if ((a) < -M_PI) \
761 for (
unsigned int k = 0; k < 8; ++k) {
762 double a = angle_sum + (double)k * delta_angle;
764 phase_Ptr[k] =
lv_cmake((
float)cos(a), (
float)sin(a));
766 phase_Val = _mm512_loadu_ps((
float*)phase_Ptr);
770 for (i = 0; i < (
unsigned int)(num_points /
ROTATOR_RELOAD); ++i) {
772 aVal = _mm512_loadu_ps((
float*)aPtr);
775 _mm512_storeu_ps((
float*)cPtr, z);
781 double y = block_delta - angle_c;
782 double t = angle_sum + y;
783 angle_c = (t - angle_sum) - y;
787 for (
unsigned int k = 0; k < 8; ++k) {
788 double a = angle_sum + (double)k * delta_angle;
790 phase_Ptr[k] =
lv_cmake((
float)cos(a), (
float)sin(a));
792 phase_Val = _mm512_loadu_ps((
float*)phase_Ptr);
796 aVal = _mm512_loadu_ps((
float*)aPtr);
799 _mm512_storeu_ps((
float*)cPtr, z);
804 double y = (8.0 * delta_angle) - angle_c;
805 double t = angle_sum + y;
806 angle_c = (t - angle_sum) - y;
812 double a = angle_sum;
814 *phase =
lv_cmake((
float)cos(a), (
float)sin(a));
817 for (i = 0; i < num_points % 8; ++i) {
818 *cPtr++ = *aPtr++ * (*phase);
820 double y = delta_angle - angle_c;
821 double t = angle_sum + y;
822 angle_c = (t - angle_sum) - y;
825 double a = angle_sum;
827 *phase =
lv_cmake((
float)cos(a), (
float)sin(a));
842#include <riscv_vector.h>
844static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvv(
lv_32fc_t* outVector,
848 unsigned int num_points)
850 size_t vlmax = __riscv_vsetvlmax_e32m2();
854 vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr;
855 for (
size_t i = 0; i < vlmax; ++i) {
859 phr = __riscv_vfslide1down(phr,
lv_creal(ph), vlmax);
860 phi = __riscv_vfslide1down(phi,
lv_cimag(ph), vlmax);
865 vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(
lv_creal(inc), vlmax);
866 vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(
lv_cimag(inc), vlmax);
874 for (; n > 0; n -= vl, inVector += vl, outVector += vl) {
876 vl = __riscv_vsetvl_e32m2(n < vlmax ? n : vlmax);
878 vuint64m4_t va = __riscv_vle64_v_u64m4((
const uint64_t*)inVector, vl);
879 vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
880 vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
883 __riscv_vfnmsac(__riscv_vfmul(var, phr, vl), vai, phi, vl);
885 __riscv_vfmacc(__riscv_vfmul(var, phi, vl), vai, phr, vl);
887 vuint32m2_t vru = __riscv_vreinterpret_u32m2(vr);
888 vuint32m2_t viu = __riscv_vreinterpret_u32m2(vi);
890 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
891 __riscv_vse64((uint64_t*)outVector, res, vl);
893 vfloat32m2_t tmp = phr;
894 phr = __riscv_vfnmsac(__riscv_vfmul(tmp, incr, vl), phi, inci, vl);
895 phi = __riscv_vfmacc(__riscv_vfmul(tmp, inci, vl), phi, incr, vl);
903 __riscv_vfmacc(__riscv_vfmul(phr, phr, vl), phi, phi, vl);
904 scale = __riscv_vfsqrt(scale, vl);
905 phr = __riscv_vfdiv(phr, scale, vl);
906 phi = __riscv_vfdiv(phi, scale, vl);
910 for (
size_t i = 0; i < vlmax - vl; ++i) {
918#include <riscv_vector.h>
920static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvvseg(
lv_32fc_t* outVector,
924 unsigned int num_points)
926 size_t vlmax = __riscv_vsetvlmax_e32m2();
930 vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr;
931 for (
size_t i = 0; i < vlmax; ++i) {
935 phr = __riscv_vfslide1down(phr,
lv_creal(ph), vlmax);
936 phi = __riscv_vfslide1down(phi,
lv_cimag(ph), vlmax);
941 vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(
lv_creal(inc), vlmax);
942 vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(
lv_cimag(inc), vlmax);
950 for (; n > 0; n -= vl, inVector += vl, outVector += vl) {
952 vl = __riscv_vsetvl_e32m2(n < vlmax ? n : vlmax);
955 __riscv_vlseg2e32_v_f32m2x2((
const float*)inVector, vl);
956 vfloat32m2_t var = __riscv_vget_f32m2(va, 0);
957 vfloat32m2_t vai = __riscv_vget_f32m2(va, 1);
960 __riscv_vfnmsac(__riscv_vfmul(var, phr, vl), vai, phi, vl);
962 __riscv_vfmacc(__riscv_vfmul(var, phi, vl), vai, phr, vl);
963 vfloat32m2x2_t vc = __riscv_vcreate_v_f32m2x2(vr, vi);
964 __riscv_vsseg2e32_v_f32m2x2((
float*)outVector, vc, vl);
966 vfloat32m2_t tmp = phr;
967 phr = __riscv_vfnmsac(__riscv_vfmul(tmp, incr, vl), phi, inci, vl);
968 phi = __riscv_vfmacc(__riscv_vfmul(tmp, inci, vl), phi, incr, vl);
976 __riscv_vfmacc(__riscv_vfmul(phr, phr, vl), phi, phi, vl);
977 scale = __riscv_vfsqrt(scale, vl);
978 phr = __riscv_vfdiv(phr, scale, vl);
979 phi = __riscv_vfdiv(phi, scale, vl);
983 for (
size_t i = 0; i < vlmax - vl; ++i) {