98#ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
99#define INCLUDED_volk_32fc_x2_divide_32fc_u_H
106#ifdef LV_HAVE_GENERIC
111 unsigned int num_points)
117 for (
unsigned int number = 0; number < num_points; number++) {
118 *cPtr++ = (*aPtr++) / (*bPtr++);
125#include <pmmintrin.h>
131 unsigned int num_points)
139 unsigned int number = 0;
140 const unsigned int quarterPoints = num_points / 4;
142 __m128 num01, num23, den01, den23, norm, result;
147 for (; number < quarterPoints; number++) {
148 num01 = _mm_loadu_ps((
float*)a);
149 den01 = _mm_loadu_ps((
float*)b);
154 num23 = _mm_loadu_ps((
float*)a);
155 den23 = _mm_loadu_ps((
float*)b);
161 den01 = _mm_unpacklo_ps(norm, norm);
162 den23 = _mm_unpackhi_ps(norm, norm);
164 result = _mm_div_ps(num01, den01);
165 _mm_storeu_ps((
float*)c, result);
167 result = _mm_div_ps(num23, den23);
168 _mm_storeu_ps((
float*)c, result);
173 for (; number < num_points; number++) {
184#include <immintrin.h>
190 unsigned int num_points)
198 unsigned int number = 0;
199 const unsigned int quarterPoints = num_points / 4;
201 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
206 for (; number < quarterPoints; number++) {
207 num = _mm256_loadu_ps(
209 denum = _mm256_loadu_ps(
212 sq = _mm256_mul_ps(denum, denum);
213 mag_sq_un = _mm256_hadd_ps(
215 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8);
218 div = _mm256_div_ps(mul_conj, mag_sq);
220 _mm256_storeu_ps((
float*)c, div);
227 number = quarterPoints * 4;
229 for (; number < num_points; number++) {
230 *c++ = (*a++) / (*b++);
235#if LV_HAVE_AVX2 && LV_HAVE_FMA
236#include <immintrin.h>
240static inline void volk_32fc_x2_divide_32fc_u_avx2_fma(
lv_32fc_t* cVector,
243 unsigned int num_points)
249 const unsigned int eighthPoints = num_points / 8;
251 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
253 for (
unsigned int number = 0; number < eighthPoints; number++) {
254 num01 = _mm256_loadu_ps((
float*)a);
255 denum01 = _mm256_loadu_ps((
float*)b);
260 num23 = _mm256_loadu_ps((
float*)a);
261 denum23 = _mm256_loadu_ps((
float*)b);
266 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
267 _mm256_mul_ps(denum23, denum23));
269 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
270 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
272 result0 = _mm256_div_ps(num01, denum01);
273 result1 = _mm256_div_ps(num23, denum23);
275 _mm256_storeu_ps((
float*)c, result0);
277 _mm256_storeu_ps((
float*)c, result1);
285#ifdef LV_HAVE_AVX512F
286#include <immintrin.h>
290static inline void volk_32fc_x2_divide_32fc_u_avx512(
lv_32fc_t* cVector,
293 unsigned int num_points)
299 const unsigned int sixteenthPoints = num_points / 16;
301 __m512 num01, num23, denum01, denum23;
302 __m512 mag_sq01_shuf, mag_sq23_shuf, mag_sq01, mag_sq23;
303 __m512 result0, result1;
305 for (
unsigned int number = 0; number < sixteenthPoints; number++) {
306 num01 = _mm512_loadu_ps((
float*)a);
307 denum01 = _mm512_loadu_ps((
float*)b);
312 num23 = _mm512_loadu_ps((
float*)a);
313 denum23 = _mm512_loadu_ps((
float*)b);
319 mag_sq01_shuf = _mm512_shuffle_ps(denum01, denum01, 0xb1);
320 mag_sq01 = _mm512_add_ps(_mm512_mul_ps(denum01, denum01),
321 _mm512_mul_ps(mag_sq01_shuf, mag_sq01_shuf));
323 mag_sq23_shuf = _mm512_shuffle_ps(denum23, denum23, 0xb1);
324 mag_sq23 = _mm512_add_ps(_mm512_mul_ps(denum23, denum23),
325 _mm512_mul_ps(mag_sq23_shuf, mag_sq23_shuf));
327 result0 = _mm512_div_ps(num01, mag_sq01);
328 result1 = _mm512_div_ps(num23, mag_sq23);
330 _mm512_storeu_ps((
float*)c, result0);
332 _mm512_storeu_ps((
float*)c, result1);
344#ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
345#define INCLUDED_volk_32fc_x2_divide_32fc_a_H
353#include <pmmintrin.h>
359 unsigned int num_points)
367 unsigned int number = 0;
368 const unsigned int quarterPoints = num_points / 4;
370 __m128 num01, num23, den01, den23, norm, result;
375 for (; number < quarterPoints; number++) {
376 num01 = _mm_load_ps((
float*)a);
377 den01 = _mm_load_ps((
float*)b);
382 num23 = _mm_load_ps((
float*)a);
383 den23 = _mm_load_ps((
float*)b);
390 den01 = _mm_unpacklo_ps(norm, norm);
391 den23 = _mm_unpackhi_ps(norm, norm);
393 result = _mm_div_ps(num01, den01);
394 _mm_store_ps((
float*)c, result);
396 result = _mm_div_ps(num23, den23);
397 _mm_store_ps((
float*)c, result);
402 for (; number < num_points; number++) {
412#include <immintrin.h>
418 unsigned int num_points)
434 const unsigned int eigthPoints = num_points / 8;
436 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
438 for (
unsigned int number = 0; number < eigthPoints; number++) {
440 num01 = _mm256_load_ps((
float*)a);
441 denum01 = _mm256_load_ps((
float*)b);
447 num23 = _mm256_load_ps((
float*)a);
448 denum23 = _mm256_load_ps((
float*)b);
453 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
454 _mm256_mul_ps(denum23, denum23));
456 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
457 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
459 result0 = _mm256_div_ps(num01, denum01);
460 result1 = _mm256_div_ps(num23, denum23);
462 _mm256_store_ps((
float*)c, result0);
464 _mm256_store_ps((
float*)c, result1);
472#if LV_HAVE_AVX2 && LV_HAVE_FMA
473#include <immintrin.h>
477static inline void volk_32fc_x2_divide_32fc_a_avx2_fma(
lv_32fc_t* cVector,
480 unsigned int num_points)
486 const unsigned int eighthPoints = num_points / 8;
488 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
490 for (
unsigned int number = 0; number < eighthPoints; number++) {
491 num01 = _mm256_load_ps((
float*)a);
492 denum01 = _mm256_load_ps((
float*)b);
497 num23 = _mm256_load_ps((
float*)a);
498 denum23 = _mm256_load_ps((
float*)b);
503 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
504 _mm256_mul_ps(denum23, denum23));
506 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
507 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
509 result0 = _mm256_div_ps(num01, denum01);
510 result1 = _mm256_div_ps(num23, denum23);
512 _mm256_store_ps((
float*)c, result0);
514 _mm256_store_ps((
float*)c, result1);
522#ifdef LV_HAVE_AVX512F
523#include <immintrin.h>
527static inline void volk_32fc_x2_divide_32fc_a_avx512(
lv_32fc_t* cVector,
530 unsigned int num_points)
536 const unsigned int sixteenthPoints = num_points / 16;
538 __m512 num01, num23, denum01, denum23;
539 __m512 mag_sq01_shuf, mag_sq23_shuf, mag_sq01, mag_sq23;
540 __m512 result0, result1;
542 for (
unsigned int number = 0; number < sixteenthPoints; number++) {
543 num01 = _mm512_load_ps((
float*)a);
544 denum01 = _mm512_load_ps((
float*)b);
549 num23 = _mm512_load_ps((
float*)a);
550 denum23 = _mm512_load_ps((
float*)b);
556 mag_sq01_shuf = _mm512_shuffle_ps(denum01, denum01, 0xb1);
557 mag_sq01 = _mm512_add_ps(_mm512_mul_ps(denum01, denum01),
558 _mm512_mul_ps(mag_sq01_shuf, mag_sq01_shuf));
560 mag_sq23_shuf = _mm512_shuffle_ps(denum23, denum23, 0xb1);
561 mag_sq23 = _mm512_add_ps(_mm512_mul_ps(denum23, denum23),
562 _mm512_mul_ps(mag_sq23_shuf, mag_sq23_shuf));
564 result0 = _mm512_div_ps(num01, mag_sq01);
565 result1 = _mm512_div_ps(num23, mag_sq23);
567 _mm512_store_ps((
float*)c, result0);
569 _mm512_store_ps((
float*)c, result1);
584 unsigned int num_points)
590 float32x4x2_t aVal, bVal, cVal;
591 float32x4_t bAbs, bAbsInv;
593 const unsigned int quarterPoints = num_points / 4;
594 unsigned int number = 0;
595 for (; number < quarterPoints; number++) {
596 aVal = vld2q_f32((
const float*)(aPtr));
597 bVal = vld2q_f32((
const float*)(bPtr));
603 bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
604 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
606 bAbsInv = vrecpeq_f32(bAbs);
607 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
608 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
610 cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
611 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
612 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
614 cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
615 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
616 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
618 vst2q_f32((
float*)(cPtr), cVal);
622 for (number = quarterPoints * 4; number < num_points; number++) {
623 *cPtr++ = (*aPtr++) / (*bPtr++);
631static inline void volk_32fc_x2_divide_32fc_neonv8(
lv_32fc_t* cVector,
634 unsigned int num_points)
640 float32x4x2_t aVal, bVal, cVal;
643 const unsigned int quarterPoints = num_points / 4;
644 unsigned int number = 0;
646 for (; number < quarterPoints; number++) {
647 aVal = vld2q_f32((
const float*)(aPtr));
648 bVal = vld2q_f32((
const float*)(bPtr));
655 bMagSq = vfmaq_f32(vmulq_f32(bVal.val[0], bVal.val[0]), bVal.val[1], bVal.val[1]);
658 float32x4_t bMagSqInv = vdivq_f32(vdupq_n_f32(1.0f), bMagSq);
662 vfmaq_f32(vmulq_f32(aVal.val[0], bVal.val[0]), aVal.val[1], bVal.val[1]);
663 cVal.val[0] = vmulq_f32(cVal.val[0], bMagSqInv);
667 vfmsq_f32(vmulq_f32(aVal.val[1], bVal.val[0]), aVal.val[0], bVal.val[1]);
668 cVal.val[1] = vmulq_f32(cVal.val[1], bMagSqInv);
670 vst2q_f32((
float*)(cPtr), cVal);
674 for (number = quarterPoints * 4; number < num_points; number++) {
675 *cPtr++ = (*aPtr++) / (*bPtr++);
681#include <riscv_vector.h>
684static inline void volk_32fc_x2_divide_32fc_rvv(
lv_32fc_t* cVector,
687 unsigned int num_points)
689 uint64_t* out = (uint64_t*)cVector;
690 size_t n = num_points;
691 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, out += vl) {
692 vl = __riscv_vsetvl_e32m4(n);
693 vuint64m8_t va = __riscv_vle64_v_u64m8((
const uint64_t*)aVector, vl);
694 vuint64m8_t vb = __riscv_vle64_v_u64m8((
const uint64_t*)bVector, vl);
695 vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
696 vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
697 vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
698 vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
699 vfloat32m4_t mul = __riscv_vfrdiv(
700 __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
701 vfloat32m4_t vr = __riscv_vfmul(
702 __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
703 vfloat32m4_t vi = __riscv_vfmul(
704 __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
705 vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr);
706 vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi);
708 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
709 __riscv_vse64(out, v, vl);
715#include <riscv_vector.h>
717static inline void volk_32fc_x2_divide_32fc_rvvseg(
lv_32fc_t* cVector,
720 unsigned int num_points)
722 size_t n = num_points;
723 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
724 vl = __riscv_vsetvl_e32m4(n);
725 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((
const float*)aVector, vl);
726 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((
const float*)bVector, vl);
727 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
728 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
729 vfloat32m4_t mul = __riscv_vfrdiv(
730 __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
731 vfloat32m4_t vr = __riscv_vfmul(
732 __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
733 vfloat32m4_t vi = __riscv_vfmul(
734 __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
735 __riscv_vsseg2e32_v_f32m4x2(
736 (
float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);