66#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
67#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
76 const unsigned int num_points)
79 for (
unsigned int i = 0; i < num_points; ++i) {
84 diff = symbol - *points++;
96volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(
float* target,
100 unsigned int num_points)
102 const unsigned int num_bytes = num_points * 8;
105 __m256 xmm_points0, xmm_points1, xmm_result;
107 const unsigned int bound = num_bytes >> 6;
110 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
111 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
114 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
115 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
118 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
120 for (
unsigned int i = 0; i < bound; ++i) {
121 xmm_points0 = _mm256_load_ps((
float*)points);
122 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
127 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
129 _mm256_store_ps(target, xmm_result);
133 if (num_bytes >> 5 & 1) {
134 xmm_points0 = _mm256_load_ps((
float*)points);
136 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
140 xmm6 = _mm256_mul_ps(xmm4, xmm4);
142 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
145 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
147 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
148 _mm_store_ps(target, xmm9);
152 if (num_bytes >> 4 & 1) {
153 xmm9 = _mm_load_ps((
float*)points);
155 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
159 xmm9 = _mm_mul_ps(xmm10, xmm10);
161 xmm10 = _mm_hadd_ps(xmm9, xmm9);
163 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
165 _mm_storeh_pi((__m64*)target, xmm10);
176#include <immintrin.h>
184 unsigned int num_points)
186 const int eightsPoints = num_points / 8;
187 const int remainder = num_points - 8 * eightsPoints;
189 __m256 xmm_points0, xmm_points1, xmm_result;
192 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
195 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
197 for (
int i = 0; i < eightsPoints; ++i) {
198 xmm_points0 = _mm256_load_ps((
float*)points);
199 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
203 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
205 _mm256_store_ps(target, xmm_result);
217#include <pmmintrin.h>
225 unsigned int num_points)
227 __m128 xmm_points0, xmm_points1, xmm_result;
235 const int quarterPoints = num_points / 4;
236 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
237 const int leftovers1 = num_points % 2;
240 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
243 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
245 for (
int i = 0; i < quarterPoints; ++i) {
246 xmm_points0 = _mm_load_ps((
float*)points);
247 xmm_points1 = _mm_load_ps((
float*)(points + 2));
252 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
254 _mm_store_ps(target, xmm_result);
258 for (
int i = 0; i < leftovers0; ++i) {
259 xmm_points0 = _mm_load_ps((
float*)points);
262 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
263 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
264 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
265 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
267 _mm_storeh_pi((__m64*)target, xmm_result);
278#include <xmmintrin.h>
284 unsigned int num_points)
286 const __m128 xmm_scalar = _mm_set1_ps(scalar);
287 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
289 for (
unsigned i = 0; i < num_points / 4; ++i) {
290 __m128 xmm_points0 = _mm_load_ps((
float*)points);
291 __m128 xmm_points1 = _mm_load_ps((
float*)(points + 2));
294 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
295 _mm_store_ps((
float*)target, xmm_result);
303#ifdef LV_HAVE_GENERIC
309 unsigned int num_points)
325 unsigned int num_points)
327 unsigned int number = 0;
328 const unsigned int quarterPoints = num_points / 4;
331 const float32x4_t symbolReal = vdupq_n_f32(
lv_creal(*src0));
332 const float32x4_t symbolImag = vdupq_n_f32(
lv_cimag(*src0));
333 const float32x4_t vScalar = vdupq_n_f32(scalar);
335 for (; number < quarterPoints; number++) {
337 float32x4x2_t pts = vld2q_f32((
const float*)points);
341 float32x4_t diffReal = vsubq_f32(symbolReal, pts.val[0]);
342 float32x4_t diffImag = vsubq_f32(symbolImag, pts.val[1]);
345 float32x4_t result = vmulq_f32(diffReal, diffReal);
346 result = vmlaq_f32(result, diffImag, diffImag);
347 result = vmulq_f32(result, vScalar);
349 vst1q_f32(target, result);
355 target, *src0, points, scalar, num_points - quarterPoints * 4);
364volk_32fc_x2_s32f_square_dist_scalar_mult_32f_neonv8(
float* target,
368 unsigned int num_points)
370 unsigned int number = 0;
371 const unsigned int eighthPoints = num_points / 8;
374 const float32x4_t symbolReal = vdupq_n_f32(
lv_creal(*src0));
375 const float32x4_t symbolImag = vdupq_n_f32(
lv_cimag(*src0));
376 const float32x4_t vScalar = vdupq_n_f32(scalar);
378 for (; number < eighthPoints; number++) {
382 float32x4x2_t pts0 = vld2q_f32((
const float*)points);
383 float32x4x2_t pts1 = vld2q_f32((
const float*)(points + 4));
387 float32x4_t diffReal0 = vsubq_f32(symbolReal, pts0.val[0]);
388 float32x4_t diffImag0 = vsubq_f32(symbolImag, pts0.val[1]);
389 float32x4_t diffReal1 = vsubq_f32(symbolReal, pts1.val[0]);
390 float32x4_t diffImag1 = vsubq_f32(symbolImag, pts1.val[1]);
393 float32x4_t result0 =
394 vfmaq_f32(vmulq_f32(diffReal0, diffReal0), diffImag0, diffImag0);
395 float32x4_t result1 =
396 vfmaq_f32(vmulq_f32(diffReal1, diffReal1), diffImag1, diffImag1);
399 result0 = vmulq_f32(result0, vScalar);
400 result1 = vmulq_f32(result1, vScalar);
402 vst1q_f32(target, result0);
403 vst1q_f32(target + 4, result1);
408 const unsigned int remaining = num_points - eighthPoints * 8;
416#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
417#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
423#include <immintrin.h>
427volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(
float* target,
431 unsigned int num_points)
433 const unsigned int num_bytes = num_points * 8;
436 __m256 xmm_points0, xmm_points1, xmm_result;
438 const unsigned int bound = num_bytes >> 6;
441 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
442 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
445 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
446 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
449 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
451 for (
unsigned int i = 0; i < bound; ++i) {
452 xmm_points0 = _mm256_loadu_ps((
float*)points);
453 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
458 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
460 _mm256_storeu_ps(target, xmm_result);
464 if (num_bytes >> 5 & 1) {
465 xmm_points0 = _mm256_loadu_ps((
float*)points);
467 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
471 xmm6 = _mm256_mul_ps(xmm4, xmm4);
473 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
474 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
476 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
478 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
479 _mm_storeu_ps(target, xmm9);
483 if (num_bytes >> 4 & 1) {
484 xmm9 = _mm_loadu_ps((
float*)points);
486 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
490 xmm9 = _mm_mul_ps(xmm10, xmm10);
492 xmm10 = _mm_hadd_ps(xmm9, xmm9);
494 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
496 _mm_storeh_pi((__m64*)target, xmm10);
507#include <immintrin.h>
515 unsigned int num_points)
517 const int eightsPoints = num_points / 8;
518 const int remainder = num_points - 8 * eightsPoints;
520 __m256 xmm_points0, xmm_points1, xmm_result;
523 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
526 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
528 for (
int i = 0; i < eightsPoints; ++i) {
529 xmm_points0 = _mm256_loadu_ps((
float*)points);
530 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
534 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
536 _mm256_storeu_ps(target, xmm_result);
548#include <pmmintrin.h>
556 unsigned int num_points)
558 __m128 xmm_points0, xmm_points1, xmm_result;
566 const int quarterPoints = num_points / 4;
567 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
568 const int leftovers1 = num_points % 2;
571 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
574 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
576 for (
int i = 0; i < quarterPoints; ++i) {
577 xmm_points0 = _mm_loadu_ps((
float*)points);
578 xmm_points1 = _mm_loadu_ps((
float*)(points + 2));
583 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
585 _mm_storeu_ps(target, xmm_result);
589 for (
int i = 0; i < leftovers0; ++i) {
590 xmm_points0 = _mm_loadu_ps((
float*)points);
593 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
594 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
595 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
596 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
598 _mm_storeh_pi((__m64*)target, xmm_result);
609#include <xmmintrin.h>
615 unsigned int num_points)
617 const __m128 xmm_scalar = _mm_set1_ps(scalar);
618 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((
const double*)src0));
620 for (
unsigned i = 0; i < num_points / 4; ++i) {
621 __m128 xmm_points0 = _mm_loadu_ps((
float*)points);
622 __m128 xmm_points1 = _mm_loadu_ps((
float*)(points + 2));
625 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
626 _mm_storeu_ps((
float*)target, xmm_result);
635#include <riscv_vector.h>
638volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvv(
float* target,
642 unsigned int num_points)
644 size_t vlmax = __riscv_vsetvlmax_e32m4();
645 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(
lv_creal(*src0), vlmax);
646 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(
lv_cimag(*src0), vlmax);
647 vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
649 size_t n = num_points;
650 for (
size_t vl; n > 0; n -= vl, target += vl, points += vl) {
651 vl = __riscv_vsetvl_e32m4(n);
652 vuint64m8_t vb = __riscv_vle64_v_u64m8((
const uint64_t*)points, vl);
653 vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
654 vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
655 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
656 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
657 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
658 __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);
664#include <riscv_vector.h>
667volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvvseg(
float* target,
671 unsigned int num_points)
673 size_t vlmax = __riscv_vsetvlmax_e32m4();
674 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(
lv_creal(*src0), vlmax);
675 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(
lv_cimag(*src0), vlmax);
676 vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
678 size_t n = num_points;
679 for (
size_t vl; n > 0; n -= vl, target += vl, points += vl) {
680 vl = __riscv_vsetvl_e32m4(n);
681 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((
const float*)points, vl);
682 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0);
683 vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1);
684 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
685 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
686 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
687 __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);