44#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
45#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
55volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(
lv_32fc_t* cVector,
59 unsigned int num_points)
61 unsigned int number = 0;
62 const unsigned int oneEigthPoints = num_points / 8;
64 __m256i x, y, realz, imagz;
65 __m256 ret, retlo, rethi;
69 __m256i conjugateSign =
70 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
72 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
74 for (; number < oneEigthPoints; number++) {
76 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
77 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
80 realz = _mm256_madd_epi16(x, y);
83 y = _mm256_sign_epi16(y, conjugateSign);
86 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
87 _MM_SHUFFLE(2, 3, 0, 1));
90 imagz = _mm256_madd_epi16(x, y);
93 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
96 retlo = _mm256_mul_ps(retlo, invScalar);
99 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
102 rethi = _mm256_mul_ps(rethi, invScalar);
104 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
105 _mm256_store_ps((
float*)c, ret);
108 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
109 _mm256_store_ps((
float*)c, ret);
116 number = oneEigthPoints * 8;
117 float* cFloatPtr = (
float*)&cVector[number];
118 int8_t* a8Ptr = (int8_t*)&aVector[number];
119 int8_t* b8Ptr = (int8_t*)&bVector[number];
120 for (; number < num_points; number++) {
121 float aReal = (float)*a8Ptr++;
122 float aImag = (float)*a8Ptr++;
124 float bReal = (float)*b8Ptr++;
125 float bImag = (float)*b8Ptr++;
129 *cFloatPtr++ =
lv_creal(temp) / scalar;
130 *cFloatPtr++ =
lv_cimag(temp) / scalar;
137#include <smmintrin.h>
140volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(
lv_32fc_t* cVector,
144 unsigned int num_points)
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
149 __m128i x, y, realz, imagz;
154 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
156 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
158 for (; number < quarterPoints; number++) {
160 x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
161 y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
164 realz = _mm_madd_epi16(x, y);
167 y = _mm_sign_epi16(y, conjugateSign);
170 y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
171 _MM_SHUFFLE(2, 3, 0, 1));
174 imagz = _mm_madd_epi16(x, y);
177 ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
180 ret = _mm_mul_ps(ret, invScalar);
183 _mm_store_ps((
float*)c, ret);
187 ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
190 ret = _mm_mul_ps(ret, invScalar);
193 _mm_store_ps((
float*)c, ret);
200 number = quarterPoints * 4;
201 float* cFloatPtr = (
float*)&cVector[number];
202 int8_t* a8Ptr = (int8_t*)&aVector[number];
203 int8_t* b8Ptr = (int8_t*)&bVector[number];
204 for (; number < num_points; number++) {
205 float aReal = (float)*a8Ptr++;
206 float aImag = (float)*a8Ptr++;
208 float bReal = (float)*b8Ptr++;
209 float bImag = (float)*b8Ptr++;
213 *cFloatPtr++ =
lv_creal(temp) / scalar;
214 *cFloatPtr++ =
lv_cimag(temp) / scalar;
220#ifdef LV_HAVE_GENERIC
227 unsigned int num_points)
229 unsigned int number = 0;
230 float* cPtr = (
float*)cVector;
231 const float invScalar = 1.0 / scalar;
232 int8_t* a8Ptr = (int8_t*)aVector;
233 int8_t* b8Ptr = (int8_t*)bVector;
234 for (number = 0; number < num_points; number++) {
235 float aReal = (float)*a8Ptr++;
236 float aImag = (float)*a8Ptr++;
238 float bReal = (float)*b8Ptr++;
239 float bImag = (float)*b8Ptr++;
243 *cPtr++ = (
lv_creal(temp) * invScalar);
244 *cPtr++ = (
lv_cimag(temp) * invScalar);
257 unsigned int num_points)
259 unsigned int number = 0;
260 const unsigned int eighthPoints = num_points / 8;
265 const float invScalar = 1.0f / scalar;
266 float32x4_t vInvScalar = vdupq_n_f32(invScalar);
268 int8x8x2_t aVal, bVal;
269 int16x8_t aReal, aImag, bReal, bImag;
270 int32x4_t realLo, realHi, imagLo, imagHi;
271 float32x4_t realFloatLo, realFloatHi, imagFloatLo, imagFloatHi;
273 for (; number < eighthPoints; number++) {
275 aVal = vld2_s8((
const int8_t*)aPtr);
276 bVal = vld2_s8((
const int8_t*)bPtr);
279 aReal = vmovl_s8(aVal.val[0]);
280 aImag = vmovl_s8(aVal.val[1]);
281 bReal = vmovl_s8(bVal.val[0]);
282 bImag = vmovl_s8(bVal.val[1]);
289 realLo = vmlal_s16(vmull_s16(vget_low_s16(aReal), vget_low_s16(bReal)),
291 vget_low_s16(bImag));
292 imagLo = vmlsl_s16(vmull_s16(vget_low_s16(aImag), vget_low_s16(bReal)),
294 vget_low_s16(bImag));
297 realHi = vmlal_s16(vmull_s16(vget_high_s16(aReal), vget_high_s16(bReal)),
298 vget_high_s16(aImag),
299 vget_high_s16(bImag));
300 imagHi = vmlsl_s16(vmull_s16(vget_high_s16(aImag), vget_high_s16(bReal)),
301 vget_high_s16(aReal),
302 vget_high_s16(bImag));
305 realFloatLo = vmulq_f32(vcvtq_f32_s32(realLo), vInvScalar);
306 imagFloatLo = vmulq_f32(vcvtq_f32_s32(imagLo), vInvScalar);
307 realFloatHi = vmulq_f32(vcvtq_f32_s32(realHi), vInvScalar);
308 imagFloatHi = vmulq_f32(vcvtq_f32_s32(imagHi), vInvScalar);
311 float32x4x2_t resultLo;
312 resultLo.val[0] = realFloatLo;
313 resultLo.val[1] = imagFloatLo;
314 vst2q_f32((
float*)cPtr, resultLo);
318 float32x4x2_t resultHi;
319 resultHi.val[0] = realFloatHi;
320 resultHi.val[1] = imagFloatHi;
321 vst2q_f32((
float*)cPtr, resultHi);
328 number = eighthPoints * 8;
329 float* cFloatPtr = (
float*)&cVector[number];
330 int8_t* a8Ptr = (int8_t*)&aVector[number];
331 int8_t* b8Ptr = (int8_t*)&bVector[number];
332 for (; number < num_points; number++) {
333 float aReal_f = (float)*a8Ptr++;
334 float aImag_f = (float)*a8Ptr++;
336 float bReal_f = (float)*b8Ptr++;
337 float bImag_f = (float)*b8Ptr++;
341 *cFloatPtr++ =
lv_creal(temp) * invScalar;
342 *cFloatPtr++ =
lv_cimag(temp) * invScalar;
350#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
351#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
358#include <immintrin.h>
361volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(
lv_32fc_t* cVector,
365 unsigned int num_points)
367 unsigned int number = 0;
368 const unsigned int oneEigthPoints = num_points / 8;
370 __m256i x, y, realz, imagz;
371 __m256 ret, retlo, rethi;
375 __m256i conjugateSign =
376 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
378 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
380 for (; number < oneEigthPoints; number++) {
382 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
383 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
386 realz = _mm256_madd_epi16(x, y);
389 y = _mm256_sign_epi16(y, conjugateSign);
392 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
393 _MM_SHUFFLE(2, 3, 0, 1));
396 imagz = _mm256_madd_epi16(x, y);
399 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
402 retlo = _mm256_mul_ps(retlo, invScalar);
405 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
408 rethi = _mm256_mul_ps(rethi, invScalar);
410 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
411 _mm256_storeu_ps((
float*)c, ret);
414 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
415 _mm256_storeu_ps((
float*)c, ret);
422 number = oneEigthPoints * 8;
423 float* cFloatPtr = (
float*)&cVector[number];
424 int8_t* a8Ptr = (int8_t*)&aVector[number];
425 int8_t* b8Ptr = (int8_t*)&bVector[number];
426 for (; number < num_points; number++) {
427 float aReal = (float)*a8Ptr++;
428 float aImag = (float)*a8Ptr++;
430 float bReal = (float)*b8Ptr++;
431 float bImag = (float)*b8Ptr++;
435 *cFloatPtr++ =
lv_creal(temp) / scalar;
436 *cFloatPtr++ =
lv_cimag(temp) / scalar;
443#include <riscv_vector.h>
445static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_rvv(
lv_32fc_t* cVector,
449 unsigned int num_points)
451 size_t n = num_points;
452 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
453 vl = __riscv_vsetvl_e8m1(n);
454 vint16m2_t va = __riscv_vle16_v_i16m2((
const int16_t*)aVector, vl);
455 vint16m2_t vb = __riscv_vle16_v_i16m2((
const int16_t*)bVector, vl);
456 vint8m1_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl);
457 vint8m1_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl);
458 vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
460 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
461 vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
462 vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
463 vuint32m4_t vru = __riscv_vreinterpret_u32m4(vrf);
464 vuint32m4_t viu = __riscv_vreinterpret_u32m4(vif);
466 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
467 __riscv_vse64((uint64_t*)cVector, v, vl);
473#include <riscv_vector.h>
476volk_8ic_x2_s32f_multiply_conjugate_32fc_rvvseg(
lv_32fc_t* cVector,
480 unsigned int num_points)
482 size_t n = num_points;
483 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
484 vl = __riscv_vsetvl_e8m1(n);
485 vint8m1x2_t va = __riscv_vlseg2e8_v_i8m1x2((
const int8_t*)aVector, vl);
486 vint8m1x2_t vb = __riscv_vlseg2e8_v_i8m1x2((
const int8_t*)bVector, vl);
487 vint8m1_t var = __riscv_vget_i8m1(va, 0), vai = __riscv_vget_i8m1(va, 1);
488 vint8m1_t vbr = __riscv_vget_i8m1(vb, 0), vbi = __riscv_vget_i8m1(vb, 1);
489 vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
491 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
492 vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
493 vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
494 __riscv_vsseg2e32_v_f32m4x2(
495 (
float*)cVector, __riscv_vcreate_v_f32m4x2(vrf, vif), vl);