Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_8ic_x2_s32f_multiply_conjugate_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
43
44#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
45#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
46
47#include <inttypes.h>
48#include <stdio.h>
49#include <volk/volk_complex.h>
50
51#ifdef LV_HAVE_AVX2
52#include <immintrin.h>
53
54static inline void
55volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
56 const lv_8sc_t* aVector,
57 const lv_8sc_t* bVector,
58 const float scalar,
59 unsigned int num_points)
60{
61 unsigned int number = 0;
62 const unsigned int oneEigthPoints = num_points / 8;
63
64 __m256i x, y, realz, imagz;
65 __m256 ret, retlo, rethi;
66 lv_32fc_t* c = cVector;
67 const lv_8sc_t* a = aVector;
68 const lv_8sc_t* b = bVector;
69 __m256i conjugateSign =
70 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
71
72 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
73
74 for (; number < oneEigthPoints; number++) {
75 // Convert 8 bit values into 16 bit values
76 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
77 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
78
79 // Calculate the ar*cr - ai*(-ci) portions
80 realz = _mm256_madd_epi16(x, y);
81
82 // Calculate the complex conjugate of the cr + ci j values
83 y = _mm256_sign_epi16(y, conjugateSign);
84
85 // Shift the order of the cr and ci values
86 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
87 _MM_SHUFFLE(2, 3, 0, 1));
88
89 // Calculate the ar*(-ci) + cr*(ai)
90 imagz = _mm256_madd_epi16(x, y);
91
92 // Interleave real and imaginary and then convert to float values
93 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
94
95 // Normalize the floating point values
96 retlo = _mm256_mul_ps(retlo, invScalar);
97
98 // Interleave real and imaginary and then convert to float values
99 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
100
101 // Normalize the floating point values
102 rethi = _mm256_mul_ps(rethi, invScalar);
103
104 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
105 _mm256_store_ps((float*)c, ret);
106 c += 4;
107
108 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
109 _mm256_store_ps((float*)c, ret);
110 c += 4;
111
112 a += 8;
113 b += 8;
114 }
115
116 number = oneEigthPoints * 8;
117 float* cFloatPtr = (float*)&cVector[number];
118 int8_t* a8Ptr = (int8_t*)&aVector[number];
119 int8_t* b8Ptr = (int8_t*)&bVector[number];
120 for (; number < num_points; number++) {
121 float aReal = (float)*a8Ptr++;
122 float aImag = (float)*a8Ptr++;
123 lv_32fc_t aVal = lv_cmake(aReal, aImag);
124 float bReal = (float)*b8Ptr++;
125 float bImag = (float)*b8Ptr++;
126 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
127 lv_32fc_t temp = aVal * bVal;
128
129 *cFloatPtr++ = lv_creal(temp) / scalar;
130 *cFloatPtr++ = lv_cimag(temp) / scalar;
131 }
132}
133#endif /* LV_HAVE_AVX2*/
134
135
136#ifdef LV_HAVE_SSE4_1
137#include <smmintrin.h>
138
139static inline void
140volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
141 const lv_8sc_t* aVector,
142 const lv_8sc_t* bVector,
143 const float scalar,
144 unsigned int num_points)
145{
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
148
149 __m128i x, y, realz, imagz;
150 __m128 ret;
151 lv_32fc_t* c = cVector;
152 const lv_8sc_t* a = aVector;
153 const lv_8sc_t* b = bVector;
154 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
155
156 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
157
158 for (; number < quarterPoints; number++) {
159 // Convert into 8 bit values into 16 bit values
160 x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
161 y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
162
163 // Calculate the ar*cr - ai*(-ci) portions
164 realz = _mm_madd_epi16(x, y);
165
166 // Calculate the complex conjugate of the cr + ci j values
167 y = _mm_sign_epi16(y, conjugateSign);
168
169 // Shift the order of the cr and ci values
170 y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
171 _MM_SHUFFLE(2, 3, 0, 1));
172
173 // Calculate the ar*(-ci) + cr*(ai)
174 imagz = _mm_madd_epi16(x, y);
175
176 // Interleave real and imaginary and then convert to float values
177 ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
178
179 // Normalize the floating point values
180 ret = _mm_mul_ps(ret, invScalar);
181
182 // Store the floating point values
183 _mm_store_ps((float*)c, ret);
184 c += 2;
185
186 // Interleave real and imaginary and then convert to float values
187 ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
188
189 // Normalize the floating point values
190 ret = _mm_mul_ps(ret, invScalar);
191
192 // Store the floating point values
193 _mm_store_ps((float*)c, ret);
194 c += 2;
195
196 a += 4;
197 b += 4;
198 }
199
200 number = quarterPoints * 4;
201 float* cFloatPtr = (float*)&cVector[number];
202 int8_t* a8Ptr = (int8_t*)&aVector[number];
203 int8_t* b8Ptr = (int8_t*)&bVector[number];
204 for (; number < num_points; number++) {
205 float aReal = (float)*a8Ptr++;
206 float aImag = (float)*a8Ptr++;
207 lv_32fc_t aVal = lv_cmake(aReal, aImag);
208 float bReal = (float)*b8Ptr++;
209 float bImag = (float)*b8Ptr++;
210 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
211 lv_32fc_t temp = aVal * bVal;
212
213 *cFloatPtr++ = lv_creal(temp) / scalar;
214 *cFloatPtr++ = lv_cimag(temp) / scalar;
215 }
216}
217#endif /* LV_HAVE_SSE4_1 */
218
219
220#ifdef LV_HAVE_GENERIC
221
222static inline void
224 const lv_8sc_t* aVector,
225 const lv_8sc_t* bVector,
226 const float scalar,
227 unsigned int num_points)
228{
229 unsigned int number = 0;
230 float* cPtr = (float*)cVector;
231 const float invScalar = 1.0 / scalar;
232 int8_t* a8Ptr = (int8_t*)aVector;
233 int8_t* b8Ptr = (int8_t*)bVector;
234 for (number = 0; number < num_points; number++) {
235 float aReal = (float)*a8Ptr++;
236 float aImag = (float)*a8Ptr++;
237 lv_32fc_t aVal = lv_cmake(aReal, aImag);
238 float bReal = (float)*b8Ptr++;
239 float bImag = (float)*b8Ptr++;
240 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
241 lv_32fc_t temp = aVal * bVal;
242
243 *cPtr++ = (lv_creal(temp) * invScalar);
244 *cPtr++ = (lv_cimag(temp) * invScalar);
245 }
246}
247#endif /* LV_HAVE_GENERIC */
248
249
250#ifdef LV_HAVE_NEON
251#include <arm_neon.h>
252
254 const lv_8sc_t* aVector,
255 const lv_8sc_t* bVector,
256 const float scalar,
257 unsigned int num_points)
258{
259 unsigned int number = 0;
260 const unsigned int eighthPoints = num_points / 8;
261
262 lv_32fc_t* cPtr = cVector;
263 const lv_8sc_t* aPtr = aVector;
264 const lv_8sc_t* bPtr = bVector;
265 const float invScalar = 1.0f / scalar;
266 float32x4_t vInvScalar = vdupq_n_f32(invScalar);
267
268 int8x8x2_t aVal, bVal;
269 int16x8_t aReal, aImag, bReal, bImag;
270 int32x4_t realLo, realHi, imagLo, imagHi;
271 float32x4_t realFloatLo, realFloatHi, imagFloatLo, imagFloatHi;
272
273 for (; number < eighthPoints; number++) {
274 // Load 8 complex 8-bit values (deinterleaved)
275 aVal = vld2_s8((const int8_t*)aPtr);
276 bVal = vld2_s8((const int8_t*)bPtr);
277
278 // Widen to 16-bit
279 aReal = vmovl_s8(aVal.val[0]);
280 aImag = vmovl_s8(aVal.val[1]);
281 bReal = vmovl_s8(bVal.val[0]);
282 bImag = vmovl_s8(bVal.val[1]);
283
284 // Complex multiply with conjugate: (ar + ai*j) * (br - bi*j)
285 // real = ar*br + ai*bi
286 // imag = ai*br - ar*bi
287
288 // Low half (first 4 complex values)
289 realLo = vmlal_s16(vmull_s16(vget_low_s16(aReal), vget_low_s16(bReal)),
290 vget_low_s16(aImag),
291 vget_low_s16(bImag));
292 imagLo = vmlsl_s16(vmull_s16(vget_low_s16(aImag), vget_low_s16(bReal)),
293 vget_low_s16(aReal),
294 vget_low_s16(bImag));
295
296 // High half (next 4 complex values)
297 realHi = vmlal_s16(vmull_s16(vget_high_s16(aReal), vget_high_s16(bReal)),
298 vget_high_s16(aImag),
299 vget_high_s16(bImag));
300 imagHi = vmlsl_s16(vmull_s16(vget_high_s16(aImag), vget_high_s16(bReal)),
301 vget_high_s16(aReal),
302 vget_high_s16(bImag));
303
304 // Convert to float and scale
305 realFloatLo = vmulq_f32(vcvtq_f32_s32(realLo), vInvScalar);
306 imagFloatLo = vmulq_f32(vcvtq_f32_s32(imagLo), vInvScalar);
307 realFloatHi = vmulq_f32(vcvtq_f32_s32(realHi), vInvScalar);
308 imagFloatHi = vmulq_f32(vcvtq_f32_s32(imagHi), vInvScalar);
309
310 // Store interleaved (first 4 complex values)
311 float32x4x2_t resultLo;
312 resultLo.val[0] = realFloatLo;
313 resultLo.val[1] = imagFloatLo;
314 vst2q_f32((float*)cPtr, resultLo);
315 cPtr += 4;
316
317 // Store interleaved (next 4 complex values)
318 float32x4x2_t resultHi;
319 resultHi.val[0] = realFloatHi;
320 resultHi.val[1] = imagFloatHi;
321 vst2q_f32((float*)cPtr, resultHi);
322 cPtr += 4;
323
324 aPtr += 8;
325 bPtr += 8;
326 }
327
328 number = eighthPoints * 8;
329 float* cFloatPtr = (float*)&cVector[number];
330 int8_t* a8Ptr = (int8_t*)&aVector[number];
331 int8_t* b8Ptr = (int8_t*)&bVector[number];
332 for (; number < num_points; number++) {
333 float aReal_f = (float)*a8Ptr++;
334 float aImag_f = (float)*a8Ptr++;
335 lv_32fc_t aVal_c = lv_cmake(aReal_f, aImag_f);
336 float bReal_f = (float)*b8Ptr++;
337 float bImag_f = (float)*b8Ptr++;
338 lv_32fc_t bVal_c = lv_cmake(bReal_f, -bImag_f);
339 lv_32fc_t temp = aVal_c * bVal_c;
340
341 *cFloatPtr++ = lv_creal(temp) * invScalar;
342 *cFloatPtr++ = lv_cimag(temp) * invScalar;
343 }
344}
345#endif /* LV_HAVE_NEON */
346
347
348#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
349
350#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
351#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
352
353#include <inttypes.h>
354#include <stdio.h>
355#include <volk/volk_complex.h>
356
357#ifdef LV_HAVE_AVX2
358#include <immintrin.h>
359
360static inline void
361volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
362 const lv_8sc_t* aVector,
363 const lv_8sc_t* bVector,
364 const float scalar,
365 unsigned int num_points)
366{
367 unsigned int number = 0;
368 const unsigned int oneEigthPoints = num_points / 8;
369
370 __m256i x, y, realz, imagz;
371 __m256 ret, retlo, rethi;
372 lv_32fc_t* c = cVector;
373 const lv_8sc_t* a = aVector;
374 const lv_8sc_t* b = bVector;
375 __m256i conjugateSign =
376 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
377
378 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
379
380 for (; number < oneEigthPoints; number++) {
381 // Convert 8 bit values into 16 bit values
382 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
383 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
384
385 // Calculate the ar*cr - ai*(-ci) portions
386 realz = _mm256_madd_epi16(x, y);
387
388 // Calculate the complex conjugate of the cr + ci j values
389 y = _mm256_sign_epi16(y, conjugateSign);
390
391 // Shift the order of the cr and ci values
392 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
393 _MM_SHUFFLE(2, 3, 0, 1));
394
395 // Calculate the ar*(-ci) + cr*(ai)
396 imagz = _mm256_madd_epi16(x, y);
397
398 // Interleave real and imaginary and then convert to float values
399 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
400
401 // Normalize the floating point values
402 retlo = _mm256_mul_ps(retlo, invScalar);
403
404 // Interleave real and imaginary and then convert to float values
405 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
406
407 // Normalize the floating point values
408 rethi = _mm256_mul_ps(rethi, invScalar);
409
410 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
411 _mm256_storeu_ps((float*)c, ret);
412 c += 4;
413
414 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
415 _mm256_storeu_ps((float*)c, ret);
416 c += 4;
417
418 a += 8;
419 b += 8;
420 }
421
422 number = oneEigthPoints * 8;
423 float* cFloatPtr = (float*)&cVector[number];
424 int8_t* a8Ptr = (int8_t*)&aVector[number];
425 int8_t* b8Ptr = (int8_t*)&bVector[number];
426 for (; number < num_points; number++) {
427 float aReal = (float)*a8Ptr++;
428 float aImag = (float)*a8Ptr++;
429 lv_32fc_t aVal = lv_cmake(aReal, aImag);
430 float bReal = (float)*b8Ptr++;
431 float bImag = (float)*b8Ptr++;
432 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
433 lv_32fc_t temp = aVal * bVal;
434
435 *cFloatPtr++ = lv_creal(temp) / scalar;
436 *cFloatPtr++ = lv_cimag(temp) / scalar;
437 }
438}
439#endif /* LV_HAVE_AVX2*/
440
441
442#ifdef LV_HAVE_RVV
443#include <riscv_vector.h>
444
445static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_rvv(lv_32fc_t* cVector,
446 const lv_8sc_t* aVector,
447 const lv_8sc_t* bVector,
448 const float scalar,
449 unsigned int num_points)
450{
451 size_t n = num_points;
452 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
453 vl = __riscv_vsetvl_e8m1(n);
454 vint16m2_t va = __riscv_vle16_v_i16m2((const int16_t*)aVector, vl);
455 vint16m2_t vb = __riscv_vle16_v_i16m2((const int16_t*)bVector, vl);
456 vint8m1_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl);
457 vint8m1_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl);
458 vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
459 vint16m2_t vi =
460 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
461 vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
462 vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
463 vuint32m4_t vru = __riscv_vreinterpret_u32m4(vrf);
464 vuint32m4_t viu = __riscv_vreinterpret_u32m4(vif);
465 vuint64m8_t v =
466 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
467 __riscv_vse64((uint64_t*)cVector, v, vl);
468 }
469}
470#endif /*LV_HAVE_RVV*/
471
472#ifdef LV_HAVE_RVVSEG
473#include <riscv_vector.h>
474
475static inline void
476volk_8ic_x2_s32f_multiply_conjugate_32fc_rvvseg(lv_32fc_t* cVector,
477 const lv_8sc_t* aVector,
478 const lv_8sc_t* bVector,
479 const float scalar,
480 unsigned int num_points)
481{
482 size_t n = num_points;
483 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
484 vl = __riscv_vsetvl_e8m1(n);
485 vint8m1x2_t va = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)aVector, vl);
486 vint8m1x2_t vb = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)bVector, vl);
487 vint8m1_t var = __riscv_vget_i8m1(va, 0), vai = __riscv_vget_i8m1(va, 1);
488 vint8m1_t vbr = __riscv_vget_i8m1(vb, 0), vbi = __riscv_vget_i8m1(vb, 1);
489 vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
490 vint16m2_t vi =
491 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
492 vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
493 vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
494 __riscv_vsseg2e32_v_f32m4x2(
495 (float*)cVector, __riscv_vcreate_v_f32m4x2(vrf, vif), vl);
496 }
497}
498
499#endif /*LV_HAVE_RVVSEG*/
500
501#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */