Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_8ic_x2_multiply_conjugate_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
11#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
12
13#include <inttypes.h>
14#include <limits.h>
15#include <stdio.h>
16#include <volk/volk_complex.h>
17
18#ifdef LV_HAVE_AVX2
19#include <immintrin.h>
28static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector,
29 const lv_8sc_t* aVector,
30 const lv_8sc_t* bVector,
31 unsigned int num_points)
32{
33 unsigned int number = 0;
34 const unsigned int quarterPoints = num_points / 8;
35
36 __m256i x, y, realz, imagz;
37 lv_16sc_t* c = cVector;
38 const lv_8sc_t* a = aVector;
39 const lv_8sc_t* b = bVector;
40 __m256i conjugateSign =
41 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
42
43 for (; number < quarterPoints; number++) {
44 // Convert 8 bit values into 16 bit values
45 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
46 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
47
48 // Calculate the ar*cr - ai*(-ci) portions
49 realz = _mm256_madd_epi16(x, y);
50
51 // Calculate the complex conjugate of the cr + ci j values
52 y = _mm256_sign_epi16(y, conjugateSign);
53
54 // Shift the order of the cr and ci values
55 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
56 _MM_SHUFFLE(2, 3, 0, 1));
57
58 // Calculate the ar*(-ci) + cr*(ai)
59 imagz = _mm256_madd_epi16(x, y);
60
61 // Perform the addition of products
62
63 _mm256_store_si256((__m256i*)c,
64 _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
65 _mm256_unpackhi_epi32(realz, imagz)));
66
67 a += 8;
68 b += 8;
69 c += 8;
70 }
71
72 number = quarterPoints * 8;
73 int16_t* c16Ptr = (int16_t*)&cVector[number];
74 int8_t* a8Ptr = (int8_t*)&aVector[number];
75 int8_t* b8Ptr = (int8_t*)&bVector[number];
76 for (; number < num_points; number++) {
77 float aReal = (float)*a8Ptr++;
78 float aImag = (float)*a8Ptr++;
79 lv_32fc_t aVal = lv_cmake(aReal, aImag);
80 float bReal = (float)*b8Ptr++;
81 float bImag = (float)*b8Ptr++;
82 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
83 lv_32fc_t temp = aVal * bVal;
84
85 *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
86 *c16Ptr++ = (int16_t)lv_cimag(temp);
87 }
88}
89#endif /* LV_HAVE_AVX2 */
90
91
92#ifdef LV_HAVE_SSE4_1
93#include <smmintrin.h>
102static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector,
103 const lv_8sc_t* aVector,
104 const lv_8sc_t* bVector,
105 unsigned int num_points)
106{
107 unsigned int number = 0;
108 const unsigned int quarterPoints = num_points / 4;
109
110 __m128i x, y, realz, imagz;
111 lv_16sc_t* c = cVector;
112 const lv_8sc_t* a = aVector;
113 const lv_8sc_t* b = bVector;
114 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
115
116 for (; number < quarterPoints; number++) {
117 // Convert into 8 bit values into 16 bit values
118 x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
119 y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
120
121 // Calculate the ar*cr - ai*(-ci) portions
122 realz = _mm_madd_epi16(x, y);
123
124 // Calculate the complex conjugate of the cr + ci j values
125 y = _mm_sign_epi16(y, conjugateSign);
126
127 // Shift the order of the cr and ci values
128 y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
129 _MM_SHUFFLE(2, 3, 0, 1));
130
131 // Calculate the ar*(-ci) + cr*(ai)
132 imagz = _mm_madd_epi16(x, y);
133
134 _mm_store_si128((__m128i*)c,
135 _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz),
136 _mm_unpackhi_epi32(realz, imagz)));
137
138 a += 4;
139 b += 4;
140 c += 4;
141 }
142
143 number = quarterPoints * 4;
144 int16_t* c16Ptr = (int16_t*)&cVector[number];
145 int8_t* a8Ptr = (int8_t*)&aVector[number];
146 int8_t* b8Ptr = (int8_t*)&bVector[number];
147 for (; number < num_points; number++) {
148 float aReal = (float)*a8Ptr++;
149 float aImag = (float)*a8Ptr++;
150 lv_32fc_t aVal = lv_cmake(aReal, aImag);
151 float bReal = (float)*b8Ptr++;
152 float bImag = (float)*b8Ptr++;
153 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
154 lv_32fc_t temp = aVal * bVal;
155
156 *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
157 *c16Ptr++ = (int16_t)lv_cimag(temp);
158 }
159}
160#endif /* LV_HAVE_SSE4_1 */
161
162#ifdef LV_HAVE_GENERIC
172 const lv_8sc_t* aVector,
173 const lv_8sc_t* bVector,
174 unsigned int num_points)
175{
176 unsigned int number = 0;
177 int16_t* c16Ptr = (int16_t*)cVector;
178 int8_t* a8Ptr = (int8_t*)aVector;
179 int8_t* b8Ptr = (int8_t*)bVector;
180 for (number = 0; number < num_points; number++) {
181 float aReal = (float)*a8Ptr++;
182 float aImag = (float)*a8Ptr++;
183 lv_32fc_t aVal = lv_cmake(aReal, aImag);
184 float bReal = (float)*b8Ptr++;
185 float bImag = (float)*b8Ptr++;
186 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
187 lv_32fc_t temp = aVal * bVal;
188
189 *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
190 *c16Ptr++ = (int16_t)lv_cimag(temp);
191 }
192}
193#endif /* LV_HAVE_GENERIC */
194
195
196#ifdef LV_HAVE_NEON
197#include <arm_neon.h>
198
200 const lv_8sc_t* aVector,
201 const lv_8sc_t* bVector,
202 unsigned int num_points)
203{
204 unsigned int number = 0;
205 const unsigned int eighthPoints = num_points / 8;
206
207 lv_16sc_t* cPtr = cVector;
208 const lv_8sc_t* aPtr = aVector;
209 const lv_8sc_t* bPtr = bVector;
210
211 int8x8x2_t aVal, bVal;
212 int16x8_t aReal, aImag, bReal, bImag;
213 int32x4_t realLo, realHi, imagLo, imagHi;
214 int16x4_t realNarrowLo, realNarrowHi, imagNarrowLo, imagNarrowHi;
215
216 for (; number < eighthPoints; number++) {
217 // Load 8 complex 8-bit values (deinterleaved)
218 aVal = vld2_s8((const int8_t*)aPtr);
219 bVal = vld2_s8((const int8_t*)bPtr);
220
221 // Widen to 16-bit
222 aReal = vmovl_s8(aVal.val[0]);
223 aImag = vmovl_s8(aVal.val[1]);
224 bReal = vmovl_s8(bVal.val[0]);
225 bImag = vmovl_s8(bVal.val[1]);
226
227 // Complex multiply with conjugate: (ar + ai*j) * (br - bi*j)
228 // real = ar*br + ai*bi
229 // imag = ai*br - ar*bi
230
231 // Low half (first 4 complex values)
232 realLo = vmlal_s16(vmull_s16(vget_low_s16(aReal), vget_low_s16(bReal)),
233 vget_low_s16(aImag),
234 vget_low_s16(bImag));
235 imagLo = vmlsl_s16(vmull_s16(vget_low_s16(aImag), vget_low_s16(bReal)),
236 vget_low_s16(aReal),
237 vget_low_s16(bImag));
238
239 // High half (next 4 complex values)
240 realHi = vmlal_s16(vmull_s16(vget_high_s16(aReal), vget_high_s16(bReal)),
241 vget_high_s16(aImag),
242 vget_high_s16(bImag));
243 imagHi = vmlsl_s16(vmull_s16(vget_high_s16(aImag), vget_high_s16(bReal)),
244 vget_high_s16(aReal),
245 vget_high_s16(bImag));
246
247 // Narrow with saturation to 16-bit
248 realNarrowLo = vqmovn_s32(realLo);
249 realNarrowHi = vqmovn_s32(realHi);
250 imagNarrowLo = vqmovn_s32(imagLo);
251 imagNarrowHi = vqmovn_s32(imagHi);
252
253 // Interleave real and imaginary
254 int16x8_t realResult = vcombine_s16(realNarrowLo, realNarrowHi);
255 int16x8_t imagResult = vcombine_s16(imagNarrowLo, imagNarrowHi);
256
257 // Store interleaved
258 int16x8x2_t result;
259 result.val[0] = realResult;
260 result.val[1] = imagResult;
261 vst2q_s16((int16_t*)cPtr, result);
262
263 aPtr += 8;
264 bPtr += 8;
265 cPtr += 8;
266 }
267
268 number = eighthPoints * 8;
269 int16_t* c16Ptr = (int16_t*)&cVector[number];
270 int8_t* a8Ptr = (int8_t*)&aVector[number];
271 int8_t* b8Ptr = (int8_t*)&bVector[number];
272 for (; number < num_points; number++) {
273 float aReal_f = (float)*a8Ptr++;
274 float aImag_f = (float)*a8Ptr++;
275 lv_32fc_t aVal_c = lv_cmake(aReal_f, aImag_f);
276 float bReal_f = (float)*b8Ptr++;
277 float bImag_f = (float)*b8Ptr++;
278 lv_32fc_t bVal_c = lv_cmake(bReal_f, -bImag_f);
279 lv_32fc_t temp = aVal_c * bVal_c;
280
281 *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
282 *c16Ptr++ = (int16_t)lv_cimag(temp);
283 }
284}
285#endif /* LV_HAVE_NEON */
286
287
288#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */
289
290#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
291#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
292
293#include <inttypes.h>
294#include <stdio.h>
295#include <volk/volk_complex.h>
296
297#ifdef LV_HAVE_AVX2
298#include <immintrin.h>
307static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector,
308 const lv_8sc_t* aVector,
309 const lv_8sc_t* bVector,
310 unsigned int num_points)
311{
312 unsigned int number = 0;
313 const unsigned int oneEigthPoints = num_points / 8;
314
315 __m256i x, y, realz, imagz;
316 lv_16sc_t* c = cVector;
317 const lv_8sc_t* a = aVector;
318 const lv_8sc_t* b = bVector;
319 __m256i conjugateSign =
320 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
321
322 for (; number < oneEigthPoints; number++) {
323 // Convert 8 bit values into 16 bit values
324 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
325 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
326
327 // Calculate the ar*cr - ai*(-ci) portions
328 realz = _mm256_madd_epi16(x, y);
329
330 // Calculate the complex conjugate of the cr + ci j values
331 y = _mm256_sign_epi16(y, conjugateSign);
332
333 // Shift the order of the cr and ci values
334 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
335 _MM_SHUFFLE(2, 3, 0, 1));
336
337 // Calculate the ar*(-ci) + cr*(ai)
338 imagz = _mm256_madd_epi16(x, y);
339
340 // Perform the addition of products
341
342 _mm256_storeu_si256((__m256i*)c,
343 _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
344 _mm256_unpackhi_epi32(realz, imagz)));
345
346 a += 8;
347 b += 8;
348 c += 8;
349 }
350
351 number = oneEigthPoints * 8;
352 int16_t* c16Ptr = (int16_t*)&cVector[number];
353 int8_t* a8Ptr = (int8_t*)&aVector[number];
354 int8_t* b8Ptr = (int8_t*)&bVector[number];
355 for (; number < num_points; number++) {
356 float aReal = (float)*a8Ptr++;
357 float aImag = (float)*a8Ptr++;
358 lv_32fc_t aVal = lv_cmake(aReal, aImag);
359 float bReal = (float)*b8Ptr++;
360 float bImag = (float)*b8Ptr++;
361 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
362 lv_32fc_t temp = aVal * bVal;
363
364 *c16Ptr++ = (int16_t)(lv_creal(temp) > SHRT_MAX ? SHRT_MAX : lv_creal(temp));
365 *c16Ptr++ = (int16_t)lv_cimag(temp);
366 }
367}
368#endif /* LV_HAVE_AVX2 */
369
370#ifdef LV_HAVE_RVV
371#include <riscv_vector.h>
372
373static inline void volk_8ic_x2_multiply_conjugate_16ic_rvv(lv_16sc_t* cVector,
374 const lv_8sc_t* aVector,
375 const lv_8sc_t* bVector,
376 unsigned int num_points)
377{
378 size_t n = num_points;
379 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
380 vl = __riscv_vsetvl_e8m2(n);
381 vint16m4_t va = __riscv_vle16_v_i16m4((const int16_t*)aVector, vl);
382 vint16m4_t vb = __riscv_vle16_v_i16m4((const int16_t*)bVector, vl);
383 vint8m2_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl);
384 vint8m2_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl);
385 vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
386 vint16m4_t vi =
387 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
388 vuint16m4_t vru = __riscv_vreinterpret_u16m4(vr);
389 vuint16m4_t viu = __riscv_vreinterpret_u16m4(vi);
390 vuint32m8_t v = __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFF, viu, vl);
391 __riscv_vse32((uint32_t*)cVector, v, vl);
392 }
393}
394#endif /*LV_HAVE_RVV*/
395
396#ifdef LV_HAVE_RVVSEG
397#include <riscv_vector.h>
398
399static inline void volk_8ic_x2_multiply_conjugate_16ic_rvvseg(lv_16sc_t* cVector,
400 const lv_8sc_t* aVector,
401 const lv_8sc_t* bVector,
402 unsigned int num_points)
403{
404 size_t n = num_points;
405 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
406 vl = __riscv_vsetvl_e8m2(n);
407 vint8m2x2_t va = __riscv_vlseg2e8_v_i8m2x2((const int8_t*)aVector, vl);
408 vint8m2x2_t vb = __riscv_vlseg2e8_v_i8m2x2((const int8_t*)bVector, vl);
409 vint8m2_t var = __riscv_vget_i8m2(va, 0), vai = __riscv_vget_i8m2(va, 1);
410 vint8m2_t vbr = __riscv_vget_i8m2(vb, 0), vbi = __riscv_vget_i8m2(vb, 1);
411 vint16m4_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
412 vint16m4_t vi =
413 __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
414 __riscv_vsseg2e16_v_i16m4x2(
415 (int16_t*)cVector, __riscv_vcreate_v_i16m4x2(vr, vi), vl);
416 }
417}
418
419#endif /*LV_HAVE_RVVSEG*/
420
421#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H */