Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_32f_multiply_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
41
42#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
43#define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
44
45#include <inttypes.h>
46#include <stdio.h>
47
48#ifdef LV_HAVE_AVX
49#include <immintrin.h>
50
51static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
52 const lv_32fc_t* aVector,
53 const float* bVector,
54 unsigned int num_points)
55{
56 unsigned int number = 0;
57 const unsigned int eighthPoints = num_points / 8;
58
59 lv_32fc_t* cPtr = cVector;
60 const lv_32fc_t* aPtr = aVector;
61 const float* bPtr = bVector;
62
63 __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
64
65 __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
66
67 for (; number < eighthPoints; number++) {
68
69 aVal1 = _mm256_load_ps((float*)aPtr);
70 aPtr += 4;
71
72 aVal2 = _mm256_load_ps((float*)aPtr);
73 aPtr += 4;
74
75 bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
76 bPtr += 8;
77
78 bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
79 bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
80
81 bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
82 bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
83
84 cVal1 = _mm256_mul_ps(aVal1, bVal1);
85 cVal2 = _mm256_mul_ps(aVal2, bVal2);
86
87 _mm256_store_ps((float*)cPtr,
88 cVal1); // Store the results back into the C container
89 cPtr += 4;
90
91 _mm256_store_ps((float*)cPtr,
92 cVal2); // Store the results back into the C container
93 cPtr += 4;
94 }
95
96 number = eighthPoints * 8;
97 for (; number < num_points; ++number) {
98 *cPtr++ = (*aPtr++) * (*bPtr++);
99 }
100}
101#endif /* LV_HAVE_AVX */
102
103
104#ifdef LV_HAVE_SSE
105#include <xmmintrin.h>
106
108 const lv_32fc_t* aVector,
109 const float* bVector,
110 unsigned int num_points)
111{
112 unsigned int number = 0;
113 const unsigned int quarterPoints = num_points / 4;
114
115 lv_32fc_t* cPtr = cVector;
116 const lv_32fc_t* aPtr = aVector;
117 const float* bPtr = bVector;
118
119 __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
120 for (; number < quarterPoints; number++) {
121
122 aVal1 = _mm_load_ps((const float*)aPtr);
123 aPtr += 2;
124
125 aVal2 = _mm_load_ps((const float*)aPtr);
126 aPtr += 2;
127
128 bVal = _mm_load_ps(bPtr);
129 bPtr += 4;
130
131 bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
132 bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
133
134 cVal = _mm_mul_ps(aVal1, bVal1);
135
136 _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
137 cPtr += 2;
138
139 cVal = _mm_mul_ps(aVal2, bVal2);
140
141 _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
142
143 cPtr += 2;
144 }
145
146 number = quarterPoints * 4;
147 for (; number < num_points; number++) {
148 *cPtr++ = (*aPtr++) * (*bPtr);
149 bPtr++;
150 }
151}
152#endif /* LV_HAVE_SSE */
153
154
155#ifdef LV_HAVE_GENERIC
156
158 const lv_32fc_t* aVector,
159 const float* bVector,
160 unsigned int num_points)
161{
162 lv_32fc_t* cPtr = cVector;
163 const lv_32fc_t* aPtr = aVector;
164 const float* bPtr = bVector;
165 unsigned int number = 0;
166
167 for (number = 0; number < num_points; number++) {
168 *cPtr++ = (*aPtr++) * (*bPtr++);
169 }
170}
171#endif /* LV_HAVE_GENERIC */
172
173
174#ifdef LV_HAVE_NEON
175#include <arm_neon.h>
176
177static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
178 const lv_32fc_t* aVector,
179 const float* bVector,
180 unsigned int num_points)
181{
182 lv_32fc_t* cPtr = cVector;
183 const lv_32fc_t* aPtr = aVector;
184 const float* bPtr = bVector;
185 unsigned int number = 0;
186 unsigned int quarter_points = num_points / 4;
187
188 float32x4x2_t inputVector, outputVector;
189 float32x4_t tapsVector;
190 for (number = 0; number < quarter_points; number++) {
191 inputVector = vld2q_f32((float*)aPtr);
192 tapsVector = vld1q_f32(bPtr);
193
194 outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
195 outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
196
197 vst2q_f32((float*)cPtr, outputVector);
198 aPtr += 4;
199 bPtr += 4;
200 cPtr += 4;
201 }
202
203 for (number = quarter_points * 4; number < num_points; number++) {
204 *cPtr++ = (*aPtr++) * (*bPtr++);
205 }
206}
207#endif /* LV_HAVE_NEON */
208
209
210#ifdef LV_HAVE_NEONV8
211#include <arm_neon.h>
212
213static inline void volk_32fc_32f_multiply_32fc_neonv8(lv_32fc_t* cVector,
214 const lv_32fc_t* aVector,
215 const float* bVector,
216 unsigned int num_points)
217{
218 unsigned int n = num_points;
219 lv_32fc_t* c = cVector;
220 const lv_32fc_t* a = aVector;
221 const float* b = bVector;
222
223 /* Process 8 complex numbers per iteration (2x unroll) */
224 while (n >= 8) {
225 float32x4x2_t a0 = vld2q_f32((const float*)a);
226 float32x4x2_t a1 = vld2q_f32((const float*)(a + 4));
227 float32x4_t b0 = vld1q_f32(b);
228 float32x4_t b1 = vld1q_f32(b + 4);
229 __VOLK_PREFETCH(a + 8);
230 __VOLK_PREFETCH(b + 8);
231
232 /* Complex × real: just scale both real and imag parts */
233 float32x4x2_t c0, c1;
234 c0.val[0] = vmulq_f32(a0.val[0], b0);
235 c0.val[1] = vmulq_f32(a0.val[1], b0);
236 c1.val[0] = vmulq_f32(a1.val[0], b1);
237 c1.val[1] = vmulq_f32(a1.val[1], b1);
238
239 vst2q_f32((float*)c, c0);
240 vst2q_f32((float*)(c + 4), c1);
241
242 a += 8;
243 b += 8;
244 c += 8;
245 n -= 8;
246 }
247
248 /* Process remaining 4 */
249 if (n >= 4) {
250 float32x4x2_t a0 = vld2q_f32((const float*)a);
251 float32x4_t b0 = vld1q_f32(b);
252 float32x4x2_t c0;
253 c0.val[0] = vmulq_f32(a0.val[0], b0);
254 c0.val[1] = vmulq_f32(a0.val[1], b0);
255 vst2q_f32((float*)c, c0);
256 a += 4;
257 b += 4;
258 c += 4;
259 n -= 4;
260 }
261
262 /* Scalar tail */
263 while (n > 0) {
264 *c++ = (*a++) * (*b++);
265 n--;
266 }
267}
268
269#endif /* LV_HAVE_NEONV8 */
270
271
272#ifdef LV_HAVE_ORC
273
274extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
275 const lv_32fc_t* aVector,
276 const float* bVector,
277 int num_points);
278
279static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
280 const lv_32fc_t* aVector,
281 const float* bVector,
282 unsigned int num_points)
283{
284 volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
285}
286
287#endif /* LV_HAVE_GENERIC */
288
289#ifdef LV_HAVE_RVV
290#include <riscv_vector.h>
291
292static inline void volk_32fc_32f_multiply_32fc_rvv(lv_32fc_t* cVector,
293 const lv_32fc_t* aVector,
294 const float* bVector,
295 unsigned int num_points)
296{
297 size_t n = num_points;
298 for (size_t vl; n > 0; n -= vl, cVector += vl, aVector += vl, bVector += vl) {
299 vl = __riscv_vsetvl_e32m4(n);
300 vfloat32m8_t vc = __riscv_vle32_v_f32m8((const float*)aVector, vl * 2);
301 vuint32m4_t v = __riscv_vle32_v_u32m4((const uint32_t*)bVector, vl);
302 vfloat32m8_t vf = __riscv_vreinterpret_f32m8(__riscv_vreinterpret_u32m8(
303 __riscv_vwmaccu(__riscv_vwaddu_vv(v, v, vl), 0xFFFFFFFF, v, vl)));
304 __riscv_vse32((float*)cVector, __riscv_vfmul(vc, vf, vl * 2), vl * 2);
305 }
306}
307#endif /*LV_HAVE_RVV*/
308
309#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */