Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_s32f_multiply_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
55
56#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
57#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
58
59#include <inttypes.h>
60#include <stdio.h>
61
62#ifdef LV_HAVE_GENERIC
63static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
64 const float* aVector,
65 const float scalar,
66 unsigned int num_points)
67{
68 for (unsigned int number = 0; number < num_points; number++) {
69 *cVector++ = (*aVector++) * scalar;
70 }
71}
72#endif /* LV_HAVE_GENERIC */
73
74#ifdef LV_HAVE_SSE
75#include <xmmintrin.h>
76
77static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
78 const float* aVector,
79 const float scalar,
80 unsigned int num_points)
81{
82 const unsigned int quarterPoints = num_points / 4;
83
84 float* cPtr = cVector;
85 const float* aPtr = aVector;
86
87 const __m128 bVal = _mm_set_ps1(scalar);
88 for (unsigned int number = 0; number < quarterPoints; number++) {
89 __m128 aVal = _mm_loadu_ps(aPtr);
90
91 __m128 cVal = _mm_mul_ps(aVal, bVal);
92
93 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
94
95 aPtr += 4;
96 cPtr += 4;
97 }
98
99 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
100 *cPtr++ = (*aPtr++) * scalar;
101 }
102}
103#endif /* LV_HAVE_SSE */
104
105#ifdef LV_HAVE_AVX
106#include <immintrin.h>
107
108static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
109 const float* aVector,
110 const float scalar,
111 unsigned int num_points)
112{
113 const unsigned int eighthPoints = num_points / 8;
114
115 float* cPtr = cVector;
116 const float* aPtr = aVector;
117
118 const __m256 bVal = _mm256_set1_ps(scalar);
119 for (unsigned int number = 0; number < eighthPoints; number++) {
120 __m256 aVal = _mm256_loadu_ps(aPtr);
121
122 __m256 cVal = _mm256_mul_ps(aVal, bVal);
123
124 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
125
126 aPtr += 8;
127 cPtr += 8;
128 }
129
130 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
131 *cPtr++ = (*aPtr++) * scalar;
132 }
133}
134#endif /* LV_HAVE_AVX */
135
136#ifdef LV_HAVE_RISCV64
137extern void volk_32f_s32f_multiply_32f_sifive_u74(float* cVector,
138 const float* aVector,
139 const float scalar,
140 unsigned int num_points);
141#endif /* LV_HAVE_RISCV64 */
142
143
144#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
145
146
147#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
148#define INCLUDED_volk_32f_s32f_multiply_32f_a_H
149
150#include <inttypes.h>
151#include <stdio.h>
152
153#ifdef LV_HAVE_SSE
154#include <xmmintrin.h>
155
156static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
157 const float* aVector,
158 const float scalar,
159 unsigned int num_points)
160{
161 const unsigned int quarterPoints = num_points / 4;
162
163 float* cPtr = cVector;
164 const float* aPtr = aVector;
165
166 const __m128 bVal = _mm_set_ps1(scalar);
167 for (unsigned int number = 0; number < quarterPoints; number++) {
168 __m128 aVal = _mm_load_ps(aPtr);
169
170 __m128 cVal = _mm_mul_ps(aVal, bVal);
171
172 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
173
174 aPtr += 4;
175 cPtr += 4;
176 }
177
178 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
179 *cPtr++ = (*aPtr++) * scalar;
180 }
181}
182#endif /* LV_HAVE_SSE */
183
184#ifdef LV_HAVE_AVX
185#include <immintrin.h>
186
187static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
188 const float* aVector,
189 const float scalar,
190 unsigned int num_points)
191{
192 const unsigned int eighthPoints = num_points / 8;
193
194 float* cPtr = cVector;
195 const float* aPtr = aVector;
196
197 const __m256 bVal = _mm256_set1_ps(scalar);
198 for (unsigned int number = 0; number < eighthPoints; number++) {
199 __m256 aVal = _mm256_load_ps(aPtr);
200
201 __m256 cVal = _mm256_mul_ps(aVal, bVal);
202
203 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
204
205 aPtr += 8;
206 cPtr += 8;
207 }
208
209 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
210 *cPtr++ = (*aPtr++) * scalar;
211 }
212}
213#endif /* LV_HAVE_AVX */
214
215#ifdef LV_HAVE_NEON
216#include <arm_neon.h>
217
218static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
219 const float* aVector,
220 const float scalar,
221 unsigned int num_points)
222{
223 const unsigned int quarterPoints = num_points / 4;
224
225 const float* inputPtr = aVector;
226 float* outputPtr = cVector;
227
228 for (unsigned int number = 0; number < quarterPoints; number++) {
229 float32x4_t aVal = vld1q_f32(inputPtr); // Load into NEON regs
230 float32x4_t cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
231 vst1q_f32(outputPtr, cVal); // Store results back to output
232 inputPtr += 4;
233 outputPtr += 4;
234 }
235
236 for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
237 *outputPtr++ = (*inputPtr++) * scalar;
238 }
239}
240#endif /* LV_HAVE_NEON */
241
242#ifdef LV_HAVE_NEONV8
243#include <arm_neon.h>
244
245static inline void volk_32f_s32f_multiply_32f_neonv8(float* cVector,
246 const float* aVector,
247 const float scalar,
248 unsigned int num_points)
249{
250 const unsigned int eighthPoints = num_points / 8;
251
252 const float* aPtr = aVector;
253 float* cPtr = cVector;
254 const float32x4_t scalarVec = vdupq_n_f32(scalar);
255
256 for (unsigned int number = 0; number < eighthPoints; number++) {
257 float32x4_t a0 = vld1q_f32(aPtr);
258 float32x4_t a1 = vld1q_f32(aPtr + 4);
259 __VOLK_PREFETCH(aPtr + 16);
260
261 vst1q_f32(cPtr, vmulq_f32(a0, scalarVec));
262 vst1q_f32(cPtr + 4, vmulq_f32(a1, scalarVec));
263
264 aPtr += 8;
265 cPtr += 8;
266 }
267
268 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
269 *cPtr++ = (*aPtr++) * scalar;
270 }
271}
272#endif /* LV_HAVE_NEONV8 */
273
274
275#ifdef LV_HAVE_ORC
276
277extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
278 const float* src,
279 const float scalar,
280 int num_points);
281
282static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
283 const float* aVector,
284 const float scalar,
285 unsigned int num_points)
286{
287 volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
288}
289
290#endif /* LV_HAVE_ORC */
291
292#ifdef LV_HAVE_RVV
293#include <riscv_vector.h>
294
295static inline void volk_32f_s32f_multiply_32f_rvv(float* cVector,
296 const float* aVector,
297 const float scalar,
298 unsigned int num_points)
299{
300 size_t n = num_points;
301 for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
302 vl = __riscv_vsetvl_e32m8(n);
303 vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
304 __riscv_vse32(cVector, __riscv_vfmul(v, scalar, vl), vl);
305 }
306}
307#endif /*LV_HAVE_RVV*/
308
309#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */