56#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
57#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
66 unsigned int num_points)
68 for (
unsigned int number = 0; number < num_points; number++) {
69 *cVector++ = (*aVector++) * scalar;
80 unsigned int num_points)
82 const unsigned int quarterPoints = num_points / 4;
84 float* cPtr = cVector;
85 const float* aPtr = aVector;
87 const __m128 bVal = _mm_set_ps1(scalar);
88 for (
unsigned int number = 0; number < quarterPoints; number++) {
89 __m128 aVal = _mm_loadu_ps(aPtr);
91 __m128 cVal = _mm_mul_ps(aVal, bVal);
93 _mm_storeu_ps(cPtr, cVal);
99 for (
unsigned int number = quarterPoints * 4; number < num_points; number++) {
100 *cPtr++ = (*aPtr++) * scalar;
106#include <immintrin.h>
109 const float* aVector,
111 unsigned int num_points)
113 const unsigned int eighthPoints = num_points / 8;
115 float* cPtr = cVector;
116 const float* aPtr = aVector;
118 const __m256 bVal = _mm256_set1_ps(scalar);
119 for (
unsigned int number = 0; number < eighthPoints; number++) {
120 __m256 aVal = _mm256_loadu_ps(aPtr);
122 __m256 cVal = _mm256_mul_ps(aVal, bVal);
124 _mm256_storeu_ps(cPtr, cVal);
130 for (
unsigned int number = eighthPoints * 8; number < num_points; number++) {
131 *cPtr++ = (*aPtr++) * scalar;
136#ifdef LV_HAVE_RISCV64
137extern void volk_32f_s32f_multiply_32f_sifive_u74(
float* cVector,
138 const float* aVector,
140 unsigned int num_points);
147#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
148#define INCLUDED_volk_32f_s32f_multiply_32f_a_H
154#include <xmmintrin.h>
157 const float* aVector,
159 unsigned int num_points)
161 const unsigned int quarterPoints = num_points / 4;
163 float* cPtr = cVector;
164 const float* aPtr = aVector;
166 const __m128 bVal = _mm_set_ps1(scalar);
167 for (
unsigned int number = 0; number < quarterPoints; number++) {
168 __m128 aVal = _mm_load_ps(aPtr);
170 __m128 cVal = _mm_mul_ps(aVal, bVal);
172 _mm_store_ps(cPtr, cVal);
178 for (
unsigned int number = quarterPoints * 4; number < num_points; number++) {
179 *cPtr++ = (*aPtr++) * scalar;
185#include <immintrin.h>
188 const float* aVector,
190 unsigned int num_points)
192 const unsigned int eighthPoints = num_points / 8;
194 float* cPtr = cVector;
195 const float* aPtr = aVector;
197 const __m256 bVal = _mm256_set1_ps(scalar);
198 for (
unsigned int number = 0; number < eighthPoints; number++) {
199 __m256 aVal = _mm256_load_ps(aPtr);
201 __m256 cVal = _mm256_mul_ps(aVal, bVal);
203 _mm256_store_ps(cPtr, cVal);
209 for (
unsigned int number = eighthPoints * 8; number < num_points; number++) {
210 *cPtr++ = (*aPtr++) * scalar;
219 const float* aVector,
221 unsigned int num_points)
223 const unsigned int quarterPoints = num_points / 4;
225 const float* inputPtr = aVector;
226 float* outputPtr = cVector;
228 for (
unsigned int number = 0; number < quarterPoints; number++) {
229 float32x4_t aVal = vld1q_f32(inputPtr);
230 float32x4_t cVal = vmulq_n_f32(aVal, scalar);
231 vst1q_f32(outputPtr, cVal);
236 for (
unsigned int number = quarterPoints * 4; number < num_points; number++) {
237 *outputPtr++ = (*inputPtr++) * scalar;
245static inline void volk_32f_s32f_multiply_32f_neonv8(
float* cVector,
246 const float* aVector,
248 unsigned int num_points)
250 const unsigned int eighthPoints = num_points / 8;
252 const float* aPtr = aVector;
253 float* cPtr = cVector;
254 const float32x4_t scalarVec = vdupq_n_f32(scalar);
256 for (
unsigned int number = 0; number < eighthPoints; number++) {
257 float32x4_t a0 = vld1q_f32(aPtr);
258 float32x4_t a1 = vld1q_f32(aPtr + 4);
261 vst1q_f32(cPtr, vmulq_f32(a0, scalarVec));
262 vst1q_f32(cPtr + 4, vmulq_f32(a1, scalarVec));
268 for (
unsigned int number = eighthPoints * 8; number < num_points; number++) {
269 *cPtr++ = (*aPtr++) * scalar;
277extern void volk_32f_s32f_multiply_32f_a_orc_impl(
float* dst,
282static inline void volk_32f_s32f_multiply_32f_u_orc(
float* cVector,
283 const float* aVector,
285 unsigned int num_points)
287 volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
293#include <riscv_vector.h>
295static inline void volk_32f_s32f_multiply_32f_rvv(
float* cVector,
296 const float* aVector,
298 unsigned int num_points)
300 size_t n = num_points;
301 for (
size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
302 vl = __riscv_vsetvl_e32m8(n);
303 vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
304 __riscv_vse32(cVector, __riscv_vfmul(v, scalar, vl), vl);