55#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
56#define INCLUDED_volk_32f_s32f_stddev_32f_a_H
66static inline void volk_32f_s32f_stddev_32f_a_sse4_1(
float* stddev,
67 const float* inputBuffer,
69 unsigned int num_points)
71 float returnValue = 0;
73 unsigned int number = 0;
74 const unsigned int sixteenthPoints = num_points / 16;
76 const float* aPtr = inputBuffer;
80 __m128 squareAccumulator = _mm_setzero_ps();
81 __m128 aVal1, aVal2, aVal3, aVal4;
82 __m128 cVal1, cVal2, cVal3, cVal4;
83 for (; number < sixteenthPoints; number++) {
84 aVal1 = _mm_load_ps(aPtr);
86 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
88 aVal2 = _mm_load_ps(aPtr);
90 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
92 aVal3 = _mm_load_ps(aPtr);
94 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
96 aVal4 = _mm_load_ps(aPtr);
98 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
100 cVal1 = _mm_or_ps(cVal1, cVal2);
101 cVal3 = _mm_or_ps(cVal3, cVal4);
102 cVal1 = _mm_or_ps(cVal1, cVal3);
105 _mm_add_ps(squareAccumulator, cVal1);
107 _mm_store_ps(squareBuffer,
109 returnValue = squareBuffer[0];
110 returnValue += squareBuffer[1];
111 returnValue += squareBuffer[2];
112 returnValue += squareBuffer[3];
114 number = sixteenthPoints * 16;
115 for (; number < num_points; number++) {
116 returnValue += (*aPtr) * (*aPtr);
119 returnValue /= num_points;
120 returnValue -= (mean * mean);
121 returnValue = sqrtf(returnValue);
123 *stddev = returnValue;
129#include <xmmintrin.h>
132 const float* inputBuffer,
134 unsigned int num_points)
136 float returnValue = 0;
137 if (num_points > 0) {
138 unsigned int number = 0;
139 const unsigned int quarterPoints = num_points / 4;
141 const float* aPtr = inputBuffer;
145 __m128 squareAccumulator = _mm_setzero_ps();
146 __m128 aVal = _mm_setzero_ps();
147 for (; number < quarterPoints; number++) {
148 aVal = _mm_load_ps(aPtr);
149 aVal = _mm_mul_ps(aVal, aVal);
150 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
153 _mm_store_ps(squareBuffer,
155 returnValue = squareBuffer[0];
156 returnValue += squareBuffer[1];
157 returnValue += squareBuffer[2];
158 returnValue += squareBuffer[3];
160 number = quarterPoints * 4;
161 for (; number < num_points; number++) {
162 returnValue += (*aPtr) * (*aPtr);
165 returnValue /= num_points;
166 returnValue -= (mean * mean);
167 returnValue = sqrtf(returnValue);
169 *stddev = returnValue;
175#include <immintrin.h>
178 const float* inputBuffer,
180 unsigned int num_points)
183 if (num_points > 0) {
184 unsigned int number = 0;
185 const unsigned int thirtySecondthPoints = num_points / 32;
187 const float* aPtr = inputBuffer;
190 __m256 squareAccumulator = _mm256_setzero_ps();
191 __m256 aVal1, aVal2, aVal3, aVal4;
192 __m256 cVal1, cVal2, cVal3, cVal4;
193 for (; number < thirtySecondthPoints; number++) {
194 aVal1 = _mm256_load_ps(aPtr);
196 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
198 aVal2 = _mm256_load_ps(aPtr);
200 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
202 aVal3 = _mm256_load_ps(aPtr);
204 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
206 aVal4 = _mm256_load_ps(aPtr);
208 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
210 cVal1 = _mm256_or_ps(cVal1, cVal2);
211 cVal3 = _mm256_or_ps(cVal3, cVal4);
212 cVal1 = _mm256_or_ps(cVal1, cVal3);
215 _mm256_add_ps(squareAccumulator, cVal1);
217 _mm256_store_ps(squareBuffer,
219 stdDev = squareBuffer[0];
220 stdDev += squareBuffer[1];
221 stdDev += squareBuffer[2];
222 stdDev += squareBuffer[3];
223 stdDev += squareBuffer[4];
224 stdDev += squareBuffer[5];
225 stdDev += squareBuffer[6];
226 stdDev += squareBuffer[7];
228 number = thirtySecondthPoints * 32;
229 for (; number < num_points; number++) {
230 stdDev += (*aPtr) * (*aPtr);
233 stdDev /= num_points;
234 stdDev -= (mean * mean);
235 stdDev = sqrtf(stdDev);
242#ifdef LV_HAVE_GENERIC
245 const float* inputBuffer,
247 unsigned int num_points)
249 float returnValue = 0;
250 if (num_points > 0) {
251 const float* aPtr = inputBuffer;
252 unsigned int number = 0;
254 for (number = 0; number < num_points; number++) {
255 returnValue += (*aPtr) * (*aPtr);
259 returnValue /= num_points;
260 returnValue -= (mean * mean);
261 returnValue = sqrtf(returnValue);
263 *stddev = returnValue;
273 const float* inputBuffer,
275 unsigned int num_points)
277 float returnValue = 0;
278 if (num_points > 0) {
279 unsigned int number = 0;
280 const unsigned int quarterPoints = num_points / 4;
282 const float* aPtr = inputBuffer;
284 float32x4_t squareAccumulator = vdupq_n_f32(0.0f);
286 for (; number < quarterPoints; number++) {
287 float32x4_t aVal = vld1q_f32(aPtr);
288 squareAccumulator = vmlaq_f32(squareAccumulator, aVal, aVal);
294 vadd_f32(vget_low_f32(squareAccumulator), vget_high_f32(squareAccumulator));
295 sum = vpadd_f32(sum, sum);
296 returnValue = vget_lane_f32(sum, 0);
298 number = quarterPoints * 4;
299 for (; number < num_points; number++) {
300 returnValue += (*aPtr) * (*aPtr);
303 returnValue /= num_points;
304 returnValue -= (mean * mean);
305 returnValue = sqrtf(returnValue);
307 *stddev = returnValue;
315static inline void volk_32f_s32f_stddev_32f_neonv8(
float* stddev,
316 const float* inputBuffer,
318 unsigned int num_points)
320 float returnValue = 0;
321 if (num_points > 0) {
322 unsigned int number = 0;
323 const unsigned int eighthPoints = num_points / 8;
325 const float* aPtr = inputBuffer;
327 float32x4_t squareAccumulator0 = vdupq_n_f32(0.0f);
328 float32x4_t squareAccumulator1 = vdupq_n_f32(0.0f);
330 for (; number < eighthPoints; number++) {
332 float32x4_t aVal0 = vld1q_f32(aPtr);
333 float32x4_t aVal1 = vld1q_f32(aPtr + 4);
334 squareAccumulator0 = vfmaq_f32(squareAccumulator0, aVal0, aVal0);
335 squareAccumulator1 = vfmaq_f32(squareAccumulator1, aVal1, aVal1);
340 float32x4_t squareAccumulator = vaddq_f32(squareAccumulator0, squareAccumulator1);
341 returnValue = vaddvq_f32(squareAccumulator);
343 number = eighthPoints * 8;
344 for (; number < num_points; number++) {
345 returnValue += (*aPtr) * (*aPtr);
348 returnValue /= num_points;
349 returnValue -= (mean * mean);
350 returnValue = sqrtf(returnValue);
352 *stddev = returnValue;
359#ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
360#define INCLUDED_volk_32f_s32f_stddev_32f_u_H
368#include <immintrin.h>
371 const float* inputBuffer,
373 unsigned int num_points)
376 if (num_points > 0) {
377 unsigned int number = 0;
378 const unsigned int thirtySecondthPoints = num_points / 32;
380 const float* aPtr = inputBuffer;
383 __m256 squareAccumulator = _mm256_setzero_ps();
384 __m256 aVal1, aVal2, aVal3, aVal4;
385 __m256 cVal1, cVal2, cVal3, cVal4;
386 for (; number < thirtySecondthPoints; number++) {
387 aVal1 = _mm256_loadu_ps(aPtr);
389 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
391 aVal2 = _mm256_loadu_ps(aPtr);
393 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
395 aVal3 = _mm256_loadu_ps(aPtr);
397 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
399 aVal4 = _mm256_loadu_ps(aPtr);
401 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
403 cVal1 = _mm256_or_ps(cVal1, cVal2);
404 cVal3 = _mm256_or_ps(cVal3, cVal4);
405 cVal1 = _mm256_or_ps(cVal1, cVal3);
408 _mm256_add_ps(squareAccumulator, cVal1);
413 stdDev = squareBuffer[0];
414 stdDev += squareBuffer[1];
415 stdDev += squareBuffer[2];
416 stdDev += squareBuffer[3];
417 stdDev += squareBuffer[4];
418 stdDev += squareBuffer[5];
419 stdDev += squareBuffer[6];
420 stdDev += squareBuffer[7];
422 number = thirtySecondthPoints * 32;
423 for (; number < num_points; number++) {
424 stdDev += (*aPtr) * (*aPtr);
427 stdDev /= num_points;
428 stdDev -= (mean * mean);
429 stdDev = sqrtf(stdDev);
436#include <riscv_vector.h>
439static inline void volk_32f_s32f_stddev_32f_rvv(
float* stddev,
440 const float* inputBuffer,
442 unsigned int num_points)
444 if (num_points == 0) {
448 vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
449 size_t n = num_points;
450 for (
size_t vl; n > 0; n -= vl, inputBuffer += vl) {
451 vl = __riscv_vsetvl_e32m8(n);
452 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl);
453 vsum = __riscv_vfmacc_tu(vsum, v, v, vl);
455 size_t vl = __riscv_vsetvlmax_e32m1();
457 v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f32m1(0, vl), vl);
458 float sum = __riscv_vfmv_f(v);
459 *stddev = sqrtf((sum / num_points) - (mean * mean));