Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_s32f_stddev_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
54
55#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
56#define INCLUDED_volk_32f_s32f_stddev_32f_a_H
57
58#include <inttypes.h>
59#include <math.h>
60#include <stdio.h>
61#include <volk/volk_common.h>
62
63#ifdef LV_HAVE_SSE4_1
64#include <smmintrin.h>
65
66static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev,
67 const float* inputBuffer,
68 const float mean,
69 unsigned int num_points)
70{
71 float returnValue = 0;
72 if (num_points > 0) {
73 unsigned int number = 0;
74 const unsigned int sixteenthPoints = num_points / 16;
75
76 const float* aPtr = inputBuffer;
77
78 __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
79
80 __m128 squareAccumulator = _mm_setzero_ps();
81 __m128 aVal1, aVal2, aVal3, aVal4;
82 __m128 cVal1, cVal2, cVal3, cVal4;
83 for (; number < sixteenthPoints; number++) {
84 aVal1 = _mm_load_ps(aPtr);
85 aPtr += 4;
86 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
87
88 aVal2 = _mm_load_ps(aPtr);
89 aPtr += 4;
90 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
91
92 aVal3 = _mm_load_ps(aPtr);
93 aPtr += 4;
94 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
95
96 aVal4 = _mm_load_ps(aPtr);
97 aPtr += 4;
98 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
99
100 cVal1 = _mm_or_ps(cVal1, cVal2);
101 cVal3 = _mm_or_ps(cVal3, cVal4);
102 cVal1 = _mm_or_ps(cVal1, cVal3);
103
104 squareAccumulator =
105 _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
106 }
107 _mm_store_ps(squareBuffer,
108 squareAccumulator); // Store the results back into the C container
109 returnValue = squareBuffer[0];
110 returnValue += squareBuffer[1];
111 returnValue += squareBuffer[2];
112 returnValue += squareBuffer[3];
113
114 number = sixteenthPoints * 16;
115 for (; number < num_points; number++) {
116 returnValue += (*aPtr) * (*aPtr);
117 aPtr++;
118 }
119 returnValue /= num_points;
120 returnValue -= (mean * mean);
121 returnValue = sqrtf(returnValue);
122 }
123 *stddev = returnValue;
124}
125
126#endif /* LV_HAVE_SSE4_1 */
127
128#ifdef LV_HAVE_SSE
129#include <xmmintrin.h>
130
131static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev,
132 const float* inputBuffer,
133 const float mean,
134 unsigned int num_points)
135{
136 float returnValue = 0;
137 if (num_points > 0) {
138 unsigned int number = 0;
139 const unsigned int quarterPoints = num_points / 4;
140
141 const float* aPtr = inputBuffer;
142
143 __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
144
145 __m128 squareAccumulator = _mm_setzero_ps();
146 __m128 aVal = _mm_setzero_ps();
147 for (; number < quarterPoints; number++) {
148 aVal = _mm_load_ps(aPtr); // aVal = x
149 aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
150 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
151 aPtr += 4;
152 }
153 _mm_store_ps(squareBuffer,
154 squareAccumulator); // Store the results back into the C container
155 returnValue = squareBuffer[0];
156 returnValue += squareBuffer[1];
157 returnValue += squareBuffer[2];
158 returnValue += squareBuffer[3];
159
160 number = quarterPoints * 4;
161 for (; number < num_points; number++) {
162 returnValue += (*aPtr) * (*aPtr);
163 aPtr++;
164 }
165 returnValue /= num_points;
166 returnValue -= (mean * mean);
167 returnValue = sqrtf(returnValue);
168 }
169 *stddev = returnValue;
170}
171#endif /* LV_HAVE_SSE */
172
173
174#ifdef LV_HAVE_AVX
175#include <immintrin.h>
176
177static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev,
178 const float* inputBuffer,
179 const float mean,
180 unsigned int num_points)
181{
182 float stdDev = 0;
183 if (num_points > 0) {
184 unsigned int number = 0;
185 const unsigned int thirtySecondthPoints = num_points / 32;
186
187 const float* aPtr = inputBuffer;
188 __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
189
190 __m256 squareAccumulator = _mm256_setzero_ps();
191 __m256 aVal1, aVal2, aVal3, aVal4;
192 __m256 cVal1, cVal2, cVal3, cVal4;
193 for (; number < thirtySecondthPoints; number++) {
194 aVal1 = _mm256_load_ps(aPtr);
195 aPtr += 8;
196 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
197
198 aVal2 = _mm256_load_ps(aPtr);
199 aPtr += 8;
200 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
201
202 aVal3 = _mm256_load_ps(aPtr);
203 aPtr += 8;
204 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
205
206 aVal4 = _mm256_load_ps(aPtr);
207 aPtr += 8;
208 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
209
210 cVal1 = _mm256_or_ps(cVal1, cVal2);
211 cVal3 = _mm256_or_ps(cVal3, cVal4);
212 cVal1 = _mm256_or_ps(cVal1, cVal3);
213
214 squareAccumulator =
215 _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
216 }
217 _mm256_store_ps(squareBuffer,
218 squareAccumulator); // Store the results back into the C container
219 stdDev = squareBuffer[0];
220 stdDev += squareBuffer[1];
221 stdDev += squareBuffer[2];
222 stdDev += squareBuffer[3];
223 stdDev += squareBuffer[4];
224 stdDev += squareBuffer[5];
225 stdDev += squareBuffer[6];
226 stdDev += squareBuffer[7];
227
228 number = thirtySecondthPoints * 32;
229 for (; number < num_points; number++) {
230 stdDev += (*aPtr) * (*aPtr);
231 aPtr++;
232 }
233 stdDev /= num_points;
234 stdDev -= (mean * mean);
235 stdDev = sqrtf(stdDev);
236 }
237 *stddev = stdDev;
238}
239#endif /* LV_HAVE_AVX */
240
241
242#ifdef LV_HAVE_GENERIC
243
244static inline void volk_32f_s32f_stddev_32f_generic(float* stddev,
245 const float* inputBuffer,
246 const float mean,
247 unsigned int num_points)
248{
249 float returnValue = 0;
250 if (num_points > 0) {
251 const float* aPtr = inputBuffer;
252 unsigned int number = 0;
253
254 for (number = 0; number < num_points; number++) {
255 returnValue += (*aPtr) * (*aPtr);
256 aPtr++;
257 }
258
259 returnValue /= num_points;
260 returnValue -= (mean * mean);
261 returnValue = sqrtf(returnValue);
262 }
263 *stddev = returnValue;
264}
265
266#endif /* LV_HAVE_GENERIC */
267
268
269#ifdef LV_HAVE_NEON
270#include <arm_neon.h>
271
272static inline void volk_32f_s32f_stddev_32f_neon(float* stddev,
273 const float* inputBuffer,
274 const float mean,
275 unsigned int num_points)
276{
277 float returnValue = 0;
278 if (num_points > 0) {
279 unsigned int number = 0;
280 const unsigned int quarterPoints = num_points / 4;
281
282 const float* aPtr = inputBuffer;
283
284 float32x4_t squareAccumulator = vdupq_n_f32(0.0f);
285
286 for (; number < quarterPoints; number++) {
287 float32x4_t aVal = vld1q_f32(aPtr);
288 squareAccumulator = vmlaq_f32(squareAccumulator, aVal, aVal);
289 aPtr += 4;
290 }
291
292 // Reduce the accumulator
293 float32x2_t sum =
294 vadd_f32(vget_low_f32(squareAccumulator), vget_high_f32(squareAccumulator));
295 sum = vpadd_f32(sum, sum);
296 returnValue = vget_lane_f32(sum, 0);
297
298 number = quarterPoints * 4;
299 for (; number < num_points; number++) {
300 returnValue += (*aPtr) * (*aPtr);
301 aPtr++;
302 }
303 returnValue /= num_points;
304 returnValue -= (mean * mean);
305 returnValue = sqrtf(returnValue);
306 }
307 *stddev = returnValue;
308}
309
310#endif /* LV_HAVE_NEON */
311
312#ifdef LV_HAVE_NEONV8
313#include <arm_neon.h>
314
315static inline void volk_32f_s32f_stddev_32f_neonv8(float* stddev,
316 const float* inputBuffer,
317 const float mean,
318 unsigned int num_points)
319{
320 float returnValue = 0;
321 if (num_points > 0) {
322 unsigned int number = 0;
323 const unsigned int eighthPoints = num_points / 8;
324
325 const float* aPtr = inputBuffer;
326
327 float32x4_t squareAccumulator0 = vdupq_n_f32(0.0f);
328 float32x4_t squareAccumulator1 = vdupq_n_f32(0.0f);
329
330 for (; number < eighthPoints; number++) {
331 __VOLK_PREFETCH(aPtr + 16);
332 float32x4_t aVal0 = vld1q_f32(aPtr);
333 float32x4_t aVal1 = vld1q_f32(aPtr + 4);
334 squareAccumulator0 = vfmaq_f32(squareAccumulator0, aVal0, aVal0);
335 squareAccumulator1 = vfmaq_f32(squareAccumulator1, aVal1, aVal1);
336 aPtr += 8;
337 }
338
339 // Combine and reduce the accumulators
340 float32x4_t squareAccumulator = vaddq_f32(squareAccumulator0, squareAccumulator1);
341 returnValue = vaddvq_f32(squareAccumulator);
342
343 number = eighthPoints * 8;
344 for (; number < num_points; number++) {
345 returnValue += (*aPtr) * (*aPtr);
346 aPtr++;
347 }
348 returnValue /= num_points;
349 returnValue -= (mean * mean);
350 returnValue = sqrtf(returnValue);
351 }
352 *stddev = returnValue;
353}
354
355#endif /* LV_HAVE_NEONV8 */
356
357#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */
358
359#ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
360#define INCLUDED_volk_32f_s32f_stddev_32f_u_H
361
362#include <inttypes.h>
363#include <math.h>
364#include <stdio.h>
365#include <volk/volk_common.h>
366
367#ifdef LV_HAVE_AVX
368#include <immintrin.h>
369
370static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev,
371 const float* inputBuffer,
372 const float mean,
373 unsigned int num_points)
374{
375 float stdDev = 0;
376 if (num_points > 0) {
377 unsigned int number = 0;
378 const unsigned int thirtySecondthPoints = num_points / 32;
379
380 const float* aPtr = inputBuffer;
381 __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
382
383 __m256 squareAccumulator = _mm256_setzero_ps();
384 __m256 aVal1, aVal2, aVal3, aVal4;
385 __m256 cVal1, cVal2, cVal3, cVal4;
386 for (; number < thirtySecondthPoints; number++) {
387 aVal1 = _mm256_loadu_ps(aPtr);
388 aPtr += 8;
389 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
390
391 aVal2 = _mm256_loadu_ps(aPtr);
392 aPtr += 8;
393 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
394
395 aVal3 = _mm256_loadu_ps(aPtr);
396 aPtr += 8;
397 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
398
399 aVal4 = _mm256_loadu_ps(aPtr);
400 aPtr += 8;
401 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
402
403 cVal1 = _mm256_or_ps(cVal1, cVal2);
404 cVal3 = _mm256_or_ps(cVal3, cVal4);
405 cVal1 = _mm256_or_ps(cVal1, cVal3);
406
407 squareAccumulator =
408 _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
409 }
410 _mm256_storeu_ps(
411 squareBuffer,
412 squareAccumulator); // Store the results back into the C container
413 stdDev = squareBuffer[0];
414 stdDev += squareBuffer[1];
415 stdDev += squareBuffer[2];
416 stdDev += squareBuffer[3];
417 stdDev += squareBuffer[4];
418 stdDev += squareBuffer[5];
419 stdDev += squareBuffer[6];
420 stdDev += squareBuffer[7];
421
422 number = thirtySecondthPoints * 32;
423 for (; number < num_points; number++) {
424 stdDev += (*aPtr) * (*aPtr);
425 aPtr++;
426 }
427 stdDev /= num_points;
428 stdDev -= (mean * mean);
429 stdDev = sqrtf(stdDev);
430 }
431 *stddev = stdDev;
432}
433#endif /* LV_HAVE_AVX */
434
435#ifdef LV_HAVE_RVV
436#include <riscv_vector.h>
438
439static inline void volk_32f_s32f_stddev_32f_rvv(float* stddev,
440 const float* inputBuffer,
441 const float mean,
442 unsigned int num_points)
443{
444 if (num_points == 0) {
445 *stddev = 0;
446 return;
447 }
448 vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, __riscv_vsetvlmax_e32m8());
449 size_t n = num_points;
450 for (size_t vl; n > 0; n -= vl, inputBuffer += vl) {
451 vl = __riscv_vsetvl_e32m8(n);
452 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputBuffer, vl);
453 vsum = __riscv_vfmacc_tu(vsum, v, v, vl);
454 }
455 size_t vl = __riscv_vsetvlmax_e32m1();
456 vfloat32m1_t v = RISCV_SHRINK8(vfadd, f, 32, vsum);
457 v = __riscv_vfredusum(v, __riscv_vfmv_s_f_f32m1(0, vl), vl);
458 float sum = __riscv_vfmv_f(v);
459 *stddev = sqrtf((sum / num_points) - (mean * mean));
460}
461#endif /*LV_HAVE_RVV*/
462
463#endif /* INCLUDED_volk_32f_s32f_stddev_32f_u_H */