Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_s32f_32f_fm_detect_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
43
44#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
45#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
46
47#include <inttypes.h>
48#include <stdio.h>
49
50#ifdef LV_HAVE_AVX
51#include <immintrin.h>
52
53static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector,
54 const float* inputVector,
55 const float bound,
56 float* saveValue,
57 unsigned int num_points)
58{
59 if (num_points < 1) {
60 return;
61 }
62 unsigned int number = 1;
63 unsigned int j = 0;
64 // num_points-1 keeps Fedora 7's gcc from crashing...
65 // num_points won't work. :(
66 const unsigned int eighthPoints = (num_points - 1) / 8;
67
68 float* outPtr = outputVector;
69 const float* inPtr = inputVector;
70 __m256 upperBound = _mm256_set1_ps(bound);
71 __m256 lowerBound = _mm256_set1_ps(-bound);
72 __m256 next3old1;
73 __m256 next4;
74 __m256 boundAdjust;
75 __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
76 __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
77 // Do the first 8 by hand since we're going in from the saveValue:
78 *outPtr = *inPtr - *saveValue;
79 if (*outPtr > bound)
80 *outPtr -= 2 * bound;
81 if (*outPtr < -bound)
82 *outPtr += 2 * bound;
83 inPtr++;
84 outPtr++;
85 for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
86 *outPtr = *(inPtr) - *(inPtr - 1);
87 if (*outPtr > bound)
88 *outPtr -= 2 * bound;
89 if (*outPtr < -bound)
90 *outPtr += 2 * bound;
91 inPtr++;
92 outPtr++;
93 }
94
95 for (; number < eighthPoints; number++) {
96 // Load data
97 next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
98 next4 = _mm256_load_ps(inPtr);
99 inPtr += 8;
100 // Subtract and store:
101 next3old1 = _mm256_sub_ps(next4, next3old1);
102 // Bound:
103 boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
104 boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
105 next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
106 next4 = _mm256_and_ps(next4, negBoundAdjust);
107 boundAdjust = _mm256_or_ps(next4, boundAdjust);
108 // Make sure we're in the bounding interval:
109 next3old1 = _mm256_add_ps(next3old1, boundAdjust);
110 _mm256_store_ps(outPtr, next3old1); // Store the results back into the output
111 outPtr += 8;
112 }
113
114 for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
115 number++) {
116 *outPtr = *(inPtr) - *(inPtr - 1);
117 if (*outPtr > bound)
118 *outPtr -= 2 * bound;
119 if (*outPtr < -bound)
120 *outPtr += 2 * bound;
121 inPtr++;
122 outPtr++;
123 }
124
125 *saveValue = inputVector[num_points - 1];
126}
127#endif /* LV_HAVE_AVX */
128
129
130#ifdef LV_HAVE_SSE
131#include <xmmintrin.h>
132
133static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector,
134 const float* inputVector,
135 const float bound,
136 float* saveValue,
137 unsigned int num_points)
138{
139 if (num_points < 1) {
140 return;
141 }
142 unsigned int number = 1;
143 unsigned int j = 0;
144 // num_points-1 keeps Fedora 7's gcc from crashing...
145 // num_points won't work. :(
146 const unsigned int quarterPoints = (num_points - 1) / 4;
147
148 float* outPtr = outputVector;
149 const float* inPtr = inputVector;
150 __m128 upperBound = _mm_set_ps1(bound);
151 __m128 lowerBound = _mm_set_ps1(-bound);
152 __m128 next3old1;
153 __m128 next4;
154 __m128 boundAdjust;
155 __m128 posBoundAdjust = _mm_set_ps1(-2 * bound); // Subtract when we're above.
156 __m128 negBoundAdjust = _mm_set_ps1(2 * bound); // Add when we're below.
157 // Do the first 4 by hand since we're going in from the saveValue:
158 *outPtr = *inPtr - *saveValue;
159 if (*outPtr > bound)
160 *outPtr -= 2 * bound;
161 if (*outPtr < -bound)
162 *outPtr += 2 * bound;
163 inPtr++;
164 outPtr++;
165 for (j = 1; j < ((4 < num_points) ? 4 : num_points); j++) {
166 *outPtr = *(inPtr) - *(inPtr - 1);
167 if (*outPtr > bound)
168 *outPtr -= 2 * bound;
169 if (*outPtr < -bound)
170 *outPtr += 2 * bound;
171 inPtr++;
172 outPtr++;
173 }
174
175 for (; number < quarterPoints; number++) {
176 // Load data
177 next3old1 = _mm_loadu_ps((float*)(inPtr - 1));
178 next4 = _mm_load_ps(inPtr);
179 inPtr += 4;
180 // Subtract and store:
181 next3old1 = _mm_sub_ps(next4, next3old1);
182 // Bound:
183 boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
184 boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
185 next4 = _mm_cmplt_ps(next3old1, lowerBound);
186 next4 = _mm_and_ps(next4, negBoundAdjust);
187 boundAdjust = _mm_or_ps(next4, boundAdjust);
188 // Make sure we're in the bounding interval:
189 next3old1 = _mm_add_ps(next3old1, boundAdjust);
190 _mm_store_ps(outPtr, next3old1); // Store the results back into the output
191 outPtr += 4;
192 }
193
194 for (number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints));
195 number < num_points;
196 number++) {
197 *outPtr = *(inPtr) - *(inPtr - 1);
198 if (*outPtr > bound)
199 *outPtr -= 2 * bound;
200 if (*outPtr < -bound)
201 *outPtr += 2 * bound;
202 inPtr++;
203 outPtr++;
204 }
205
206 *saveValue = inputVector[num_points - 1];
207}
208#endif /* LV_HAVE_SSE */
209
210#ifdef LV_HAVE_GENERIC
211
212static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
213 const float* inputVector,
214 const float bound,
215 float* saveValue,
216 unsigned int num_points)
217{
218 if (num_points < 1) {
219 return;
220 }
221 unsigned int number = 0;
222 float* outPtr = outputVector;
223 const float* inPtr = inputVector;
224
225 // Do the first 1 by hand since we're going in from the saveValue:
226 *outPtr = *inPtr - *saveValue;
227 if (*outPtr > bound)
228 *outPtr -= 2 * bound;
229 if (*outPtr < -bound)
230 *outPtr += 2 * bound;
231 inPtr++;
232 outPtr++;
233
234 for (number = 1; number < num_points; number++) {
235 *outPtr = *(inPtr) - *(inPtr - 1);
236 if (*outPtr > bound)
237 *outPtr -= 2 * bound;
238 if (*outPtr < -bound)
239 *outPtr += 2 * bound;
240 inPtr++;
241 outPtr++;
242 }
243
244 *saveValue = inputVector[num_points - 1];
245}
246#endif /* LV_HAVE_GENERIC */
247
248
249#ifdef LV_HAVE_NEON
250#include <arm_neon.h>
251
252static inline void volk_32f_s32f_32f_fm_detect_32f_neon(float* outputVector,
253 const float* inputVector,
254 const float bound,
255 float* saveValue,
256 unsigned int num_points)
257{
258 if (num_points < 1) {
259 return;
260 }
261
262 float* outPtr = outputVector;
263 const float* inPtr = inputVector;
264
265 const float32x4_t upperBound = vdupq_n_f32(bound);
266 const float32x4_t lowerBound = vdupq_n_f32(-bound);
267 const float32x4_t posBoundAdjust = vdupq_n_f32(-2.f * bound);
268 const float32x4_t negBoundAdjust = vdupq_n_f32(2.f * bound);
269
270 // Do the first element from saveValue
271 *outPtr = *inPtr - *saveValue;
272 if (*outPtr > bound)
273 *outPtr -= 2 * bound;
274 if (*outPtr < -bound)
275 *outPtr += 2 * bound;
276 inPtr++;
277 outPtr++;
278
279 // Do the next 3 elements to align to 4
280 for (unsigned int j = 1; j < ((4 < num_points) ? 4 : num_points); j++) {
281 *outPtr = *inPtr - *(inPtr - 1);
282 if (*outPtr > bound)
283 *outPtr -= 2 * bound;
284 if (*outPtr < -bound)
285 *outPtr += 2 * bound;
286 inPtr++;
287 outPtr++;
288 }
289
290 const unsigned int quarterPoints = (num_points - 1) / 4;
291 for (unsigned int number = 1; number < quarterPoints; number++) {
292 // Load current and previous (offset by 1)
293 float32x4_t curr = vld1q_f32(inPtr);
294 float32x4_t prev = vld1q_f32(inPtr - 1);
295 inPtr += 4;
296
297 // Compute difference
298 float32x4_t diff = vsubq_f32(curr, prev);
299
300 // Apply bound wrapping
301 uint32x4_t aboveMask = vcgtq_f32(diff, upperBound);
302 uint32x4_t belowMask = vcltq_f32(diff, lowerBound);
303
304 float32x4_t adjust = vbslq_f32(aboveMask, posBoundAdjust, vdupq_n_f32(0));
305 adjust = vbslq_f32(belowMask, negBoundAdjust, adjust);
306
307 diff = vaddq_f32(diff, adjust);
308
309 vst1q_f32(outPtr, diff);
310 outPtr += 4;
311 }
312
313 // Handle remainder
314 for (unsigned int number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints));
315 number < num_points;
316 number++) {
317 *outPtr = *inPtr - *(inPtr - 1);
318 if (*outPtr > bound)
319 *outPtr -= 2 * bound;
320 if (*outPtr < -bound)
321 *outPtr += 2 * bound;
322 inPtr++;
323 outPtr++;
324 }
325
326 *saveValue = inputVector[num_points - 1];
327}
328#endif /* LV_HAVE_NEON */
329
330
331#ifdef LV_HAVE_NEONV8
332#include <arm_neon.h>
333
334static inline void volk_32f_s32f_32f_fm_detect_32f_neonv8(float* outputVector,
335 const float* inputVector,
336 const float bound,
337 float* saveValue,
338 unsigned int num_points)
339{
340 if (num_points < 1) {
341 return;
342 }
343
344 float* outPtr = outputVector;
345 const float* inPtr = inputVector;
346
347 const float32x4_t upperBound = vdupq_n_f32(bound);
348 const float32x4_t lowerBound = vdupq_n_f32(-bound);
349 const float32x4_t posBoundAdjust = vdupq_n_f32(-2.f * bound);
350 const float32x4_t negBoundAdjust = vdupq_n_f32(2.f * bound);
351 const float32x4_t zeros = vdupq_n_f32(0);
352
353 /* Do the first element from saveValue */
354 *outPtr = *inPtr - *saveValue;
355 if (*outPtr > bound)
356 *outPtr -= 2 * bound;
357 if (*outPtr < -bound)
358 *outPtr += 2 * bound;
359 inPtr++;
360 outPtr++;
361
362 /* Do the next 7 elements to align to 8 */
363 for (unsigned int j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
364 *outPtr = *inPtr - *(inPtr - 1);
365 if (*outPtr > bound)
366 *outPtr -= 2 * bound;
367 if (*outPtr < -bound)
368 *outPtr += 2 * bound;
369 inPtr++;
370 outPtr++;
371 }
372
373 /* Process 8 floats per iteration (2x unroll) */
374 const unsigned int eighthPoints = (num_points - 1) / 8;
375 for (unsigned int number = 1; number < eighthPoints; number++) {
376 /* Load current and previous (offset by 1) */
377 float32x4_t curr0 = vld1q_f32(inPtr);
378 float32x4_t prev0 = vld1q_f32(inPtr - 1);
379 float32x4_t curr1 = vld1q_f32(inPtr + 4);
380 float32x4_t prev1 = vld1q_f32(inPtr + 3);
381 __VOLK_PREFETCH(inPtr + 16);
382 inPtr += 8;
383
384 /* Compute differences */
385 float32x4_t diff0 = vsubq_f32(curr0, prev0);
386 float32x4_t diff1 = vsubq_f32(curr1, prev1);
387
388 /* Apply bound wrapping for first 4 */
389 uint32x4_t above0 = vcgtq_f32(diff0, upperBound);
390 uint32x4_t below0 = vcltq_f32(diff0, lowerBound);
391 float32x4_t adj0 = vbslq_f32(above0, posBoundAdjust, zeros);
392 adj0 = vbslq_f32(below0, negBoundAdjust, adj0);
393 diff0 = vaddq_f32(diff0, adj0);
394
395 /* Apply bound wrapping for second 4 */
396 uint32x4_t above1 = vcgtq_f32(diff1, upperBound);
397 uint32x4_t below1 = vcltq_f32(diff1, lowerBound);
398 float32x4_t adj1 = vbslq_f32(above1, posBoundAdjust, zeros);
399 adj1 = vbslq_f32(below1, negBoundAdjust, adj1);
400 diff1 = vaddq_f32(diff1, adj1);
401
402 vst1q_f32(outPtr, diff0);
403 vst1q_f32(outPtr + 4, diff1);
404 outPtr += 8;
405 }
406
407 /* Handle remainder */
408 for (unsigned int number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints));
409 number < num_points;
410 number++) {
411 *outPtr = *inPtr - *(inPtr - 1);
412 if (*outPtr > bound)
413 *outPtr -= 2 * bound;
414 if (*outPtr < -bound)
415 *outPtr += 2 * bound;
416 inPtr++;
417 outPtr++;
418 }
419
420 *saveValue = inputVector[num_points - 1];
421}
422#endif /* LV_HAVE_NEONV8 */
423
424#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
425
426
427#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
428#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
429
430#include <inttypes.h>
431#include <stdio.h>
432
433#ifdef LV_HAVE_AVX
434#include <immintrin.h>
435
436static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector,
437 const float* inputVector,
438 const float bound,
439 float* saveValue,
440 unsigned int num_points)
441{
442 if (num_points < 1) {
443 return;
444 }
445 unsigned int number = 1;
446 unsigned int j = 0;
447 // num_points-1 keeps Fedora 7's gcc from crashing...
448 // num_points won't work. :(
449 const unsigned int eighthPoints = (num_points - 1) / 8;
450
451 float* outPtr = outputVector;
452 const float* inPtr = inputVector;
453 __m256 upperBound = _mm256_set1_ps(bound);
454 __m256 lowerBound = _mm256_set1_ps(-bound);
455 __m256 next3old1;
456 __m256 next4;
457 __m256 boundAdjust;
458 __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
459 __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
460 // Do the first 8 by hand since we're going in from the saveValue:
461 *outPtr = *inPtr - *saveValue;
462 if (*outPtr > bound)
463 *outPtr -= 2 * bound;
464 if (*outPtr < -bound)
465 *outPtr += 2 * bound;
466 inPtr++;
467 outPtr++;
468 for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
469 *outPtr = *(inPtr) - *(inPtr - 1);
470 if (*outPtr > bound)
471 *outPtr -= 2 * bound;
472 if (*outPtr < -bound)
473 *outPtr += 2 * bound;
474 inPtr++;
475 outPtr++;
476 }
477
478 for (; number < eighthPoints; number++) {
479 // Load data
480 next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
481 next4 = _mm256_loadu_ps(inPtr);
482 inPtr += 8;
483 // Subtract and store:
484 next3old1 = _mm256_sub_ps(next4, next3old1);
485 // Bound:
486 boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
487 boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
488 next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
489 next4 = _mm256_and_ps(next4, negBoundAdjust);
490 boundAdjust = _mm256_or_ps(next4, boundAdjust);
491 // Make sure we're in the bounding interval:
492 next3old1 = _mm256_add_ps(next3old1, boundAdjust);
493 _mm256_storeu_ps(outPtr, next3old1); // Store the results back into the output
494 outPtr += 8;
495 }
496
497 for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
498 number++) {
499 *outPtr = *(inPtr) - *(inPtr - 1);
500 if (*outPtr > bound)
501 *outPtr -= 2 * bound;
502 if (*outPtr < -bound)
503 *outPtr += 2 * bound;
504 inPtr++;
505 outPtr++;
506 }
507
508 *saveValue = inputVector[num_points - 1];
509}
510#endif /* LV_HAVE_AVX */
511
512
513#ifdef LV_HAVE_RVV
514#include <riscv_vector.h>
515
516static inline void volk_32f_s32f_32f_fm_detect_32f_rvv(float* outputVector,
517 const float* inputVector,
518 const float bound,
519 float* saveValue,
520 unsigned int num_points)
521{
522 if (num_points < 1)
523 return;
524
525 *outputVector = *inputVector - *saveValue;
526 if (*outputVector > bound)
527 *outputVector -= 2 * bound;
528 if (*outputVector < -bound)
529 *outputVector += 2 * bound;
530 ++inputVector;
531 ++outputVector;
532
533 vfloat32m8_t v2bound = __riscv_vfmv_v_f_f32m8(bound * 2, __riscv_vsetvlmax_e32m8());
534
535 size_t n = num_points - 1;
536 for (size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
537 vl = __riscv_vsetvl_e32m8(n);
538 vfloat32m8_t va = __riscv_vle32_v_f32m8(inputVector, vl);
539 vfloat32m8_t vb = __riscv_vle32_v_f32m8(inputVector - 1, vl);
540 vfloat32m8_t v = __riscv_vfsub(va, vb, vl);
541 v = __riscv_vfsub_mu(__riscv_vmfgt(v, bound, vl), v, v, v2bound, vl);
542 v = __riscv_vfadd_mu(__riscv_vmflt(v, -bound, vl), v, v, v2bound, vl);
543 __riscv_vse32(outputVector, v, vl);
544 }
545
546 *saveValue = inputVector[-1];
547}
548#endif /*LV_HAVE_RVV*/
549
550#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */