58#ifndef INCLUDED_volk_32f_x2_min_32f_a_H
59#define INCLUDED_volk_32f_x2_min_32f_a_H
70 unsigned int num_points)
72 unsigned int number = 0;
73 const unsigned int quarterPoints = num_points / 4;
75 float* cPtr = cVector;
76 const float* aPtr = aVector;
77 const float* bPtr = bVector;
79 __m128 aVal, bVal, cVal;
80 for (; number < quarterPoints; number++) {
81 aVal = _mm_load_ps(aPtr);
82 bVal = _mm_load_ps(bPtr);
84 cVal = _mm_min_ps(aVal, bVal);
86 _mm_store_ps(cPtr, cVal);
93 number = quarterPoints * 4;
94 for (; number < num_points; number++) {
95 const float a = *aPtr++;
96 const float b = *bPtr++;
97 *cPtr++ = (a < b ? a : b);
107 const float* aVector,
108 const float* bVector,
109 unsigned int num_points)
111 float* cPtr = cVector;
112 const float* aPtr = aVector;
113 const float* bPtr = bVector;
114 unsigned int number = 0;
115 unsigned int quarter_points = num_points / 4;
117 float32x4_t a_vec, b_vec, c_vec;
118 for (number = 0; number < quarter_points; number++) {
119 a_vec = vld1q_f32(aPtr);
120 b_vec = vld1q_f32(bPtr);
122 c_vec = vminq_f32(a_vec, b_vec);
124 vst1q_f32(cPtr, c_vec);
130 for (number = quarter_points * 4; number < num_points; number++) {
131 const float a = *aPtr++;
132 const float b = *bPtr++;
133 *cPtr++ = (a < b ? a : b);
141static inline void volk_32f_x2_min_32f_neonv8(
float* cVector,
142 const float* aVector,
143 const float* bVector,
144 unsigned int num_points)
146 const unsigned int eighthPoints = num_points / 8;
148 const float* aPtr = aVector;
149 const float* bPtr = bVector;
150 float* cPtr = cVector;
152 for (
unsigned int number = 0; number < eighthPoints; number++) {
153 float32x4_t a0 = vld1q_f32(aPtr);
154 float32x4_t a1 = vld1q_f32(aPtr + 4);
155 float32x4_t b0 = vld1q_f32(bPtr);
156 float32x4_t b1 = vld1q_f32(bPtr + 4);
160 vst1q_f32(cPtr, vminq_f32(a0, b0));
161 vst1q_f32(cPtr + 4, vminq_f32(a1, b1));
168 for (
unsigned int number = eighthPoints * 8; number < num_points; number++) {
169 const float a = *aPtr++;
170 const float b = *bPtr++;
171 *cPtr++ = (a < b ? a : b);
177#ifdef LV_HAVE_GENERIC
180 const float* aVector,
181 const float* bVector,
182 unsigned int num_points)
184 float* cPtr = cVector;
185 const float* aPtr = aVector;
186 const float* bPtr = bVector;
187 unsigned int number = 0;
189 for (number = 0; number < num_points; number++) {
190 const float a = *aPtr++;
191 const float b = *bPtr++;
192 *cPtr++ = (a < b ? a : b);
200extern void volk_32f_x2_min_32f_a_orc_impl(
float* cVector,
201 const float* aVector,
202 const float* bVector,
205static inline void volk_32f_x2_min_32f_u_orc(
float* cVector,
206 const float* aVector,
207 const float* bVector,
208 unsigned int num_points)
210 volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
215#include <immintrin.h>
218 const float* aVector,
219 const float* bVector,
220 unsigned int num_points)
222 unsigned int number = 0;
223 const unsigned int eighthPoints = num_points / 8;
225 float* cPtr = cVector;
226 const float* aPtr = aVector;
227 const float* bPtr = bVector;
229 __m256 aVal, bVal, cVal;
230 for (; number < eighthPoints; number++) {
231 aVal = _mm256_load_ps(aPtr);
232 bVal = _mm256_load_ps(bPtr);
234 cVal = _mm256_min_ps(aVal, bVal);
236 _mm256_store_ps(cPtr, cVal);
243 number = eighthPoints * 8;
244 for (; number < num_points; number++) {
245 const float a = *aPtr++;
246 const float b = *bPtr++;
247 *cPtr++ = (a < b ? a : b);
252#ifdef LV_HAVE_AVX512F
253#include <immintrin.h>
255static inline void volk_32f_x2_min_32f_a_avx512f(
float* cVector,
256 const float* aVector,
257 const float* bVector,
258 unsigned int num_points)
260 unsigned int number = 0;
261 const unsigned int sixteenthPoints = num_points / 16;
263 float* cPtr = cVector;
264 const float* aPtr = aVector;
265 const float* bPtr = bVector;
267 __m512 aVal, bVal, cVal;
268 for (; number < sixteenthPoints; number++) {
269 aVal = _mm512_load_ps(aPtr);
270 bVal = _mm512_load_ps(bPtr);
272 cVal = _mm512_min_ps(aVal, bVal);
274 _mm512_store_ps(cPtr, cVal);
281 number = sixteenthPoints * 16;
282 for (; number < num_points; number++) {
283 const float a = *aPtr++;
284 const float b = *bPtr++;
285 *cPtr++ = (a < b ? a : b);
293#ifndef INCLUDED_volk_32f_x2_min_32f_u_H
294#define INCLUDED_volk_32f_x2_min_32f_u_H
299#ifdef LV_HAVE_AVX512F
300#include <immintrin.h>
302static inline void volk_32f_x2_min_32f_u_avx512f(
float* cVector,
303 const float* aVector,
304 const float* bVector,
305 unsigned int num_points)
307 unsigned int number = 0;
308 const unsigned int sixteenthPoints = num_points / 16;
310 float* cPtr = cVector;
311 const float* aPtr = aVector;
312 const float* bPtr = bVector;
314 __m512 aVal, bVal, cVal;
315 for (; number < sixteenthPoints; number++) {
316 aVal = _mm512_loadu_ps(aPtr);
317 bVal = _mm512_loadu_ps(bPtr);
319 cVal = _mm512_min_ps(aVal, bVal);
321 _mm512_storeu_ps(cPtr, cVal);
328 number = sixteenthPoints * 16;
329 for (; number < num_points; number++) {
330 const float a = *aPtr++;
331 const float b = *bPtr++;
332 *cPtr++ = (a < b ? a : b);
338#include <immintrin.h>
341 const float* aVector,
342 const float* bVector,
343 unsigned int num_points)
345 unsigned int number = 0;
346 const unsigned int eighthPoints = num_points / 8;
348 float* cPtr = cVector;
349 const float* aPtr = aVector;
350 const float* bPtr = bVector;
352 __m256 aVal, bVal, cVal;
353 for (; number < eighthPoints; number++) {
354 aVal = _mm256_loadu_ps(aPtr);
355 bVal = _mm256_loadu_ps(bPtr);
357 cVal = _mm256_min_ps(aVal, bVal);
359 _mm256_storeu_ps(cPtr, cVal);
366 number = eighthPoints * 8;
367 for (; number < num_points; number++) {
368 const float a = *aPtr++;
369 const float b = *bPtr++;
370 *cPtr++ = (a < b ? a : b);
376#include <riscv_vector.h>
378static inline void volk_32f_x2_min_32f_rvv(
float* cVector,
379 const float* aVector,
380 const float* bVector,
381 unsigned int num_points)
383 size_t n = num_points;
384 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
385 vl = __riscv_vsetvl_e32m8(n);
386 vfloat32m8_t va = __riscv_vle32_v_f32m8(aVector, vl);
387 vfloat32m8_t vb = __riscv_vle32_v_f32m8(bVector, vl);
388 __riscv_vse32(cVector, __riscv_vfmin(va, vb, vl), vl);