57#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
58#define INCLUDED_volk_32f_s32f_convert_32i_u_H
68 const float* inputVector,
70 unsigned int num_points)
72 unsigned int number = 0;
74 const unsigned int eighthPoints = num_points / 8;
76 const float* inputVectorPtr = (
const float*)inputVector;
77 int32_t* outputVectorPtr = outputVector;
79 float min_val = INT_MIN;
80 float max_val = (uint32_t)INT_MAX + 1;
83 __m256 vScalar = _mm256_set1_ps(scalar);
86 __m256 vmin_val = _mm256_set1_ps(min_val);
87 __m256 vmax_val = _mm256_set1_ps(max_val);
89 for (; number < eighthPoints; number++) {
90 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
93 inputVal1 = _mm256_max_ps(
94 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
95 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
97 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
101 number = eighthPoints * 8;
102 for (; number < num_points; number++) {
103 r = inputVector[number] * scalar;
106 else if (r < min_val)
108 outputVector[number] = (int32_t)rintf(r);
115#include <emmintrin.h>
118 const float* inputVector,
120 unsigned int num_points)
122 unsigned int number = 0;
124 const unsigned int quarterPoints = num_points / 4;
126 const float* inputVectorPtr = (
const float*)inputVector;
127 int32_t* outputVectorPtr = outputVector;
129 float min_val = INT_MIN;
130 float max_val = (uint32_t)INT_MAX + 1;
133 __m128 vScalar = _mm_set_ps1(scalar);
135 __m128i intInputVal1;
136 __m128 vmin_val = _mm_set_ps1(min_val);
137 __m128 vmax_val = _mm_set_ps1(max_val);
139 for (; number < quarterPoints; number++) {
140 inputVal1 = _mm_loadu_ps(inputVectorPtr);
144 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
145 intInputVal1 = _mm_cvtps_epi32(inputVal1);
147 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
148 outputVectorPtr += 4;
151 number = quarterPoints * 4;
152 for (; number < num_points; number++) {
153 r = inputVector[number] * scalar;
156 else if (r < min_val)
158 outputVector[number] = (int32_t)rintf(r);
166#include <xmmintrin.h>
169 const float* inputVector,
171 unsigned int num_points)
173 unsigned int number = 0;
175 const unsigned int quarterPoints = num_points / 4;
177 const float* inputVectorPtr = (
const float*)inputVector;
178 int32_t* outputVectorPtr = outputVector;
180 float min_val = INT_MIN;
181 float max_val = (uint32_t)INT_MAX + 1;
184 __m128 vScalar = _mm_set_ps1(scalar);
186 __m128 vmin_val = _mm_set_ps1(min_val);
187 __m128 vmax_val = _mm_set_ps1(max_val);
191 for (; number < quarterPoints; number++) {
192 ret = _mm_loadu_ps(inputVectorPtr);
195 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
197 _mm_store_ps(outputFloatBuffer, ret);
198 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
199 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
200 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
201 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
204 number = quarterPoints * 4;
205 for (; number < num_points; number++) {
206 r = inputVector[number] * scalar;
209 else if (r < min_val)
211 outputVector[number] = (int32_t)rintf(r);
218#ifdef LV_HAVE_GENERIC
221 const float* inputVector,
223 unsigned int num_points)
225 int32_t* outputVectorPtr = outputVector;
226 const float* inputVectorPtr = inputVector;
227 const float min_val = (float)INT_MIN;
228 const float max_val = (float)((uint32_t)INT_MAX + 1);
230 for (
unsigned int number = 0; number < num_points; number++) {
231 const float r = *inputVectorPtr++ * scalar;
235 else if (r < min_val)
238 s = (int32_t)rintf(r);
239 *outputVectorPtr++ = s;
247#ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
248#define INCLUDED_volk_32f_s32f_convert_32i_a_H
255#include <immintrin.h>
258 const float* inputVector,
260 unsigned int num_points)
262 unsigned int number = 0;
264 const unsigned int eighthPoints = num_points / 8;
266 const float* inputVectorPtr = (
const float*)inputVector;
267 int32_t* outputVectorPtr = outputVector;
269 float min_val = INT_MIN;
270 float max_val = (uint32_t)INT_MAX + 1;
273 __m256 vScalar = _mm256_set1_ps(scalar);
275 __m256i intInputVal1;
276 __m256 vmin_val = _mm256_set1_ps(min_val);
277 __m256 vmax_val = _mm256_set1_ps(max_val);
279 for (; number < eighthPoints; number++) {
280 inputVal1 = _mm256_load_ps(inputVectorPtr);
283 inputVal1 = _mm256_max_ps(
284 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
285 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
287 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
288 outputVectorPtr += 8;
291 number = eighthPoints * 8;
292 for (; number < num_points; number++) {
293 r = inputVector[number] * scalar;
296 else if (r < min_val)
298 outputVector[number] = (int32_t)rintf(r);
306#include <emmintrin.h>
309 const float* inputVector,
311 unsigned int num_points)
313 unsigned int number = 0;
315 const unsigned int quarterPoints = num_points / 4;
317 const float* inputVectorPtr = (
const float*)inputVector;
318 int32_t* outputVectorPtr = outputVector;
320 float min_val = INT_MIN;
321 float max_val = (uint32_t)INT_MAX + 1;
324 __m128 vScalar = _mm_set_ps1(scalar);
326 __m128i intInputVal1;
327 __m128 vmin_val = _mm_set_ps1(min_val);
328 __m128 vmax_val = _mm_set_ps1(max_val);
330 for (; number < quarterPoints; number++) {
331 inputVal1 = _mm_load_ps(inputVectorPtr);
335 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
336 intInputVal1 = _mm_cvtps_epi32(inputVal1);
338 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
339 outputVectorPtr += 4;
342 number = quarterPoints * 4;
343 for (; number < num_points; number++) {
344 r = inputVector[number] * scalar;
347 else if (r < min_val)
349 outputVector[number] = (int32_t)rintf(r);
357#include <xmmintrin.h>
360 const float* inputVector,
362 unsigned int num_points)
364 unsigned int number = 0;
366 const unsigned int quarterPoints = num_points / 4;
368 const float* inputVectorPtr = (
const float*)inputVector;
369 int32_t* outputVectorPtr = outputVector;
371 float min_val = INT_MIN;
372 float max_val = (uint32_t)INT_MAX + 1;
375 __m128 vScalar = _mm_set_ps1(scalar);
377 __m128 vmin_val = _mm_set_ps1(min_val);
378 __m128 vmax_val = _mm_set_ps1(max_val);
382 for (; number < quarterPoints; number++) {
383 ret = _mm_load_ps(inputVectorPtr);
386 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
388 _mm_store_ps(outputFloatBuffer, ret);
389 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
390 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
391 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
392 *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
395 number = quarterPoints * 4;
396 for (; number < num_points; number++) {
397 r = inputVector[number] * scalar;
400 else if (r < min_val)
402 outputVector[number] = (int32_t)rintf(r);
412 const float* inputVector,
414 unsigned int num_points)
416 unsigned int number = 0;
417 const unsigned int quarter_points = num_points / 4;
419 const float* inputPtr = inputVector;
420 int32_t* outputPtr = outputVector;
422 const float min_val = (float)INT_MIN;
423 const float max_val = (float)((uint32_t)INT_MAX + 1);
425 float32x4_t vScalar = vdupq_n_f32(scalar);
426 float32x4_t vmin_val = vdupq_n_f32(min_val);
427 float32x4_t vmax_val = vdupq_n_f32(max_val);
428 float32x4_t half = vdupq_n_f32(0.5f);
429 float32x4_t neg_half = vdupq_n_f32(-0.5f);
430 float32x4_t zero = vdupq_n_f32(0.0f);
432 for (; number < quarter_points; number++) {
433 float32x4_t inputVal = vld1q_f32(inputPtr);
434 inputVal = vmulq_f32(inputVal, vScalar);
435 inputVal = vmaxq_f32(vminq_f32(inputVal, vmax_val), vmin_val);
437 uint32x4_t neg = vcltq_f32(inputVal, zero);
438 inputVal = vaddq_f32(inputVal, vbslq_f32(neg, neg_half, half));
439 int32x4_t intVal = vcvtq_s32_f32(inputVal);
440 vst1q_s32(outputPtr, intVal);
445 number = quarter_points * 4;
446 for (; number < num_points; number++) {
447 float r = *inputPtr++ * scalar;
449 *outputPtr++ = INT_MAX;
450 else if (r < min_val)
451 *outputPtr++ = INT_MIN;
453 *outputPtr++ = (int32_t)rintf(r);
461static inline void volk_32f_s32f_convert_32i_neonv8(int32_t* outputVector,
462 const float* inputVector,
464 unsigned int num_points)
466 unsigned int number = 0;
467 const unsigned int eighth_points = num_points / 8;
469 const float* inputPtr = inputVector;
470 int32_t* outputPtr = outputVector;
472 const float min_val = (float)INT_MIN;
473 const float max_val = (float)((uint32_t)INT_MAX + 1);
475 float32x4_t vScalar = vdupq_n_f32(scalar);
476 float32x4_t vmin_val = vdupq_n_f32(min_val);
477 float32x4_t vmax_val = vdupq_n_f32(max_val);
479 for (; number < eighth_points; number++) {
480 float32x4_t inputVal0 = vld1q_f32(inputPtr);
481 float32x4_t inputVal1 = vld1q_f32(inputPtr + 4);
484 inputVal0 = vmulq_f32(inputVal0, vScalar);
485 inputVal1 = vmulq_f32(inputVal1, vScalar);
486 inputVal0 = vmaxq_f32(vminq_f32(inputVal0, vmax_val), vmin_val);
487 inputVal1 = vmaxq_f32(vminq_f32(inputVal1, vmax_val), vmin_val);
489 int32x4_t intVal0 = vcvtnq_s32_f32(inputVal0);
490 int32x4_t intVal1 = vcvtnq_s32_f32(inputVal1);
492 vst1q_s32(outputPtr, intVal0);
493 vst1q_s32(outputPtr + 4, intVal1);
498 number = eighth_points * 8;
499 for (; number < num_points; number++) {
500 float r = *inputPtr++ * scalar;
502 *outputPtr++ = INT_MAX;
503 else if (r < min_val)
504 *outputPtr++ = INT_MIN;
506 *outputPtr++ = (int32_t)rintf(r);
512#include <riscv_vector.h>
514static inline void volk_32f_s32f_convert_32i_rvv(int32_t* outputVector,
515 const float* inputVector,
517 unsigned int num_points)
519 size_t n = num_points;
520 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
521 vl = __riscv_vsetvl_e32m8(n);
522 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
523 v = __riscv_vfmul(v, scalar, vl);
524 __riscv_vse32(outputVector, __riscv_vfcvt_x(v, vl), vl);