60#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
61#define INCLUDED_volk_32f_s32f_convert_8i_u_H
67 const float min_val = INT8_MIN;
68 const float max_val = INT8_MAX;
70 *out = (int8_t)(max_val);
71 }
else if (in < min_val) {
72 *out = (int8_t)(min_val);
74 *out = (int8_t)(rintf(in));
81 const float* inputVector,
83 unsigned int num_points)
85 const float* inputVectorPtr = inputVector;
87 for (
unsigned int number = 0; number < num_points; number++) {
88 const float r = *inputVectorPtr++ * scalar;
99static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
100 const float* inputVector,
102 unsigned int num_points)
104 const unsigned int thirtysecondPoints = num_points / 32;
106 const float* inputVectorPtr = (
const float*)inputVector;
107 int8_t* outputVectorPtr = outputVector;
109 const float min_val = INT8_MIN;
110 const float max_val = INT8_MAX;
111 const __m256 vmin_val = _mm256_set1_ps(min_val);
112 const __m256 vmax_val = _mm256_set1_ps(max_val);
114 const __m256 vScalar = _mm256_set1_ps(scalar);
116 for (
unsigned int number = 0; number < thirtysecondPoints; number++) {
117 __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
119 __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
121 __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
123 __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
126 inputVal1 = _mm256_max_ps(
127 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
128 inputVal2 = _mm256_max_ps(
129 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
130 inputVal3 = _mm256_max_ps(
131 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
132 inputVal4 = _mm256_max_ps(
133 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
135 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
136 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
137 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
138 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
140 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
141 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
142 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
143 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
145 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
146 const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
148 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
149 outputVectorPtr += 32;
152 for (
unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
153 float r = inputVector[number] * scalar;
160#ifdef LV_HAVE_AVX512F
161#include <immintrin.h>
163static inline void volk_32f_s32f_convert_8i_u_avx512(int8_t* outputVector,
164 const float* inputVector,
166 unsigned int num_points)
168 unsigned int number = 0;
170 const unsigned int thirtysecondPoints = num_points / 32;
172 const float* inputVectorPtr = (
const float*)inputVector;
173 int8_t* outputVectorPtr = outputVector;
175 float min_val = INT8_MIN;
176 float max_val = INT8_MAX;
179 __m512 vScalar = _mm512_set1_ps(scalar);
180 __m512 inputVal1, inputVal2;
181 __m512i intInputVal1, intInputVal2;
182 __m512 vmin_val = _mm512_set1_ps(min_val);
183 __m512 vmax_val = _mm512_set1_ps(max_val);
184 __m128i packed_result;
186 for (; number < thirtysecondPoints; number++) {
187 inputVal1 = _mm512_loadu_ps(inputVectorPtr);
188 inputVectorPtr += 16;
189 inputVal2 = _mm512_loadu_ps(inputVectorPtr);
190 inputVectorPtr += 16;
192 inputVal1 = _mm512_max_ps(
193 _mm512_min_ps(_mm512_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
194 inputVal2 = _mm512_max_ps(
195 _mm512_min_ps(_mm512_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
197 intInputVal1 = _mm512_cvtps_epi32(inputVal1);
198 intInputVal2 = _mm512_cvtps_epi32(inputVal2);
201 packed_result = _mm512_cvtsepi32_epi8(intInputVal1);
202 _mm_storeu_si128((__m128i*)outputVectorPtr, packed_result);
203 outputVectorPtr += 16;
205 packed_result = _mm512_cvtsepi32_epi8(intInputVal2);
206 _mm_storeu_si128((__m128i*)outputVectorPtr, packed_result);
207 outputVectorPtr += 16;
210 number = thirtysecondPoints * 32;
211 for (; number < num_points; number++) {
212 r = inputVector[number] * scalar;
221#include <emmintrin.h>
224 const float* inputVector,
226 unsigned int num_points)
228 const unsigned int sixteenthPoints = num_points / 16;
230 const float* inputVectorPtr = (
const float*)inputVector;
231 int8_t* outputVectorPtr = outputVector;
233 const float min_val = INT8_MIN;
234 const float max_val = INT8_MAX;
235 const __m128 vmin_val = _mm_set_ps1(min_val);
236 const __m128 vmax_val = _mm_set_ps1(max_val);
238 const __m128 vScalar = _mm_set_ps1(scalar);
240 for (
unsigned int number = 0; number < sixteenthPoints; number++) {
241 __m128 inputVal1 = _mm_loadu_ps(inputVectorPtr);
243 __m128 inputVal2 = _mm_loadu_ps(inputVectorPtr);
245 __m128 inputVal3 = _mm_loadu_ps(inputVectorPtr);
247 __m128 inputVal4 = _mm_loadu_ps(inputVectorPtr);
251 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
253 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
255 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
257 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
259 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
260 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
261 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
262 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
264 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
265 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
267 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
269 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
270 outputVectorPtr += 16;
273 for (
unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
274 const float r = inputVector[number] * scalar;
283#include <xmmintrin.h>
286 const float* inputVector,
288 unsigned int num_points)
290 const unsigned int quarterPoints = num_points / 4;
292 const float* inputVectorPtr = (
const float*)inputVector;
293 int8_t* outputVectorPtr = outputVector;
295 const float min_val = INT8_MIN;
296 const float max_val = INT8_MAX;
297 const __m128 vmin_val = _mm_set_ps1(min_val);
298 const __m128 vmax_val = _mm_set_ps1(max_val);
300 const __m128 vScalar = _mm_set_ps1(scalar);
304 for (
unsigned int number = 0; number < quarterPoints; number++) {
305 __m128 ret = _mm_loadu_ps(inputVectorPtr);
308 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
310 _mm_store_ps(outputFloatBuffer, ret);
311 for (
size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
312 *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
316 for (
unsigned int number = quarterPoints * 4; number < num_points; number++) {
317 const float r = inputVector[number] * scalar;
326#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
327#define INCLUDED_volk_32f_s32f_convert_8i_a_H
332#include <immintrin.h>
334static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
335 const float* inputVector,
337 unsigned int num_points)
339 const unsigned int thirtysecondPoints = num_points / 32;
341 const float* inputVectorPtr = (
const float*)inputVector;
342 int8_t* outputVectorPtr = outputVector;
344 const float min_val = INT8_MIN;
345 const float max_val = INT8_MAX;
346 const __m256 vmin_val = _mm256_set1_ps(min_val);
347 const __m256 vmax_val = _mm256_set1_ps(max_val);
349 const __m256 vScalar = _mm256_set1_ps(scalar);
351 for (
unsigned int number = 0; number < thirtysecondPoints; number++) {
352 __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
354 __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
356 __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
358 __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
361 inputVal1 = _mm256_max_ps(
362 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
363 inputVal2 = _mm256_max_ps(
364 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
365 inputVal3 = _mm256_max_ps(
366 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
367 inputVal4 = _mm256_max_ps(
368 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
370 __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
371 __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
372 __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
373 __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
375 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
376 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
377 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
378 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
380 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
381 __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
383 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
384 outputVectorPtr += 32;
387 for (
unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
388 const float r = inputVector[number] * scalar;
395#ifdef LV_HAVE_AVX512F
396#include <immintrin.h>
398static inline void volk_32f_s32f_convert_8i_a_avx512(int8_t* outputVector,
399 const float* inputVector,
401 unsigned int num_points)
403 unsigned int number = 0;
405 const unsigned int thirtysecondPoints = num_points / 32;
407 const float* inputVectorPtr = (
const float*)inputVector;
408 int8_t* outputVectorPtr = outputVector;
410 float min_val = INT8_MIN;
411 float max_val = INT8_MAX;
414 __m512 vScalar = _mm512_set1_ps(scalar);
415 __m512 inputVal1, inputVal2;
416 __m512i intInputVal1, intInputVal2;
417 __m512 vmin_val = _mm512_set1_ps(min_val);
418 __m512 vmax_val = _mm512_set1_ps(max_val);
419 __m128i packed_result;
421 for (; number < thirtysecondPoints; number++) {
422 inputVal1 = _mm512_load_ps(inputVectorPtr);
423 inputVectorPtr += 16;
424 inputVal2 = _mm512_load_ps(inputVectorPtr);
425 inputVectorPtr += 16;
427 inputVal1 = _mm512_max_ps(
428 _mm512_min_ps(_mm512_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
429 inputVal2 = _mm512_max_ps(
430 _mm512_min_ps(_mm512_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
432 intInputVal1 = _mm512_cvtps_epi32(inputVal1);
433 intInputVal2 = _mm512_cvtps_epi32(inputVal2);
436 packed_result = _mm512_cvtsepi32_epi8(intInputVal1);
437 _mm_store_si128((__m128i*)outputVectorPtr, packed_result);
438 outputVectorPtr += 16;
440 packed_result = _mm512_cvtsepi32_epi8(intInputVal2);
441 _mm_store_si128((__m128i*)outputVectorPtr, packed_result);
442 outputVectorPtr += 16;
445 number = thirtysecondPoints * 32;
446 for (; number < num_points; number++) {
447 r = inputVector[number] * scalar;
456#include <emmintrin.h>
459 const float* inputVector,
461 unsigned int num_points)
463 const unsigned int sixteenthPoints = num_points / 16;
465 const float* inputVectorPtr = (
const float*)inputVector;
466 int8_t* outputVectorPtr = outputVector;
468 const float min_val = INT8_MIN;
469 const float max_val = INT8_MAX;
470 const __m128 vmin_val = _mm_set_ps1(min_val);
471 const __m128 vmax_val = _mm_set_ps1(max_val);
473 const __m128 vScalar = _mm_set_ps1(scalar);
475 for (
unsigned int number = 0; number < sixteenthPoints; number++) {
476 __m128 inputVal1 = _mm_load_ps(inputVectorPtr);
478 __m128 inputVal2 = _mm_load_ps(inputVectorPtr);
480 __m128 inputVal3 = _mm_load_ps(inputVectorPtr);
482 __m128 inputVal4 = _mm_load_ps(inputVectorPtr);
486 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
488 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
490 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
492 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
494 __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
495 __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
496 __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
497 __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
499 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
500 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
502 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
504 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
505 outputVectorPtr += 16;
508 for (
unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
509 const float r = inputVector[number] * scalar;
517#include <xmmintrin.h>
520 const float* inputVector,
522 unsigned int num_points)
524 const unsigned int quarterPoints = num_points / 4;
526 const float* inputVectorPtr = (
const float*)inputVector;
527 int8_t* outputVectorPtr = outputVector;
529 const float min_val = INT8_MIN;
530 const float max_val = INT8_MAX;
531 const __m128 vmin_val = _mm_set_ps1(min_val);
532 const __m128 vmax_val = _mm_set_ps1(max_val);
534 const __m128 vScalar = _mm_set_ps1(scalar);
538 for (
unsigned int number = 0; number < quarterPoints; number++) {
539 __m128 ret = _mm_load_ps(inputVectorPtr);
542 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
544 _mm_store_ps(outputFloatBuffer, ret);
545 for (
size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
546 *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
550 for (
unsigned int number = quarterPoints * 4; number < num_points; number++) {
551 const float r = inputVector[number] * scalar;
563 const float* inputVector,
565 unsigned int num_points)
567 unsigned int number = 0;
568 const unsigned int sixteenthPoints = num_points / 16;
570 const float* inputVectorPtr = inputVector;
571 int8_t* outputVectorPtr = outputVector;
573 const float min_val = INT8_MIN;
574 const float max_val = INT8_MAX;
576 float32x4_t vScalar = vdupq_n_f32(scalar);
577 float32x4_t vmin_val = vdupq_n_f32(min_val);
578 float32x4_t vmax_val = vdupq_n_f32(max_val);
579 float32x4_t half = vdupq_n_f32(0.5f);
580 float32x4_t neg_half = vdupq_n_f32(-0.5f);
581 float32x4_t zero = vdupq_n_f32(0.0f);
583 for (; number < sixteenthPoints; number++) {
584 float32x4_t inputVal0 = vld1q_f32(inputVectorPtr);
585 float32x4_t inputVal1 = vld1q_f32(inputVectorPtr + 4);
586 float32x4_t inputVal2 = vld1q_f32(inputVectorPtr + 8);
587 float32x4_t inputVal3 = vld1q_f32(inputVectorPtr + 12);
588 inputVectorPtr += 16;
592 vmaxq_f32(vminq_f32(vmulq_f32(inputVal0, vScalar), vmax_val), vmin_val);
594 vmaxq_f32(vminq_f32(vmulq_f32(inputVal1, vScalar), vmax_val), vmin_val);
596 vmaxq_f32(vminq_f32(vmulq_f32(inputVal2, vScalar), vmax_val), vmin_val);
598 vmaxq_f32(vminq_f32(vmulq_f32(inputVal3, vScalar), vmax_val), vmin_val);
601 uint32x4_t neg0 = vcltq_f32(ret0, zero);
602 uint32x4_t neg1 = vcltq_f32(ret1, zero);
603 uint32x4_t neg2 = vcltq_f32(ret2, zero);
604 uint32x4_t neg3 = vcltq_f32(ret3, zero);
605 ret0 = vaddq_f32(ret0, vbslq_f32(neg0, neg_half, half));
606 ret1 = vaddq_f32(ret1, vbslq_f32(neg1, neg_half, half));
607 ret2 = vaddq_f32(ret2, vbslq_f32(neg2, neg_half, half));
608 ret3 = vaddq_f32(ret3, vbslq_f32(neg3, neg_half, half));
611 int32x4_t intVal0 = vcvtq_s32_f32(ret0);
612 int32x4_t intVal1 = vcvtq_s32_f32(ret1);
613 int32x4_t intVal2 = vcvtq_s32_f32(ret2);
614 int32x4_t intVal3 = vcvtq_s32_f32(ret3);
617 int16x4_t narrow16_0 = vqmovn_s32(intVal0);
618 int16x4_t narrow16_1 = vqmovn_s32(intVal1);
619 int16x4_t narrow16_2 = vqmovn_s32(intVal2);
620 int16x4_t narrow16_3 = vqmovn_s32(intVal3);
621 int16x8_t wide16_0 = vcombine_s16(narrow16_0, narrow16_1);
622 int16x8_t wide16_1 = vcombine_s16(narrow16_2, narrow16_3);
625 int8x8_t narrow8_0 = vqmovn_s16(wide16_0);
626 int8x8_t narrow8_1 = vqmovn_s16(wide16_1);
627 int8x16_t result = vcombine_s8(narrow8_0, narrow8_1);
629 vst1q_s8(outputVectorPtr, result);
630 outputVectorPtr += 16;
633 number = sixteenthPoints * 16;
634 for (; number < num_points; number++) {
635 float r = inputVector[number] * scalar;
645static inline void volk_32f_s32f_convert_8i_neonv8(int8_t* outputVector,
646 const float* inputVector,
648 unsigned int num_points)
650 unsigned int number = 0;
651 const unsigned int thirtysecondPoints = num_points / 32;
653 const float* inputVectorPtr = inputVector;
654 int8_t* outputVectorPtr = outputVector;
656 const float min_val = INT8_MIN;
657 const float max_val = INT8_MAX;
659 float32x4_t vScalar = vdupq_n_f32(scalar);
660 float32x4_t vmin_val = vdupq_n_f32(min_val);
661 float32x4_t vmax_val = vdupq_n_f32(max_val);
663 for (; number < thirtysecondPoints; number++) {
664 float32x4_t inputVal0 = vld1q_f32(inputVectorPtr);
665 float32x4_t inputVal1 = vld1q_f32(inputVectorPtr + 4);
666 float32x4_t inputVal2 = vld1q_f32(inputVectorPtr + 8);
667 float32x4_t inputVal3 = vld1q_f32(inputVectorPtr + 12);
668 float32x4_t inputVal4 = vld1q_f32(inputVectorPtr + 16);
669 float32x4_t inputVal5 = vld1q_f32(inputVectorPtr + 20);
670 float32x4_t inputVal6 = vld1q_f32(inputVectorPtr + 24);
671 float32x4_t inputVal7 = vld1q_f32(inputVectorPtr + 28);
673 inputVectorPtr += 32;
677 vmaxq_f32(vminq_f32(vmulq_f32(inputVal0, vScalar), vmax_val), vmin_val);
679 vmaxq_f32(vminq_f32(vmulq_f32(inputVal1, vScalar), vmax_val), vmin_val);
681 vmaxq_f32(vminq_f32(vmulq_f32(inputVal2, vScalar), vmax_val), vmin_val);
683 vmaxq_f32(vminq_f32(vmulq_f32(inputVal3, vScalar), vmax_val), vmin_val);
685 vmaxq_f32(vminq_f32(vmulq_f32(inputVal4, vScalar), vmax_val), vmin_val);
687 vmaxq_f32(vminq_f32(vmulq_f32(inputVal5, vScalar), vmax_val), vmin_val);
689 vmaxq_f32(vminq_f32(vmulq_f32(inputVal6, vScalar), vmax_val), vmin_val);
691 vmaxq_f32(vminq_f32(vmulq_f32(inputVal7, vScalar), vmax_val), vmin_val);
694 int32x4_t intVal0 = vcvtnq_s32_f32(ret0);
695 int32x4_t intVal1 = vcvtnq_s32_f32(ret1);
696 int32x4_t intVal2 = vcvtnq_s32_f32(ret2);
697 int32x4_t intVal3 = vcvtnq_s32_f32(ret3);
698 int32x4_t intVal4 = vcvtnq_s32_f32(ret4);
699 int32x4_t intVal5 = vcvtnq_s32_f32(ret5);
700 int32x4_t intVal6 = vcvtnq_s32_f32(ret6);
701 int32x4_t intVal7 = vcvtnq_s32_f32(ret7);
704 int16x4_t narrow16_0 = vqmovn_s32(intVal0);
705 int16x4_t narrow16_1 = vqmovn_s32(intVal1);
706 int16x4_t narrow16_2 = vqmovn_s32(intVal2);
707 int16x4_t narrow16_3 = vqmovn_s32(intVal3);
708 int16x4_t narrow16_4 = vqmovn_s32(intVal4);
709 int16x4_t narrow16_5 = vqmovn_s32(intVal5);
710 int16x4_t narrow16_6 = vqmovn_s32(intVal6);
711 int16x4_t narrow16_7 = vqmovn_s32(intVal7);
713 int16x8_t wide16_0 = vcombine_s16(narrow16_0, narrow16_1);
714 int16x8_t wide16_1 = vcombine_s16(narrow16_2, narrow16_3);
715 int16x8_t wide16_2 = vcombine_s16(narrow16_4, narrow16_5);
716 int16x8_t wide16_3 = vcombine_s16(narrow16_6, narrow16_7);
719 int8x8_t narrow8_0 = vqmovn_s16(wide16_0);
720 int8x8_t narrow8_1 = vqmovn_s16(wide16_1);
721 int8x8_t narrow8_2 = vqmovn_s16(wide16_2);
722 int8x8_t narrow8_3 = vqmovn_s16(wide16_3);
724 int8x16_t result0 = vcombine_s8(narrow8_0, narrow8_1);
725 int8x16_t result1 = vcombine_s8(narrow8_2, narrow8_3);
727 vst1q_s8(outputVectorPtr, result0);
728 vst1q_s8(outputVectorPtr + 16, result1);
729 outputVectorPtr += 32;
732 number = thirtysecondPoints * 32;
733 for (; number < num_points; number++) {
734 float r = inputVector[number] * scalar;
742#include <riscv_vector.h>
744static inline void volk_32f_s32f_convert_8i_rvv(int8_t* outputVector,
745 const float* inputVector,
747 unsigned int num_points)
749 size_t n = num_points;
750 for (
size_t vl; n > 0; n -= vl, inputVector += vl, outputVector += vl) {
751 vl = __riscv_vsetvl_e32m8(n);
752 vfloat32m8_t v = __riscv_vle32_v_f32m8(inputVector, vl);
753 vint16m4_t vi = __riscv_vfncvt_x(__riscv_vfmul(v, scalar, vl), vl);
754 __riscv_vse8(outputVector, __riscv_vnclip(vi, 0, 0, vl), vl);