Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_binary_slicer_32i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
56
57#ifndef INCLUDED_volk_32f_binary_slicer_32i_H
58#define INCLUDED_volk_32f_binary_slicer_32i_H
59
60
61#ifdef LV_HAVE_GENERIC
62
63static inline void volk_32f_binary_slicer_32i_generic(int* cVector,
64 const float* aVector,
65 unsigned int num_points)
66{
67 int* cPtr = cVector;
68 const float* aPtr = aVector;
69 unsigned int number = 0;
70
71 for (number = 0; number < num_points; number++) {
72 if (*aPtr++ >= 0) {
73 *cPtr++ = 1;
74 } else {
75 *cPtr++ = 0;
76 }
77 }
78}
79#endif /* LV_HAVE_GENERIC */
80
81
82#ifdef LV_HAVE_GENERIC
83
84static inline void volk_32f_binary_slicer_32i_generic_branchless(int* cVector,
85 const float* aVector,
86 unsigned int num_points)
87{
88 int* cPtr = cVector;
89 const float* aPtr = aVector;
90 unsigned int number = 0;
91
92 for (number = 0; number < num_points; number++) {
93 *cPtr++ = (*aPtr++ >= 0);
94 }
95}
96#endif /* LV_HAVE_GENERIC */
97
98
99#ifdef LV_HAVE_SSE2
100#include <emmintrin.h>
101
102static inline void volk_32f_binary_slicer_32i_a_sse2(int* cVector,
103 const float* aVector,
104 unsigned int num_points)
105{
106 int* cPtr = cVector;
107 const float* aPtr = aVector;
108 unsigned int number = 0;
109
110 unsigned int quarter_points = num_points / 4;
111 __m128 a_val, res_f;
112 __m128i res_i, binary_i;
113 __m128 zero_val;
114 zero_val = _mm_set1_ps(0.0f);
115
116 for (number = 0; number < quarter_points; number++) {
117 a_val = _mm_load_ps(aPtr);
118
119 res_f = _mm_cmpge_ps(a_val, zero_val);
120 res_i = _mm_cvtps_epi32(res_f);
121 binary_i = _mm_srli_epi32(res_i, 31);
122
123 _mm_store_si128((__m128i*)cPtr, binary_i);
124
125 cPtr += 4;
126 aPtr += 4;
127 }
128
129 for (number = quarter_points * 4; number < num_points; number++) {
130 if (*aPtr++ >= 0) {
131 *cPtr++ = 1;
132 } else {
133 *cPtr++ = 0;
134 }
135 }
136}
137#endif /* LV_HAVE_SSE2 */
138
139
140#ifdef LV_HAVE_AVX
141#include <immintrin.h>
142
143static inline void volk_32f_binary_slicer_32i_a_avx(int* cVector,
144 const float* aVector,
145 unsigned int num_points)
146{
147 int* cPtr = cVector;
148 const float* aPtr = aVector;
149 unsigned int number = 0;
150
151 unsigned int quarter_points = num_points / 8;
152 __m256 a_val, res_f, binary_f;
153 __m256i binary_i;
154 __m256 zero_val, one_val;
155 zero_val = _mm256_set1_ps(0.0f);
156 one_val = _mm256_set1_ps(1.0f);
157
158 for (number = 0; number < quarter_points; number++) {
159 a_val = _mm256_load_ps(aPtr);
160
161 res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
162 binary_f = _mm256_and_ps(res_f, one_val);
163 binary_i = _mm256_cvtps_epi32(binary_f);
164
165 _mm256_store_si256((__m256i*)cPtr, binary_i);
166
167 cPtr += 8;
168 aPtr += 8;
169 }
170
171 for (number = quarter_points * 8; number < num_points; number++) {
172 if (*aPtr++ >= 0) {
173 *cPtr++ = 1;
174 } else {
175 *cPtr++ = 0;
176 }
177 }
178}
179#endif /* LV_HAVE_AVX */
180
181
182#ifdef LV_HAVE_SSE2
183#include <emmintrin.h>
184
185static inline void volk_32f_binary_slicer_32i_u_sse2(int* cVector,
186 const float* aVector,
187 unsigned int num_points)
188{
189 int* cPtr = cVector;
190 const float* aPtr = aVector;
191 unsigned int number = 0;
192
193 unsigned int quarter_points = num_points / 4;
194 __m128 a_val, res_f;
195 __m128i res_i, binary_i;
196 __m128 zero_val;
197 zero_val = _mm_set1_ps(0.0f);
198
199 for (number = 0; number < quarter_points; number++) {
200 a_val = _mm_loadu_ps(aPtr);
201
202 res_f = _mm_cmpge_ps(a_val, zero_val);
203 res_i = _mm_cvtps_epi32(res_f);
204 binary_i = _mm_srli_epi32(res_i, 31);
205
206 _mm_storeu_si128((__m128i*)cPtr, binary_i);
207
208 cPtr += 4;
209 aPtr += 4;
210 }
211
212 for (number = quarter_points * 4; number < num_points; number++) {
213 if (*aPtr++ >= 0) {
214 *cPtr++ = 1;
215 } else {
216 *cPtr++ = 0;
217 }
218 }
219}
220#endif /* LV_HAVE_SSE2 */
221
222
223#ifdef LV_HAVE_AVX
224#include <immintrin.h>
225
226static inline void volk_32f_binary_slicer_32i_u_avx(int* cVector,
227 const float* aVector,
228 unsigned int num_points)
229{
230 int* cPtr = cVector;
231 const float* aPtr = aVector;
232 unsigned int number = 0;
233
234 unsigned int quarter_points = num_points / 8;
235 __m256 a_val, res_f, binary_f;
236 __m256i binary_i;
237 __m256 zero_val, one_val;
238 zero_val = _mm256_set1_ps(0.0f);
239 one_val = _mm256_set1_ps(1.0f);
240
241 for (number = 0; number < quarter_points; number++) {
242 a_val = _mm256_loadu_ps(aPtr);
243
244 res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
245 binary_f = _mm256_and_ps(res_f, one_val);
246 binary_i = _mm256_cvtps_epi32(binary_f);
247
248 _mm256_storeu_si256((__m256i*)cPtr, binary_i);
249
250 cPtr += 8;
251 aPtr += 8;
252 }
253
254 for (number = quarter_points * 8; number < num_points; number++) {
255 if (*aPtr++ >= 0) {
256 *cPtr++ = 1;
257 } else {
258 *cPtr++ = 0;
259 }
260 }
261}
262#endif /* LV_HAVE_AVX */
263
264#ifdef LV_HAVE_NEON
265#include <arm_neon.h>
266
267static inline void volk_32f_binary_slicer_32i_neon(int* cVector,
268 const float* aVector,
269 unsigned int num_points)
270{
271 int* cPtr = cVector;
272 const float* aPtr = aVector;
273 unsigned int number = 0;
274 const unsigned int quarter_points = num_points / 4;
275
276 float32x4_t zero_val = vdupq_n_f32(0.0f);
277
278 for (; number < quarter_points; number++) {
279 float32x4_t a_val = vld1q_f32(aPtr);
280 uint32x4_t cmp = vcgeq_f32(a_val, zero_val);
281 uint32x4_t result = vshrq_n_u32(cmp, 31);
282 vst1q_s32(cPtr, vreinterpretq_s32_u32(result));
283 aPtr += 4;
284 cPtr += 4;
285 }
286
287 for (number = quarter_points * 4; number < num_points; number++) {
288 *cPtr++ = (*aPtr++ >= 0) ? 1 : 0;
289 }
290}
291#endif /* LV_HAVE_NEON */
292
293#ifdef LV_HAVE_NEONV8
294#include <arm_neon.h>
295
296static inline void volk_32f_binary_slicer_32i_neonv8(int* cVector,
297 const float* aVector,
298 unsigned int num_points)
299{
300 int* cPtr = cVector;
301 const float* aPtr = aVector;
302 unsigned int number = 0;
303 const unsigned int eighth_points = num_points / 8;
304
305 float32x4_t zero_val = vdupq_n_f32(0.0f);
306
307 for (; number < eighth_points; number++) {
308 float32x4_t a_val0 = vld1q_f32(aPtr);
309 float32x4_t a_val1 = vld1q_f32(aPtr + 4);
310 __VOLK_PREFETCH(aPtr + 8);
311
312 uint32x4_t cmp0 = vcgeq_f32(a_val0, zero_val);
313 uint32x4_t cmp1 = vcgeq_f32(a_val1, zero_val);
314 uint32x4_t result0 = vshrq_n_u32(cmp0, 31);
315 uint32x4_t result1 = vshrq_n_u32(cmp1, 31);
316
317 vst1q_s32(cPtr, vreinterpretq_s32_u32(result0));
318 vst1q_s32(cPtr + 4, vreinterpretq_s32_u32(result1));
319 aPtr += 8;
320 cPtr += 8;
321 }
322
323 for (number = eighth_points * 8; number < num_points; number++) {
324 *cPtr++ = (*aPtr++ >= 0) ? 1 : 0;
325 }
326}
327#endif /* LV_HAVE_NEONV8 */
328
329#ifdef LV_HAVE_RVV
330#include <riscv_vector.h>
331
332static inline void volk_32f_binary_slicer_32i_rvv(int* cVector,
333 const float* aVector,
334 unsigned int num_points)
335{
336 size_t n = num_points;
337 for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
338 vl = __riscv_vsetvl_e32m8(n);
339 vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)aVector, vl);
340 v = __riscv_vsrl(__riscv_vnot(v, vl), 31, vl);
341 __riscv_vse32((uint32_t*)cVector, v, vl);
342 }
343}
344#endif /*LV_HAVE_RVV*/
345
346#endif /* INCLUDED_volk_32f_binary_slicer_32i_H */