Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_s32f_x2_clamp_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
43
44#ifndef INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H
45#define INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H
46
47#ifdef LV_HAVE_GENERIC
48static inline void volk_32f_s32f_x2_clamp_32f_generic(float* out,
49 const float* in,
50 const float min,
51 const float max,
52 unsigned int num_points)
53{
54 unsigned int number = 0;
55 for (; number < num_points; number++) {
56 if (*in > max) {
57 *out = max;
58 } else if (*in < min) {
59 *out = min;
60 } else {
61 *out = *in;
62 }
63 in++;
64 out++;
65 }
66}
67#endif /* LV_HAVE_GENERIC */
68
69#if LV_HAVE_AVX2
70#include <immintrin.h>
71static inline void volk_32f_s32f_x2_clamp_32f_a_avx2(float* out,
72 const float* in,
73 const float min,
74 const float max,
75 unsigned int num_points)
76{
77 const __m256 vmin = _mm256_set1_ps(min);
78 const __m256 vmax = _mm256_set1_ps(max);
79
80 unsigned int number = 0;
81 unsigned int eighth_points = num_points / 8;
82 for (; number < eighth_points; number++) {
83 __m256 res = _mm256_load_ps(in);
84 __m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
85 __m256 min_mask = _mm256_cmp_ps(res, vmin, _CMP_LT_OS);
86 res = _mm256_blendv_ps(res, vmax, max_mask);
87 res = _mm256_blendv_ps(res, vmin, min_mask);
88 _mm256_store_ps(out, res);
89 in += 8;
90 out += 8;
91 }
92
93 number = eighth_points * 8;
94 volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
95}
96#endif /* LV_HAVE_AVX2 */
97
98#if LV_HAVE_SSE4_1
99#include <immintrin.h>
100static inline void volk_32f_s32f_x2_clamp_32f_a_sse4_1(float* out,
101 const float* in,
102 const float min,
103 const float max,
104 unsigned int num_points)
105{
106 const __m128 vmin = _mm_set1_ps(min);
107 const __m128 vmax = _mm_set1_ps(max);
108
109 unsigned int number = 0;
110 unsigned int quarter_points = num_points / 4;
111 for (; number < quarter_points; number++) {
112 __m128 res = _mm_load_ps(in);
113 __m128 max_mask = _mm_cmplt_ps(vmax, res);
114 __m128 min_mask = _mm_cmplt_ps(res, vmin);
115 res = _mm_blendv_ps(res, vmax, max_mask);
116 res = _mm_blendv_ps(res, vmin, min_mask);
117 _mm_store_ps(out, res);
118 in += 4;
119 out += 4;
120 }
121
122 number = quarter_points * 4;
123 volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
124}
125#endif /* LV_HAVE_SSE4_1 */
126
127#endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_a_H */
128
129#ifndef INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H
130#define INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H
131
132#if LV_HAVE_AVX2
133#include <immintrin.h>
134static inline void volk_32f_s32f_x2_clamp_32f_u_avx2(float* out,
135 const float* in,
136 const float min,
137 const float max,
138 unsigned int num_points)
139{
140 const __m256 vmin = _mm256_set1_ps(min);
141 const __m256 vmax = _mm256_set1_ps(max);
142
143 unsigned int number = 0;
144 unsigned int eighth_points = num_points / 8;
145 for (; number < eighth_points; number++) {
146 __m256 res = _mm256_loadu_ps(in);
147 __m256 max_mask = _mm256_cmp_ps(vmax, res, _CMP_LT_OS);
148 __m256 min_mask = _mm256_cmp_ps(res, vmin, _CMP_LT_OS);
149 res = _mm256_blendv_ps(res, vmax, max_mask);
150 res = _mm256_blendv_ps(res, vmin, min_mask);
151 _mm256_storeu_ps(out, res);
152 in += 8;
153 out += 8;
154 }
155
156 number = eighth_points * 8;
157 volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
158}
159#endif /* LV_HAVE_AVX2 */
160
161#if LV_HAVE_SSE4_1
162#include <immintrin.h>
163static inline void volk_32f_s32f_x2_clamp_32f_u_sse4_1(float* out,
164 const float* in,
165 const float min,
166 const float max,
167 unsigned int num_points)
168{
169 const __m128 vmin = _mm_set1_ps(min);
170 const __m128 vmax = _mm_set1_ps(max);
171
172 unsigned int number = 0;
173 unsigned int quarter_points = num_points / 4;
174 for (; number < quarter_points; number++) {
175 __m128 res = _mm_loadu_ps(in);
176 __m128 max_mask = _mm_cmplt_ps(vmax, res);
177 __m128 min_mask = _mm_cmplt_ps(res, vmin);
178 res = _mm_blendv_ps(res, vmax, max_mask);
179 res = _mm_blendv_ps(res, vmin, min_mask);
180 _mm_storeu_ps(out, res);
181 in += 4;
182 out += 4;
183 }
184
185 number = quarter_points * 4;
186 volk_32f_s32f_x2_clamp_32f_generic(out, in, min, max, num_points - number);
187}
188#endif /* LV_HAVE_SSE4_1 */
189
190#ifdef LV_HAVE_NEON
191#include <arm_neon.h>
192
193static inline void volk_32f_s32f_x2_clamp_32f_neon(float* out,
194 const float* in,
195 const float min,
196 const float max,
197 unsigned int num_points)
198{
199 const float32x4_t vmin = vdupq_n_f32(min);
200 const float32x4_t vmax = vdupq_n_f32(max);
201
202 unsigned int number = 0;
203 const unsigned int quarter_points = num_points / 4;
204
205 for (; number < quarter_points; number++) {
206 float32x4_t val = vld1q_f32(in);
207 val = vmaxq_f32(val, vmin);
208 val = vminq_f32(val, vmax);
209 vst1q_f32(out, val);
210 in += 4;
211 out += 4;
212 }
213
214 number = quarter_points * 4;
215 for (; number < num_points; number++) {
216 float val = *in++;
217 if (val < min)
218 val = min;
219 else if (val > max)
220 val = max;
221 *out++ = val;
222 }
223}
224#endif /* LV_HAVE_NEON */
225
226#ifdef LV_HAVE_NEONV8
227#include <arm_neon.h>
228
229static inline void volk_32f_s32f_x2_clamp_32f_neonv8(float* out,
230 const float* in,
231 const float min,
232 const float max,
233 unsigned int num_points)
234{
235 const float32x4_t vmin = vdupq_n_f32(min);
236 const float32x4_t vmax = vdupq_n_f32(max);
237
238 unsigned int number = 0;
239 const unsigned int eighth_points = num_points / 8;
240
241 for (; number < eighth_points; number++) {
242 float32x4_t val0 = vld1q_f32(in);
243 float32x4_t val1 = vld1q_f32(in + 4);
244 __VOLK_PREFETCH(in + 8);
245
246 val0 = vmaxq_f32(val0, vmin);
247 val1 = vmaxq_f32(val1, vmin);
248 val0 = vminq_f32(val0, vmax);
249 val1 = vminq_f32(val1, vmax);
250
251 vst1q_f32(out, val0);
252 vst1q_f32(out + 4, val1);
253 in += 8;
254 out += 8;
255 }
256
257 number = eighth_points * 8;
258 for (; number < num_points; number++) {
259 float val = *in++;
260 if (val < min)
261 val = min;
262 else if (val > max)
263 val = max;
264 *out++ = val;
265 }
266}
267#endif /* LV_HAVE_NEONV8 */
268
269#ifdef LV_HAVE_RVV
270#include <riscv_vector.h>
271
272static inline void volk_32f_s32f_x2_clamp_32f_rvv(float* out,
273 const float* in,
274 const float min,
275 const float max,
276 unsigned int num_points)
277{
278 vfloat32m8_t vmin = __riscv_vfmv_v_f_f32m8(min, __riscv_vsetvlmax_e32m8());
279 vfloat32m8_t vmax = __riscv_vfmv_v_f_f32m8(max, __riscv_vsetvlmax_e32m8());
280 size_t n = num_points;
281 for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
282 vl = __riscv_vsetvl_e32m8(n);
283 vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
284 v = __riscv_vfmin(__riscv_vfmax(v, vmin, vl), vmax, vl);
285 __riscv_vse32(out, v, vl);
286 }
287}
288#endif /*LV_HAVE_RVV*/
289
290#endif /* INCLUDED_volk_32fc_s32f_x2_clamp_32f_u_H */