Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32i_x2_and_32i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
67
68#ifndef INCLUDED_volk_32i_x2_and_32i_a_H
69#define INCLUDED_volk_32i_x2_and_32i_a_H
70
71#include <inttypes.h>
72#include <stdio.h>
73
74#ifdef LV_HAVE_AVX512F
75#include <immintrin.h>
76
77static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector,
78 const int32_t* aVector,
79 const int32_t* bVector,
80 unsigned int num_points)
81{
82 unsigned int number = 0;
83 const unsigned int sixteenthPoints = num_points / 16;
84
85 int32_t* cPtr = (int32_t*)cVector;
86 const int32_t* aPtr = (int32_t*)aVector;
87 const int32_t* bPtr = (int32_t*)bVector;
88
89 __m512i aVal, bVal, cVal;
90 for (; number < sixteenthPoints; number++) {
91
92 aVal = _mm512_load_si512(aPtr);
93 bVal = _mm512_load_si512(bPtr);
94
95 cVal = _mm512_and_si512(aVal, bVal);
96
97 _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
98
99 aPtr += 16;
100 bPtr += 16;
101 cPtr += 16;
102 }
103
104 number = sixteenthPoints * 16;
105 for (; number < num_points; number++) {
106 cVector[number] = aVector[number] & bVector[number];
107 }
108}
109#endif /* LV_HAVE_AVX512F */
110
111#ifdef LV_HAVE_AVX2
112#include <immintrin.h>
113
114static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector,
115 const int32_t* aVector,
116 const int32_t* bVector,
117 unsigned int num_points)
118{
119 unsigned int number = 0;
120 const unsigned int oneEightPoints = num_points / 8;
121
122 int32_t* cPtr = cVector;
123 const int32_t* aPtr = aVector;
124 const int32_t* bPtr = bVector;
125
126 __m256i aVal, bVal, cVal;
127 for (; number < oneEightPoints; number++) {
128
129 aVal = _mm256_load_si256((__m256i*)aPtr);
130 bVal = _mm256_load_si256((__m256i*)bPtr);
131
132 cVal = _mm256_and_si256(aVal, bVal);
133
134 _mm256_store_si256((__m256i*)cPtr,
135 cVal); // Store the results back into the C container
136
137 aPtr += 8;
138 bPtr += 8;
139 cPtr += 8;
140 }
141
142 number = oneEightPoints * 8;
143 for (; number < num_points; number++) {
144 cVector[number] = aVector[number] & bVector[number];
145 }
146}
147#endif /* LV_HAVE_AVX2 */
148
149
150#ifdef LV_HAVE_SSE
151#include <xmmintrin.h>
152
153static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector,
154 const int32_t* aVector,
155 const int32_t* bVector,
156 unsigned int num_points)
157{
158 unsigned int number = 0;
159 const unsigned int quarterPoints = num_points / 4;
160
161 float* cPtr = (float*)cVector;
162 const float* aPtr = (float*)aVector;
163 const float* bPtr = (float*)bVector;
164
165 __m128 aVal, bVal, cVal;
166 for (; number < quarterPoints; number++) {
167
168 aVal = _mm_load_ps(aPtr);
169 bVal = _mm_load_ps(bPtr);
170
171 cVal = _mm_and_ps(aVal, bVal);
172
173 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
174
175 aPtr += 4;
176 bPtr += 4;
177 cPtr += 4;
178 }
179
180 number = quarterPoints * 4;
181 for (; number < num_points; number++) {
182 cVector[number] = aVector[number] & bVector[number];
183 }
184}
185#endif /* LV_HAVE_SSE */
186
187
188#ifdef LV_HAVE_NEON
189#include <arm_neon.h>
190
191static inline void volk_32i_x2_and_32i_neon(int32_t* cVector,
192 const int32_t* aVector,
193 const int32_t* bVector,
194 unsigned int num_points)
195{
196 int32_t* cPtr = cVector;
197 const int32_t* aPtr = aVector;
198 const int32_t* bPtr = bVector;
199 unsigned int number = 0;
200 unsigned int quarter_points = num_points / 4;
201
202 int32x4_t a_val, b_val, c_val;
203
204 for (number = 0; number < quarter_points; number++) {
205 a_val = vld1q_s32(aPtr);
206 b_val = vld1q_s32(bPtr);
207 c_val = vandq_s32(a_val, b_val);
208 vst1q_s32(cPtr, c_val);
209 aPtr += 4;
210 bPtr += 4;
211 cPtr += 4;
212 }
213
214 for (number = quarter_points * 4; number < num_points; number++) {
215 *cPtr++ = (*aPtr++) & (*bPtr++);
216 }
217}
218#endif /* LV_HAVE_NEON */
219
220#ifdef LV_HAVE_NEONV8
221#include <arm_neon.h>
222
223static inline void volk_32i_x2_and_32i_neonv8(int32_t* cVector,
224 const int32_t* aVector,
225 const int32_t* bVector,
226 unsigned int num_points)
227{
228 const unsigned int eighthPoints = num_points / 8;
229
230 const int32_t* aPtr = aVector;
231 const int32_t* bPtr = bVector;
232 int32_t* cPtr = cVector;
233
234 for (unsigned int number = 0; number < eighthPoints; number++) {
235 int32x4_t a0 = vld1q_s32(aPtr);
236 int32x4_t a1 = vld1q_s32(aPtr + 4);
237 int32x4_t b0 = vld1q_s32(bPtr);
238 int32x4_t b1 = vld1q_s32(bPtr + 4);
239 __VOLK_PREFETCH(aPtr + 16);
240 __VOLK_PREFETCH(bPtr + 16);
241
242 vst1q_s32(cPtr, vandq_s32(a0, b0));
243 vst1q_s32(cPtr + 4, vandq_s32(a1, b1));
244
245 aPtr += 8;
246 bPtr += 8;
247 cPtr += 8;
248 }
249
250 for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
251 *cPtr++ = (*aPtr++) & (*bPtr++);
252 }
253}
254#endif /* LV_HAVE_NEONV8 */
255
256
257#ifdef LV_HAVE_GENERIC
258
259static inline void volk_32i_x2_and_32i_generic(int32_t* cVector,
260 const int32_t* aVector,
261 const int32_t* bVector,
262 unsigned int num_points)
263{
264 int32_t* cPtr = cVector;
265 const int32_t* aPtr = aVector;
266 const int32_t* bPtr = bVector;
267 unsigned int number = 0;
268
269 for (number = 0; number < num_points; number++) {
270 *cPtr++ = (*aPtr++) & (*bPtr++);
271 }
272}
273#endif /* LV_HAVE_GENERIC */
274
275
276#ifdef LV_HAVE_ORC
277extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector,
278 const int32_t* aVector,
279 const int32_t* bVector,
280 int num_points);
281
282static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector,
283 const int32_t* aVector,
284 const int32_t* bVector,
285 unsigned int num_points)
286{
287 volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
288}
289#endif /* LV_HAVE_ORC */
290
291
292#endif /* INCLUDED_volk_32i_x2_and_32i_a_H */
293
294
295#ifndef INCLUDED_volk_32i_x2_and_32i_u_H
296#define INCLUDED_volk_32i_x2_and_32i_u_H
297
298#include <inttypes.h>
299#include <stdio.h>
300
301#ifdef LV_HAVE_AVX512F
302#include <immintrin.h>
303
304static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector,
305 const int32_t* aVector,
306 const int32_t* bVector,
307 unsigned int num_points)
308{
309 unsigned int number = 0;
310 const unsigned int sixteenthPoints = num_points / 16;
311
312 int32_t* cPtr = (int32_t*)cVector;
313 const int32_t* aPtr = (int32_t*)aVector;
314 const int32_t* bPtr = (int32_t*)bVector;
315
316 __m512i aVal, bVal, cVal;
317 for (; number < sixteenthPoints; number++) {
318
319 aVal = _mm512_loadu_si512(aPtr);
320 bVal = _mm512_loadu_si512(bPtr);
321
322 cVal = _mm512_and_si512(aVal, bVal);
323
324 _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
325
326 aPtr += 16;
327 bPtr += 16;
328 cPtr += 16;
329 }
330
331 number = sixteenthPoints * 16;
332 for (; number < num_points; number++) {
333 cVector[number] = aVector[number] & bVector[number];
334 }
335}
336#endif /* LV_HAVE_AVX512F */
337
338#ifdef LV_HAVE_AVX2
339#include <immintrin.h>
340
341static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector,
342 const int32_t* aVector,
343 const int32_t* bVector,
344 unsigned int num_points)
345{
346 unsigned int number = 0;
347 const unsigned int oneEightPoints = num_points / 8;
348
349 int32_t* cPtr = cVector;
350 const int32_t* aPtr = aVector;
351 const int32_t* bPtr = bVector;
352
353 __m256i aVal, bVal, cVal;
354 for (; number < oneEightPoints; number++) {
355
356 aVal = _mm256_loadu_si256((__m256i*)aPtr);
357 bVal = _mm256_loadu_si256((__m256i*)bPtr);
358
359 cVal = _mm256_and_si256(aVal, bVal);
360
361 _mm256_storeu_si256((__m256i*)cPtr,
362 cVal); // Store the results back into the C container
363
364 aPtr += 8;
365 bPtr += 8;
366 cPtr += 8;
367 }
368
369 number = oneEightPoints * 8;
370 for (; number < num_points; number++) {
371 cVector[number] = aVector[number] & bVector[number];
372 }
373}
374#endif /* LV_HAVE_AVX2 */
375
376#ifdef LV_HAVE_RVV
377#include <riscv_vector.h>
378
379static inline void volk_32i_x2_and_32i_rvv(int32_t* cVector,
380 const int32_t* aVector,
381 const int32_t* bVector,
382 unsigned int num_points)
383{
384 size_t n = num_points;
385 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
386 vl = __riscv_vsetvl_e32m8(n);
387 vint32m8_t va = __riscv_vle32_v_i32m8(aVector, vl);
388 vint32m8_t vb = __riscv_vle32_v_i32m8(bVector, vl);
389 __riscv_vse32(cVector, __riscv_vand(va, vb, vl), vl);
390 }
391}
392#endif /*LV_HAVE_RVV*/
393
394#endif /* INCLUDED_volk_32i_x2_and_32i_u_H */