68#ifndef INCLUDED_volk_32i_x2_or_32i_a_H
69#define INCLUDED_volk_32i_x2_or_32i_a_H
77static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector,
78 const int32_t* aVector,
79 const int32_t* bVector,
80 unsigned int num_points)
82 unsigned int number = 0;
83 const unsigned int sixteenthPoints = num_points / 16;
85 int32_t* cPtr = (int32_t*)cVector;
86 const int32_t* aPtr = (int32_t*)aVector;
87 const int32_t* bPtr = (int32_t*)bVector;
89 __m512i aVal, bVal, cVal;
90 for (; number < sixteenthPoints; number++) {
92 aVal = _mm512_load_si512(aPtr);
93 bVal = _mm512_load_si512(bPtr);
95 cVal = _mm512_or_si512(aVal, bVal);
97 _mm512_store_si512(cPtr, cVal);
104 number = sixteenthPoints * 16;
105 for (; number < num_points; number++) {
106 cVector[number] = aVector[number] | bVector[number];
112#include <immintrin.h>
114static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector,
115 const int32_t* aVector,
116 const int32_t* bVector,
117 unsigned int num_points)
119 unsigned int number = 0;
120 const unsigned int oneEightPoints = num_points / 8;
122 int32_t* cPtr = cVector;
123 const int32_t* aPtr = aVector;
124 const int32_t* bPtr = bVector;
126 __m256i aVal, bVal, cVal;
127 for (; number < oneEightPoints; number++) {
129 aVal = _mm256_load_si256((__m256i*)aPtr);
130 bVal = _mm256_load_si256((__m256i*)bPtr);
132 cVal = _mm256_or_si256(aVal, bVal);
134 _mm256_store_si256((__m256i*)cPtr,
142 number = oneEightPoints * 8;
143 for (; number < num_points; number++) {
144 cVector[number] = aVector[number] | bVector[number];
151#include <xmmintrin.h>
154 const int32_t* aVector,
155 const int32_t* bVector,
156 unsigned int num_points)
158 unsigned int number = 0;
159 const unsigned int quarterPoints = num_points / 4;
161 float* cPtr = (
float*)cVector;
162 const float* aPtr = (
float*)aVector;
163 const float* bPtr = (
float*)bVector;
165 __m128 aVal, bVal, cVal;
166 for (; number < quarterPoints; number++) {
167 aVal = _mm_load_ps(aPtr);
168 bVal = _mm_load_ps(bPtr);
170 cVal = _mm_or_ps(aVal, bVal);
172 _mm_store_ps(cPtr, cVal);
179 number = quarterPoints * 4;
180 for (; number < num_points; number++) {
181 cVector[number] = aVector[number] | bVector[number];
191 const int32_t* aVector,
192 const int32_t* bVector,
193 unsigned int num_points)
195 int32_t* cPtr = cVector;
196 const int32_t* aPtr = aVector;
197 const int32_t* bPtr = bVector;
198 unsigned int number = 0;
199 unsigned int quarter_points = num_points / 4;
201 int32x4_t a_val, b_val, c_val;
203 for (number = 0; number < quarter_points; number++) {
204 a_val = vld1q_s32(aPtr);
205 b_val = vld1q_s32(bPtr);
206 c_val = vorrq_s32(a_val, b_val);
207 vst1q_s32(cPtr, c_val);
213 for (number = quarter_points * 4; number < num_points; number++) {
214 *cPtr++ = (*aPtr++) | (*bPtr++);
222static inline void volk_32i_x2_or_32i_neonv8(int32_t* cVector,
223 const int32_t* aVector,
224 const int32_t* bVector,
225 unsigned int num_points)
227 const unsigned int eighthPoints = num_points / 8;
229 const int32_t* aPtr = aVector;
230 const int32_t* bPtr = bVector;
231 int32_t* cPtr = cVector;
233 for (
unsigned int number = 0; number < eighthPoints; number++) {
234 int32x4_t a0 = vld1q_s32(aPtr);
235 int32x4_t a1 = vld1q_s32(aPtr + 4);
236 int32x4_t b0 = vld1q_s32(bPtr);
237 int32x4_t b1 = vld1q_s32(bPtr + 4);
241 vst1q_s32(cPtr, vorrq_s32(a0, b0));
242 vst1q_s32(cPtr + 4, vorrq_s32(a1, b1));
249 for (
unsigned int number = eighthPoints * 8; number < num_points; number++) {
250 *cPtr++ = (*aPtr++) | (*bPtr++);
256#ifdef LV_HAVE_GENERIC
259 const int32_t* aVector,
260 const int32_t* bVector,
261 unsigned int num_points)
263 int32_t* cPtr = cVector;
264 const int32_t* aPtr = aVector;
265 const int32_t* bPtr = bVector;
266 unsigned int number = 0;
268 for (number = 0; number < num_points; number++) {
269 *cPtr++ = (*aPtr++) | (*bPtr++);
276extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector,
277 const int32_t* aVector,
278 const int32_t* bVector,
281static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector,
282 const int32_t* aVector,
283 const int32_t* bVector,
284 unsigned int num_points)
286 volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
294#ifndef INCLUDED_volk_32i_x2_or_32i_u_H
295#define INCLUDED_volk_32i_x2_or_32i_u_H
300#ifdef LV_HAVE_AVX512F
301#include <immintrin.h>
303static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector,
304 const int32_t* aVector,
305 const int32_t* bVector,
306 unsigned int num_points)
308 unsigned int number = 0;
309 const unsigned int sixteenthPoints = num_points / 16;
311 int32_t* cPtr = (int32_t*)cVector;
312 const int32_t* aPtr = (int32_t*)aVector;
313 const int32_t* bPtr = (int32_t*)bVector;
315 __m512i aVal, bVal, cVal;
316 for (; number < sixteenthPoints; number++) {
318 aVal = _mm512_loadu_si512(aPtr);
319 bVal = _mm512_loadu_si512(bPtr);
321 cVal = _mm512_or_si512(aVal, bVal);
323 _mm512_storeu_si512(cPtr, cVal);
330 number = sixteenthPoints * 16;
331 for (; number < num_points; number++) {
332 cVector[number] = aVector[number] | bVector[number];
338#include <immintrin.h>
340static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
341 const int32_t* aVector,
342 const int32_t* bVector,
343 unsigned int num_points)
345 unsigned int number = 0;
346 const unsigned int oneEightPoints = num_points / 8;
348 int32_t* cPtr = cVector;
349 const int32_t* aPtr = aVector;
350 const int32_t* bPtr = bVector;
352 __m256i aVal, bVal, cVal;
353 for (; number < oneEightPoints; number++) {
355 aVal = _mm256_loadu_si256((__m256i*)aPtr);
356 bVal = _mm256_loadu_si256((__m256i*)bPtr);
358 cVal = _mm256_or_si256(aVal, bVal);
360 _mm256_storeu_si256((__m256i*)cPtr,
368 number = oneEightPoints * 8;
369 for (; number < num_points; number++) {
370 cVector[number] = aVector[number] | bVector[number];
376#include <riscv_vector.h>
378static inline void volk_32i_x2_or_32i_rvv(int32_t* cVector,
379 const int32_t* aVector,
380 const int32_t* bVector,
381 unsigned int num_points)
383 size_t n = num_points;
384 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
385 vl = __riscv_vsetvl_e32m8(n);
386 vint32m8_t va = __riscv_vle32_v_i32m8(aVector, vl);
387 vint32m8_t vb = __riscv_vle32_v_i32m8(bVector, vl);
388 __riscv_vse32(cVector, __riscv_vor(va, vb, vl), vl);