Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32u_popcnt.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
42
43#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H
44#define INCLUDED_VOLK_32u_POPCNT_A16_H
45
46#include <inttypes.h>
47#include <stdio.h>
48
49#ifdef LV_HAVE_GENERIC
50
51static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
52{
53 // This is faster than a lookup table
54 uint32_t retVal = value;
55
56 retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
57 retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
58 retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
59 retVal = (retVal + (retVal >> 8));
60 retVal = (retVal + (retVal >> 16)) & 0x0000003F;
61
62 *ret = retVal;
63}
64
65#endif /*LV_HAVE_GENERIC*/
66
67
68#ifdef LV_HAVE_NEON
69#include <arm_neon.h>
70
71static inline void volk_32u_popcnt_neon(uint32_t* ret, const uint32_t value)
72{
73 // Load value into a 64-bit vector (as 8 bytes)
74 uint8x8_t input = vreinterpret_u8_u32(vdup_n_u32(value));
75 // Count bits in each byte
76 uint8x8_t counts = vcnt_u8(input);
77 // Sum across all bytes (only first 4 matter for 32-bit value)
78 // Use vpaddl to widen and add: 8x8 -> 4x16 -> 2x32 -> 1x64
79 uint16x4_t sum16 = vpaddl_u8(counts);
80 uint32x2_t sum32 = vpaddl_u16(sum16);
81 // Extract the lower 32-bit element which contains the sum of the lower 4 bytes
82 *ret = vget_lane_u32(sum32, 0);
83}
84#endif /* LV_HAVE_NEON */
85
86
87#ifdef LV_HAVE_SSE4_2
88
89#include <nmmintrin.h>
90
91static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
92{
93 *ret = _mm_popcnt_u32(value);
94}
95
96#endif /*LV_HAVE_SSE4_2*/
97
98#ifdef LV_HAVE_RVV
99#include <riscv_vector.h>
100
101static inline void volk_32u_popcnt_rvv(uint32_t* ret, const uint32_t value)
102{
103 *ret = __riscv_vcpop(__riscv_vreinterpret_b4(__riscv_vmv_s_x_u64m1(value, 1)), 32);
104}
105#endif /*LV_HAVE_RVV*/
106
107#ifdef LV_HAVE_RVA22V
108#include <riscv_bitmanip.h>
109
110static inline void volk_32u_popcnt_rva22(uint32_t* ret, const uint32_t value)
111{
112 *ret = __riscv_cpop_32(value);
113}
114#endif /*LV_HAVE_RVA22V*/
115
116#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/