TC++: ビット操作関連の関数
C++ でビット操作の実装。
・最右ビットの分離、最左ビットの分離(=2の冪乗で切り捨て)、1のビットの数を数えるコードと実行例
#include <cmath>
#include <iostream>
#include <ctime>
using namespace std;
// RMB(right-most bit) : 01011000 => 00001000, 00000000 => 00000000
// LMB(left-most bit) : 01011000 => 01000000, 00000000 => 00000000
// POP(population count): 01011000 => 3
template <typename T> T RMB(T x){return x&-(unsigned long long)x;}
unsigned int LMB32(unsigned int x){x|=x>>1;x|=x>>2;x|=x>>4;x|=x>>8;x|=x>>16;return x-(x>>1);}
unsigned long long LMB(unsigned long long x){x|=x>>1;x|=x>>2;x|=x>>4;x|=x>>8;x|=x>>16;x|=x>>32;return x-(x>>1);}
int POP32(unsigned x) {
unsigned y=0x33333333;x-=(x>>1)&0x55555555;
x=(x&y)+((x>>2)&y);x=(x+(x>>4))&0x0f0f0f0f;x+=x>>8;x+=x>>16;return x&0x3f;}
int POP(unsigned long long x) {
unsigned long long y=0x3333333333333333ULL;x-=(x>>1)&0x5555555555555555ULL;
x=(x&y)+((x>>2)&y);x=(x+(x>>4))&0x0f0f0f0f0f0f0f0fULL;x+=x>>8;x+=x>>16;x+=x>>32;return x&0x7f;}
#define sz(a) static_cast<int>(sizeof(a) / sizeof((a)[0]))
#define rep(i,n) for(int i=0;i<n;++i)
#define check32(val, name) \
rep(i, sz(val)) { \
printf("%s %08x: RMB=%08x, LMB=%08x, POP=%d\n", name, \
val[i], RMB(val[i]), LMB32(val[i]), POP32(val[i])); \
}
#define check64(val, name) \
rep(i, sz(val)) { \
printf("%s %016llx: RMB=%016llx, LMB=%016llx, POP=%d\n", name, \
val[i], RMB(val[i]), LMB(val[i]), POP(val[i])); \
}
int main(int argc, char *argv[]) {
int ints[] = {0, 1, 88, 0x7fffffff, 0x80000000, 0xffffffff};
unsigned uints[] = {0, 1, 88, 0x7fffffff, 0x80000000, 0xffffffff};
long long lls[] = {0, 1, 88, 0x7fffffff, 0x80000000LL, 0xffffffffLL,
0x7fffffffffffffffLL, 0x8000000000000000LL, 0xffffffffffffffffLL};
unsigned long long ulls[] = {0, 1, 88, 0x7fffffff, 0x80000000LL, 0xffffffffLL,
0x7fffffffffffffffLL, 0x8000000000000000LL, 0xffffffffffffffffLL};
check32(ints , "int ");
check32(uints, "uint");
check64(lls , "ll ");
check64(ulls , "ull ");
return 0;
}
・実行結果
int 00000000: RMB=00000000, LMB=00000000, POP=0
int 00000001: RMB=00000001, LMB=00000001, POP=1
int 00000058: RMB=00000008, LMB=00000040, POP=3
int 7fffffff: RMB=00000001, LMB=40000000, POP=31
int 80000000: RMB=80000000, LMB=80000000, POP=1
int ffffffff: RMB=00000001, LMB=80000000, POP=32
uint 00000000: RMB=00000000, LMB=00000000, POP=0
uint 00000001: RMB=00000001, LMB=00000001, POP=1
uint 00000058: RMB=00000008, LMB=00000040, POP=3
uint 7fffffff: RMB=00000001, LMB=40000000, POP=31
uint 80000000: RMB=80000000, LMB=80000000, POP=1
uint ffffffff: RMB=00000001, LMB=80000000, POP=32
ll 0000000000000000: RMB=0000000000000000, LMB=0000000000000000, POP=0
ll 0000000000000001: RMB=0000000000000001, LMB=0000000000000001, POP=1
ll 0000000000000058: RMB=0000000000000008, LMB=0000000000000040, POP=3
ll 000000007fffffff: RMB=0000000000000001, LMB=0000000040000000, POP=31
ll 0000000080000000: RMB=0000000080000000, LMB=0000000080000000, POP=1
ll 00000000ffffffff: RMB=0000000000000001, LMB=0000000080000000, POP=32
ll 7fffffffffffffff: RMB=0000000000000001, LMB=4000000000000000, POP=63
ll 8000000000000000: RMB=8000000000000000, LMB=8000000000000000, POP=1
ll ffffffffffffffff: RMB=0000000000000001, LMB=8000000000000000, POP=64
ull 0000000000000000: RMB=0000000000000000, LMB=0000000000000000, POP=0
ull 0000000000000001: RMB=0000000000000001, LMB=0000000000000001, POP=1
ull 0000000000000058: RMB=0000000000000008, LMB=0000000000000040, POP=3
ull 000000007fffffff: RMB=0000000000000001, LMB=0000000040000000, POP=31
ull 0000000080000000: RMB=0000000080000000, LMB=0000000080000000, POP=1
ull 00000000ffffffff: RMB=0000000000000001, LMB=0000000080000000, POP=32
ull 7fffffffffffffff: RMB=0000000000000001, LMB=4000000000000000, POP=63
ull 8000000000000000: RMB=8000000000000000, LMB=8000000000000000, POP=1
ull ffffffffffffffff: RMB=0000000000000001, LMB=8000000000000000, POP=64
・ アルゴリズム実測
LMB(最左のビットを求める計算)については、いくつかの異なるアプローチがある。
1) Divide & Conquer で 左端の1のビットを伝播させる。分岐不要。
2) 左端から1ビットずつ移動してチェック。 大きいビットが立っている場合に有利。
3) 最右の1のビットを消し込んでいく方法。1のビットが少ない場合に有利。
4) log2関数を使う方法。ただし、long long型の大きな数になると精度不足で誤った結果となってしまったので
最初に一回だけ分岐を行った。
次のようなコードで実装した。
#include <cmath>
#include <iostream>
#include <ctime>
using namespace std;
typedef unsigned long long ULL;
#define sz(a) static_cast(sizeof(a) / sizeof((a)[0]))
#define rep(i,n) for(int i=0;i>i;return x-(x>>1);}
ULL LMB_BRANCH_FREE(ULL x) {rep(i,6)x|=x>>(1<<i);return x-(x>>1);}
ULL LMB_BRANCH_FREE2(ULL x) {x|=x>>1;x|=x>>2;x|=x>>4;x|=x>>8;x|=x>>16;x|=x>>32;return x-(x>>1);}
ULL LMB_SHIFT(ULL x) {ULL y=0x8000000000000000;while(y>x)y>>=1;return y;}
ULL LMB_POP(ULL x) {ULL y;do{y=x;x=x&(x-1);}while(x);return y;}
ULL LMB_LOG2(ULL x) {ULL y=0x8000000000000000;return x&y?y:x?1ULL<<static_cast<int>(log2(x)):x;}
double time_it(int iter, int count, T func, U val) {
double elapsed = 1e30;
for (int i = 0; i < iter; ++i) {
clock_t t = clock();
for (int j = 0; j < count; ++j) {
func(val);
}
clock_t s = clock();
elapsed = min(elapsed, static_cast(s - t) / CLOCKS_PER_SEC);
}
return elapsed;
}
int main(int argc, char *argv[]) {
ULL a[] = {0, 1, 0x6666666666666666ULL, 0x7fffffffffffffffULL, 0xffffffffffffffffULL};
ULL (*f[])(ULL) = {LMB_BRANCH_FREE, LMB_BRANCH_FREE2, LMB_SHIFT, LMB_POP, LMB_LOG2};
int iter = 3, count = 10000000;
rep(i, sz(f)) rep(j, sz(a)) {
printf("val=%016llx elapsed=%.6f count=%d ret=%016llx\n",
a[j], time_it(iter, count, f[i], a[j]), count, f[i](a[j]));
}
}
手元のMacで実測した結果。
val=0000000000000000 elapsed=0.173500 count=10000000 ret=0000000000000000
val=0000000000000001 elapsed=0.169051 count=10000000 ret=0000000000000001
val=6666666666666666 elapsed=0.164456 count=10000000 ret=4000000000000000
val=7fffffffffffffff elapsed=0.162661 count=10000000 ret=4000000000000000
val=ffffffffffffffff elapsed=0.164984 count=10000000 ret=8000000000000000
val=0000000000000000 elapsed=0.104226 count=10000000 ret=0000000000000000
val=0000000000000001 elapsed=0.104737 count=10000000 ret=0000000000000001
val=6666666666666666 elapsed=0.104844 count=10000000 ret=4000000000000000
val=7fffffffffffffff elapsed=0.104384 count=10000000 ret=4000000000000000
val=ffffffffffffffff elapsed=0.104266 count=10000000 ret=8000000000000000
val=0000000000000000 elapsed=1.761051 count=10000000 ret=0000000000000000
val=0000000000000001 elapsed=1.732183 count=10000000 ret=0000000000000001
val=6666666666666666 elapsed=0.048205 count=10000000 ret=4000000000000000
val=7fffffffffffffff elapsed=0.048141 count=10000000 ret=4000000000000000
val=ffffffffffffffff elapsed=0.040281 count=10000000 ret=8000000000000000
val=0000000000000000 elapsed=0.049773 count=10000000 ret=0000000000000000
val=0000000000000001 elapsed=0.050759 count=10000000 ret=0000000000000001
val=6666666666666666 elapsed=1.017678 count=10000000 ret=4000000000000000
val=7fffffffffffffff elapsed=1.995849 count=10000000 ret=4000000000000000
val=ffffffffffffffff elapsed=2.025537 count=10000000 ret=8000000000000000
val=0000000000000000 elapsed=0.049091 count=10000000 ret=0000000000000000
val=0000000000000001 elapsed=0.253222 count=10000000 ret=0000000000000001
val=6666666666666666 elapsed=0.233752 count=10000000 ret=4000000000000000
val=7fffffffffffffff elapsed=0.233075 count=10000000 ret=8000000000000000
val=ffffffffffffffff elapsed=0.044856 count=10000000 ret=8000000000000000
グラフにするとこんな感じ。当然ながら、ブランチフリー版が最も安定していた。