9.23.2012

Bitwise Operations in C++

TC++: ビット操作関連の関数

C++ でビット操作の実装。

・最右ビットの分離、最左ビットの分離(=2の冪乗で切り捨て)、1のビットの数を数えるコードと実行例

#include <cmath>
#include <iostream>
#include <ctime>

using namespace std;

// RMB(right-most bit)  : 01011000 => 00001000, 00000000 => 00000000
// LMB(left-most bit)   : 01011000 => 01000000, 00000000 => 00000000
// POP(population count): 01011000 => 3
template <typename T> T RMB(T x){return x&-(unsigned long long)x;}
unsigned int LMB32(unsigned int x){x|=x>>1;x|=x>>2;x|=x>>4;x|=x>>8;x|=x>>16;return x-(x>>1);}
unsigned long long LMB(unsigned long long x){x|=x>>1;x|=x>>2;x|=x>>4;x|=x>>8;x|=x>>16;x|=x>>32;return x-(x>>1);}
int POP32(unsigned x) {
  unsigned y=0x33333333;x-=(x>>1)&0x55555555;
  x=(x&y)+((x>>2)&y);x=(x+(x>>4))&0x0f0f0f0f;x+=x>>8;x+=x>>16;return x&0x3f;}
int POP(unsigned long long x) {
  unsigned long long y=0x3333333333333333ULL;x-=(x>>1)&0x5555555555555555ULL;
  x=(x&y)+((x>>2)&y);x=(x+(x>>4))&0x0f0f0f0f0f0f0f0fULL;x+=x>>8;x+=x>>16;x+=x>>32;return x&0x7f;}

#define sz(a) static_cast<int>(sizeof(a) / sizeof((a)[0]))
#define rep(i,n) for(int i=0;i<n;++i)
#define check32(val, name) \
  rep(i, sz(val)) { \
    printf("%s %08x: RMB=%08x, LMB=%08x, POP=%d\n", name, \
        val[i], RMB(val[i]), LMB32(val[i]), POP32(val[i])); \
  }
#define check64(val, name) \
  rep(i, sz(val)) { \
    printf("%s %016llx: RMB=%016llx, LMB=%016llx, POP=%d\n", name, \
        val[i], RMB(val[i]), LMB(val[i]), POP(val[i])); \
  }

int main(int argc, char *argv[]) {
  int ints[] = {0, 1, 88, 0x7fffffff, 0x80000000, 0xffffffff};
  unsigned uints[] = {0, 1, 88, 0x7fffffff, 0x80000000, 0xffffffff};
  long long lls[] = {0, 1, 88, 0x7fffffff, 0x80000000LL, 0xffffffffLL,
      0x7fffffffffffffffLL, 0x8000000000000000LL, 0xffffffffffffffffLL};
  unsigned long long ulls[] = {0, 1, 88, 0x7fffffff, 0x80000000LL, 0xffffffffLL,
      0x7fffffffffffffffLL, 0x8000000000000000LL, 0xffffffffffffffffLL};

  check32(ints , "int ");
  check32(uints, "uint");
  check64(lls  , "ll  ");
  check64(ulls , "ull ");
  return 0;
}

・実行結果

int  00000000: RMB=00000000, LMB=00000000, POP=0
int  00000001: RMB=00000001, LMB=00000001, POP=1
int  00000058: RMB=00000008, LMB=00000040, POP=3
int  7fffffff: RMB=00000001, LMB=40000000, POP=31
int  80000000: RMB=80000000, LMB=80000000, POP=1
int  ffffffff: RMB=00000001, LMB=80000000, POP=32
uint 00000000: RMB=00000000, LMB=00000000, POP=0
uint 00000001: RMB=00000001, LMB=00000001, POP=1
uint 00000058: RMB=00000008, LMB=00000040, POP=3
uint 7fffffff: RMB=00000001, LMB=40000000, POP=31
uint 80000000: RMB=80000000, LMB=80000000, POP=1
uint ffffffff: RMB=00000001, LMB=80000000, POP=32
ll   0000000000000000: RMB=0000000000000000, LMB=0000000000000000, POP=0
ll   0000000000000001: RMB=0000000000000001, LMB=0000000000000001, POP=1
ll   0000000000000058: RMB=0000000000000008, LMB=0000000000000040, POP=3
ll   000000007fffffff: RMB=0000000000000001, LMB=0000000040000000, POP=31
ll   0000000080000000: RMB=0000000080000000, LMB=0000000080000000, POP=1
ll   00000000ffffffff: RMB=0000000000000001, LMB=0000000080000000, POP=32
ll   7fffffffffffffff: RMB=0000000000000001, LMB=4000000000000000, POP=63
ll   8000000000000000: RMB=8000000000000000, LMB=8000000000000000, POP=1
ll   ffffffffffffffff: RMB=0000000000000001, LMB=8000000000000000, POP=64
ull  0000000000000000: RMB=0000000000000000, LMB=0000000000000000, POP=0
ull  0000000000000001: RMB=0000000000000001, LMB=0000000000000001, POP=1
ull  0000000000000058: RMB=0000000000000008, LMB=0000000000000040, POP=3
ull  000000007fffffff: RMB=0000000000000001, LMB=0000000040000000, POP=31
ull  0000000080000000: RMB=0000000080000000, LMB=0000000080000000, POP=1
ull  00000000ffffffff: RMB=0000000000000001, LMB=0000000080000000, POP=32
ull  7fffffffffffffff: RMB=0000000000000001, LMB=4000000000000000, POP=63
ull  8000000000000000: RMB=8000000000000000, LMB=8000000000000000, POP=1
ull  ffffffffffffffff: RMB=0000000000000001, LMB=8000000000000000, POP=64

・ アルゴリズム実測

LMB(最左のビットを求める計算)については、いくつかの異なるアプローチがある。

1) Divide & Conquer で 左端の1のビットを伝播させる。分岐不要。
2) 左端から1ビットずつ移動してチェック。 大きいビットが立っている場合に有利。
3) 最右の1のビットを消し込んでいく方法。1のビットが少ない場合に有利。
4) log2関数を使う方法。ただし、long long型の大きな数になると精度不足で誤った結果となってしまったので
 最初に一回だけ分岐を行った。

次のようなコードで実装した。

#include <cmath>
#include <iostream>
#include <ctime>

using namespace std;
typedef unsigned long long ULL;
#define sz(a) static_cast(sizeof(a) / sizeof((a)[0]))
#define rep(i,n) for(int i=0;i>i;return x-(x>>1);}

ULL LMB_BRANCH_FREE(ULL x) {rep(i,6)x|=x>>(1<<i);return x-(x>>1);}
ULL LMB_BRANCH_FREE2(ULL x) {x|=x>>1;x|=x>>2;x|=x>>4;x|=x>>8;x|=x>>16;x|=x>>32;return x-(x>>1);}
ULL LMB_SHIFT(ULL x) {ULL y=0x8000000000000000;while(y>x)y>>=1;return y;}
ULL LMB_POP(ULL x) {ULL y;do{y=x;x=x&(x-1);}while(x);return y;}
ULL LMB_LOG2(ULL x) {ULL y=0x8000000000000000;return x&y?y:x?1ULL<<static_cast<int>(log2(x)):x;}

double time_it(int iter, int count, T func, U val) {
  double elapsed = 1e30;
  for (int i = 0; i < iter; ++i) {
    clock_t t = clock();
    for (int j = 0; j < count; ++j) {
      func(val);
    }
    clock_t s = clock();
    elapsed = min(elapsed, static_cast(s - t) / CLOCKS_PER_SEC);
  }
  return elapsed;
}

int main(int argc, char *argv[]) {
  ULL a[] = {0, 1, 0x6666666666666666ULL, 0x7fffffffffffffffULL, 0xffffffffffffffffULL};
  ULL (*f[])(ULL) = {LMB_BRANCH_FREE, LMB_BRANCH_FREE2, LMB_SHIFT, LMB_POP, LMB_LOG2};
  int iter = 3, count = 10000000;
  rep(i, sz(f)) rep(j, sz(a)) {
    printf("val=%016llx  elapsed=%.6f  count=%d  ret=%016llx\n",
        a[j], time_it(iter, count, f[i], a[j]), count, f[i](a[j]));
  }
}

手元のMacで実測した結果。

val=0000000000000000  elapsed=0.173500  count=10000000  ret=0000000000000000
val=0000000000000001  elapsed=0.169051  count=10000000  ret=0000000000000001
val=6666666666666666  elapsed=0.164456  count=10000000  ret=4000000000000000
val=7fffffffffffffff  elapsed=0.162661  count=10000000  ret=4000000000000000
val=ffffffffffffffff  elapsed=0.164984  count=10000000  ret=8000000000000000
val=0000000000000000  elapsed=0.104226  count=10000000  ret=0000000000000000
val=0000000000000001  elapsed=0.104737  count=10000000  ret=0000000000000001
val=6666666666666666  elapsed=0.104844  count=10000000  ret=4000000000000000
val=7fffffffffffffff  elapsed=0.104384  count=10000000  ret=4000000000000000
val=ffffffffffffffff  elapsed=0.104266  count=10000000  ret=8000000000000000
val=0000000000000000  elapsed=1.761051  count=10000000  ret=0000000000000000
val=0000000000000001  elapsed=1.732183  count=10000000  ret=0000000000000001
val=6666666666666666  elapsed=0.048205  count=10000000  ret=4000000000000000
val=7fffffffffffffff  elapsed=0.048141  count=10000000  ret=4000000000000000
val=ffffffffffffffff  elapsed=0.040281  count=10000000  ret=8000000000000000
val=0000000000000000  elapsed=0.049773  count=10000000  ret=0000000000000000
val=0000000000000001  elapsed=0.050759  count=10000000  ret=0000000000000001
val=6666666666666666  elapsed=1.017678  count=10000000  ret=4000000000000000
val=7fffffffffffffff  elapsed=1.995849  count=10000000  ret=4000000000000000
val=ffffffffffffffff  elapsed=2.025537  count=10000000  ret=8000000000000000
val=0000000000000000  elapsed=0.049091  count=10000000  ret=0000000000000000
val=0000000000000001  elapsed=0.253222  count=10000000  ret=0000000000000001
val=6666666666666666  elapsed=0.233752  count=10000000  ret=4000000000000000
val=7fffffffffffffff  elapsed=0.233075  count=10000000  ret=8000000000000000
val=ffffffffffffffff  elapsed=0.044856  count=10000000  ret=8000000000000000

グラフにするとこんな感じ。当然ながら、ブランチフリー版が最も安定していた。
NewImage

0 件のコメント:

コメントを投稿