O(n)で打倒qsortを目指し。勝てないなぁ。
アルゴリズム考えてるときから分かってたけど、メモリ喰いまくり。ワーストケースでn*33bytesも喰う。
あとntzはもっと早く求めることできる気がする。
uint bit_count(uint bits) { bits = (bits & 0x55555555) + (bits >> 1 & 0x55555555); bits = (bits & 0x33333333) + (bits >> 2 & 0x33333333); bits = (bits & 0x0f0f0f0f) + (bits >> 4 & 0x0f0f0f0f); bits = (bits & 0x00ff00ff) + (bits >> 8 & 0x00ff00ff); return (bits & 0x0000ffff) + (bits >>16 & 0x0000ffff); } uint[] sort(uint[] arr){ size_t[33] index, offset; foreach(ref a;arr) for(uint k = a; k; k&=k-1) offset[bit_count(~k&(k-1))]++; foreach(i;0..32) index[i+1] = offset[i] + index[i]; scope uint[] work = new uint[index[32] + arr.length]; foreach(ref a;arr) work[index[bit_count(~a&(a-1))]++] = a; auto o = &*offset; uint mask = uint.max << 1; foreach(ref a;work[0..$-arr.length]){ for(;!*o;o++) mask&=mask-1; (*o)--; auto t = a & mask; work[index[bit_count(~t&(t-1))]++] = a; } arr = work[$-arr.length..$]; return arr; }
ntzを高速化してみた。ライセンス注意。
//License is under LGPL. copy paste from ffmpeg invariant ubyte[256] ff_log2_tab=[ 0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 ]; //License is under LGPL. copy paste from ffmpeg uint ntz(bool zero=true)(uint x){ static if(!zero) assert(x); else if(!x) return 32; x = x&(-x); int n = 0; if (x & 0xffff0000) { x >>= 16; n += 16; } if (x & 0xff00) { x >>= 8; n += 8; } n += ff_log2_tab[x]; return n; } uint[] sort(uint[] arr){ size_t[33] index, offset; foreach(ref a;arr) for(uint k = a; k; k&=k-1) offset[ntz!(false)(k)]++; foreach(i;0..32) index[i+1] = offset[i] + index[i]; scope uint[] work = new uint[index[32] + arr.length]; foreach(ref a;arr) work[index[ntz(a)]++] = a; auto o = &*offset; uint mask = uint.max << 1; foreach(ref a;work[0..$-arr.length]){ for(;!*o;o++) mask&=mask-1; (*o)--; auto t = a & mask; work[index[ntz(t)]++] = a; } arr = work[$-arr.length..$]; return arr; }