自作ソート続き - 謎's キッチン

メモリがワーストケースでもn*32で良くなった。入力バッファに出力するようにしたので、配列スライスやGCの無いC言語でも実装できるようになった。高速化した。まだ一ヶ所高速化途中の箇所あるけどもういいや。
速度的には、D言語の内部ソート(独自実装のqsort)にも勝てない。STLのsortはおろか、C言語のqsortすら負けてる気ガス。
#そもそもテーブル使ってる時点でキャッシュメモリを汚染s(ry
#CPUにntz命令があれば良いのに。
まぁ書いてて楽しかったので良し。

import std.stdio, std.c.stdlib;

invariant ubyte[152] table=[
  0, 0, 1, 0, 2, 0, 8, 9, 3,10, 0,16,17,11,18, 0, 4, 0,19, 0, 0,12, 0, 0,24,25,20,26, 0, 0, 0,27,
  5, 0, 0, 0, 0,13, 0,28, 0, 0,21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,29, 0, 0, 0, 0, 0, 0, 0, 0,
  6, 0, 0, 0, 0,14, 0, 0, 0, 0,22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,30, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  7, 0, 0, 0, 0,15, 0, 0, 0, 0,23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,31
];

invariant ubyte[256] table2=[
  32,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  28,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  29,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  28,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  30,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  28,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  29,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  28,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  31,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  28,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  29,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  28,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  30,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  28,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  29,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
  28,24,25,24,26,24,25,24,27,24,25,24,26,24,25,24,
];

uint ntz(bool zero=true, uint b = 0)(uint x){
  version(D_InlineAsm_X86){
    asm{
      naked;
    };
    static if(!zero)
      assert(x);
    else
      asm{
        test EAX, EAX;
        jne n;
        mov EAX, 32;
        ret;
      };
    n:

    asm{
      mov ECX, EAX;
      neg ECX;
      and ECX, EAX;

    }
    static if(b==0){
      asm{
        movzx EAX, CL;
        mov   AL, byte ptr table[EAX];
        movzx EDX, CH;
        or    AL, byte ptr table[EDX+0x5];
        shr ECX, 16;

        mov  DL, CL;
        or   AL, byte ptr table[EDX+0xa];
        mov  DL, CH;
        or   AL, byte ptr table[EDX+0x17];

        ret;
      };
    }else static if(b==1){
      asm{
        movzx EAX, CH;
        mov    AL, byte ptr table[EAX+0x5];
        shr   ECX, 16;

        movzx EDX, CL;
        or    AL, byte ptr table[EDX+0xa];
        mov   DL, CH;
        or    AL, byte ptr table[EDX+0x17];

        ret;
      };
    }else static if(b==2){
      asm{
        shr   ECX, 16;

        movzx EAX, CL;
        mov    AL, byte ptr table[EAX+0xa];
        movzx EDX, CH;
        or    AL, byte ptr table[EDX+0x17];

        ret;
     };
   }else static if(b==3){
      asm{
        shr   ECX, 24;

        movzx EAX, byte ptr table[ECX+0x17];

        ret;
      };
    }
  }else{
    static if(!zero)
      assert(x);
    else
      if(!x)
        return 32;

    x = x&(-x);

    static if(b==0){
      return table[ (x>>24)      + 0x17] |
             table[((x>>16)&0xff) + 0xa ] |
             table[((x>>8 )&0xff) + 0x5 ] |
             table[  x    &0xff  + 0x0 ];
    }else static if(b==1){
      return table[ (x>>24)      + 0x17] |
             table[((x>>16)&0xff) + 0xa ] |
             table[((x>>8 )&0xff) + 0x5 ];

    }else static if(b==2){
      return table[ (x>>24)      + 0x17] |
             table[((x>>16)&0xff) + 0xa ];

    }else static if(b==3){
      return table[ (x>>24)      + 0x17];
    }
  }
}

uint offset_init(size_t* _offset,uint[] arr){
  version(D_InlineAsm_X86){
    asm{
      naked;

      push EBP;
      mov EBP,ESP;
      push EBX;
      push ESI;
      push EDI;

      mov ESI, 0xc[EBP]; // arr.ptr
      mov EDI, 0x8[EBP]; // arr.ptr + arr.length
      add EDI, EDI;
      add EDI, EDI;
      add EDI, ESI;
      mov EBP, 0x10[EBP]; // _offset
      mov EAX, 0; //len

      start:
      mov EBX, [ESI];
      xor EDX, EDX;
      test BL, BL;
      jz end1;
      start1:
      mov ECX, EBX;
      neg ECX;
      and ECX, EBX;
      mov DL, byte ptr table[ECX];
      inc int ptr [EBP+EDX*4];
      inc EAX;
      xor EBX, ECX;
      test BL, BL;
      jnz start1;
      end1:;

      test BH, BH;
      jz end2;
      start2:
      movzx ECX, BH;
      neg CL;
      and CL, BH;
      mov DL, byte ptr table[ECX + 5];
      inc int ptr [EBP+EDX*4];
      inc EAX;
      xor BH, CL;
      test BH, BH;
      jnz start2;
      end2:;

      shr EBX, 0x10;

      test BL, BL;
      jz end3;
      start3:
      mov ECX, EBX;
      neg ECX;
      and ECX, EBX;
      mov DL, byte ptr table[ECX + 0xa];
      inc int ptr [EBP+EDX*4];
      inc EAX;
      xor EBX, ECX;
      test BL, BL;
      jnz start3;
      end3:;

      test BH, BH;
      jz end4;
      start4:
      movzx ECX, BH;
      neg CL;
      and CL, BH;
      mov DL, byte ptr table[ECX + 0x17];
      inc int ptr [EBP+EDX*4];
      inc EAX;
      xor BH, CL;
      test BH, BH;
      jnz start4;
      end4:;

      add ESI, 4;
      cmp ESI, EDI;
      jb start;

      pop EDI;
      pop ESI;
      pop EBX;
      pop EBP;
      ret 0xc;
    }
  }else{
    uint len;
    foreach(a;arr){
      uint b;
      for(; a & 0xff; a^=b, len++)
        _offset[table[b=(a&(-a))]]++;
      a>>=8;
      for(; a & 0xff; a^=b, len++)
        _offset[table[(b=(a&(-a))) + 5]]++;
      a>>=8;
      for(; a & 0xff; a^=b, len++)
        _offset[table[(b=(a&(-a))) + 0xa]]++;
      a>>=8;
      for(; a; a^=b, len++)
        _offset[table[(b=(a&(-a))) + 0x17]]++;
    }
    return len;
  }
}

uint[] sort(uint[] arr){
  size_t[32] offset;
  uint*[33] index = void;
  size_t len = offset_init(offset.ptr, arr);

  scope uint[] work = new uint[len];

  index[0] = &*work;
  foreach(i;0..31)
    index[i+1] = index[i] + offset[i];
  index[32] = &*arr;

  foreach(ref a;arr)
    *index[ntz(a)]++ = a;
  uint mask = uint.max << 1;
  auto w = &*work;

  foreach(o;offset[0..7]){
    for(;o;o--, w++)
      *index[ntz!(true,0)(*w & mask)]++ = *w;
    mask += mask;
  }
  foreach(o;offset[7..15]){
    for(;o;o--, w++)
      *index[ntz!(true,1)(*w & mask)]++ = *w;
    mask += mask;
  }
  foreach(o;offset[15..23]){
    for(;o;o--, w++)
      *index[ntz!(true,2)(*w & mask)]++ = *w;
    mask += mask;
  }
  foreach(o;offset[23..31]){
    for(;o;o--, w++)
      *index[table2[(*w & mask)>>24]]++ = *w;
    mask += mask;
  }
  for(auto o=offset[31];o;o--, w++)
    *index[32]++ = *w;
  return arr;
}

そういや~~i486に~~bsfあるの忘れてた…orz。書き換えてみた。

~~bsfは遅いなぁ。キャッシュ汚染もないのに…。~~
コンパイラの具合によって速度が変わる感じ。組み込み関数使ったら早くなった。

import std.stdio, std.intrinsic;

uint[] sort(uint[] arr){
  size_t[32] offset;
  uint*[33] index = void;
  size_t len;

  foreach(a;arr)
    for(; a; a&=a-1, len++)
      offset[bsf(a)]++;

  scope uint[] work = new uint[len];

  index[0] = &*work;
  foreach(i;0..31)
    index[i+1] = index[i] + offset[i];
  index[32] = &*arr;

  foreach(ref a;arr)
    if(a)
      *index[bsf(a)]++ = a;
    else
      *index[32]++ = a;

  uint mask = uint.max << 1;
  auto w = &*work;
  foreach(o;offset[0..31]){
    for(;o;o--, w++){
      auto t = *w & mask;
      if(t)
        *index[bsf(t)]++ = *w;
      else
        *index[32]++ = *w;
    }
    mask += mask;
  }
  for(auto o=offset[31];o;o--, w++)
    *index[32]++ = *w;
  return arr;
}

最後の部分、

  auto o=offset[31];
  index[32][0..o] = w[0..o];

に変更可能だけど、遅くなるのは何故かなぁ。

std.c.stdlib.mallocを使ってみた。ようは初期化省略。

import std.stdio, std.intrinsic, std.c.stdlib;

uint[] sort(uint[] arr){
  size_t[32] offset;
  uint*[33] index = void;
  size_t len;

  foreach(a;arr)
    for(; a; a&=a-1, len++)
      offset[bsf(a)]++;

//  scope uint[] work = new uint[len];
  scope uint* work = cast(uint*)malloc(uint.sizeof * len);
  index[0] = work;
//  index[0] = &*work;
  foreach(i;0..31)
    index[i+1] = index[i] + offset[i];
  index[32] = &*arr;

  foreach(ref a;arr)
    if(a)
      *index[bsf(a)]++ = a;
    else
      *index[32]++ = a;

  uint mask = uint.max << 1;
//  auto w = &*work;
  auto w = work;
  foreach(o;offset[0..31]){
    for(;o;o--, w++){
      auto t = *w & mask;
      if(t)
        *index[bsf(t)]++ = *w;
      else
        *index[32]++ = *w;
    }
    mask += mask;
  }
//  auto o=offset[31];
//  index[32][0..o] = w[0..o];
  for(auto o=offset[31];o;o--, w++)
    *index[32]++ = *w;

  free(work);
  return arr;
}