ntzの別の形 - 謎's キッチン

作って使ってみたもののmysortではテーブル引きより遅くなるなぁ(実際のコードはこのとおりでは無いから単純比較はできないけど)。
!!ってどうなってるのか気になったのでアセンブラも調べてみた。

uint ntz(uint x){
  if(!x)
    return 32;

  x = x&(-x);

  uint rv;

  rv = !!(x>>16) << 4;//16
  rv += !!(x>>(8+rv)) << 3;//8
  rv += !!(x>>(4+rv)) << 2;//4
  rv += !!(x>>(2+rv)) << 1;//2
  rv += !!(x>>(1+rv));//1
  return rv;
}

アセンブラはこんな感じ。

   0:	55                   	push   %ebp
   1:	8b ec                	mov    %esp,%ebp
   3:	53                   	push   %ebx
   4:	56                   	push   %esi
   5:	57                   	push   %edi
   6:	89 c3                	mov    %eax,%ebx
   8:	85 db                	test   %ebx,%ebx
   a:	75 0a                	jne    16 <_D7mysort314__T3ntzVb1Vi0Z3ntzFkZk+0x16>
   c:	b8 20 00 00 00       	mov    $0x20,%eax
  11:	5f                   	pop    %edi
  12:	5e                   	pop    %esi
  13:	5b                   	pop    %ebx
  14:	5d                   	pop    %ebp
  15:	c3                   	ret    
  16:	f7 d8                	neg    %eax
  18:	21 c3                	and    %eax,%ebx
  1a:	8b f3                	mov    %ebx,%esi
  1c:	89 f2                	mov    %esi,%edx
  1e:	c1 ea 10             	shr    $0x10,%edx
  21:	f7 da                	neg    %edx
  23:	19 d2                	sbb    %edx,%edx
  25:	f7 da                	neg    %edx
  27:	c1 e2 04             	shl    $0x4,%edx
  2a:	8d 4a 08             	lea    0x8(%edx),%ecx
  2d:	d3 ee                	shr    %cl,%esi
  2f:	f7 de                	neg    %esi
  31:	19 f6                	sbb    %esi,%esi
  33:	f7 de                	neg    %esi
  35:	c1 e6 03             	shl    $0x3,%esi
  38:	01 f2                	add    %esi,%edx
  3a:	8b fb                	mov    %ebx,%edi
  3c:	8d 4a 04             	lea    0x4(%edx),%ecx
  3f:	d3 ef                	shr    %cl,%edi
  41:	f7 df                	neg    %edi
  43:	19 ff                	sbb    %edi,%edi
  45:	f7 df                	neg    %edi
  47:	c1 e7 02             	shl    $0x2,%edi
  4a:	01 fa                	add    %edi,%edx
  4c:	8b c3                	mov    %ebx,%eax
  4e:	8d 4a 02             	lea    0x2(%edx),%ecx
  51:	d3 e8                	shr    %cl,%eax
  53:	f7 d8                	neg    %eax
  55:	19 c0                	sbb    %eax,%eax
  57:	f7 d8                	neg    %eax
  59:	01 c0                	add    %eax,%eax
  5b:	01 c2                	add    %eax,%edx
  5d:	8b f3                	mov    %ebx,%esi
  5f:	8d 4a 01             	lea    0x1(%edx),%ecx
  62:	d3 ee                	shr    %cl,%esi
  64:	f7 de                	neg    %esi
  66:	19 f6                	sbb    %esi,%esi
  68:	f7 de                	neg    %esi
  6a:	01 f2                	add    %esi,%edx
  6c:	8b c2                	mov    %edx,%eax
  6e:	5f                   	pop    %edi
  6f:	5e                   	pop    %esi
  70:	5b                   	pop    %ebx
  71:	5d                   	pop    %ebp
  72:	c3                   	ret    
  73:	90                   	nop

えーと…、neg sbb negがどうなってるんだ、これ。
ついでにC版。

unsigned int ntz(unsigned int x){
  unsigned int rv;

  if(!x)
    return 32;

  x = x&(-x);

  rv = !!(x>>16) << 4;//16
  rv += !!(x>>(8+rv)) << 3;//8
  rv += !!(x>>(4+rv)) << 2;//4
  rv += !!(x>>(2+rv)) << 1;//2
  rv += !!(x>>(1+rv));//1
  return rv;
}

gcc 4.3で-O3 -fomit-frame-pointerなアセンブラ。

   0:	83 ec 24             	sub    $0x24,%esp
   3:	83 7c 24 28 00       	cmpl   $0x0,0x28(%esp)
   8:	75 0d                	jne    17 <ntz+0x17>
   a:	b8 20 00 00 00       	mov    $0x20,%eax
   f:	89 04 24             	mov    %eax,(%esp)
  12:	e9 d3 00 00 00       	jmp    ea <ntz+0xea>
  17:	8b 44 24 28          	mov    0x28(%esp),%eax
  1b:	f7 d8                	neg    %eax
  1d:	21 44 24 28          	and    %eax,0x28(%esp)
  21:	8b 44 24 28          	mov    0x28(%esp),%eax
  25:	c1 e8 10             	shr    $0x10,%eax
  28:	85 c0                	test   %eax,%eax
  2a:	74 0a                	je     36 <ntz+0x36>
  2c:	c7 44 24 04 10 00 00 	movl   $0x10,0x4(%esp)
  33:	00 
  34:	eb 08                	jmp    3e <ntz+0x3e>
  36:	c7 44 24 04 00 00 00 	movl   $0x0,0x4(%esp)
  3d:	00 
  3e:	8b 44 24 04          	mov    0x4(%esp),%eax
  42:	89 44 24 20          	mov    %eax,0x20(%esp)
  46:	8b 44 24 20          	mov    0x20(%esp),%eax
  4a:	83 c0 08             	add    $0x8,%eax
  4d:	89 c1                	mov    %eax,%ecx
  4f:	8b 44 24 28          	mov    0x28(%esp),%eax
  53:	d3 e8                	shr    %cl,%eax
  55:	85 c0                	test   %eax,%eax
  57:	74 0a                	je     63 <ntz+0x63>
  59:	c7 44 24 08 08 00 00 	movl   $0x8,0x8(%esp)
  60:	00 
  61:	eb 08                	jmp    6b <ntz+0x6b>
  63:	c7 44 24 08 00 00 00 	movl   $0x0,0x8(%esp)
  6a:	00 
  6b:	8b 44 24 08          	mov    0x8(%esp),%eax
  6f:	01 44 24 20          	add    %eax,0x20(%esp)
  73:	8b 44 24 20          	mov    0x20(%esp),%eax
  77:	83 c0 04             	add    $0x4,%eax
  7a:	89 c1                	mov    %eax,%ecx
  7c:	8b 44 24 28          	mov    0x28(%esp),%eax
  80:	d3 e8                	shr    %cl,%eax
  82:	85 c0                	test   %eax,%eax
  84:	74 0a                	je     90 <ntz+0x90>
  86:	c7 44 24 0c 04 00 00 	movl   $0x4,0xc(%esp)
  8d:	00 
  8e:	eb 08                	jmp    98 <ntz+0x98>
  90:	c7 44 24 0c 00 00 00 	movl   $0x0,0xc(%esp)
  97:	00 
  98:	8b 44 24 0c          	mov    0xc(%esp),%eax
  9c:	01 44 24 20          	add    %eax,0x20(%esp)
  a0:	8b 44 24 20          	mov    0x20(%esp),%eax
  a4:	83 c0 02             	add    $0x2,%eax
  a7:	89 c1                	mov    %eax,%ecx
  a9:	8b 44 24 28          	mov    0x28(%esp),%eax
  ad:	d3 e8                	shr    %cl,%eax
  af:	85 c0                	test   %eax,%eax
  b1:	74 0a                	je     bd <ntz+0xbd>
  b3:	c7 44 24 10 02 00 00 	movl   $0x2,0x10(%esp)
  ba:	00 
  bb:	eb 08                	jmp    c5 <ntz+0xc5>
  bd:	c7 44 24 10 00 00 00 	movl   $0x0,0x10(%esp)
  c4:	00 
  c5:	8b 44 24 10          	mov    0x10(%esp),%eax
  c9:	01 44 24 20          	add    %eax,0x20(%esp)
  cd:	8b 44 24 20          	mov    0x20(%esp),%eax
  d1:	83 c0 01             	add    $0x1,%eax
  d4:	89 c1                	mov    %eax,%ecx
  d6:	8b 44 24 28          	mov    0x28(%esp),%eax
  da:	d3 e8                	shr    %cl,%eax
  dc:	85 c0                	test   %eax,%eax
  de:	0f 95 c0             	setne  %al
  e1:	0f b6 c0             	movzbl %al,%eax
  e4:	01 44 24 20          	add    %eax,0x20(%esp)
  e8:	eb 03                	jmp    ed <ntz+0xed>
  ea:	8b 04 24             	mov    (%esp),%eax
  ed:	83 c4 24             	add    $0x24,%esp
  f0:	c3                   	ret

ジャンプだらけな件。i686向けになるけど?:使ってcmovがいいのかなぁ。
iccでテストしたくなってきた件。

!!が遅いってことで無くしてみた。

  if(!x)
    return 32;

  x = (~x&(x-1));

  uint rv;

  rv =  (x&0x8000)>>11;//16
  rv |= (x>>(rv+4))&(0x80>>4);//8
  rv |= (x>>(rv+1))&(0x8>>1);//4
  rv |= (x>>rv)&0x2;//2
  rv |= (x>>rv)/*&0x1*/;//1

  return rv;

なるほどな速度。下位のテーブル化もできそう。