lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5
   6 //===---------------------------------------------------------------------===//
   7
   8 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
   9 backend knows how to three-addressify this shift, but it appears the register
  10 allocator isn't even asking it to do so in this case.  We should investigate
  11 why this isn't happening, it could have significant impact on other important
  12 cases for X86 as well.
  13
  14 //===---------------------------------------------------------------------===//
  15
  16 This should be one DIV/IDIV instruction, not a libcall:
  17
  18 unsigned test(unsigned long long X, unsigned Y) {
  19         return X/Y;
  20 }
  21
  22 This can be done trivially with a custom legalizer.  What about overflow
  23 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  24
  25 //===---------------------------------------------------------------------===//
  26
  27 Improvements to the multiply -> shift/add algorithm:
  28 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  29
  30 //===---------------------------------------------------------------------===//
  31
  32 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  33 long long foo(int x) { return 1LL << x; }
  34
  35 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  36 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  37 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  38
  39 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  40
  41 One better solution for 1LL << x is:
  42         xorl    %eax, %eax
  43         xorl    %edx, %edx
  44         testb   $32, %cl
  45         sete    %al
  46         setne   %dl
  47         sall    %cl, %eax
  48         sall    %cl, %edx
  49
  50 But that requires good 8-bit subreg support.
  51
  52 Also, this might be better.  It's an extra shift, but it's one instruction
  53 shorter, and doesn't stress 8-bit subreg support.
  54 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
  55 but without the unnecessary and.)
  56         movl %ecx, %eax
  57         shrl $5, %eax
  58         movl %eax, %edx
  59         xorl $1, %edx
  60         sall %cl, %eax
  61         sall %cl. %edx
  62
  63 64-bit shifts (in general) expand to really bad code.  Instead of using
  64 cmovs, we should expand to a conditional branch like GCC produces.
  65
  66 //===---------------------------------------------------------------------===//
  67
  68 Compile this:
  69 _Bool f(_Bool a) { return a!=1; }
  70
  71 into:
  72         movzbl  %dil, %eax
  73         xorl    $1, %eax
  74         ret
  75
  76 (Although note that this isn't a legal way to express the code that llvm-gcc
  77 currently generates for that function.)
  78
  79 //===---------------------------------------------------------------------===//
  80
  81 Some isel ideas:
  82
  83 1. Dynamic programming based approach when compile time if not an
  84    issue.
  85 2. Code duplication (addressing mode) during isel.
  86 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  87    Sequencing of Instructions".
  88 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  89    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  90    and other related papers.
  91    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  92
  93 //===---------------------------------------------------------------------===//
  94
  95 Should we promote i16 to i32 to avoid partial register update stalls?
  96
  97 //===---------------------------------------------------------------------===//
  98
  99 Leave any_extend as pseudo instruction and hint to register
 100 allocator. Delay codegen until post register allocation.
 101 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
 102 the coalescer how to deal with it though.
 103
 104 //===---------------------------------------------------------------------===//
 105
 106 It appears icc use push for parameter passing. Need to investigate.
 107
 108 //===---------------------------------------------------------------------===//
 109
 110 Only use inc/neg/not instructions on processors where they are faster than
 111 add/sub/xor.  They are slower on the P4 due to only updating some processor
 112 flags.
 113
 114 //===---------------------------------------------------------------------===//
 115
 116 The instruction selector sometimes misses folding a load into a compare.  The
 117 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 118 commutative, it is not matched with the load on both sides.  The dag combiner
 119 should be made smart enough to cannonicalize the load into the RHS of a compare
 120 when it can invert the result of the compare for free.
 121
 122 //===---------------------------------------------------------------------===//
 123
 124 How about intrinsics? An example is:
 125   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 126
 127 compiles to
 128         pmuludq (%eax), %xmm0
 129         movl 8(%esp), %eax
 130         movdqa (%eax), %xmm1
 131         pmulhuw %xmm0, %xmm1
 132
 133 The transformation probably requires a X86 specific pass or a DAG combiner
 134 target specific hook.
 135
 136 //===---------------------------------------------------------------------===//
 137
 138 In many cases, LLVM generates code like this:
 139
 140 _test:
 141         movl 8(%esp), %eax
 142         cmpl %eax, 4(%esp)
 143         setl %al
 144         movzbl %al, %eax
 145         ret
 146
 147 on some processors (which ones?), it is more efficient to do this:
 148
 149 _test:
 150         movl 8(%esp), %ebx
 151         xor  %eax, %eax
 152         cmpl %ebx, 4(%esp)
 153         setl %al
 154         ret
 155
 156 Doing this correctly is tricky though, as the xor clobbers the flags.
 157
 158 //===---------------------------------------------------------------------===//
 159
 160 We should generate bts/btr/etc instructions on targets where they are cheap or
 161 when codesize is important.  e.g., for:
 162
 163 void setbit(int *target, int bit) {
 164     *target |= (1 << bit);
 165 }
 166 void clearbit(int *target, int bit) {
 167     *target &= ~(1 << bit);
 168 }
 169
 170 //===---------------------------------------------------------------------===//
 171
 172 Instead of the following for memset char*, 1, 10:
 173
 174         movl $16843009, 4(%edx)
 175         movl $16843009, (%edx)
 176         movw $257, 8(%edx)
 177
 178 It might be better to generate
 179
 180         movl $16843009, %eax
 181         movl %eax, 4(%edx)
 182         movl %eax, (%edx)
 183         movw al, 8(%edx)
 184
 185 when we can spare a register. It reduces code size.
 186
 187 //===---------------------------------------------------------------------===//
 188
 189 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 190 get this:
 191
 192 define i32 @test1(i32 %X) {
 193     %Y = sdiv i32 %X, 8
 194     ret i32 %Y
 195 }
 196
 197 _test1:
 198         movl 4(%esp), %eax
 199         movl %eax, %ecx
 200         sarl $31, %ecx
 201         shrl $29, %ecx
 202         addl %ecx, %eax
 203         sarl $3, %eax
 204         ret
 205
 206 GCC knows several different ways to codegen it, one of which is this:
 207
 208 _test1:
 209         movl    4(%esp), %eax
 210         cmpl    $-1, %eax
 211         leal    7(%eax), %ecx
 212         cmovle  %ecx, %eax
 213         sarl    $3, %eax
 214         ret
 215
 216 which is probably slower, but it's interesting at least :)
 217
 218 //===---------------------------------------------------------------------===//
 219
 220 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 221 We should leave these as libcalls for everything over a much lower threshold,
 222 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 223 stores, TLB preheating, etc)
 224
 225 //===---------------------------------------------------------------------===//
 226
 227 Optimize this into something reasonable:
 228  x * copysign(1.0, y) * copysign(1.0, z)
 229
 230 //===---------------------------------------------------------------------===//
 231
 232 Optimize copysign(x, *y) to use an integer load from y.
 233
 234 //===---------------------------------------------------------------------===//
 235
 236 %X = weak global int 0
 237
 238 void %foo(int %N) {
 239         %N = cast int %N to uint
 240         %tmp.24 = setgt int %N, 0
 241         br bool %tmp.24, label %no_exit, label %return
 242
 243 no_exit:
 244         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 245         %i.0.0 = cast uint %indvar to int
 246         volatile store int %i.0.0, int* %X
 247         %indvar.next = add uint %indvar, 1
 248         %exitcond = seteq uint %indvar.next, %N
 249         br bool %exitcond, label %return, label %no_exit
 250
 251 return:
 252         ret void
 253 }
 254
 255 compiles into:
 256
 257         .text
 258         .align  4
 259         .globl  _foo
 260 _foo:
 261         movl 4(%esp), %eax
 262         cmpl $1, %eax
 263         jl LBB_foo_4    # return
 264 LBB_foo_1:      # no_exit.preheader
 265         xorl %ecx, %ecx
 266 LBB_foo_2:      # no_exit
 267         movl L_X$non_lazy_ptr, %edx
 268         movl %ecx, (%edx)
 269         incl %ecx
 270         cmpl %eax, %ecx
 271         jne LBB_foo_2   # no_exit
 272 LBB_foo_3:      # return.loopexit
 273 LBB_foo_4:      # return
 274         ret
 275
 276 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 277 remateralization is implemented. This can be accomplished with 1) a target
 278 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 279
 280 //===---------------------------------------------------------------------===//
 281
 282 The following tests perform worse with LSR:
 283
 284 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 285
 286 //===---------------------------------------------------------------------===//
 287
 288 We are generating far worse code than gcc:
 289
 290 volatile short X, Y;
 291
 292 void foo(int N) {
 293   int i;
 294   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 295 }
 296
 297 LBB1_1: # entry.bb_crit_edge
 298         xorl    %ecx, %ecx
 299         xorw    %dx, %dx
 300 LBB1_2: # bb
 301         movl    L_X$non_lazy_ptr, %esi
 302         movw    %cx, (%esi)
 303         movl    L_Y$non_lazy_ptr, %esi
 304         movw    %dx, (%esi)
 305         addw    $4, %dx
 306         incl    %ecx
 307         cmpl    %eax, %ecx
 308         jne     LBB1_2  # bb
 309
 310 vs.
 311
 312         xorl    %edx, %edx
 313         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 314         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 315 L4:
 316         movw    %dx, (%esi)
 317         leal    0(,%edx,4), %eax
 318         movw    %ax, (%ecx)
 319         addl    $1, %edx
 320         cmpl    %edx, %edi
 321         jne     L4
 322
 323 This is due to the lack of post regalloc LICM.
 324
 325 //===---------------------------------------------------------------------===//
 326
 327 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 328 FR64 to VR128.
 329
 330 //===---------------------------------------------------------------------===//
 331
 332 Adding to the list of cmp / test poor codegen issues:
 333
 334 int test(__m128 *A, __m128 *B) {
 335   if (_mm_comige_ss(*A, *B))
 336     return 3;
 337   else
 338     return 4;
 339 }
 340
 341 _test:
 342         movl 8(%esp), %eax
 343         movaps (%eax), %xmm0
 344         movl 4(%esp), %eax
 345         movaps (%eax), %xmm1
 346         comiss %xmm0, %xmm1
 347         setae %al
 348         movzbl %al, %ecx
 349         movl $3, %eax
 350         movl $4, %edx
 351         cmpl $0, %ecx
 352         cmove %edx, %eax
 353         ret
 354
 355 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 356 are a number of issues. 1) We are introducing a setcc between the result of the
 357 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 358 so a any extend (which becomes a zero extend) is added.
 359
 360 We probably need some kind of target DAG combine hook to fix this.
 361
 362 //===---------------------------------------------------------------------===//
 363
 364 We generate significantly worse code for this than GCC:
 365 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 366 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 367
 368 There is also one case we do worse on PPC.
 369
 370 //===---------------------------------------------------------------------===//
 371
 372 For this:
 373
 374 int test(int a)
 375 {
 376   return a * 3;
 377 }
 378
 379 We currently emits
 380         imull $3, 4(%esp), %eax
 381
 382 Perhaps this is what we really should generate is? Is imull three or four
 383 cycles? Note: ICC generates this:
 384         movl    4(%esp), %eax
 385         leal    (%eax,%eax,2), %eax
 386
 387 The current instruction priority is based on pattern complexity. The former is
 388 more "complex" because it folds a load so the latter will not be emitted.
 389
 390 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 391 should always try to match LEA first since the LEA matching code does some
 392 estimate to determine whether the match is profitable.
 393
 394 However, if we care more about code size, then imull is better. It's two bytes
 395 shorter than movl + leal.
 396
 397 On a Pentium M, both variants have the same characteristics with regard
 398 to throughput; however, the multiplication has a latency of four cycles, as
 399 opposed to two cycles for the movl+lea variant.
 400
 401 //===---------------------------------------------------------------------===//
 402
 403 __builtin_ffs codegen is messy.
 404
 405 int ffs_(unsigned X) { return __builtin_ffs(X); }
 406
 407 llvm produces:
 408 ffs_:
 409         movl    4(%esp), %ecx
 410         bsfl    %ecx, %eax
 411         movl    $32, %edx
 412         cmove   %edx, %eax
 413         incl    %eax
 414         xorl    %edx, %edx
 415         testl   %ecx, %ecx
 416         cmove   %edx, %eax
 417         ret
 418
 419 vs gcc:
 420
 421 _ffs_:
 422         movl    $-1, %edx
 423         bsfl    4(%esp), %eax
 424         cmove   %edx, %eax
 425         addl    $1, %eax
 426         ret
 427
 428 Another example of __builtin_ffs (use predsimplify to eliminate a select):
 429
 430 int foo (unsigned long j) {
 431   if (j)
 432     return __builtin_ffs (j) - 1;
 433   else
 434     return 0;
 435 }
 436
 437 //===---------------------------------------------------------------------===//
 438
 439 It appears gcc place string data with linkonce linkage in
 440 .section __TEXT,__const_coal,coalesced instead of
 441 .section __DATA,__const_coal,coalesced.
 442 Take a look at darwin.h, there are other Darwin assembler directives that we
 443 do not make use of.
 444
 445 //===---------------------------------------------------------------------===//
 446
 447 define i32 @foo(i32* %a, i32 %t) {
 448 entry:
 449         br label %cond_true
 450
 451 cond_true:              ; preds = %cond_true, %entry
 452         %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]           ; <i32> [#uses=3]
 453         %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]             ; <i32> [#uses=1]
 454         %tmp2 = getelementptr i32* %a, i32 %x.0.0               ; <i32*> [#uses=1]
 455         %tmp3 = load i32* %tmp2         ; <i32> [#uses=1]
 456         %tmp5 = add i32 %t_addr.0.0, %x.0.0             ; <i32> [#uses=1]
 457         %tmp7 = add i32 %tmp5, %tmp3            ; <i32> [#uses=2]
 458         %tmp9 = add i32 %x.0.0, 1               ; <i32> [#uses=2]
 459         %tmp = icmp sgt i32 %tmp9, 39           ; <i1> [#uses=1]
 460         br i1 %tmp, label %bb12, label %cond_true
 461
 462 bb12:           ; preds = %cond_true
 463         ret i32 %tmp7
 464 }
 465 is pessimized by -loop-reduce and -indvars
 466
 467 //===---------------------------------------------------------------------===//
 468
 469 u32 to float conversion improvement:
 470
 471 float uint32_2_float( unsigned u ) {
 472   float fl = (int) (u & 0xffff);
 473   float fh = (int) (u >> 16);
 474   fh *= 0x1.0p16f;
 475   return fh + fl;
 476 }
 477
 478 00000000        subl    $0x04,%esp
 479 00000003        movl    0x08(%esp,1),%eax
 480 00000007        movl    %eax,%ecx
 481 00000009        shrl    $0x10,%ecx
 482 0000000c        cvtsi2ss        %ecx,%xmm0
 483 00000010        andl    $0x0000ffff,%eax
 484 00000015        cvtsi2ss        %eax,%xmm1
 485 00000019        mulss   0x00000078,%xmm0
 486 00000021        addss   %xmm1,%xmm0
 487 00000025        movss   %xmm0,(%esp,1)
 488 0000002a        flds    (%esp,1)
 489 0000002d        addl    $0x04,%esp
 490 00000030        ret
 491
 492 //===---------------------------------------------------------------------===//
 493
 494 When using fastcc abi, align stack slot of argument of type double on 8 byte
 495 boundary to improve performance.
 496
 497 //===---------------------------------------------------------------------===//
 498
 499 Codegen:
 500
 501 int f(int a, int b) {
 502   if (a == 4 || a == 6)
 503     b++;
 504   return b;
 505 }
 506
 507
 508 as:
 509
 510 or eax, 2
 511 cmp eax, 6
 512 jz label
 513
 514 //===---------------------------------------------------------------------===//
 515
 516 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 517 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 518
 519 int G;
 520 void f(int X, int Y) {
 521   G = X < 0 ? 14 : 13;
 522 }
 523
 524 compiling to:
 525
 526 _f:
 527         movl $14, %eax
 528         movl $13, %ecx
 529         movl 4(%esp), %edx
 530         testl %edx, %edx
 531         cmovl %eax, %ecx
 532         movl %ecx, _G
 533         ret
 534
 535 it could be:
 536 _f:
 537         movl    4(%esp), %eax
 538         sarl    $31, %eax
 539         notl    %eax
 540         addl    $14, %eax
 541         movl    %eax, _G
 542         ret
 543
 544 etc.
 545
 546 Another is:
 547 int usesbb(unsigned int a, unsigned int b) {
 548        return (a < b ? -1 : 0);
 549 }
 550 to:
 551 _usesbb:
 552         movl    8(%esp), %eax
 553         cmpl    %eax, 4(%esp)
 554         sbbl    %eax, %eax
 555         ret
 556
 557 instead of:
 558 _usesbb:
 559         xorl    %eax, %eax
 560         movl    8(%esp), %ecx
 561         cmpl    %ecx, 4(%esp)
 562         movl    $4294967295, %ecx
 563         cmovb   %ecx, %eax
 564         ret
 565
 566 //===---------------------------------------------------------------------===//
 567
 568 Currently we don't have elimination of redundant stack manipulations. Consider
 569 the code:
 570
 571 int %main() {
 572 entry:
 573         call fastcc void %test1( )
 574         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 575         ret int 0
 576 }
 577
 578 declare fastcc void %test1()
 579
 580 declare fastcc void %test2(sbyte*)
 581
 582
 583 This currently compiles to:
 584
 585         subl $16, %esp
 586         call _test5
 587         addl $12, %esp
 588         subl $16, %esp
 589         movl $_test5, (%esp)
 590         call _test6
 591         addl $12, %esp
 592
 593 The add\sub pair is really unneeded here.
 594
 595 //===---------------------------------------------------------------------===//
 596
 597 Consider the expansion of:
 598
 599 define i32 @test3(i32 %X) {
 600         %tmp1 = urem i32 %X, 255
 601         ret i32 %tmp1
 602 }
 603
 604 Currently it compiles to:
 605
 606 ...
 607         movl $2155905153, %ecx
 608         movl 8(%esp), %esi
 609         movl %esi, %eax
 610         mull %ecx
 611 ...
 612
 613 This could be "reassociated" into:
 614
 615         movl $2155905153, %eax
 616         movl 8(%esp), %ecx
 617         mull %ecx
 618
 619 to avoid the copy.  In fact, the existing two-address stuff would do this
 620 except that mul isn't a commutative 2-addr instruction.  I guess this has
 621 to be done at isel time based on the #uses to mul?
 622
 623 //===---------------------------------------------------------------------===//
 624
 625 Make sure the instruction which starts a loop does not cross a cacheline
 626 boundary. This requires knowning the exact length of each machine instruction.
 627 That is somewhat complicated, but doable. Example 256.bzip2:
 628
 629 In the new trace, the hot loop has an instruction which crosses a cacheline
 630 boundary.  In addition to potential cache misses, this can't help decoding as I
 631 imagine there has to be some kind of complicated decoder reset and realignment
 632 to grab the bytes from the next cacheline.
 633
 634 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 635 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 636 937  937 0x3d0a incl     %esi
 637 3    3   0x3d0b cmpb     %bl, %dl
 638 27   27  0x3d0d jnz      0x000062db <main+11707>
 639
 640 //===---------------------------------------------------------------------===//
 641
 642 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 643
 644 //===---------------------------------------------------------------------===//
 645
 646 This could be a single 16-bit load.
 647
 648 int f(char *p) {
 649     if ((p[0] == 1) & (p[1] == 2)) return 1;
 650     return 0;
 651 }
 652
 653 //===---------------------------------------------------------------------===//
 654
 655 We should inline lrintf and probably other libc functions.
 656
 657 //===---------------------------------------------------------------------===//
 658
 659 Start using the flags more.  For example, compile:
 660
 661 int add_zf(int *x, int y, int a, int b) {
 662      if ((*x += y) == 0)
 663           return a;
 664      else
 665           return b;
 666 }
 667
 668 to:
 669        addl    %esi, (%rdi)
 670        movl    %edx, %eax
 671        cmovne  %ecx, %eax
 672        ret
 673 instead of:
 674
 675 _add_zf:
 676         addl (%rdi), %esi
 677         movl %esi, (%rdi)
 678         testl %esi, %esi
 679         cmove %edx, %ecx
 680         movl %ecx, %eax
 681         ret
 682
 683 and:
 684
 685 int add_zf(int *x, int y, int a, int b) {
 686      if ((*x + y) < 0)
 687           return a;
 688      else
 689           return b;
 690 }
 691
 692 to:
 693
 694 add_zf:
 695         addl    (%rdi), %esi
 696         movl    %edx, %eax
 697         cmovns  %ecx, %eax
 698         ret
 699
 700 instead of:
 701
 702 _add_zf:
 703         addl (%rdi), %esi
 704         testl %esi, %esi
 705         cmovs %edx, %ecx
 706         movl %ecx, %eax
 707         ret
 708
 709 //===---------------------------------------------------------------------===//
 710
 711 These two functions have identical effects:
 712
 713 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 714 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 715
 716 We currently compile them to:
 717
 718 _f:
 719         movl 4(%esp), %eax
 720         movl %eax, %ecx
 721         incl %ecx
 722         movl 8(%esp), %edx
 723         cmpl %edx, %ecx
 724         jne LBB1_2      #UnifiedReturnBlock
 725 LBB1_1: #cond_true
 726         addl $2, %eax
 727         ret
 728 LBB1_2: #UnifiedReturnBlock
 729         movl %ecx, %eax
 730         ret
 731 _f2:
 732         movl 4(%esp), %eax
 733         movl %eax, %ecx
 734         incl %ecx
 735         cmpl 8(%esp), %ecx
 736         sete %cl
 737         movzbl %cl, %ecx
 738         leal 1(%ecx,%eax), %eax
 739         ret
 740
 741 both of which are inferior to GCC's:
 742
 743 _f:
 744         movl    4(%esp), %edx
 745         leal    1(%edx), %eax
 746         addl    $2, %edx
 747         cmpl    8(%esp), %eax
 748         cmove   %edx, %eax
 749         ret
 750 _f2:
 751         movl    4(%esp), %eax
 752         addl    $1, %eax
 753         xorl    %edx, %edx
 754         cmpl    8(%esp), %eax
 755         sete    %dl
 756         addl    %edx, %eax
 757         ret
 758
 759 //===---------------------------------------------------------------------===//
 760
 761 This code:
 762
 763 void test(int X) {
 764   if (X) abort();
 765 }
 766
 767 is currently compiled to:
 768
 769 _test:
 770         subl $12, %esp
 771         cmpl $0, 16(%esp)
 772         jne LBB1_1
 773         addl $12, %esp
 774         ret
 775 LBB1_1:
 776         call L_abort$stub
 777
 778 It would be better to produce:
 779
 780 _test:
 781         subl $12, %esp
 782         cmpl $0, 16(%esp)
 783         jne L_abort$stub
 784         addl $12, %esp
 785         ret
 786
 787 This can be applied to any no-return function call that takes no arguments etc.
 788 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 789 something like this:
 790
 791 _test:
 792         cmpl $0, 4(%esp)
 793         jne LBB1_1
 794         ret
 795 LBB1_1:
 796         subl $12, %esp
 797         call L_abort$stub
 798
 799 Both are useful in different situations.  Finally, it could be shrink-wrapped
 800 and tail called, like this:
 801
 802 _test:
 803         cmpl $0, 4(%esp)
 804         jne LBB1_1
 805         ret
 806 LBB1_1:
 807         pop %eax   # realign stack.
 808         call L_abort$stub
 809
 810 Though this probably isn't worth it.
 811
 812 //===---------------------------------------------------------------------===//
 813
 814 We need to teach the codegen to convert two-address INC instructions to LEA
 815 when the flags are dead (likewise dec).  For example, on X86-64, compile:
 816
 817 int foo(int A, int B) {
 818   return A+1;
 819 }
 820
 821 to:
 822
 823 _foo:
 824         leal    1(%edi), %eax
 825         ret
 826
 827 instead of:
 828
 829 _foo:
 830         incl %edi
 831         movl %edi, %eax
 832         ret
 833
 834 Another example is:
 835
 836 ;; X's live range extends beyond the shift, so the register allocator
 837 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
 838 ;; emitted before the shift to save the register value before it is
 839 ;; clobbered.  However, this copy is not needed if the register
 840 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
 841
 842 ; Check that the shift gets turned into an LEA.
 843 ; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
 844 ; RUN:   not grep {mov E.X, E.X}
 845
 846 @G = external global i32                ; <i32*> [#uses=3]
 847
 848 define i32 @test1(i32 %X, i32 %Y) {
 849         %Z = add i32 %X, %Y             ; <i32> [#uses=1]
 850         volatile store i32 %Y, i32* @G
 851         volatile store i32 %Z, i32* @G
 852         ret i32 %X
 853 }
 854
 855 define i32 @test2(i32 %X) {
 856         %Z = add i32 %X, 1              ; <i32> [#uses=1]
 857         volatile store i32 %Z, i32* @G
 858         ret i32 %X
 859 }
 860
 861 //===---------------------------------------------------------------------===//
 862
 863 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 864 a neg instead of a sub instruction.  Consider:
 865
 866 int test(char X) { return 7-X; }
 867
 868 we currently produce:
 869 _test:
 870         movl $7, %eax
 871         movsbl 4(%esp), %ecx
 872         subl %ecx, %eax
 873         ret
 874
 875 We would use one fewer register if codegen'd as:
 876
 877         movsbl 4(%esp), %eax
 878         neg %eax
 879         add $7, %eax
 880         ret
 881
 882 Note that this isn't beneficial if the load can be folded into the sub.  In
 883 this case, we want a sub:
 884
 885 int test(int X) { return 7-X; }
 886 _test:
 887         movl $7, %eax
 888         subl 4(%esp), %eax
 889         ret
 890
 891 //===---------------------------------------------------------------------===//
 892
 893 Leaf functions that require one 4-byte spill slot have a prolog like this:
 894
 895 _foo:
 896         pushl   %esi
 897         subl    $4, %esp
 898 ...
 899 and an epilog like this:
 900         addl    $4, %esp
 901         popl    %esi
 902         ret
 903
 904 It would be smaller, and potentially faster, to push eax on entry and to
 905 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
 906 into any return registers :)
 907
 908 //===---------------------------------------------------------------------===//
 909
 910 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
 911 branches.  We generate really poor code for:
 912
 913 double testf(double a) {
 914        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 915 }
 916
 917 For example, the entry BB is:
 918
 919 _testf:
 920         subl    $20, %esp
 921         pxor    %xmm0, %xmm0
 922         movsd   24(%esp), %xmm1
 923         ucomisd %xmm0, %xmm1
 924         setnp   %al
 925         sete    %cl
 926         testb   %cl, %al
 927         jne     LBB1_5  # UnifiedReturnBlock
 928 LBB1_1: # cond_true
 929
 930
 931 it would be better to replace the last four instructions with:
 932
 933         jp LBB1_1
 934         je LBB1_5
 935 LBB1_1:
 936
 937 We also codegen the inner ?: into a diamond:
 938
 939        cvtss2sd        LCPI1_0(%rip), %xmm2
 940         cvtss2sd        LCPI1_1(%rip), %xmm3
 941         ucomisd %xmm1, %xmm0
 942         ja      LBB1_3  # cond_true
 943 LBB1_2: # cond_true
 944         movapd  %xmm3, %xmm2
 945 LBB1_3: # cond_true
 946         movapd  %xmm2, %xmm0
 947         ret
 948
 949 We should sink the load into xmm3 into the LBB1_2 block.  This should
 950 be pretty easy, and will nuke all the copies.
 951
 952 //===---------------------------------------------------------------------===//
 953
 954 This:
 955         #include <algorithm>
 956         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 957         { return std::make_pair(a + b, a + b < a); }
 958         bool no_overflow(unsigned a, unsigned b)
 959         { return !full_add(a, b).second; }
 960
 961 Should compile to:
 962
 963
 964         _Z11no_overflowjj:
 965                 addl    %edi, %esi
 966                 setae   %al
 967                 ret
 968
 969 FIXME: That code looks wrong; bool return is normally defined as zext.
 970
 971 on x86-64, not:
 972
 973 __Z11no_overflowjj:
 974         addl    %edi, %esi
 975         cmpl    %edi, %esi
 976         setae   %al
 977         movzbl  %al, %eax
 978         ret
 979
 980
 981 //===---------------------------------------------------------------------===//
 982
 983 Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
 984 condition register is dead. xor reg reg is shorter than mov reg, #0.
 985
 986 //===---------------------------------------------------------------------===//
 987
 988 We aren't matching RMW instructions aggressively
 989 enough.  Here's a reduced testcase (more in PR1160):
 990
 991 define void @test(i32* %huge_ptr, i32* %target_ptr) {
 992         %A = load i32* %huge_ptr                ; <i32> [#uses=1]
 993         %B = load i32* %target_ptr              ; <i32> [#uses=1]
 994         %C = or i32 %A, %B              ; <i32> [#uses=1]
 995         store i32 %C, i32* %target_ptr
 996         ret void
 997 }
 998
 999 $ llvm-as < t.ll | llc -march=x86-64
1000
1001 _test:
1002         movl (%rdi), %eax
1003         orl (%rsi), %eax
1004         movl %eax, (%rsi)
1005         ret
1006
1007 That should be something like:
1008
1009 _test:
1010         movl (%rdi), %eax
1011         orl %eax, (%rsi)
1012         ret
1013
1014 //===---------------------------------------------------------------------===//
1015
1016 The following code:
1017
1018 bb114.preheader:                ; preds = %cond_next94
1019         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
1020         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
1021         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
1022         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
1023         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
1024         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
1025         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
1026         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
1027         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
1028         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
1029         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
1030         br label %bb114
1031
1032 produces:
1033
1034 LBB3_5: # bb114.preheader
1035         movswl  -68(%ebp), %eax
1036         movl    $32, %ecx
1037         movl    %ecx, -80(%ebp)
1038         subl    %eax, -80(%ebp)
1039         movswl  -52(%ebp), %eax
1040         movl    %ecx, -84(%ebp)
1041         subl    %eax, -84(%ebp)
1042         movswl  -70(%ebp), %eax
1043         movl    %ecx, -88(%ebp)
1044         subl    %eax, -88(%ebp)
1045         movswl  -50(%ebp), %eax
1046         subl    %eax, %ecx
1047         movl    %ecx, -76(%ebp)
1048         movswl  -42(%ebp), %eax
1049         movl    %eax, -92(%ebp)
1050         movswl  -66(%ebp), %eax
1051         movl    %eax, -96(%ebp)
1052         movw    $0, -98(%ebp)
1053
1054 This appears to be bad because the RA is not folding the store to the stack
1055 slot into the movl.  The above instructions could be:
1056         movl    $32, -80(%ebp)
1057 ...
1058         movl    $32, -84(%ebp)
1059 ...
1060 This seems like a cross between remat and spill folding.
1061
1062 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
1063 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
1064 vice-versa).
1065
1066 //===---------------------------------------------------------------------===//
1067
1068 This code:
1069
1070         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
1071         br i1 %tmp659, label %cond_true662, label %cond_next715
1072
1073 produces this:
1074
1075         testw   %cx, %cx
1076         movswl  %cx, %esi
1077         jns     LBB4_109        # cond_next715
1078
1079 Shark tells us that using %cx in the testw instruction is sub-optimal. It
1080 suggests using the 32-bit register (which is what ICC uses).
1081
1082 //===---------------------------------------------------------------------===//
1083
1084 We compile this:
1085
1086 void compare (long long foo) {
1087   if (foo < 4294967297LL)
1088     abort();
1089 }
1090
1091 to:
1092
1093 compare:
1094         subl    $4, %esp
1095         cmpl    $0, 8(%esp)
1096         setne   %al
1097         movzbw  %al, %ax
1098         cmpl    $1, 12(%esp)
1099         setg    %cl
1100         movzbw  %cl, %cx
1101         cmove   %ax, %cx
1102         testb   $1, %cl
1103         jne     .LBB1_2 # UnifiedReturnBlock
1104 .LBB1_1:        # ifthen
1105         call    abort
1106 .LBB1_2:        # UnifiedReturnBlock
1107         addl    $4, %esp
1108         ret
1109
1110 (also really horrible code on ppc).  This is due to the expand code for 64-bit
1111 compares.  GCC produces multiple branches, which is much nicer:
1112
1113 compare:
1114         subl    $12, %esp
1115         movl    20(%esp), %edx
1116         movl    16(%esp), %eax
1117         decl    %edx
1118         jle     .L7
1119 .L5:
1120         addl    $12, %esp
1121         ret
1122         .p2align 4,,7
1123 .L7:
1124         jl      .L4
1125         cmpl    $0, %eax
1126         .p2align 4,,8
1127         ja      .L5
1128 .L4:
1129         .p2align 4,,9
1130         call    abort
1131
1132 //===---------------------------------------------------------------------===//
1133
1134 Tail call optimization improvements: Tail call optimization currently
1135 pushes all arguments on the top of the stack (their normal place for
1136 non-tail call optimized calls) that source from the callers arguments
1137 or  that source from a virtual register (also possibly sourcing from
1138 callers arguments).
1139 This is done to prevent overwriting of parameters (see example
1140 below) that might be used later.
1141
1142 example:
1143
1144 int callee(int32, int64);
1145 int caller(int32 arg1, int32 arg2) {
1146   int64 local = arg2 * 2;
1147   return callee(arg2, (int64)local);
1148 }
1149
1150 [arg1]          [!arg2 no longer valid since we moved local onto it]
1151 [arg2]      ->  [(int64)
1152 [RETADDR]        local  ]
1153
1154 Moving arg1 onto the stack slot of callee function would overwrite
1155 arg2 of the caller.
1156
1157 Possible optimizations:
1158
1159
1160  - Analyse the actual parameters of the callee to see which would
1161    overwrite a caller parameter which is used by the callee and only
1162    push them onto the top of the stack.
1163
1164    int callee (int32 arg1, int32 arg2);
1165    int caller (int32 arg1, int32 arg2) {
1166        return callee(arg1,arg2);
1167    }
1168
1169    Here we don't need to write any variables to the top of the stack
1170    since they don't overwrite each other.
1171
1172    int callee (int32 arg1, int32 arg2);
1173    int caller (int32 arg1, int32 arg2) {
1174        return callee(arg2,arg1);
1175    }
1176
1177    Here we need to push the arguments because they overwrite each
1178    other.
1179
1180 //===---------------------------------------------------------------------===//
1181
1182 main ()
1183 {
1184   int i = 0;
1185   unsigned long int z = 0;
1186
1187   do {
1188     z -= 0x00004000;
1189     i++;
1190     if (i > 0x00040000)
1191       abort ();
1192   } while (z > 0);
1193   exit (0);
1194 }
1195
1196 gcc compiles this to:
1197
1198 _main:
1199         subl    $28, %esp
1200         xorl    %eax, %eax
1201         jmp     L2
1202 L3:
1203         cmpl    $262144, %eax
1204         je      L10
1205 L2:
1206         addl    $1, %eax
1207         cmpl    $262145, %eax
1208         jne     L3
1209         call    L_abort$stub
1210 L10:
1211         movl    $0, (%esp)
1212         call    L_exit$stub
1213
1214 llvm:
1215
1216 _main:
1217         subl    $12, %esp
1218         movl    $1, %eax
1219         movl    $16384, %ecx
1220 LBB1_1: # bb
1221         cmpl    $262145, %eax
1222         jge     LBB1_4  # cond_true
1223 LBB1_2: # cond_next
1224         incl    %eax
1225         addl    $4294950912, %ecx
1226         cmpl    $16384, %ecx
1227         jne     LBB1_1  # bb
1228 LBB1_3: # bb11
1229         xorl    %eax, %eax
1230         addl    $12, %esp
1231         ret
1232 LBB1_4: # cond_true
1233         call    L_abort$stub
1234
1235 1. LSR should rewrite the first cmp with induction variable %ecx.
1236 2. DAG combiner should fold
1237         leal    1(%eax), %edx
1238         cmpl    $262145, %edx
1239    =>
1240         cmpl    $262144, %eax
1241
1242 //===---------------------------------------------------------------------===//
1243
1244 define i64 @test(double %X) {
1245         %Y = fptosi double %X to i64
1246         ret i64 %Y
1247 }
1248
1249 compiles to:
1250
1251 _test:
1252         subl    $20, %esp
1253         movsd   24(%esp), %xmm0
1254         movsd   %xmm0, 8(%esp)
1255         fldl    8(%esp)
1256         fisttpll        (%esp)
1257         movl    4(%esp), %edx
1258         movl    (%esp), %eax
1259         addl    $20, %esp
1260         #FP_REG_KILL
1261         ret
1262
1263 This should just fldl directly from the input stack slot.
1264
1265 //===---------------------------------------------------------------------===//
1266
1267 This code:
1268 int foo (int x) { return (x & 65535) | 255; }
1269
1270 Should compile into:
1271
1272 _foo:
1273         movzwl  4(%esp), %eax
1274         orl     $255, %eax
1275         ret
1276
1277 instead of:
1278 _foo:
1279         movl    $255, %eax
1280         orl     4(%esp), %eax
1281         andl    $65535, %eax
1282         ret
1283
1284 //===---------------------------------------------------------------------===//
1285
1286 We're codegen'ing multiply of long longs inefficiently:
1287
1288 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
1289   return arg1 *  arg2;
1290 }
1291
1292 We compile to (fomit-frame-pointer):
1293
1294 _LLM:
1295         pushl   %esi
1296         movl    8(%esp), %ecx
1297         movl    16(%esp), %esi
1298         movl    %esi, %eax
1299         mull    %ecx
1300         imull   12(%esp), %esi
1301         addl    %edx, %esi
1302         imull   20(%esp), %ecx
1303         movl    %esi, %edx
1304         addl    %ecx, %edx
1305         popl    %esi
1306         ret
1307
1308 This looks like a scheduling deficiency and lack of remat of the load from
1309 the argument area.  ICC apparently produces:
1310
1311         movl      8(%esp), %ecx
1312         imull     12(%esp), %ecx
1313         movl      16(%esp), %eax
1314         imull     4(%esp), %eax
1315         addl      %eax, %ecx
1316         movl      4(%esp), %eax
1317         mull      12(%esp)
1318         addl      %ecx, %edx
1319         ret
1320
1321 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1322 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1323
1324 //===---------------------------------------------------------------------===//
1325
1326 We can fold a store into "zeroing a reg".  Instead of:
1327
1328 xorl    %eax, %eax
1329 movl    %eax, 124(%esp)
1330
1331 we should get:
1332
1333 movl    $0, 124(%esp)
1334
1335 if the flags of the xor are dead.
1336
1337 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1338 be folded into: shl [mem], 1
1339
1340 //===---------------------------------------------------------------------===//
1341
1342 This testcase misses a read/modify/write opportunity (from PR1425):
1343
1344 void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
1345     int i;
1346     for(i=0; i<width; i++)
1347         b1[i] += (1*(b0[i] + b2[i])+0)>>0;
1348 }
1349
1350 We compile it down to:
1351
1352 LBB1_2: # bb
1353         movl    (%esi,%edi,4), %ebx
1354         addl    (%ecx,%edi,4), %ebx
1355         addl    (%edx,%edi,4), %ebx
1356         movl    %ebx, (%ecx,%edi,4)
1357         incl    %edi
1358         cmpl    %eax, %edi
1359         jne     LBB1_2  # bb
1360
1361 the inner loop should add to the memory location (%ecx,%edi,4), saving
1362 a mov.  Something like:
1363
1364         movl    (%esi,%edi,4), %ebx
1365         addl    (%edx,%edi,4), %ebx
1366         addl    %ebx, (%ecx,%edi,4)
1367
1368 Here is another interesting example:
1369
1370 void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
1371     int i;
1372     for(i=0; i<width; i++)
1373         b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
1374 }
1375
1376 We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
1377
1378 LBB9_2: # bb
1379         movl    (%ecx,%edi,4), %ebx
1380         subl    (%esi,%edi,4), %ebx
1381         subl    (%edx,%edi,4), %ebx
1382         movl    %ebx, (%ecx,%edi,4)
1383         incl    %edi
1384         cmpl    %eax, %edi
1385         jne     LBB9_2  # bb
1386
1387 Additionally, LSR should rewrite the exit condition of these loops to use
1388 a stride-4 IV, would would allow all the scales in the loop to go away.
1389 This would result in smaller code and more efficient microops.
1390
1391 //===---------------------------------------------------------------------===//
1392
1393 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1394 or and instruction, for example:
1395
1396         xorpd   LCPI1_0, %xmm2
1397
1398 However, if xmm2 gets spilled, we end up with really ugly code like this:
1399
1400         movsd   (%esp), %xmm0
1401         xorpd   LCPI1_0, %xmm0
1402         movsd   %xmm0, (%esp)
1403
1404 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1405 the neg/abs instruction, turning it into an *integer* operation, like this:
1406
1407         xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1408
1409 you could also use xorb, but xorl is less likely to lead to a partial register
1410 stall.  Here is a contrived testcase:
1411
1412 double a, b, c;
1413 void test(double *P) {
1414   double X = *P;
1415   a = X;
1416   bar();
1417   X = -X;
1418   b = X;
1419   bar();
1420   c = X;
1421 }
1422
1423 //===---------------------------------------------------------------------===//
1424
1425 handling llvm.memory.barrier on pre SSE2 cpus
1426
1427 should generate:
1428 lock ; mov %esp, %esp
1429
1430 //===---------------------------------------------------------------------===//
1431
1432 The generated code on x86 for checking for signed overflow on a multiply the
1433 obvious way is much longer than it needs to be.
1434
1435 int x(int a, int b) {
1436   long long prod = (long long)a*b;
1437   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1438 }
1439
1440 See PR2053 for more details.
1441
1442 //===---------------------------------------------------------------------===//
1443
1444 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1445 more aggressively; it should cost the same as a move+shift on any modern
1446 processor, but it's a lot shorter. Downside is that it puts more
1447 pressure on register allocation because it has fixed operands.
1448
1449 Example:
1450 int abs(int x) {return x < 0 ? -x : x;}
1451
1452 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1453 abs:
1454         movl    4(%esp), %eax
1455         cltd
1456         xorl    %edx, %eax
1457         subl    %edx, %eax
1458         ret
1459
1460 //===---------------------------------------------------------------------===//
1461
1462 Consider:
1463 int test(unsigned long a, unsigned long b) { return -(a < b); }
1464
1465 We currently compile this to:
1466
1467 define i32 @test(i32 %a, i32 %b) nounwind  {
1468         %tmp3 = icmp ult i32 %a, %b             ; <i1> [#uses=1]
1469         %tmp34 = zext i1 %tmp3 to i32           ; <i32> [#uses=1]
1470         %tmp5 = sub i32 0, %tmp34               ; <i32> [#uses=1]
1471         ret i32 %tmp5
1472 }
1473
1474 and
1475
1476 _test:
1477         movl    8(%esp), %eax
1478         cmpl    %eax, 4(%esp)
1479         setb    %al
1480         movzbl  %al, %eax
1481         negl    %eax
1482         ret
1483
1484 Several deficiencies here.  First, we should instcombine zext+neg into sext:
1485
1486 define i32 @test2(i32 %a, i32 %b) nounwind  {
1487         %tmp3 = icmp ult i32 %a, %b             ; <i1> [#uses=1]
1488         %tmp34 = sext i1 %tmp3 to i32           ; <i32> [#uses=1]
1489         ret i32 %tmp34
1490 }
1491
1492 However, before we can do that, we have to fix the bad codegen that we get for
1493 sext from bool:
1494
1495 _test2:
1496         movl    8(%esp), %eax
1497         cmpl    %eax, 4(%esp)
1498         setb    %al
1499         movzbl  %al, %eax
1500         shll    $31, %eax
1501         sarl    $31, %eax
1502         ret
1503
1504 This code should be at least as good as the code above.  Once this is fixed, we
1505 can optimize this specific case even more to:
1506
1507         movl    8(%esp), %eax
1508         xorl    %ecx, %ecx
1509         cmpl    %eax, 4(%esp)
1510         sbbl    %ecx, %ecx
1511
1512 //===---------------------------------------------------------------------===//
1513
1514 Take the following code (from
1515 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1516
1517 extern unsigned char first_one[65536];
1518 int FirstOnet(unsigned long long arg1)
1519 {
1520   if (arg1 >> 48)
1521     return (first_one[arg1 >> 48]);
1522   return 0;
1523 }
1524
1525
1526 The following code is currently generated:
1527 FirstOnet:
1528         movl    8(%esp), %eax
1529         cmpl    $65536, %eax
1530         movl    4(%esp), %ecx
1531         jb      .LBB1_2 # UnifiedReturnBlock
1532 .LBB1_1:        # ifthen
1533         shrl    $16, %eax
1534         movzbl  first_one(%eax), %eax
1535         ret
1536 .LBB1_2:        # UnifiedReturnBlock
1537         xorl    %eax, %eax
1538         ret
1539
1540 There are a few possible improvements here:
1541 1. We should be able to eliminate the dead load into %ecx
1542 2. We could change the "movl 8(%esp), %eax" into
1543    "movzwl 10(%esp), %eax"; this lets us change the cmpl
1544    into a testl, which is shorter, and eliminate the shift.
1545
1546 We could also in theory eliminate the branch by using a conditional
1547 for the address of the load, but that seems unlikely to be worthwhile
1548 in general.
1549
1550 //===---------------------------------------------------------------------===//
1551
1552 We compile this function:
1553
1554 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
1555 entry:
1556         %tmp2 = icmp eq i8 %d, 0                ; <i1> [#uses=1]
1557         br i1 %tmp2, label %bb7, label %bb
1558
1559 bb:             ; preds = %entry
1560         %tmp6 = add i32 %b, %a          ; <i32> [#uses=1]
1561         ret i32 %tmp6
1562
1563 bb7:            ; preds = %entry
1564         %tmp10 = sub i32 %a, %c         ; <i32> [#uses=1]
1565         ret i32 %tmp10
1566 }
1567
1568 to:
1569
1570 _foo:
1571         cmpb    $0, 16(%esp)
1572         movl    12(%esp), %ecx
1573         movl    8(%esp), %eax
1574         movl    4(%esp), %edx
1575         je      LBB1_2  # bb7
1576 LBB1_1: # bb
1577         addl    %edx, %eax
1578         ret
1579 LBB1_2: # bb7
1580         movl    %edx, %eax
1581         subl    %ecx, %eax
1582         ret
1583
1584 The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
1585 if it commuted the addl in LBB1_1.
1586
1587 //===---------------------------------------------------------------------===//
1588
1589 See rdar://4653682.
1590
1591 From flops:
1592
1593 LBB1_15:        # bb310
1594         cvtss2sd        LCPI1_0, %xmm1
1595         addsd   %xmm1, %xmm0
1596         movsd   176(%esp), %xmm2
1597         mulsd   %xmm0, %xmm2
1598         movapd  %xmm2, %xmm3
1599         mulsd   %xmm3, %xmm3
1600         movapd  %xmm3, %xmm4
1601         mulsd   LCPI1_23, %xmm4
1602         addsd   LCPI1_24, %xmm4
1603         mulsd   %xmm3, %xmm4
1604         addsd   LCPI1_25, %xmm4
1605         mulsd   %xmm3, %xmm4
1606         addsd   LCPI1_26, %xmm4
1607         mulsd   %xmm3, %xmm4
1608         addsd   LCPI1_27, %xmm4
1609         mulsd   %xmm3, %xmm4
1610         addsd   LCPI1_28, %xmm4
1611         mulsd   %xmm3, %xmm4
1612         addsd   %xmm1, %xmm4
1613         mulsd   %xmm2, %xmm4
1614         movsd   152(%esp), %xmm1
1615         addsd   %xmm4, %xmm1
1616         movsd   %xmm1, 152(%esp)
1617         incl    %eax
1618         cmpl    %eax, %esi
1619         jge     LBB1_15 # bb310
1620 LBB1_16:        # bb358.loopexit
1621         movsd   152(%esp), %xmm0
1622         addsd   %xmm0, %xmm0
1623         addsd   LCPI1_22, %xmm0
1624         movsd   %xmm0, 152(%esp)
1625
1626 Rather than spilling the result of the last addsd in the loop, we should have
1627 insert a copy to split the interval (one for the duration of the loop, one
1628 extending to the fall through). The register pressure in the loop isn't high
1629 enough to warrant the spill.
1630
1631 Also check why xmm7 is not used at all in the function.
1632
1633 //===---------------------------------------------------------------------===//
1634
1635 Legalize loses track of the fact that bools are always zero extended when in
1636 memory.  This causes us to compile abort_gzip (from 164.gzip) from:
1637
1638 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1639 target triple = "i386-apple-darwin8"
1640 @in_exit.4870.b = internal global i1 false              ; <i1*> [#uses=2]
1641 define fastcc void @abort_gzip() noreturn nounwind  {
1642 entry:
1643         %tmp.b.i = load i1* @in_exit.4870.b             ; <i1> [#uses=1]
1644         br i1 %tmp.b.i, label %bb.i, label %bb4.i
1645 bb.i:           ; preds = %entry
1646         tail call void @exit( i32 1 ) noreturn nounwind
1647         unreachable
1648 bb4.i:          ; preds = %entry
1649         store i1 true, i1* @in_exit.4870.b
1650         tail call void @exit( i32 1 ) noreturn nounwind
1651         unreachable
1652 }
1653 declare void @exit(i32) noreturn nounwind
1654
1655 into:
1656
1657 _abort_gzip:
1658         subl    $12, %esp
1659         movb    _in_exit.4870.b, %al
1660         notb    %al
1661         testb   $1, %al
1662         jne     LBB1_2  ## bb4.i
1663 LBB1_1: ## bb.i
1664   ...
1665
1666 //===---------------------------------------------------------------------===//
1667
1668 We compile:
1669
1670 int test(int x, int y) {
1671   return x-y-1;
1672 }
1673
1674 into (-m64):
1675
1676 _test:
1677         decl    %edi
1678         movl    %edi, %eax
1679         subl    %esi, %eax
1680         ret
1681
1682 it would be better to codegen as: x+~y  (notl+addl)
1683
1684 //===---------------------------------------------------------------------===//
1685
1686 This code:
1687
1688 int foo(const char *str,...)
1689 {
1690  __builtin_va_list a; int x;
1691  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1692  return x;
1693 }
1694
1695 gets compiled into this on x86-64:
1696         subq    $200, %rsp
1697         movaps  %xmm7, 160(%rsp)
1698         movaps  %xmm6, 144(%rsp)
1699         movaps  %xmm5, 128(%rsp)
1700         movaps  %xmm4, 112(%rsp)
1701         movaps  %xmm3, 96(%rsp)
1702         movaps  %xmm2, 80(%rsp)
1703         movaps  %xmm1, 64(%rsp)
1704         movaps  %xmm0, 48(%rsp)
1705         movq    %r9, 40(%rsp)
1706         movq    %r8, 32(%rsp)
1707         movq    %rcx, 24(%rsp)
1708         movq    %rdx, 16(%rsp)
1709         movq    %rsi, 8(%rsp)
1710         leaq    (%rsp), %rax
1711         movq    %rax, 192(%rsp)
1712         leaq    208(%rsp), %rax
1713         movq    %rax, 184(%rsp)
1714         movl    $48, 180(%rsp)
1715         movl    $8, 176(%rsp)
1716         movl    176(%rsp), %eax
1717         cmpl    $47, %eax
1718         jbe     .LBB1_3 # bb
1719 .LBB1_1:        # bb3
1720         movq    184(%rsp), %rcx
1721         leaq    8(%rcx), %rax
1722         movq    %rax, 184(%rsp)
1723 .LBB1_2:        # bb4
1724         movl    (%rcx), %eax
1725         addq    $200, %rsp
1726         ret
1727 .LBB1_3:        # bb
1728         movl    %eax, %ecx
1729         addl    $8, %eax
1730         addq    192(%rsp), %rcx
1731         movl    %eax, 176(%rsp)
1732         jmp     .LBB1_2 # bb4
1733
1734 gcc 4.3 generates:
1735         subq    $96, %rsp
1736 .LCFI0:
1737         leaq    104(%rsp), %rax
1738         movq    %rsi, -80(%rsp)
1739         movl    $8, -120(%rsp)
1740         movq    %rax, -112(%rsp)
1741         leaq    -88(%rsp), %rax
1742         movq    %rax, -104(%rsp)
1743         movl    $8, %eax
1744         cmpl    $48, %eax
1745         jb      .L6
1746         movq    -112(%rsp), %rdx
1747         movl    (%rdx), %eax
1748         addq    $96, %rsp
1749         ret
1750         .p2align 4,,10
1751         .p2align 3
1752 .L6:
1753         mov     %eax, %edx
1754         addq    -104(%rsp), %rdx
1755         addl    $8, %eax
1756         movl    %eax, -120(%rsp)
1757         movl    (%rdx), %eax
1758         addq    $96, %rsp
1759         ret
1760
1761 and it gets compiled into this on x86:
1762         pushl   %ebp
1763         movl    %esp, %ebp
1764         subl    $4, %esp
1765         leal    12(%ebp), %eax
1766         movl    %eax, -4(%ebp)
1767         leal    16(%ebp), %eax
1768         movl    %eax, -4(%ebp)
1769         movl    12(%ebp), %eax
1770         addl    $4, %esp
1771         popl    %ebp
1772         ret
1773
1774 gcc 4.3 generates:
1775         pushl   %ebp
1776         movl    %esp, %ebp
1777         movl    12(%ebp), %eax
1778         popl    %ebp
1779         ret
1780
1781 //===---------------------------------------------------------------------===//
1782
1783 Teach tblgen not to check bitconvert source type in some cases. This allows us
1784 to consolidate the following patterns in X86InstrMMX.td:
1785
1786 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1787                                                   (iPTR 0))))),
1788           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1789 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1790                                                   (iPTR 0))))),
1791           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1792 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1793                                                   (iPTR 0))))),
1794           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1795
1796 There are other cases in various td files.
1797
1798 //===---------------------------------------------------------------------===//
1799
1800 Take something like the following on x86-32:
1801 unsigned a(unsigned long long x, unsigned y) {return x % y;}
1802
1803 We currently generate a libcall, but we really shouldn't: the expansion is
1804 shorter and likely faster than the libcall.  The expected code is something
1805 like the following:
1806
1807         movl    12(%ebp), %eax
1808         movl    16(%ebp), %ecx
1809         xorl    %edx, %edx
1810         divl    %ecx
1811         movl    8(%ebp), %eax
1812         divl    %ecx
1813         movl    %edx, %eax
1814         ret
1815
1816 A similar code sequence works for division.
1817
1818 //===---------------------------------------------------------------------===//
1819
1820 These should compile to the same code, but the later codegen's to useless
1821 instructions on X86. This may be a trivial dag combine (GCC PR7061):
1822
1823 struct s1 { unsigned char a, b; };
1824 unsigned long f1(struct s1 x) {
1825     return x.a + x.b;
1826 }
1827 struct s2 { unsigned a: 8, b: 8; };
1828 unsigned long f2(struct s2 x) {
1829     return x.a + x.b;
1830 }
1831
1832 //===---------------------------------------------------------------------===//
1833