lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 We should add support for the "movbe" instruction, which does a byte-swapping
   6 copy (3-addr bswap + memory support?)  This is available on Atom processors.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  11 backend knows how to three-addressify this shift, but it appears the register
  12 allocator isn't even asking it to do so in this case.  We should investigate
  13 why this isn't happening, it could have significant impact on other important
  14 cases for X86 as well.
  15
  16 //===---------------------------------------------------------------------===//
  17
  18 This should be one DIV/IDIV instruction, not a libcall:
  19
  20 unsigned test(unsigned long long X, unsigned Y) {
  21         return X/Y;
  22 }
  23
  24 This can be done trivially with a custom legalizer.  What about overflow
  25 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  26
  27 //===---------------------------------------------------------------------===//
  28
  29 Improvements to the multiply -> shift/add algorithm:
  30 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  35 long long foo(int x) { return 1LL << x; }
  36
  37 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  38 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  39 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  40
  41 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  42
  43 One better solution for 1LL << x is:
  44         xorl    %eax, %eax
  45         xorl    %edx, %edx
  46         testb   $32, %cl
  47         sete    %al
  48         setne   %dl
  49         sall    %cl, %eax
  50         sall    %cl, %edx
  51
  52 But that requires good 8-bit subreg support.
  53
  54 Also, this might be better.  It's an extra shift, but it's one instruction
  55 shorter, and doesn't stress 8-bit subreg support.
  56 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
  57 but without the unnecessary and.)
  58         movl %ecx, %eax
  59         shrl $5, %eax
  60         movl %eax, %edx
  61         xorl $1, %edx
  62         sall %cl, %eax
  63         sall %cl. %edx
  64
  65 64-bit shifts (in general) expand to really bad code.  Instead of using
  66 cmovs, we should expand to a conditional branch like GCC produces.
  67
  68 //===---------------------------------------------------------------------===//
  69
  70 Compile this:
  71 _Bool f(_Bool a) { return a!=1; }
  72
  73 into:
  74         movzbl  %dil, %eax
  75         xorl    $1, %eax
  76         ret
  77
  78 (Although note that this isn't a legal way to express the code that llvm-gcc
  79 currently generates for that function.)
  80
  81 //===---------------------------------------------------------------------===//
  82
  83 Some isel ideas:
  84
  85 1. Dynamic programming based approach when compile time if not an
  86    issue.
  87 2. Code duplication (addressing mode) during isel.
  88 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  89    Sequencing of Instructions".
  90 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  91    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  92    and other related papers.
  93    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  94
  95 //===---------------------------------------------------------------------===//
  96
  97 Should we promote i16 to i32 to avoid partial register update stalls?
  98
  99 //===---------------------------------------------------------------------===//
 100
 101 Leave any_extend as pseudo instruction and hint to register
 102 allocator. Delay codegen until post register allocation.
 103 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
 104 the coalescer how to deal with it though.
 105
 106 //===---------------------------------------------------------------------===//
 107
 108 It appears icc use push for parameter passing. Need to investigate.
 109
 110 //===---------------------------------------------------------------------===//
 111
 112 Only use inc/neg/not instructions on processors where they are faster than
 113 add/sub/xor.  They are slower on the P4 due to only updating some processor
 114 flags.
 115
 116 //===---------------------------------------------------------------------===//
 117
 118 The instruction selector sometimes misses folding a load into a compare.  The
 119 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 120 commutative, it is not matched with the load on both sides.  The dag combiner
 121 should be made smart enough to cannonicalize the load into the RHS of a compare
 122 when it can invert the result of the compare for free.
 123
 124 //===---------------------------------------------------------------------===//
 125
 126 How about intrinsics? An example is:
 127   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 128
 129 compiles to
 130         pmuludq (%eax), %xmm0
 131         movl 8(%esp), %eax
 132         movdqa (%eax), %xmm1
 133         pmulhuw %xmm0, %xmm1
 134
 135 The transformation probably requires a X86 specific pass or a DAG combiner
 136 target specific hook.
 137
 138 //===---------------------------------------------------------------------===//
 139
 140 In many cases, LLVM generates code like this:
 141
 142 _test:
 143         movl 8(%esp), %eax
 144         cmpl %eax, 4(%esp)
 145         setl %al
 146         movzbl %al, %eax
 147         ret
 148
 149 on some processors (which ones?), it is more efficient to do this:
 150
 151 _test:
 152         movl 8(%esp), %ebx
 153         xor  %eax, %eax
 154         cmpl %ebx, 4(%esp)
 155         setl %al
 156         ret
 157
 158 Doing this correctly is tricky though, as the xor clobbers the flags.
 159
 160 //===---------------------------------------------------------------------===//
 161
 162 We should generate bts/btr/etc instructions on targets where they are cheap or
 163 when codesize is important.  e.g., for:
 164
 165 void setbit(int *target, int bit) {
 166     *target |= (1 << bit);
 167 }
 168 void clearbit(int *target, int bit) {
 169     *target &= ~(1 << bit);
 170 }
 171
 172 //===---------------------------------------------------------------------===//
 173
 174 Instead of the following for memset char*, 1, 10:
 175
 176         movl $16843009, 4(%edx)
 177         movl $16843009, (%edx)
 178         movw $257, 8(%edx)
 179
 180 It might be better to generate
 181
 182         movl $16843009, %eax
 183         movl %eax, 4(%edx)
 184         movl %eax, (%edx)
 185         movw al, 8(%edx)
 186
 187 when we can spare a register. It reduces code size.
 188
 189 //===---------------------------------------------------------------------===//
 190
 191 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 192 get this:
 193
 194 define i32 @test1(i32 %X) {
 195     %Y = sdiv i32 %X, 8
 196     ret i32 %Y
 197 }
 198
 199 _test1:
 200         movl 4(%esp), %eax
 201         movl %eax, %ecx
 202         sarl $31, %ecx
 203         shrl $29, %ecx
 204         addl %ecx, %eax
 205         sarl $3, %eax
 206         ret
 207
 208 GCC knows several different ways to codegen it, one of which is this:
 209
 210 _test1:
 211         movl    4(%esp), %eax
 212         cmpl    $-1, %eax
 213         leal    7(%eax), %ecx
 214         cmovle  %ecx, %eax
 215         sarl    $3, %eax
 216         ret
 217
 218 which is probably slower, but it's interesting at least :)
 219
 220 //===---------------------------------------------------------------------===//
 221
 222 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 223 We should leave these as libcalls for everything over a much lower threshold,
 224 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 225 stores, TLB preheating, etc)
 226
 227 //===---------------------------------------------------------------------===//
 228
 229 Optimize this into something reasonable:
 230  x * copysign(1.0, y) * copysign(1.0, z)
 231
 232 //===---------------------------------------------------------------------===//
 233
 234 Optimize copysign(x, *y) to use an integer load from y.
 235
 236 //===---------------------------------------------------------------------===//
 237
 238 The following tests perform worse with LSR:
 239
 240 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 241
 242 //===---------------------------------------------------------------------===//
 243
 244 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 245 FR64 to VR128.
 246
 247 //===---------------------------------------------------------------------===//
 248
 249 Adding to the list of cmp / test poor codegen issues:
 250
 251 int test(__m128 *A, __m128 *B) {
 252   if (_mm_comige_ss(*A, *B))
 253     return 3;
 254   else
 255     return 4;
 256 }
 257
 258 _test:
 259         movl 8(%esp), %eax
 260         movaps (%eax), %xmm0
 261         movl 4(%esp), %eax
 262         movaps (%eax), %xmm1
 263         comiss %xmm0, %xmm1
 264         setae %al
 265         movzbl %al, %ecx
 266         movl $3, %eax
 267         movl $4, %edx
 268         cmpl $0, %ecx
 269         cmove %edx, %eax
 270         ret
 271
 272 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 273 are a number of issues. 1) We are introducing a setcc between the result of the
 274 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 275 so a any extend (which becomes a zero extend) is added.
 276
 277 We probably need some kind of target DAG combine hook to fix this.
 278
 279 //===---------------------------------------------------------------------===//
 280
 281 We generate significantly worse code for this than GCC:
 282 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 283 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 284
 285 There is also one case we do worse on PPC.
 286
 287 //===---------------------------------------------------------------------===//
 288
 289 For this:
 290
 291 int test(int a)
 292 {
 293   return a * 3;
 294 }
 295
 296 We currently emits
 297         imull $3, 4(%esp), %eax
 298
 299 Perhaps this is what we really should generate is? Is imull three or four
 300 cycles? Note: ICC generates this:
 301         movl    4(%esp), %eax
 302         leal    (%eax,%eax,2), %eax
 303
 304 The current instruction priority is based on pattern complexity. The former is
 305 more "complex" because it folds a load so the latter will not be emitted.
 306
 307 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 308 should always try to match LEA first since the LEA matching code does some
 309 estimate to determine whether the match is profitable.
 310
 311 However, if we care more about code size, then imull is better. It's two bytes
 312 shorter than movl + leal.
 313
 314 On a Pentium M, both variants have the same characteristics with regard
 315 to throughput; however, the multiplication has a latency of four cycles, as
 316 opposed to two cycles for the movl+lea variant.
 317
 318 //===---------------------------------------------------------------------===//
 319
 320 __builtin_ffs codegen is messy.
 321
 322 int ffs_(unsigned X) { return __builtin_ffs(X); }
 323
 324 llvm produces:
 325 ffs_:
 326         movl    4(%esp), %ecx
 327         bsfl    %ecx, %eax
 328         movl    $32, %edx
 329         cmove   %edx, %eax
 330         incl    %eax
 331         xorl    %edx, %edx
 332         testl   %ecx, %ecx
 333         cmove   %edx, %eax
 334         ret
 335
 336 vs gcc:
 337
 338 _ffs_:
 339         movl    $-1, %edx
 340         bsfl    4(%esp), %eax
 341         cmove   %edx, %eax
 342         addl    $1, %eax
 343         ret
 344
 345 Another example of __builtin_ffs (use predsimplify to eliminate a select):
 346
 347 int foo (unsigned long j) {
 348   if (j)
 349     return __builtin_ffs (j) - 1;
 350   else
 351     return 0;
 352 }
 353
 354 //===---------------------------------------------------------------------===//
 355
 356 It appears gcc place string data with linkonce linkage in
 357 .section __TEXT,__const_coal,coalesced instead of
 358 .section __DATA,__const_coal,coalesced.
 359 Take a look at darwin.h, there are other Darwin assembler directives that we
 360 do not make use of.
 361
 362 //===---------------------------------------------------------------------===//
 363
 364 define i32 @foo(i32* %a, i32 %t) {
 365 entry:
 366         br label %cond_true
 367
 368 cond_true:              ; preds = %cond_true, %entry
 369         %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]           ; <i32> [#uses=3]
 370         %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]             ; <i32> [#uses=1]
 371         %tmp2 = getelementptr i32* %a, i32 %x.0.0               ; <i32*> [#uses=1]
 372         %tmp3 = load i32* %tmp2         ; <i32> [#uses=1]
 373         %tmp5 = add i32 %t_addr.0.0, %x.0.0             ; <i32> [#uses=1]
 374         %tmp7 = add i32 %tmp5, %tmp3            ; <i32> [#uses=2]
 375         %tmp9 = add i32 %x.0.0, 1               ; <i32> [#uses=2]
 376         %tmp = icmp sgt i32 %tmp9, 39           ; <i1> [#uses=1]
 377         br i1 %tmp, label %bb12, label %cond_true
 378
 379 bb12:           ; preds = %cond_true
 380         ret i32 %tmp7
 381 }
 382 is pessimized by -loop-reduce and -indvars
 383
 384 //===---------------------------------------------------------------------===//
 385
 386 u32 to float conversion improvement:
 387
 388 float uint32_2_float( unsigned u ) {
 389   float fl = (int) (u & 0xffff);
 390   float fh = (int) (u >> 16);
 391   fh *= 0x1.0p16f;
 392   return fh + fl;
 393 }
 394
 395 00000000        subl    $0x04,%esp
 396 00000003        movl    0x08(%esp,1),%eax
 397 00000007        movl    %eax,%ecx
 398 00000009        shrl    $0x10,%ecx
 399 0000000c        cvtsi2ss        %ecx,%xmm0
 400 00000010        andl    $0x0000ffff,%eax
 401 00000015        cvtsi2ss        %eax,%xmm1
 402 00000019        mulss   0x00000078,%xmm0
 403 00000021        addss   %xmm1,%xmm0
 404 00000025        movss   %xmm0,(%esp,1)
 405 0000002a        flds    (%esp,1)
 406 0000002d        addl    $0x04,%esp
 407 00000030        ret
 408
 409 //===---------------------------------------------------------------------===//
 410
 411 When using fastcc abi, align stack slot of argument of type double on 8 byte
 412 boundary to improve performance.
 413
 414 //===---------------------------------------------------------------------===//
 415
 416 Codegen:
 417
 418 int f(int a, int b) {
 419   if (a == 4 || a == 6)
 420     b++;
 421   return b;
 422 }
 423
 424
 425 as:
 426
 427 or eax, 2
 428 cmp eax, 6
 429 jz label
 430
 431 //===---------------------------------------------------------------------===//
 432
 433 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 434 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 435
 436 int G;
 437 void f(int X, int Y) {
 438   G = X < 0 ? 14 : 13;
 439 }
 440
 441 compiling to:
 442
 443 _f:
 444         movl $14, %eax
 445         movl $13, %ecx
 446         movl 4(%esp), %edx
 447         testl %edx, %edx
 448         cmovl %eax, %ecx
 449         movl %ecx, _G
 450         ret
 451
 452 it could be:
 453 _f:
 454         movl    4(%esp), %eax
 455         sarl    $31, %eax
 456         notl    %eax
 457         addl    $14, %eax
 458         movl    %eax, _G
 459         ret
 460
 461 etc.
 462
 463 Another is:
 464 int usesbb(unsigned int a, unsigned int b) {
 465        return (a < b ? -1 : 0);
 466 }
 467 to:
 468 _usesbb:
 469         movl    8(%esp), %eax
 470         cmpl    %eax, 4(%esp)
 471         sbbl    %eax, %eax
 472         ret
 473
 474 instead of:
 475 _usesbb:
 476         xorl    %eax, %eax
 477         movl    8(%esp), %ecx
 478         cmpl    %ecx, 4(%esp)
 479         movl    $4294967295, %ecx
 480         cmovb   %ecx, %eax
 481         ret
 482
 483 //===---------------------------------------------------------------------===//
 484
 485 Currently we don't have elimination of redundant stack manipulations. Consider
 486 the code:
 487
 488 int %main() {
 489 entry:
 490         call fastcc void %test1( )
 491         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 492         ret int 0
 493 }
 494
 495 declare fastcc void %test1()
 496
 497 declare fastcc void %test2(sbyte*)
 498
 499
 500 This currently compiles to:
 501
 502         subl $16, %esp
 503         call _test5
 504         addl $12, %esp
 505         subl $16, %esp
 506         movl $_test5, (%esp)
 507         call _test6
 508         addl $12, %esp
 509
 510 The add\sub pair is really unneeded here.
 511
 512 //===---------------------------------------------------------------------===//
 513
 514 Consider the expansion of:
 515
 516 define i32 @test3(i32 %X) {
 517         %tmp1 = urem i32 %X, 255
 518         ret i32 %tmp1
 519 }
 520
 521 Currently it compiles to:
 522
 523 ...
 524         movl $2155905153, %ecx
 525         movl 8(%esp), %esi
 526         movl %esi, %eax
 527         mull %ecx
 528 ...
 529
 530 This could be "reassociated" into:
 531
 532         movl $2155905153, %eax
 533         movl 8(%esp), %ecx
 534         mull %ecx
 535
 536 to avoid the copy.  In fact, the existing two-address stuff would do this
 537 except that mul isn't a commutative 2-addr instruction.  I guess this has
 538 to be done at isel time based on the #uses to mul?
 539
 540 //===---------------------------------------------------------------------===//
 541
 542 Make sure the instruction which starts a loop does not cross a cacheline
 543 boundary. This requires knowning the exact length of each machine instruction.
 544 That is somewhat complicated, but doable. Example 256.bzip2:
 545
 546 In the new trace, the hot loop has an instruction which crosses a cacheline
 547 boundary.  In addition to potential cache misses, this can't help decoding as I
 548 imagine there has to be some kind of complicated decoder reset and realignment
 549 to grab the bytes from the next cacheline.
 550
 551 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 552 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 553 937  937 0x3d0a incl     %esi
 554 3    3   0x3d0b cmpb     %bl, %dl
 555 27   27  0x3d0d jnz      0x000062db <main+11707>
 556
 557 //===---------------------------------------------------------------------===//
 558
 559 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 560
 561 //===---------------------------------------------------------------------===//
 562
 563 This could be a single 16-bit load.
 564
 565 int f(char *p) {
 566     if ((p[0] == 1) & (p[1] == 2)) return 1;
 567     return 0;
 568 }
 569
 570 //===---------------------------------------------------------------------===//
 571
 572 We should inline lrintf and probably other libc functions.
 573
 574 //===---------------------------------------------------------------------===//
 575
 576 Start using the flags more.  For example, compile:
 577
 578 int add_zf(int *x, int y, int a, int b) {
 579      if ((*x += y) == 0)
 580           return a;
 581      else
 582           return b;
 583 }
 584
 585 to:
 586        addl    %esi, (%rdi)
 587        movl    %edx, %eax
 588        cmovne  %ecx, %eax
 589        ret
 590 instead of:
 591
 592 _add_zf:
 593         addl (%rdi), %esi
 594         movl %esi, (%rdi)
 595         testl %esi, %esi
 596         cmove %edx, %ecx
 597         movl %ecx, %eax
 598         ret
 599
 600 and:
 601
 602 int add_zf(int *x, int y, int a, int b) {
 603      if ((*x + y) < 0)
 604           return a;
 605      else
 606           return b;
 607 }
 608
 609 to:
 610
 611 add_zf:
 612         addl    (%rdi), %esi
 613         movl    %edx, %eax
 614         cmovns  %ecx, %eax
 615         ret
 616
 617 instead of:
 618
 619 _add_zf:
 620         addl (%rdi), %esi
 621         testl %esi, %esi
 622         cmovs %edx, %ecx
 623         movl %ecx, %eax
 624         ret
 625
 626 //===---------------------------------------------------------------------===//
 627
 628 These two functions have identical effects:
 629
 630 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 631 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 632
 633 We currently compile them to:
 634
 635 _f:
 636         movl 4(%esp), %eax
 637         movl %eax, %ecx
 638         incl %ecx
 639         movl 8(%esp), %edx
 640         cmpl %edx, %ecx
 641         jne LBB1_2      #UnifiedReturnBlock
 642 LBB1_1: #cond_true
 643         addl $2, %eax
 644         ret
 645 LBB1_2: #UnifiedReturnBlock
 646         movl %ecx, %eax
 647         ret
 648 _f2:
 649         movl 4(%esp), %eax
 650         movl %eax, %ecx
 651         incl %ecx
 652         cmpl 8(%esp), %ecx
 653         sete %cl
 654         movzbl %cl, %ecx
 655         leal 1(%ecx,%eax), %eax
 656         ret
 657
 658 both of which are inferior to GCC's:
 659
 660 _f:
 661         movl    4(%esp), %edx
 662         leal    1(%edx), %eax
 663         addl    $2, %edx
 664         cmpl    8(%esp), %eax
 665         cmove   %edx, %eax
 666         ret
 667 _f2:
 668         movl    4(%esp), %eax
 669         addl    $1, %eax
 670         xorl    %edx, %edx
 671         cmpl    8(%esp), %eax
 672         sete    %dl
 673         addl    %edx, %eax
 674         ret
 675
 676 //===---------------------------------------------------------------------===//
 677
 678 This code:
 679
 680 void test(int X) {
 681   if (X) abort();
 682 }
 683
 684 is currently compiled to:
 685
 686 _test:
 687         subl $12, %esp
 688         cmpl $0, 16(%esp)
 689         jne LBB1_1
 690         addl $12, %esp
 691         ret
 692 LBB1_1:
 693         call L_abort$stub
 694
 695 It would be better to produce:
 696
 697 _test:
 698         subl $12, %esp
 699         cmpl $0, 16(%esp)
 700         jne L_abort$stub
 701         addl $12, %esp
 702         ret
 703
 704 This can be applied to any no-return function call that takes no arguments etc.
 705 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 706 something like this:
 707
 708 _test:
 709         cmpl $0, 4(%esp)
 710         jne LBB1_1
 711         ret
 712 LBB1_1:
 713         subl $12, %esp
 714         call L_abort$stub
 715
 716 Both are useful in different situations.  Finally, it could be shrink-wrapped
 717 and tail called, like this:
 718
 719 _test:
 720         cmpl $0, 4(%esp)
 721         jne LBB1_1
 722         ret
 723 LBB1_1:
 724         pop %eax   # realign stack.
 725         call L_abort$stub
 726
 727 Though this probably isn't worth it.
 728
 729 //===---------------------------------------------------------------------===//
 730
 731 We need to teach the codegen to convert two-address INC instructions to LEA
 732 when the flags are dead (likewise dec).  For example, on X86-64, compile:
 733
 734 int foo(int A, int B) {
 735   return A+1;
 736 }
 737
 738 to:
 739
 740 _foo:
 741         leal    1(%edi), %eax
 742         ret
 743
 744 instead of:
 745
 746 _foo:
 747         incl %edi
 748         movl %edi, %eax
 749         ret
 750
 751 Another example is:
 752
 753 ;; X's live range extends beyond the shift, so the register allocator
 754 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
 755 ;; emitted before the shift to save the register value before it is
 756 ;; clobbered.  However, this copy is not needed if the register
 757 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
 758
 759 ; Check that the shift gets turned into an LEA.
 760 ; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
 761 ; RUN:   not grep {mov E.X, E.X}
 762
 763 @G = external global i32                ; <i32*> [#uses=3]
 764
 765 define i32 @test1(i32 %X, i32 %Y) {
 766         %Z = add i32 %X, %Y             ; <i32> [#uses=1]
 767         volatile store i32 %Y, i32* @G
 768         volatile store i32 %Z, i32* @G
 769         ret i32 %X
 770 }
 771
 772 define i32 @test2(i32 %X) {
 773         %Z = add i32 %X, 1              ; <i32> [#uses=1]
 774         volatile store i32 %Z, i32* @G
 775         ret i32 %X
 776 }
 777
 778 //===---------------------------------------------------------------------===//
 779
 780 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 781 a neg instead of a sub instruction.  Consider:
 782
 783 int test(char X) { return 7-X; }
 784
 785 we currently produce:
 786 _test:
 787         movl $7, %eax
 788         movsbl 4(%esp), %ecx
 789         subl %ecx, %eax
 790         ret
 791
 792 We would use one fewer register if codegen'd as:
 793
 794         movsbl 4(%esp), %eax
 795         neg %eax
 796         add $7, %eax
 797         ret
 798
 799 Note that this isn't beneficial if the load can be folded into the sub.  In
 800 this case, we want a sub:
 801
 802 int test(int X) { return 7-X; }
 803 _test:
 804         movl $7, %eax
 805         subl 4(%esp), %eax
 806         ret
 807
 808 //===---------------------------------------------------------------------===//
 809
 810 Leaf functions that require one 4-byte spill slot have a prolog like this:
 811
 812 _foo:
 813         pushl   %esi
 814         subl    $4, %esp
 815 ...
 816 and an epilog like this:
 817         addl    $4, %esp
 818         popl    %esi
 819         ret
 820
 821 It would be smaller, and potentially faster, to push eax on entry and to
 822 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
 823 into any return registers :)
 824
 825 //===---------------------------------------------------------------------===//
 826
 827 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
 828 branches.  We generate really poor code for:
 829
 830 double testf(double a) {
 831        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 832 }
 833
 834 For example, the entry BB is:
 835
 836 _testf:
 837         subl    $20, %esp
 838         pxor    %xmm0, %xmm0
 839         movsd   24(%esp), %xmm1
 840         ucomisd %xmm0, %xmm1
 841         setnp   %al
 842         sete    %cl
 843         testb   %cl, %al
 844         jne     LBB1_5  # UnifiedReturnBlock
 845 LBB1_1: # cond_true
 846
 847
 848 it would be better to replace the last four instructions with:
 849
 850         jp LBB1_1
 851         je LBB1_5
 852 LBB1_1:
 853
 854 We also codegen the inner ?: into a diamond:
 855
 856        cvtss2sd        LCPI1_0(%rip), %xmm2
 857         cvtss2sd        LCPI1_1(%rip), %xmm3
 858         ucomisd %xmm1, %xmm0
 859         ja      LBB1_3  # cond_true
 860 LBB1_2: # cond_true
 861         movapd  %xmm3, %xmm2
 862 LBB1_3: # cond_true
 863         movapd  %xmm2, %xmm0
 864         ret
 865
 866 We should sink the load into xmm3 into the LBB1_2 block.  This should
 867 be pretty easy, and will nuke all the copies.
 868
 869 //===---------------------------------------------------------------------===//
 870
 871 This:
 872         #include <algorithm>
 873         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 874         { return std::make_pair(a + b, a + b < a); }
 875         bool no_overflow(unsigned a, unsigned b)
 876         { return !full_add(a, b).second; }
 877
 878 Should compile to:
 879
 880
 881         _Z11no_overflowjj:
 882                 addl    %edi, %esi
 883                 setae   %al
 884                 ret
 885
 886 FIXME: That code looks wrong; bool return is normally defined as zext.
 887
 888 on x86-64, not:
 889
 890 __Z11no_overflowjj:
 891         addl    %edi, %esi
 892         cmpl    %edi, %esi
 893         setae   %al
 894         movzbl  %al, %eax
 895         ret
 896
 897
 898 //===---------------------------------------------------------------------===//
 899
 900 Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
 901 condition register is dead. xor reg reg is shorter than mov reg, #0.
 902
 903 //===---------------------------------------------------------------------===//
 904
 905 We aren't matching RMW instructions aggressively
 906 enough.  Here's a reduced testcase (more in PR1160):
 907
 908 define void @test(i32* %huge_ptr, i32* %target_ptr) {
 909         %A = load i32* %huge_ptr                ; <i32> [#uses=1]
 910         %B = load i32* %target_ptr              ; <i32> [#uses=1]
 911         %C = or i32 %A, %B              ; <i32> [#uses=1]
 912         store i32 %C, i32* %target_ptr
 913         ret void
 914 }
 915
 916 $ llvm-as < t.ll | llc -march=x86-64
 917
 918 _test:
 919         movl (%rdi), %eax
 920         orl (%rsi), %eax
 921         movl %eax, (%rsi)
 922         ret
 923
 924 That should be something like:
 925
 926 _test:
 927         movl (%rdi), %eax
 928         orl %eax, (%rsi)
 929         ret
 930
 931 //===---------------------------------------------------------------------===//
 932
 933 The following code:
 934
 935 bb114.preheader:                ; preds = %cond_next94
 936         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
 937         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
 938         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
 939         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
 940         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
 941         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
 942         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
 943         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
 944         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
 945         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
 946         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
 947         br label %bb114
 948
 949 produces:
 950
 951 LBB3_5: # bb114.preheader
 952         movswl  -68(%ebp), %eax
 953         movl    $32, %ecx
 954         movl    %ecx, -80(%ebp)
 955         subl    %eax, -80(%ebp)
 956         movswl  -52(%ebp), %eax
 957         movl    %ecx, -84(%ebp)
 958         subl    %eax, -84(%ebp)
 959         movswl  -70(%ebp), %eax
 960         movl    %ecx, -88(%ebp)
 961         subl    %eax, -88(%ebp)
 962         movswl  -50(%ebp), %eax
 963         subl    %eax, %ecx
 964         movl    %ecx, -76(%ebp)
 965         movswl  -42(%ebp), %eax
 966         movl    %eax, -92(%ebp)
 967         movswl  -66(%ebp), %eax
 968         movl    %eax, -96(%ebp)
 969         movw    $0, -98(%ebp)
 970
 971 This appears to be bad because the RA is not folding the store to the stack
 972 slot into the movl.  The above instructions could be:
 973         movl    $32, -80(%ebp)
 974 ...
 975         movl    $32, -84(%ebp)
 976 ...
 977 This seems like a cross between remat and spill folding.
 978
 979 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
 980 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
 981 vice-versa).
 982
 983 //===---------------------------------------------------------------------===//
 984
 985 This code:
 986
 987         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
 988         br i1 %tmp659, label %cond_true662, label %cond_next715
 989
 990 produces this:
 991
 992         testw   %cx, %cx
 993         movswl  %cx, %esi
 994         jns     LBB4_109        # cond_next715
 995
 996 Shark tells us that using %cx in the testw instruction is sub-optimal. It
 997 suggests using the 32-bit register (which is what ICC uses).
 998
 999 //===---------------------------------------------------------------------===//
1000
1001 We compile this:
1002
1003 void compare (long long foo) {
1004   if (foo < 4294967297LL)
1005     abort();
1006 }
1007
1008 to:
1009
1010 compare:
1011         subl    $4, %esp
1012         cmpl    $0, 8(%esp)
1013         setne   %al
1014         movzbw  %al, %ax
1015         cmpl    $1, 12(%esp)
1016         setg    %cl
1017         movzbw  %cl, %cx
1018         cmove   %ax, %cx
1019         testb   $1, %cl
1020         jne     .LBB1_2 # UnifiedReturnBlock
1021 .LBB1_1:        # ifthen
1022         call    abort
1023 .LBB1_2:        # UnifiedReturnBlock
1024         addl    $4, %esp
1025         ret
1026
1027 (also really horrible code on ppc).  This is due to the expand code for 64-bit
1028 compares.  GCC produces multiple branches, which is much nicer:
1029
1030 compare:
1031         subl    $12, %esp
1032         movl    20(%esp), %edx
1033         movl    16(%esp), %eax
1034         decl    %edx
1035         jle     .L7
1036 .L5:
1037         addl    $12, %esp
1038         ret
1039         .p2align 4,,7
1040 .L7:
1041         jl      .L4
1042         cmpl    $0, %eax
1043         .p2align 4,,8
1044         ja      .L5
1045 .L4:
1046         .p2align 4,,9
1047         call    abort
1048
1049 //===---------------------------------------------------------------------===//
1050
1051 Tail call optimization improvements: Tail call optimization currently
1052 pushes all arguments on the top of the stack (their normal place for
1053 non-tail call optimized calls) that source from the callers arguments
1054 or  that source from a virtual register (also possibly sourcing from
1055 callers arguments).
1056 This is done to prevent overwriting of parameters (see example
1057 below) that might be used later.
1058
1059 example:
1060
1061 int callee(int32, int64);
1062 int caller(int32 arg1, int32 arg2) {
1063   int64 local = arg2 * 2;
1064   return callee(arg2, (int64)local);
1065 }
1066
1067 [arg1]          [!arg2 no longer valid since we moved local onto it]
1068 [arg2]      ->  [(int64)
1069 [RETADDR]        local  ]
1070
1071 Moving arg1 onto the stack slot of callee function would overwrite
1072 arg2 of the caller.
1073
1074 Possible optimizations:
1075
1076
1077  - Analyse the actual parameters of the callee to see which would
1078    overwrite a caller parameter which is used by the callee and only
1079    push them onto the top of the stack.
1080
1081    int callee (int32 arg1, int32 arg2);
1082    int caller (int32 arg1, int32 arg2) {
1083        return callee(arg1,arg2);
1084    }
1085
1086    Here we don't need to write any variables to the top of the stack
1087    since they don't overwrite each other.
1088
1089    int callee (int32 arg1, int32 arg2);
1090    int caller (int32 arg1, int32 arg2) {
1091        return callee(arg2,arg1);
1092    }
1093
1094    Here we need to push the arguments because they overwrite each
1095    other.
1096
1097 //===---------------------------------------------------------------------===//
1098
1099 main ()
1100 {
1101   int i = 0;
1102   unsigned long int z = 0;
1103
1104   do {
1105     z -= 0x00004000;
1106     i++;
1107     if (i > 0x00040000)
1108       abort ();
1109   } while (z > 0);
1110   exit (0);
1111 }
1112
1113 gcc compiles this to:
1114
1115 _main:
1116         subl    $28, %esp
1117         xorl    %eax, %eax
1118         jmp     L2
1119 L3:
1120         cmpl    $262144, %eax
1121         je      L10
1122 L2:
1123         addl    $1, %eax
1124         cmpl    $262145, %eax
1125         jne     L3
1126         call    L_abort$stub
1127 L10:
1128         movl    $0, (%esp)
1129         call    L_exit$stub
1130
1131 llvm:
1132
1133 _main:
1134         subl    $12, %esp
1135         movl    $1, %eax
1136         movl    $16384, %ecx
1137 LBB1_1: # bb
1138         cmpl    $262145, %eax
1139         jge     LBB1_4  # cond_true
1140 LBB1_2: # cond_next
1141         incl    %eax
1142         addl    $4294950912, %ecx
1143         cmpl    $16384, %ecx
1144         jne     LBB1_1  # bb
1145 LBB1_3: # bb11
1146         xorl    %eax, %eax
1147         addl    $12, %esp
1148         ret
1149 LBB1_4: # cond_true
1150         call    L_abort$stub
1151
1152 1. LSR should rewrite the first cmp with induction variable %ecx.
1153 2. DAG combiner should fold
1154         leal    1(%eax), %edx
1155         cmpl    $262145, %edx
1156    =>
1157         cmpl    $262144, %eax
1158
1159 //===---------------------------------------------------------------------===//
1160
1161 define i64 @test(double %X) {
1162         %Y = fptosi double %X to i64
1163         ret i64 %Y
1164 }
1165
1166 compiles to:
1167
1168 _test:
1169         subl    $20, %esp
1170         movsd   24(%esp), %xmm0
1171         movsd   %xmm0, 8(%esp)
1172         fldl    8(%esp)
1173         fisttpll        (%esp)
1174         movl    4(%esp), %edx
1175         movl    (%esp), %eax
1176         addl    $20, %esp
1177         #FP_REG_KILL
1178         ret
1179
1180 This should just fldl directly from the input stack slot.
1181
1182 //===---------------------------------------------------------------------===//
1183
1184 This code:
1185 int foo (int x) { return (x & 65535) | 255; }
1186
1187 Should compile into:
1188
1189 _foo:
1190         movzwl  4(%esp), %eax
1191         orl     $255, %eax
1192         ret
1193
1194 instead of:
1195 _foo:
1196         movl    $255, %eax
1197         orl     4(%esp), %eax
1198         andl    $65535, %eax
1199         ret
1200
1201 //===---------------------------------------------------------------------===//
1202
1203 We're codegen'ing multiply of long longs inefficiently:
1204
1205 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
1206   return arg1 *  arg2;
1207 }
1208
1209 We compile to (fomit-frame-pointer):
1210
1211 _LLM:
1212         pushl   %esi
1213         movl    8(%esp), %ecx
1214         movl    16(%esp), %esi
1215         movl    %esi, %eax
1216         mull    %ecx
1217         imull   12(%esp), %esi
1218         addl    %edx, %esi
1219         imull   20(%esp), %ecx
1220         movl    %esi, %edx
1221         addl    %ecx, %edx
1222         popl    %esi
1223         ret
1224
1225 This looks like a scheduling deficiency and lack of remat of the load from
1226 the argument area.  ICC apparently produces:
1227
1228         movl      8(%esp), %ecx
1229         imull     12(%esp), %ecx
1230         movl      16(%esp), %eax
1231         imull     4(%esp), %eax
1232         addl      %eax, %ecx
1233         movl      4(%esp), %eax
1234         mull      12(%esp)
1235         addl      %ecx, %edx
1236         ret
1237
1238 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1239 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1240
1241 //===---------------------------------------------------------------------===//
1242
1243 We can fold a store into "zeroing a reg".  Instead of:
1244
1245 xorl    %eax, %eax
1246 movl    %eax, 124(%esp)
1247
1248 we should get:
1249
1250 movl    $0, 124(%esp)
1251
1252 if the flags of the xor are dead.
1253
1254 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1255 be folded into: shl [mem], 1
1256
1257 //===---------------------------------------------------------------------===//
1258
1259 This testcase misses a read/modify/write opportunity (from PR1425):
1260
1261 void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
1262     int i;
1263     for(i=0; i<width; i++)
1264         b1[i] += (1*(b0[i] + b2[i])+0)>>0;
1265 }
1266
1267 We compile it down to:
1268
1269 LBB1_2: # bb
1270         movl    (%esi,%edi,4), %ebx
1271         addl    (%ecx,%edi,4), %ebx
1272         addl    (%edx,%edi,4), %ebx
1273         movl    %ebx, (%ecx,%edi,4)
1274         incl    %edi
1275         cmpl    %eax, %edi
1276         jne     LBB1_2  # bb
1277
1278 the inner loop should add to the memory location (%ecx,%edi,4), saving
1279 a mov.  Something like:
1280
1281         movl    (%esi,%edi,4), %ebx
1282         addl    (%edx,%edi,4), %ebx
1283         addl    %ebx, (%ecx,%edi,4)
1284
1285 Here is another interesting example:
1286
1287 void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
1288     int i;
1289     for(i=0; i<width; i++)
1290         b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
1291 }
1292
1293 We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
1294
1295 LBB9_2: # bb
1296         movl    (%ecx,%edi,4), %ebx
1297         subl    (%esi,%edi,4), %ebx
1298         subl    (%edx,%edi,4), %ebx
1299         movl    %ebx, (%ecx,%edi,4)
1300         incl    %edi
1301         cmpl    %eax, %edi
1302         jne     LBB9_2  # bb
1303
1304 Additionally, LSR should rewrite the exit condition of these loops to use
1305 a stride-4 IV, would would allow all the scales in the loop to go away.
1306 This would result in smaller code and more efficient microops.
1307
1308 //===---------------------------------------------------------------------===//
1309
1310 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1311 or and instruction, for example:
1312
1313         xorpd   LCPI1_0, %xmm2
1314
1315 However, if xmm2 gets spilled, we end up with really ugly code like this:
1316
1317         movsd   (%esp), %xmm0
1318         xorpd   LCPI1_0, %xmm0
1319         movsd   %xmm0, (%esp)
1320
1321 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1322 the neg/abs instruction, turning it into an *integer* operation, like this:
1323
1324         xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1325
1326 you could also use xorb, but xorl is less likely to lead to a partial register
1327 stall.  Here is a contrived testcase:
1328
1329 double a, b, c;
1330 void test(double *P) {
1331   double X = *P;
1332   a = X;
1333   bar();
1334   X = -X;
1335   b = X;
1336   bar();
1337   c = X;
1338 }
1339
1340 //===---------------------------------------------------------------------===//
1341
1342 handling llvm.memory.barrier on pre SSE2 cpus
1343
1344 should generate:
1345 lock ; mov %esp, %esp
1346
1347 //===---------------------------------------------------------------------===//
1348
1349 The generated code on x86 for checking for signed overflow on a multiply the
1350 obvious way is much longer than it needs to be.
1351
1352 int x(int a, int b) {
1353   long long prod = (long long)a*b;
1354   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1355 }
1356
1357 See PR2053 for more details.
1358
1359 //===---------------------------------------------------------------------===//
1360
1361 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1362 more aggressively; it should cost the same as a move+shift on any modern
1363 processor, but it's a lot shorter. Downside is that it puts more
1364 pressure on register allocation because it has fixed operands.
1365
1366 Example:
1367 int abs(int x) {return x < 0 ? -x : x;}
1368
1369 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1370 abs:
1371         movl    4(%esp), %eax
1372         cltd
1373         xorl    %edx, %eax
1374         subl    %edx, %eax
1375         ret
1376
1377 //===---------------------------------------------------------------------===//
1378
1379 Consider:
1380 int test(unsigned long a, unsigned long b) { return -(a < b); }
1381
1382 We currently compile this to:
1383
1384 define i32 @test(i32 %a, i32 %b) nounwind  {
1385         %tmp3 = icmp ult i32 %a, %b             ; <i1> [#uses=1]
1386         %tmp34 = zext i1 %tmp3 to i32           ; <i32> [#uses=1]
1387         %tmp5 = sub i32 0, %tmp34               ; <i32> [#uses=1]
1388         ret i32 %tmp5
1389 }
1390
1391 and
1392
1393 _test:
1394         movl    8(%esp), %eax
1395         cmpl    %eax, 4(%esp)
1396         setb    %al
1397         movzbl  %al, %eax
1398         negl    %eax
1399         ret
1400
1401 Several deficiencies here.  First, we should instcombine zext+neg into sext:
1402
1403 define i32 @test2(i32 %a, i32 %b) nounwind  {
1404         %tmp3 = icmp ult i32 %a, %b             ; <i1> [#uses=1]
1405         %tmp34 = sext i1 %tmp3 to i32           ; <i32> [#uses=1]
1406         ret i32 %tmp34
1407 }
1408
1409 However, before we can do that, we have to fix the bad codegen that we get for
1410 sext from bool:
1411
1412 _test2:
1413         movl    8(%esp), %eax
1414         cmpl    %eax, 4(%esp)
1415         setb    %al
1416         movzbl  %al, %eax
1417         shll    $31, %eax
1418         sarl    $31, %eax
1419         ret
1420
1421 This code should be at least as good as the code above.  Once this is fixed, we
1422 can optimize this specific case even more to:
1423
1424         movl    8(%esp), %eax
1425         xorl    %ecx, %ecx
1426         cmpl    %eax, 4(%esp)
1427         sbbl    %ecx, %ecx
1428
1429 //===---------------------------------------------------------------------===//
1430
1431 Take the following code (from
1432 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1433
1434 extern unsigned char first_one[65536];
1435 int FirstOnet(unsigned long long arg1)
1436 {
1437   if (arg1 >> 48)
1438     return (first_one[arg1 >> 48]);
1439   return 0;
1440 }
1441
1442
1443 The following code is currently generated:
1444 FirstOnet:
1445         movl    8(%esp), %eax
1446         cmpl    $65536, %eax
1447         movl    4(%esp), %ecx
1448         jb      .LBB1_2 # UnifiedReturnBlock
1449 .LBB1_1:        # ifthen
1450         shrl    $16, %eax
1451         movzbl  first_one(%eax), %eax
1452         ret
1453 .LBB1_2:        # UnifiedReturnBlock
1454         xorl    %eax, %eax
1455         ret
1456
1457 There are a few possible improvements here:
1458 1. We should be able to eliminate the dead load into %ecx
1459 2. We could change the "movl 8(%esp), %eax" into
1460    "movzwl 10(%esp), %eax"; this lets us change the cmpl
1461    into a testl, which is shorter, and eliminate the shift.
1462
1463 We could also in theory eliminate the branch by using a conditional
1464 for the address of the load, but that seems unlikely to be worthwhile
1465 in general.
1466
1467 //===---------------------------------------------------------------------===//
1468
1469 We compile this function:
1470
1471 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
1472 entry:
1473         %tmp2 = icmp eq i8 %d, 0                ; <i1> [#uses=1]
1474         br i1 %tmp2, label %bb7, label %bb
1475
1476 bb:             ; preds = %entry
1477         %tmp6 = add i32 %b, %a          ; <i32> [#uses=1]
1478         ret i32 %tmp6
1479
1480 bb7:            ; preds = %entry
1481         %tmp10 = sub i32 %a, %c         ; <i32> [#uses=1]
1482         ret i32 %tmp10
1483 }
1484
1485 to:
1486
1487 _foo:
1488         cmpb    $0, 16(%esp)
1489         movl    12(%esp), %ecx
1490         movl    8(%esp), %eax
1491         movl    4(%esp), %edx
1492         je      LBB1_2  # bb7
1493 LBB1_1: # bb
1494         addl    %edx, %eax
1495         ret
1496 LBB1_2: # bb7
1497         movl    %edx, %eax
1498         subl    %ecx, %eax
1499         ret
1500
1501 The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
1502 if it commuted the addl in LBB1_1.
1503
1504 //===---------------------------------------------------------------------===//
1505
1506 See rdar://4653682.
1507
1508 From flops:
1509
1510 LBB1_15:        # bb310
1511         cvtss2sd        LCPI1_0, %xmm1
1512         addsd   %xmm1, %xmm0
1513         movsd   176(%esp), %xmm2
1514         mulsd   %xmm0, %xmm2
1515         movapd  %xmm2, %xmm3
1516         mulsd   %xmm3, %xmm3
1517         movapd  %xmm3, %xmm4
1518         mulsd   LCPI1_23, %xmm4
1519         addsd   LCPI1_24, %xmm4
1520         mulsd   %xmm3, %xmm4
1521         addsd   LCPI1_25, %xmm4
1522         mulsd   %xmm3, %xmm4
1523         addsd   LCPI1_26, %xmm4
1524         mulsd   %xmm3, %xmm4
1525         addsd   LCPI1_27, %xmm4
1526         mulsd   %xmm3, %xmm4
1527         addsd   LCPI1_28, %xmm4
1528         mulsd   %xmm3, %xmm4
1529         addsd   %xmm1, %xmm4
1530         mulsd   %xmm2, %xmm4
1531         movsd   152(%esp), %xmm1
1532         addsd   %xmm4, %xmm1
1533         movsd   %xmm1, 152(%esp)
1534         incl    %eax
1535         cmpl    %eax, %esi
1536         jge     LBB1_15 # bb310
1537 LBB1_16:        # bb358.loopexit
1538         movsd   152(%esp), %xmm0
1539         addsd   %xmm0, %xmm0
1540         addsd   LCPI1_22, %xmm0
1541         movsd   %xmm0, 152(%esp)
1542
1543 Rather than spilling the result of the last addsd in the loop, we should have
1544 insert a copy to split the interval (one for the duration of the loop, one
1545 extending to the fall through). The register pressure in the loop isn't high
1546 enough to warrant the spill.
1547
1548 Also check why xmm7 is not used at all in the function.
1549
1550 //===---------------------------------------------------------------------===//
1551
1552 Legalize loses track of the fact that bools are always zero extended when in
1553 memory.  This causes us to compile abort_gzip (from 164.gzip) from:
1554
1555 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1556 target triple = "i386-apple-darwin8"
1557 @in_exit.4870.b = internal global i1 false              ; <i1*> [#uses=2]
1558 define fastcc void @abort_gzip() noreturn nounwind  {
1559 entry:
1560         %tmp.b.i = load i1* @in_exit.4870.b             ; <i1> [#uses=1]
1561         br i1 %tmp.b.i, label %bb.i, label %bb4.i
1562 bb.i:           ; preds = %entry
1563         tail call void @exit( i32 1 ) noreturn nounwind
1564         unreachable
1565 bb4.i:          ; preds = %entry
1566         store i1 true, i1* @in_exit.4870.b
1567         tail call void @exit( i32 1 ) noreturn nounwind
1568         unreachable
1569 }
1570 declare void @exit(i32) noreturn nounwind
1571
1572 into:
1573
1574 _abort_gzip:
1575         subl    $12, %esp
1576         movb    _in_exit.4870.b, %al
1577         notb    %al
1578         testb   $1, %al
1579         jne     LBB1_2  ## bb4.i
1580 LBB1_1: ## bb.i
1581   ...
1582
1583 //===---------------------------------------------------------------------===//
1584
1585 We compile:
1586
1587 int test(int x, int y) {
1588   return x-y-1;
1589 }
1590
1591 into (-m64):
1592
1593 _test:
1594         decl    %edi
1595         movl    %edi, %eax
1596         subl    %esi, %eax
1597         ret
1598
1599 it would be better to codegen as: x+~y  (notl+addl)
1600
1601 //===---------------------------------------------------------------------===//
1602
1603 This code:
1604
1605 int foo(const char *str,...)
1606 {
1607  __builtin_va_list a; int x;
1608  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1609  return x;
1610 }
1611
1612 gets compiled into this on x86-64:
1613         subq    $200, %rsp
1614         movaps  %xmm7, 160(%rsp)
1615         movaps  %xmm6, 144(%rsp)
1616         movaps  %xmm5, 128(%rsp)
1617         movaps  %xmm4, 112(%rsp)
1618         movaps  %xmm3, 96(%rsp)
1619         movaps  %xmm2, 80(%rsp)
1620         movaps  %xmm1, 64(%rsp)
1621         movaps  %xmm0, 48(%rsp)
1622         movq    %r9, 40(%rsp)
1623         movq    %r8, 32(%rsp)
1624         movq    %rcx, 24(%rsp)
1625         movq    %rdx, 16(%rsp)
1626         movq    %rsi, 8(%rsp)
1627         leaq    (%rsp), %rax
1628         movq    %rax, 192(%rsp)
1629         leaq    208(%rsp), %rax
1630         movq    %rax, 184(%rsp)
1631         movl    $48, 180(%rsp)
1632         movl    $8, 176(%rsp)
1633         movl    176(%rsp), %eax
1634         cmpl    $47, %eax
1635         jbe     .LBB1_3 # bb
1636 .LBB1_1:        # bb3
1637         movq    184(%rsp), %rcx
1638         leaq    8(%rcx), %rax
1639         movq    %rax, 184(%rsp)
1640 .LBB1_2:        # bb4
1641         movl    (%rcx), %eax
1642         addq    $200, %rsp
1643         ret
1644 .LBB1_3:        # bb
1645         movl    %eax, %ecx
1646         addl    $8, %eax
1647         addq    192(%rsp), %rcx
1648         movl    %eax, 176(%rsp)
1649         jmp     .LBB1_2 # bb4
1650
1651 gcc 4.3 generates:
1652         subq    $96, %rsp
1653 .LCFI0:
1654         leaq    104(%rsp), %rax
1655         movq    %rsi, -80(%rsp)
1656         movl    $8, -120(%rsp)
1657         movq    %rax, -112(%rsp)
1658         leaq    -88(%rsp), %rax
1659         movq    %rax, -104(%rsp)
1660         movl    $8, %eax
1661         cmpl    $48, %eax
1662         jb      .L6
1663         movq    -112(%rsp), %rdx
1664         movl    (%rdx), %eax
1665         addq    $96, %rsp
1666         ret
1667         .p2align 4,,10
1668         .p2align 3
1669 .L6:
1670         mov     %eax, %edx
1671         addq    -104(%rsp), %rdx
1672         addl    $8, %eax
1673         movl    %eax, -120(%rsp)
1674         movl    (%rdx), %eax
1675         addq    $96, %rsp
1676         ret
1677
1678 and it gets compiled into this on x86:
1679         pushl   %ebp
1680         movl    %esp, %ebp
1681         subl    $4, %esp
1682         leal    12(%ebp), %eax
1683         movl    %eax, -4(%ebp)
1684         leal    16(%ebp), %eax
1685         movl    %eax, -4(%ebp)
1686         movl    12(%ebp), %eax
1687         addl    $4, %esp
1688         popl    %ebp
1689         ret
1690
1691 gcc 4.3 generates:
1692         pushl   %ebp
1693         movl    %esp, %ebp
1694         movl    12(%ebp), %eax
1695         popl    %ebp
1696         ret
1697
1698 //===---------------------------------------------------------------------===//
1699
1700 Teach tblgen not to check bitconvert source type in some cases. This allows us
1701 to consolidate the following patterns in X86InstrMMX.td:
1702
1703 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1704                                                   (iPTR 0))))),
1705           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1706 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1707                                                   (iPTR 0))))),
1708           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1709 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1710                                                   (iPTR 0))))),
1711           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1712
1713 There are other cases in various td files.
1714
1715 //===---------------------------------------------------------------------===//
1716
1717 Take something like the following on x86-32:
1718 unsigned a(unsigned long long x, unsigned y) {return x % y;}
1719
1720 We currently generate a libcall, but we really shouldn't: the expansion is
1721 shorter and likely faster than the libcall.  The expected code is something
1722 like the following:
1723
1724         movl    12(%ebp), %eax
1725         movl    16(%ebp), %ecx
1726         xorl    %edx, %edx
1727         divl    %ecx
1728         movl    8(%ebp), %eax
1729         divl    %ecx
1730         movl    %edx, %eax
1731         ret
1732
1733 A similar code sequence works for division.
1734
1735 //===---------------------------------------------------------------------===//
1736
1737 These should compile to the same code, but the later codegen's to useless
1738 instructions on X86. This may be a trivial dag combine (GCC PR7061):
1739
1740 struct s1 { unsigned char a, b; };
1741 unsigned long f1(struct s1 x) {
1742     return x.a + x.b;
1743 }
1744 struct s2 { unsigned a: 8, b: 8; };
1745 unsigned long f2(struct s2 x) {
1746     return x.a + x.b;
1747 }
1748
1749 //===---------------------------------------------------------------------===//
1750
1751 We currently compile this:
1752
1753 define i32 @func1(i32 %v1, i32 %v2) nounwind {
1754 entry:
1755   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1756   %sum = extractvalue {i32, i1} %t, 0
1757   %obit = extractvalue {i32, i1} %t, 1
1758   br i1 %obit, label %overflow, label %normal
1759 normal:
1760   ret i32 %sum
1761 overflow:
1762   call void @llvm.trap()
1763   unreachable
1764 }
1765 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1766 declare void @llvm.trap()
1767
1768 to:
1769
1770 _func1:
1771         movl    4(%esp), %eax
1772         addl    8(%esp), %eax
1773         jo      LBB1_2  ## overflow
1774 LBB1_1: ## normal
1775         ret
1776 LBB1_2: ## overflow
1777         ud2
1778
1779 it would be nice to produce "into" someday.
1780
1781 //===---------------------------------------------------------------------===//
1782
1783 This code:
1784
1785 void vec_mpys1(int y[], const int x[], int scaler) {
1786 int i;
1787 for (i = 0; i < 150; i++)
1788  y[i] += (((long long)scaler * (long long)x[i]) >> 31);
1789 }
1790
1791 Compiles to this loop with GCC 3.x:
1792
1793 .L5:
1794         movl    %ebx, %eax
1795         imull   (%edi,%ecx,4)
1796         shrdl   $31, %edx, %eax
1797         addl    %eax, (%esi,%ecx,4)
1798         incl    %ecx
1799         cmpl    $149, %ecx
1800         jle     .L5
1801
1802 llvm-gcc compiles it to the much uglier:
1803
1804 LBB1_1: ## bb1
1805         movl    24(%esp), %eax
1806         movl    (%eax,%edi,4), %ebx
1807         movl    %ebx, %ebp
1808         imull   %esi, %ebp
1809         movl    %ebx, %eax
1810         mull    %ecx
1811         addl    %ebp, %edx
1812         sarl    $31, %ebx
1813         imull   %ecx, %ebx
1814         addl    %edx, %ebx
1815         shldl   $1, %eax, %ebx
1816         movl    20(%esp), %eax
1817         addl    %ebx, (%eax,%edi,4)
1818         incl    %edi
1819         cmpl    $150, %edi
1820         jne     LBB1_1  ## bb1
1821
1822 //===---------------------------------------------------------------------===//
1823
1824 Test instructions can be eliminated by using EFLAGS values from arithmetic
1825 instructions. This is currently not done for mul, and, or, xor, neg, shl,
1826 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1827 for read-modify-write instructions. It is also current not done if the
1828 OF or CF flags are needed.
1829
1830 The shift operators have the complication that when the shift count is
1831 zero, EFLAGS is not set, so they can only subsume a test instruction if
1832 the shift count is known to be non-zero. Also, using the EFLAGS value
1833 from a shift is apparently very slow on some x86 implementations.
1834
1835 In read-modify-write instructions, the root node in the isel match is
1836 the store, and isel has no way for the use of the EFLAGS result of the
1837 arithmetic to be remapped to the new node.
1838
1839 Add and subtract instructions set OF on signed overflow and CF on unsiged
1840 overflow, while test instructions always clear OF and CF. In order to
1841 replace a test with an add or subtract in a situation where OF or CF is
1842 needed, codegen must be able to prove that the operation cannot see
1843 signed or unsigned overflow, respectively.
1844
1845 //===---------------------------------------------------------------------===//
1846
1847 memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
1848 define <16 x float> @foo(<16 x float> %A) nounwind {
1849         %tmp = alloca <16 x float>, align 16
1850         %tmp2 = alloca <16 x float>, align 16
1851         store <16 x float> %A, <16 x float>* %tmp
1852         %s = bitcast <16 x float>* %tmp to i8*
1853         %s2 = bitcast <16 x float>* %tmp2 to i8*
1854         call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1855         %R = load <16 x float>* %tmp2
1856         ret <16 x float> %R
1857 }
1858
1859 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1860
1861 which compiles to:
1862
1863 _foo:
1864         subl    $140, %esp
1865         movaps  %xmm3, 112(%esp)
1866         movaps  %xmm2, 96(%esp)
1867         movaps  %xmm1, 80(%esp)
1868         movaps  %xmm0, 64(%esp)
1869         movl    60(%esp), %eax
1870         movl    %eax, 124(%esp)
1871         movl    56(%esp), %eax
1872         movl    %eax, 120(%esp)
1873         movl    52(%esp), %eax
1874         <many many more 32-bit copies>
1875         movaps  (%esp), %xmm0
1876         movaps  16(%esp), %xmm1
1877         movaps  32(%esp), %xmm2
1878         movaps  48(%esp), %xmm3
1879         addl    $140, %esp
1880         ret
1881
1882 On Nehalem, it may even be cheaper to just use movups when unaligned than to
1883 fall back to lower-granularity chunks.
1884
1885 //===---------------------------------------------------------------------===//
1886
1887 Implement processor-specific optimizations for parity with GCC on these
1888 processors.  GCC does two optimizations:
1889
1890 1. ix86_pad_returns inserts a noop before ret instructions if immediately
1891    preceeded by a conditional branch or is the target of a jump.
1892 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1893    code contains more than 3 branches.
1894
1895 The first one is done for all AMDs, Core2, and "Generic"
1896 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1897   Core 2, and "Generic"
1898
1899 //===---------------------------------------------------------------------===//