lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Missing features:
   6   - Support for SSE4: http://www.intel.com/software/penryn
   7 http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
   8   - support for 3DNow!
   9   - weird abis?
  10
  11 //===---------------------------------------------------------------------===//
  12
  13 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  14 backend knows how to three-addressify this shift, but it appears the register
  15 allocator isn't even asking it to do so in this case.  We should investigate
  16 why this isn't happening, it could have significant impact on other important
  17 cases for X86 as well.
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 This should be one DIV/IDIV instruction, not a libcall:
  22
  23 unsigned test(unsigned long long X, unsigned Y) {
  24         return X/Y;
  25 }
  26
  27 This can be done trivially with a custom legalizer.  What about overflow
  28 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  29
  30 //===---------------------------------------------------------------------===//
  31
  32 Improvements to the multiply -> shift/add algorithm:
  33 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  34
  35 //===---------------------------------------------------------------------===//
  36
  37 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  38 long long foo(int x) { return 1LL << x; }
  39
  40 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  41 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  42 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  43
  44 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  45
  46 One better solution for 1LL << x is:
  47         xorl    %eax, %eax
  48         xorl    %edx, %edx
  49         testb   $32, %cl
  50         sete    %al
  51         setne   %dl
  52         sall    %cl, %eax
  53         sall    %cl, %edx
  54
  55 But that requires good 8-bit subreg support.
  56
  57 Also, this might be better.  It's an extra shift, but it's one instruction
  58 shorter, and doesn't stress 8-bit subreg support.
  59 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
  60 but without the unnecessary and.)
  61         movl %ecx, %eax
  62         shrl $5, %eax
  63         movl %eax, %edx
  64         xorl $1, %edx
  65         sall %cl, %eax
  66         sall %cl. %edx
  67
  68 64-bit shifts (in general) expand to really bad code.  Instead of using
  69 cmovs, we should expand to a conditional branch like GCC produces.
  70
  71 //===---------------------------------------------------------------------===//
  72
  73 Compile this:
  74 _Bool f(_Bool a) { return a!=1; }
  75
  76 into:
  77         movzbl  %dil, %eax
  78         xorl    $1, %eax
  79         ret
  80
  81 (Although note that this isn't a legal way to express the code that llvm-gcc
  82 currently generates for that function.)
  83
  84 //===---------------------------------------------------------------------===//
  85
  86 Some isel ideas:
  87
  88 1. Dynamic programming based approach when compile time if not an
  89    issue.
  90 2. Code duplication (addressing mode) during isel.
  91 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  92    Sequencing of Instructions".
  93 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  94    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  95    and other related papers.
  96    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  97
  98 //===---------------------------------------------------------------------===//
  99
 100 Should we promote i16 to i32 to avoid partial register update stalls?
 101
 102 //===---------------------------------------------------------------------===//
 103
 104 Leave any_extend as pseudo instruction and hint to register
 105 allocator. Delay codegen until post register allocation.
 106 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
 107 the coalescer how to deal with it though.
 108
 109 //===---------------------------------------------------------------------===//
 110
 111 It appears icc use push for parameter passing. Need to investigate.
 112
 113 //===---------------------------------------------------------------------===//
 114
 115 Only use inc/neg/not instructions on processors where they are faster than
 116 add/sub/xor.  They are slower on the P4 due to only updating some processor
 117 flags.
 118
 119 //===---------------------------------------------------------------------===//
 120
 121 The instruction selector sometimes misses folding a load into a compare.  The
 122 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 123 commutative, it is not matched with the load on both sides.  The dag combiner
 124 should be made smart enough to cannonicalize the load into the RHS of a compare
 125 when it can invert the result of the compare for free.
 126
 127 //===---------------------------------------------------------------------===//
 128
 129 How about intrinsics? An example is:
 130   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 131
 132 compiles to
 133         pmuludq (%eax), %xmm0
 134         movl 8(%esp), %eax
 135         movdqa (%eax), %xmm1
 136         pmulhuw %xmm0, %xmm1
 137
 138 The transformation probably requires a X86 specific pass or a DAG combiner
 139 target specific hook.
 140
 141 //===---------------------------------------------------------------------===//
 142
 143 In many cases, LLVM generates code like this:
 144
 145 _test:
 146         movl 8(%esp), %eax
 147         cmpl %eax, 4(%esp)
 148         setl %al
 149         movzbl %al, %eax
 150         ret
 151
 152 on some processors (which ones?), it is more efficient to do this:
 153
 154 _test:
 155         movl 8(%esp), %ebx
 156         xor  %eax, %eax
 157         cmpl %ebx, 4(%esp)
 158         setl %al
 159         ret
 160
 161 Doing this correctly is tricky though, as the xor clobbers the flags.
 162
 163 //===---------------------------------------------------------------------===//
 164
 165 We should generate bts/btr/etc instructions on targets where they are cheap or
 166 when codesize is important.  e.g., for:
 167
 168 void setbit(int *target, int bit) {
 169     *target |= (1 << bit);
 170 }
 171 void clearbit(int *target, int bit) {
 172     *target &= ~(1 << bit);
 173 }
 174
 175 //===---------------------------------------------------------------------===//
 176
 177 Instead of the following for memset char*, 1, 10:
 178
 179         movl $16843009, 4(%edx)
 180         movl $16843009, (%edx)
 181         movw $257, 8(%edx)
 182
 183 It might be better to generate
 184
 185         movl $16843009, %eax
 186         movl %eax, 4(%edx)
 187         movl %eax, (%edx)
 188         movw al, 8(%edx)
 189
 190 when we can spare a register. It reduces code size.
 191
 192 //===---------------------------------------------------------------------===//
 193
 194 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 195 get this:
 196
 197 int %test1(int %X) {
 198         %Y = div int %X, 8
 199         ret int %Y
 200 }
 201
 202 _test1:
 203         movl 4(%esp), %eax
 204         movl %eax, %ecx
 205         sarl $31, %ecx
 206         shrl $29, %ecx
 207         addl %ecx, %eax
 208         sarl $3, %eax
 209         ret
 210
 211 GCC knows several different ways to codegen it, one of which is this:
 212
 213 _test1:
 214         movl    4(%esp), %eax
 215         cmpl    $-1, %eax
 216         leal    7(%eax), %ecx
 217         cmovle  %ecx, %eax
 218         sarl    $3, %eax
 219         ret
 220
 221 which is probably slower, but it's interesting at least :)
 222
 223 //===---------------------------------------------------------------------===//
 224
 225 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 226 We should leave these as libcalls for everything over a much lower threshold,
 227 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 228 stores, TLB preheating, etc)
 229
 230 //===---------------------------------------------------------------------===//
 231
 232 Optimize this into something reasonable:
 233  x * copysign(1.0, y) * copysign(1.0, z)
 234
 235 //===---------------------------------------------------------------------===//
 236
 237 Optimize copysign(x, *y) to use an integer load from y.
 238
 239 //===---------------------------------------------------------------------===//
 240
 241 %X = weak global int 0
 242
 243 void %foo(int %N) {
 244         %N = cast int %N to uint
 245         %tmp.24 = setgt int %N, 0
 246         br bool %tmp.24, label %no_exit, label %return
 247
 248 no_exit:
 249         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 250         %i.0.0 = cast uint %indvar to int
 251         volatile store int %i.0.0, int* %X
 252         %indvar.next = add uint %indvar, 1
 253         %exitcond = seteq uint %indvar.next, %N
 254         br bool %exitcond, label %return, label %no_exit
 255
 256 return:
 257         ret void
 258 }
 259
 260 compiles into:
 261
 262         .text
 263         .align  4
 264         .globl  _foo
 265 _foo:
 266         movl 4(%esp), %eax
 267         cmpl $1, %eax
 268         jl LBB_foo_4    # return
 269 LBB_foo_1:      # no_exit.preheader
 270         xorl %ecx, %ecx
 271 LBB_foo_2:      # no_exit
 272         movl L_X$non_lazy_ptr, %edx
 273         movl %ecx, (%edx)
 274         incl %ecx
 275         cmpl %eax, %ecx
 276         jne LBB_foo_2   # no_exit
 277 LBB_foo_3:      # return.loopexit
 278 LBB_foo_4:      # return
 279         ret
 280
 281 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 282 remateralization is implemented. This can be accomplished with 1) a target
 283 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 284
 285 //===---------------------------------------------------------------------===//
 286
 287 The following tests perform worse with LSR:
 288
 289 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 290
 291 //===---------------------------------------------------------------------===//
 292
 293 We are generating far worse code than gcc:
 294
 295 volatile short X, Y;
 296
 297 void foo(int N) {
 298   int i;
 299   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 300 }
 301
 302 LBB1_1: # entry.bb_crit_edge
 303         xorl    %ecx, %ecx
 304         xorw    %dx, %dx
 305 LBB1_2: # bb
 306         movl    L_X$non_lazy_ptr, %esi
 307         movw    %cx, (%esi)
 308         movl    L_Y$non_lazy_ptr, %esi
 309         movw    %dx, (%esi)
 310         addw    $4, %dx
 311         incl    %ecx
 312         cmpl    %eax, %ecx
 313         jne     LBB1_2  # bb
 314
 315 vs.
 316
 317         xorl    %edx, %edx
 318         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 319         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 320 L4:
 321         movw    %dx, (%esi)
 322         leal    0(,%edx,4), %eax
 323         movw    %ax, (%ecx)
 324         addl    $1, %edx
 325         cmpl    %edx, %edi
 326         jne     L4
 327
 328 This is due to the lack of post regalloc LICM.
 329
 330 //===---------------------------------------------------------------------===//
 331
 332 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 333 FR64 to VR128.
 334
 335 //===---------------------------------------------------------------------===//
 336
 337 mov $reg, 48(%esp)
 338 ...
 339 leal 48(%esp), %eax
 340 mov %eax, (%esp)
 341 call _foo
 342
 343 Obviously it would have been better for the first mov (or any op) to store
 344 directly %esp[0] if there are no other uses.
 345
 346 //===---------------------------------------------------------------------===//
 347
 348 Adding to the list of cmp / test poor codegen issues:
 349
 350 int test(__m128 *A, __m128 *B) {
 351   if (_mm_comige_ss(*A, *B))
 352     return 3;
 353   else
 354     return 4;
 355 }
 356
 357 _test:
 358         movl 8(%esp), %eax
 359         movaps (%eax), %xmm0
 360         movl 4(%esp), %eax
 361         movaps (%eax), %xmm1
 362         comiss %xmm0, %xmm1
 363         setae %al
 364         movzbl %al, %ecx
 365         movl $3, %eax
 366         movl $4, %edx
 367         cmpl $0, %ecx
 368         cmove %edx, %eax
 369         ret
 370
 371 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 372 are a number of issues. 1) We are introducing a setcc between the result of the
 373 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 374 so a any extend (which becomes a zero extend) is added.
 375
 376 We probably need some kind of target DAG combine hook to fix this.
 377
 378 //===---------------------------------------------------------------------===//
 379
 380 We generate significantly worse code for this than GCC:
 381 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 382 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 383
 384 There is also one case we do worse on PPC.
 385
 386 //===---------------------------------------------------------------------===//
 387
 388 If shorter, we should use things like:
 389 movzwl %ax, %eax
 390 instead of:
 391 andl $65535, %EAX
 392
 393 The former can also be used when the two-addressy nature of the 'and' would
 394 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 395
 396 //===---------------------------------------------------------------------===//
 397
 398 Another instruction selector deficiency:
 399
 400 void %bar() {
 401         %tmp = load int (int)** %foo
 402         %tmp = tail call int %tmp( int 3 )
 403         ret void
 404 }
 405
 406 _bar:
 407         subl $12, %esp
 408         movl L_foo$non_lazy_ptr, %eax
 409         movl (%eax), %eax
 410         call *%eax
 411         addl $12, %esp
 412         ret
 413
 414 The current isel scheme will not allow the load to be folded in the call since
 415 the load's chain result is read by the callseq_start.
 416
 417 //===---------------------------------------------------------------------===//
 418
 419 For this:
 420
 421 int test(int a)
 422 {
 423   return a * 3;
 424 }
 425
 426 We currently emits
 427         imull $3, 4(%esp), %eax
 428
 429 Perhaps this is what we really should generate is? Is imull three or four
 430 cycles? Note: ICC generates this:
 431         movl    4(%esp), %eax
 432         leal    (%eax,%eax,2), %eax
 433
 434 The current instruction priority is based on pattern complexity. The former is
 435 more "complex" because it folds a load so the latter will not be emitted.
 436
 437 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 438 should always try to match LEA first since the LEA matching code does some
 439 estimate to determine whether the match is profitable.
 440
 441 However, if we care more about code size, then imull is better. It's two bytes
 442 shorter than movl + leal.
 443
 444 //===---------------------------------------------------------------------===//
 445
 446 __builtin_ffs codegen is messy.
 447
 448 int ffs_(unsigned X) { return __builtin_ffs(X); }
 449
 450 llvm produces:
 451 ffs_:
 452         movl    4(%esp), %ecx
 453         bsfl    %ecx, %eax
 454         movl    $32, %edx
 455         cmove   %edx, %eax
 456         incl    %eax
 457         xorl    %edx, %edx
 458         testl   %ecx, %ecx
 459         cmove   %edx, %eax
 460         ret
 461
 462 vs gcc:
 463
 464 _ffs_:
 465         movl    $-1, %edx
 466         bsfl    4(%esp), %eax
 467         cmove   %edx, %eax
 468         addl    $1, %eax
 469         ret
 470
 471 Another example of __builtin_ffs (use predsimplify to eliminate a select):
 472
 473 int foo (unsigned long j) {
 474   if (j)
 475     return __builtin_ffs (j) - 1;
 476   else
 477     return 0;
 478 }
 479
 480 //===---------------------------------------------------------------------===//
 481
 482 It appears gcc place string data with linkonce linkage in
 483 .section __TEXT,__const_coal,coalesced instead of
 484 .section __DATA,__const_coal,coalesced.
 485 Take a look at darwin.h, there are other Darwin assembler directives that we
 486 do not make use of.
 487
 488 //===---------------------------------------------------------------------===//
 489
 490 define i32 @foo(i32* %a, i32 %t) {
 491 entry:
 492         br label %cond_true
 493
 494 cond_true:              ; preds = %cond_true, %entry
 495         %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]           ; <i32> [#uses=3]
 496         %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]             ; <i32> [#uses=1]
 497         %tmp2 = getelementptr i32* %a, i32 %x.0.0               ; <i32*> [#uses=1]
 498         %tmp3 = load i32* %tmp2         ; <i32> [#uses=1]
 499         %tmp5 = add i32 %t_addr.0.0, %x.0.0             ; <i32> [#uses=1]
 500         %tmp7 = add i32 %tmp5, %tmp3            ; <i32> [#uses=2]
 501         %tmp9 = add i32 %x.0.0, 1               ; <i32> [#uses=2]
 502         %tmp = icmp sgt i32 %tmp9, 39           ; <i1> [#uses=1]
 503         br i1 %tmp, label %bb12, label %cond_true
 504
 505 bb12:           ; preds = %cond_true
 506         ret i32 %tmp7
 507 }
 508 is pessimized by -loop-reduce and -indvars
 509
 510 //===---------------------------------------------------------------------===//
 511
 512 u32 to float conversion improvement:
 513
 514 float uint32_2_float( unsigned u ) {
 515   float fl = (int) (u & 0xffff);
 516   float fh = (int) (u >> 16);
 517   fh *= 0x1.0p16f;
 518   return fh + fl;
 519 }
 520
 521 00000000        subl    $0x04,%esp
 522 00000003        movl    0x08(%esp,1),%eax
 523 00000007        movl    %eax,%ecx
 524 00000009        shrl    $0x10,%ecx
 525 0000000c        cvtsi2ss        %ecx,%xmm0
 526 00000010        andl    $0x0000ffff,%eax
 527 00000015        cvtsi2ss        %eax,%xmm1
 528 00000019        mulss   0x00000078,%xmm0
 529 00000021        addss   %xmm1,%xmm0
 530 00000025        movss   %xmm0,(%esp,1)
 531 0000002a        flds    (%esp,1)
 532 0000002d        addl    $0x04,%esp
 533 00000030        ret
 534
 535 //===---------------------------------------------------------------------===//
 536
 537 When using fastcc abi, align stack slot of argument of type double on 8 byte
 538 boundary to improve performance.
 539
 540 //===---------------------------------------------------------------------===//
 541
 542 Codegen:
 543
 544 int f(int a, int b) {
 545   if (a == 4 || a == 6)
 546     b++;
 547   return b;
 548 }
 549
 550
 551 as:
 552
 553 or eax, 2
 554 cmp eax, 6
 555 jz label
 556
 557 //===---------------------------------------------------------------------===//
 558
 559 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 560 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 561
 562 int G;
 563 void f(int X, int Y) {
 564   G = X < 0 ? 14 : 13;
 565 }
 566
 567 compiling to:
 568
 569 _f:
 570         movl $14, %eax
 571         movl $13, %ecx
 572         movl 4(%esp), %edx
 573         testl %edx, %edx
 574         cmovl %eax, %ecx
 575         movl %ecx, _G
 576         ret
 577
 578 it could be:
 579 _f:
 580         movl    4(%esp), %eax
 581         sarl    $31, %eax
 582         notl    %eax
 583         addl    $14, %eax
 584         movl    %eax, _G
 585         ret
 586
 587 etc.
 588
 589 Another is:
 590 int usesbb(unsigned int a, unsigned int b) {
 591        return (a < b ? -1 : 0);
 592 }
 593 to:
 594 _usesbb:
 595         movl    8(%esp), %eax
 596         cmpl    %eax, 4(%esp)
 597         sbbl    %eax, %eax
 598         ret
 599
 600 instead of:
 601 _usesbb:
 602         xorl    %eax, %eax
 603         movl    8(%esp), %ecx
 604         cmpl    %ecx, 4(%esp)
 605         movl    $4294967295, %ecx
 606         cmovb   %ecx, %eax
 607         ret
 608
 609 //===---------------------------------------------------------------------===//
 610
 611 Currently we don't have elimination of redundant stack manipulations. Consider
 612 the code:
 613
 614 int %main() {
 615 entry:
 616         call fastcc void %test1( )
 617         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 618         ret int 0
 619 }
 620
 621 declare fastcc void %test1()
 622
 623 declare fastcc void %test2(sbyte*)
 624
 625
 626 This currently compiles to:
 627
 628         subl $16, %esp
 629         call _test5
 630         addl $12, %esp
 631         subl $16, %esp
 632         movl $_test5, (%esp)
 633         call _test6
 634         addl $12, %esp
 635
 636 The add\sub pair is really unneeded here.
 637
 638 //===---------------------------------------------------------------------===//
 639
 640 Consider the expansion of:
 641
 642 define i32 @test3(i32 %X) {
 643         %tmp1 = urem i32 %X, 255
 644         ret i32 %tmp1
 645 }
 646
 647 Currently it compiles to:
 648
 649 ...
 650         movl $2155905153, %ecx
 651         movl 8(%esp), %esi
 652         movl %esi, %eax
 653         mull %ecx
 654 ...
 655
 656 This could be "reassociated" into:
 657
 658         movl $2155905153, %eax
 659         movl 8(%esp), %ecx
 660         mull %ecx
 661
 662 to avoid the copy.  In fact, the existing two-address stuff would do this
 663 except that mul isn't a commutative 2-addr instruction.  I guess this has
 664 to be done at isel time based on the #uses to mul?
 665
 666 //===---------------------------------------------------------------------===//
 667
 668 Make sure the instruction which starts a loop does not cross a cacheline
 669 boundary. This requires knowning the exact length of each machine instruction.
 670 That is somewhat complicated, but doable. Example 256.bzip2:
 671
 672 In the new trace, the hot loop has an instruction which crosses a cacheline
 673 boundary.  In addition to potential cache misses, this can't help decoding as I
 674 imagine there has to be some kind of complicated decoder reset and realignment
 675 to grab the bytes from the next cacheline.
 676
 677 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 678 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 679 937  937 0x3d0a incl     %esi
 680 3    3   0x3d0b cmpb     %bl, %dl
 681 27   27  0x3d0d jnz      0x000062db <main+11707>
 682
 683 //===---------------------------------------------------------------------===//
 684
 685 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 686
 687 //===---------------------------------------------------------------------===//
 688
 689 This could be a single 16-bit load.
 690
 691 int f(char *p) {
 692     if ((p[0] == 1) & (p[1] == 2)) return 1;
 693     return 0;
 694 }
 695
 696 //===---------------------------------------------------------------------===//
 697
 698 We should inline lrintf and probably other libc functions.
 699
 700 //===---------------------------------------------------------------------===//
 701
 702 Start using the flags more.  For example, compile:
 703
 704 int add_zf(int *x, int y, int a, int b) {
 705      if ((*x += y) == 0)
 706           return a;
 707      else
 708           return b;
 709 }
 710
 711 to:
 712        addl    %esi, (%rdi)
 713        movl    %edx, %eax
 714        cmovne  %ecx, %eax
 715        ret
 716 instead of:
 717
 718 _add_zf:
 719         addl (%rdi), %esi
 720         movl %esi, (%rdi)
 721         testl %esi, %esi
 722         cmove %edx, %ecx
 723         movl %ecx, %eax
 724         ret
 725
 726 and:
 727
 728 int add_zf(int *x, int y, int a, int b) {
 729      if ((*x + y) < 0)
 730           return a;
 731      else
 732           return b;
 733 }
 734
 735 to:
 736
 737 add_zf:
 738         addl    (%rdi), %esi
 739         movl    %edx, %eax
 740         cmovns  %ecx, %eax
 741         ret
 742
 743 instead of:
 744
 745 _add_zf:
 746         addl (%rdi), %esi
 747         testl %esi, %esi
 748         cmovs %edx, %ecx
 749         movl %ecx, %eax
 750         ret
 751
 752 //===---------------------------------------------------------------------===//
 753
 754 These two functions have identical effects:
 755
 756 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 757 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 758
 759 We currently compile them to:
 760
 761 _f:
 762         movl 4(%esp), %eax
 763         movl %eax, %ecx
 764         incl %ecx
 765         movl 8(%esp), %edx
 766         cmpl %edx, %ecx
 767         jne LBB1_2      #UnifiedReturnBlock
 768 LBB1_1: #cond_true
 769         addl $2, %eax
 770         ret
 771 LBB1_2: #UnifiedReturnBlock
 772         movl %ecx, %eax
 773         ret
 774 _f2:
 775         movl 4(%esp), %eax
 776         movl %eax, %ecx
 777         incl %ecx
 778         cmpl 8(%esp), %ecx
 779         sete %cl
 780         movzbl %cl, %ecx
 781         leal 1(%ecx,%eax), %eax
 782         ret
 783
 784 both of which are inferior to GCC's:
 785
 786 _f:
 787         movl    4(%esp), %edx
 788         leal    1(%edx), %eax
 789         addl    $2, %edx
 790         cmpl    8(%esp), %eax
 791         cmove   %edx, %eax
 792         ret
 793 _f2:
 794         movl    4(%esp), %eax
 795         addl    $1, %eax
 796         xorl    %edx, %edx
 797         cmpl    8(%esp), %eax
 798         sete    %dl
 799         addl    %edx, %eax
 800         ret
 801
 802 //===---------------------------------------------------------------------===//
 803
 804 This code:
 805
 806 void test(int X) {
 807   if (X) abort();
 808 }
 809
 810 is currently compiled to:
 811
 812 _test:
 813         subl $12, %esp
 814         cmpl $0, 16(%esp)
 815         jne LBB1_1
 816         addl $12, %esp
 817         ret
 818 LBB1_1:
 819         call L_abort$stub
 820
 821 It would be better to produce:
 822
 823 _test:
 824         subl $12, %esp
 825         cmpl $0, 16(%esp)
 826         jne L_abort$stub
 827         addl $12, %esp
 828         ret
 829
 830 This can be applied to any no-return function call that takes no arguments etc.
 831 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 832 something like this:
 833
 834 _test:
 835         cmpl $0, 4(%esp)
 836         jne LBB1_1
 837         ret
 838 LBB1_1:
 839         subl $12, %esp
 840         call L_abort$stub
 841
 842 Both are useful in different situations.  Finally, it could be shrink-wrapped
 843 and tail called, like this:
 844
 845 _test:
 846         cmpl $0, 4(%esp)
 847         jne LBB1_1
 848         ret
 849 LBB1_1:
 850         pop %eax   # realign stack.
 851         call L_abort$stub
 852
 853 Though this probably isn't worth it.
 854
 855 //===---------------------------------------------------------------------===//
 856
 857 We need to teach the codegen to convert two-address INC instructions to LEA
 858 when the flags are dead (likewise dec).  For example, on X86-64, compile:
 859
 860 int foo(int A, int B) {
 861   return A+1;
 862 }
 863
 864 to:
 865
 866 _foo:
 867         leal    1(%edi), %eax
 868         ret
 869
 870 instead of:
 871
 872 _foo:
 873         incl %edi
 874         movl %edi, %eax
 875         ret
 876
 877 Another example is:
 878
 879 ;; X's live range extends beyond the shift, so the register allocator
 880 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
 881 ;; emitted before the shift to save the register value before it is
 882 ;; clobbered.  However, this copy is not needed if the register
 883 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
 884
 885 ; Check that the shift gets turned into an LEA.
 886 ; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
 887 ; RUN:   not grep {mov E.X, E.X}
 888
 889 @G = external global i32                ; <i32*> [#uses=3]
 890
 891 define i32 @test1(i32 %X, i32 %Y) {
 892         %Z = add i32 %X, %Y             ; <i32> [#uses=1]
 893         volatile store i32 %Y, i32* @G
 894         volatile store i32 %Z, i32* @G
 895         ret i32 %X
 896 }
 897
 898 define i32 @test2(i32 %X) {
 899         %Z = add i32 %X, 1              ; <i32> [#uses=1]
 900         volatile store i32 %Z, i32* @G
 901         ret i32 %X
 902 }
 903
 904 //===---------------------------------------------------------------------===//
 905
 906 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 907 a neg instead of a sub instruction.  Consider:
 908
 909 int test(char X) { return 7-X; }
 910
 911 we currently produce:
 912 _test:
 913         movl $7, %eax
 914         movsbl 4(%esp), %ecx
 915         subl %ecx, %eax
 916         ret
 917
 918 We would use one fewer register if codegen'd as:
 919
 920         movsbl 4(%esp), %eax
 921         neg %eax
 922         add $7, %eax
 923         ret
 924
 925 Note that this isn't beneficial if the load can be folded into the sub.  In
 926 this case, we want a sub:
 927
 928 int test(int X) { return 7-X; }
 929 _test:
 930         movl $7, %eax
 931         subl 4(%esp), %eax
 932         ret
 933
 934 //===---------------------------------------------------------------------===//
 935
 936 This is a "commutable two-address" register coallescing deficiency:
 937
 938 define <4 x float> @test1(<4 x float> %V) {
 939 entry:
 940         %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
 941                                         <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
 942         %add = add <4 x float> %tmp8, %V
 943         ret <4 x float> %add
 944 }
 945
 946 this codegens to:
 947
 948 _test1:
 949         pshufd  $27, %xmm0, %xmm1
 950         addps   %xmm0, %xmm1
 951         movaps  %xmm1, %xmm0
 952         ret
 953
 954 instead of:
 955
 956 _test1:
 957         pshufd  $27, %xmm0, %xmm1
 958         addps   %xmm1, %xmm0
 959         ret
 960
 961 //===---------------------------------------------------------------------===//
 962
 963 Leaf functions that require one 4-byte spill slot have a prolog like this:
 964
 965 _foo:
 966         pushl   %esi
 967         subl    $4, %esp
 968 ...
 969 and an epilog like this:
 970         addl    $4, %esp
 971         popl    %esi
 972         ret
 973
 974 It would be smaller, and potentially faster, to push eax on entry and to
 975 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
 976 into any return registers :)
 977
 978 //===---------------------------------------------------------------------===//
 979
 980 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
 981 branches.  We generate really poor code for:
 982
 983 double testf(double a) {
 984        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 985 }
 986
 987 For example, the entry BB is:
 988
 989 _testf:
 990         subl    $20, %esp
 991         pxor    %xmm0, %xmm0
 992         movsd   24(%esp), %xmm1
 993         ucomisd %xmm0, %xmm1
 994         setnp   %al
 995         sete    %cl
 996         testb   %cl, %al
 997         jne     LBB1_5  # UnifiedReturnBlock
 998 LBB1_1: # cond_true
 999
1000
1001 it would be better to replace the last four instructions with:
1002
1003         jp LBB1_1
1004         je LBB1_5
1005 LBB1_1:
1006
1007 We also codegen the inner ?: into a diamond:
1008
1009        cvtss2sd        LCPI1_0(%rip), %xmm2
1010         cvtss2sd        LCPI1_1(%rip), %xmm3
1011         ucomisd %xmm1, %xmm0
1012         ja      LBB1_3  # cond_true
1013 LBB1_2: # cond_true
1014         movapd  %xmm3, %xmm2
1015 LBB1_3: # cond_true
1016         movapd  %xmm2, %xmm0
1017         ret
1018
1019 We should sink the load into xmm3 into the LBB1_2 block.  This should
1020 be pretty easy, and will nuke all the copies.
1021
1022 //===---------------------------------------------------------------------===//
1023
1024 This:
1025         #include <algorithm>
1026         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
1027         { return std::make_pair(a + b, a + b < a); }
1028         bool no_overflow(unsigned a, unsigned b)
1029         { return !full_add(a, b).second; }
1030
1031 Should compile to:
1032
1033
1034         _Z11no_overflowjj:
1035                 addl    %edi, %esi
1036                 setae   %al
1037                 ret
1038
1039 FIXME: That code looks wrong; bool return is normally defined as zext.
1040
1041 on x86-64, not:
1042
1043 __Z11no_overflowjj:
1044         addl    %edi, %esi
1045         cmpl    %edi, %esi
1046         setae   %al
1047         movzbl  %al, %eax
1048         ret
1049
1050
1051 //===---------------------------------------------------------------------===//
1052
1053 Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
1054 condition register is dead. xor reg reg is shorter than mov reg, #0.
1055
1056 //===---------------------------------------------------------------------===//
1057
1058 We aren't matching RMW instructions aggressively
1059 enough.  Here's a reduced testcase (more in PR1160):
1060
1061 define void @test(i32* %huge_ptr, i32* %target_ptr) {
1062         %A = load i32* %huge_ptr                ; <i32> [#uses=1]
1063         %B = load i32* %target_ptr              ; <i32> [#uses=1]
1064         %C = or i32 %A, %B              ; <i32> [#uses=1]
1065         store i32 %C, i32* %target_ptr
1066         ret void
1067 }
1068
1069 $ llvm-as < t.ll | llc -march=x86-64
1070
1071 _test:
1072         movl (%rdi), %eax
1073         orl (%rsi), %eax
1074         movl %eax, (%rsi)
1075         ret
1076
1077 That should be something like:
1078
1079 _test:
1080         movl (%rdi), %eax
1081         orl %eax, (%rsi)
1082         ret
1083
1084 //===---------------------------------------------------------------------===//
1085
1086 The following code:
1087
1088 bb114.preheader:                ; preds = %cond_next94
1089         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
1090         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
1091         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
1092         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
1093         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
1094         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
1095         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
1096         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
1097         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
1098         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
1099         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
1100         br label %bb114
1101
1102 produces:
1103
1104 LBB3_5: # bb114.preheader
1105         movswl  -68(%ebp), %eax
1106         movl    $32, %ecx
1107         movl    %ecx, -80(%ebp)
1108         subl    %eax, -80(%ebp)
1109         movswl  -52(%ebp), %eax
1110         movl    %ecx, -84(%ebp)
1111         subl    %eax, -84(%ebp)
1112         movswl  -70(%ebp), %eax
1113         movl    %ecx, -88(%ebp)
1114         subl    %eax, -88(%ebp)
1115         movswl  -50(%ebp), %eax
1116         subl    %eax, %ecx
1117         movl    %ecx, -76(%ebp)
1118         movswl  -42(%ebp), %eax
1119         movl    %eax, -92(%ebp)
1120         movswl  -66(%ebp), %eax
1121         movl    %eax, -96(%ebp)
1122         movw    $0, -98(%ebp)
1123
1124 This appears to be bad because the RA is not folding the store to the stack
1125 slot into the movl.  The above instructions could be:
1126         movl    $32, -80(%ebp)
1127 ...
1128         movl    $32, -84(%ebp)
1129 ...
1130 This seems like a cross between remat and spill folding.
1131
1132 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
1133 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
1134 vice-versa).
1135
1136 //===---------------------------------------------------------------------===//
1137
1138 For this code:
1139
1140 cond_next603:           ; preds = %bb493, %cond_true336, %cond_next599
1141         %v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ]                ; <i32> [#uses=1]
1142         %maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ]             ; <i32> [#uses=2]
1143         %cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ]                ; <i32> [#uses=2]
1144         %byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ]             ; <i8*> [#uses=9]
1145         %bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ]             ; <i32> [#uses=4]
1146         %source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ]             ; <i32> [#uses=7]
1147         %tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1           ; <i8*> [#uses=1]
1148         %tmp607 = load i8* %tmp606, align 1             ; <i8> [#uses=1]
1149
1150 We produce this:
1151
1152 LBB4_70:        # cond_next603
1153         movl    -20(%ebp), %esi
1154         movl    L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
1155
1156 However, ICC caches this information before the loop and produces this:
1157
1158         movl      88(%esp), %eax                                #481.12
1159
1160 //===---------------------------------------------------------------------===//
1161
1162 This code:
1163
1164         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
1165         br i1 %tmp659, label %cond_true662, label %cond_next715
1166
1167 produces this:
1168
1169         testw   %cx, %cx
1170         movswl  %cx, %esi
1171         jns     LBB4_109        # cond_next715
1172
1173 Shark tells us that using %cx in the testw instruction is sub-optimal. It
1174 suggests using the 32-bit register (which is what ICC uses).
1175
1176 //===---------------------------------------------------------------------===//
1177
1178 We compile this:
1179
1180 void compare (long long foo) {
1181   if (foo < 4294967297LL)
1182     abort();
1183 }
1184
1185 to:
1186
1187 compare:
1188         subl    $4, %esp
1189         cmpl    $0, 8(%esp)
1190         setne   %al
1191         movzbw  %al, %ax
1192         cmpl    $1, 12(%esp)
1193         setg    %cl
1194         movzbw  %cl, %cx
1195         cmove   %ax, %cx
1196         testb   $1, %cl
1197         jne     .LBB1_2 # UnifiedReturnBlock
1198 .LBB1_1:        # ifthen
1199         call    abort
1200 .LBB1_2:        # UnifiedReturnBlock
1201         addl    $4, %esp
1202         ret
1203
1204 (also really horrible code on ppc).  This is due to the expand code for 64-bit
1205 compares.  GCC produces multiple branches, which is much nicer:
1206
1207 compare:
1208         subl    $12, %esp
1209         movl    20(%esp), %edx
1210         movl    16(%esp), %eax
1211         decl    %edx
1212         jle     .L7
1213 .L5:
1214         addl    $12, %esp
1215         ret
1216         .p2align 4,,7
1217 .L7:
1218         jl      .L4
1219         cmpl    $0, %eax
1220         .p2align 4,,8
1221         ja      .L5
1222 .L4:
1223         .p2align 4,,9
1224         call    abort
1225
1226 //===---------------------------------------------------------------------===//
1227
1228 Tail call optimization improvements: Tail call optimization currently
1229 pushes all arguments on the top of the stack (their normal place for
1230 non-tail call optimized calls) that source from the callers arguments
1231 or  that source from a virtual register (also possibly sourcing from
1232 callers arguments).
1233 This is done to prevent overwriting of parameters (see example
1234 below) that might be used later.
1235
1236 example:
1237
1238 int callee(int32, int64);
1239 int caller(int32 arg1, int32 arg2) {
1240   int64 local = arg2 * 2;
1241   return callee(arg2, (int64)local);
1242 }
1243
1244 [arg1]          [!arg2 no longer valid since we moved local onto it]
1245 [arg2]      ->  [(int64)
1246 [RETADDR]        local  ]
1247
1248 Moving arg1 onto the stack slot of callee function would overwrite
1249 arg2 of the caller.
1250
1251 Possible optimizations:
1252
1253
1254  - Analyse the actual parameters of the callee to see which would
1255    overwrite a caller parameter which is used by the callee and only
1256    push them onto the top of the stack.
1257
1258    int callee (int32 arg1, int32 arg2);
1259    int caller (int32 arg1, int32 arg2) {
1260        return callee(arg1,arg2);
1261    }
1262
1263    Here we don't need to write any variables to the top of the stack
1264    since they don't overwrite each other.
1265
1266    int callee (int32 arg1, int32 arg2);
1267    int caller (int32 arg1, int32 arg2) {
1268        return callee(arg2,arg1);
1269    }
1270
1271    Here we need to push the arguments because they overwrite each
1272    other.
1273
1274 //===---------------------------------------------------------------------===//
1275
1276 main ()
1277 {
1278   int i = 0;
1279   unsigned long int z = 0;
1280
1281   do {
1282     z -= 0x00004000;
1283     i++;
1284     if (i > 0x00040000)
1285       abort ();
1286   } while (z > 0);
1287   exit (0);
1288 }
1289
1290 gcc compiles this to:
1291
1292 _main:
1293         subl    $28, %esp
1294         xorl    %eax, %eax
1295         jmp     L2
1296 L3:
1297         cmpl    $262144, %eax
1298         je      L10
1299 L2:
1300         addl    $1, %eax
1301         cmpl    $262145, %eax
1302         jne     L3
1303         call    L_abort$stub
1304 L10:
1305         movl    $0, (%esp)
1306         call    L_exit$stub
1307
1308 llvm:
1309
1310 _main:
1311         subl    $12, %esp
1312         movl    $1, %eax
1313         movl    $16384, %ecx
1314 LBB1_1: # bb
1315         cmpl    $262145, %eax
1316         jge     LBB1_4  # cond_true
1317 LBB1_2: # cond_next
1318         incl    %eax
1319         addl    $4294950912, %ecx
1320         cmpl    $16384, %ecx
1321         jne     LBB1_1  # bb
1322 LBB1_3: # bb11
1323         xorl    %eax, %eax
1324         addl    $12, %esp
1325         ret
1326 LBB1_4: # cond_true
1327         call    L_abort$stub
1328
1329 1. LSR should rewrite the first cmp with induction variable %ecx.
1330 2. DAG combiner should fold
1331         leal    1(%eax), %edx
1332         cmpl    $262145, %edx
1333    =>
1334         cmpl    $262144, %eax
1335
1336 //===---------------------------------------------------------------------===//
1337
1338 define i64 @test(double %X) {
1339         %Y = fptosi double %X to i64
1340         ret i64 %Y
1341 }
1342
1343 compiles to:
1344
1345 _test:
1346         subl    $20, %esp
1347         movsd   24(%esp), %xmm0
1348         movsd   %xmm0, 8(%esp)
1349         fldl    8(%esp)
1350         fisttpll        (%esp)
1351         movl    4(%esp), %edx
1352         movl    (%esp), %eax
1353         addl    $20, %esp
1354         #FP_REG_KILL
1355         ret
1356
1357 This should just fldl directly from the input stack slot.
1358
1359 //===---------------------------------------------------------------------===//
1360
1361 This code:
1362 int foo (int x) { return (x & 65535) | 255; }
1363
1364 Should compile into:
1365
1366 _foo:
1367         movzwl  4(%esp), %eax
1368         orl     $255, %eax
1369         ret
1370
1371 instead of:
1372 _foo:
1373         movl    $255, %eax
1374         orl     4(%esp), %eax
1375         andl    $65535, %eax
1376         ret
1377
1378 //===---------------------------------------------------------------------===//
1379
1380 We're codegen'ing multiply of long longs inefficiently:
1381
1382 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
1383   return arg1 *  arg2;
1384 }
1385
1386 We compile to (fomit-frame-pointer):
1387
1388 _LLM:
1389         pushl   %esi
1390         movl    8(%esp), %ecx
1391         movl    16(%esp), %esi
1392         movl    %esi, %eax
1393         mull    %ecx
1394         imull   12(%esp), %esi
1395         addl    %edx, %esi
1396         imull   20(%esp), %ecx
1397         movl    %esi, %edx
1398         addl    %ecx, %edx
1399         popl    %esi
1400         ret
1401
1402 This looks like a scheduling deficiency and lack of remat of the load from
1403 the argument area.  ICC apparently produces:
1404
1405         movl      8(%esp), %ecx
1406         imull     12(%esp), %ecx
1407         movl      16(%esp), %eax
1408         imull     4(%esp), %eax
1409         addl      %eax, %ecx
1410         movl      4(%esp), %eax
1411         mull      12(%esp)
1412         addl      %ecx, %edx
1413         ret
1414
1415 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1416 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1417
1418 //===---------------------------------------------------------------------===//
1419
1420 We can fold a store into "zeroing a reg".  Instead of:
1421
1422 xorl    %eax, %eax
1423 movl    %eax, 124(%esp)
1424
1425 we should get:
1426
1427 movl    $0, 124(%esp)
1428
1429 if the flags of the xor are dead.
1430
1431 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1432 be folded into: shl [mem], 1
1433
1434 //===---------------------------------------------------------------------===//
1435
1436 This testcase misses a read/modify/write opportunity (from PR1425):
1437
1438 void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
1439     int i;
1440     for(i=0; i<width; i++)
1441         b1[i] += (1*(b0[i] + b2[i])+0)>>0;
1442 }
1443
1444 We compile it down to:
1445
1446 LBB1_2: # bb
1447         movl    (%esi,%edi,4), %ebx
1448         addl    (%ecx,%edi,4), %ebx
1449         addl    (%edx,%edi,4), %ebx
1450         movl    %ebx, (%ecx,%edi,4)
1451         incl    %edi
1452         cmpl    %eax, %edi
1453         jne     LBB1_2  # bb
1454
1455 the inner loop should add to the memory location (%ecx,%edi,4), saving
1456 a mov.  Something like:
1457
1458         movl    (%esi,%edi,4), %ebx
1459         addl    (%edx,%edi,4), %ebx
1460         addl    %ebx, (%ecx,%edi,4)
1461
1462 Here is another interesting example:
1463
1464 void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
1465     int i;
1466     for(i=0; i<width; i++)
1467         b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
1468 }
1469
1470 We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
1471
1472 LBB9_2: # bb
1473         movl    (%ecx,%edi,4), %ebx
1474         subl    (%esi,%edi,4), %ebx
1475         subl    (%edx,%edi,4), %ebx
1476         movl    %ebx, (%ecx,%edi,4)
1477         incl    %edi
1478         cmpl    %eax, %edi
1479         jne     LBB9_2  # bb
1480
1481 Additionally, LSR should rewrite the exit condition of these loops to use
1482 a stride-4 IV, would would allow all the scales in the loop to go away.
1483 This would result in smaller code and more efficient microops.
1484
1485 //===---------------------------------------------------------------------===//
1486
1487 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1488 or and instruction, for example:
1489
1490         xorpd   LCPI1_0, %xmm2
1491
1492 However, if xmm2 gets spilled, we end up with really ugly code like this:
1493
1494         movsd   (%esp), %xmm0
1495         xorpd   LCPI1_0, %xmm0
1496         movsd   %xmm0, (%esp)
1497
1498 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1499 the neg/abs instruction, turning it into an *integer* operation, like this:
1500
1501         xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1502
1503 you could also use xorb, but xorl is less likely to lead to a partial register
1504 stall.  Here is a contrived testcase:
1505
1506 double a, b, c;
1507 void test(double *P) {
1508   double X = *P;
1509   a = X;
1510   bar();
1511   X = -X;
1512   b = X;
1513   bar();
1514   c = X;
1515 }
1516
1517 //===---------------------------------------------------------------------===//
1518
1519 handling llvm.memory.barrier on pre SSE2 cpus
1520
1521 should generate:
1522 lock ; mov %esp, %esp
1523
1524 //===---------------------------------------------------------------------===//
1525
1526 The generated code on x86 for checking for signed overflow on a multiply the
1527 obvious way is much longer than it needs to be.
1528
1529 int x(int a, int b) {
1530   long long prod = (long long)a*b;
1531   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1532 }
1533
1534 See PR2053 for more details.
1535
1536 //===---------------------------------------------------------------------===//
1537
1538 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1539 more aggressively; it should cost the same as a move+shift on any modern
1540 processor, but it's a lot shorter. Downside is that it puts more
1541 pressure on register allocation because it has fixed operands.
1542
1543 Example:
1544 int abs(int x) {return x < 0 ? -x : x;}
1545
1546 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1547 abs:
1548         movl    4(%esp), %eax
1549         cltd
1550         xorl    %edx, %eax
1551         subl    %edx, %eax
1552         ret
1553
1554 //===---------------------------------------------------------------------===//
1555
1556 Consider:
1557 int test(unsigned long a, unsigned long b) { return -(a < b); }
1558
1559 We currently compile this to:
1560
1561 define i32 @test(i32 %a, i32 %b) nounwind  {
1562         %tmp3 = icmp ult i32 %a, %b             ; <i1> [#uses=1]
1563         %tmp34 = zext i1 %tmp3 to i32           ; <i32> [#uses=1]
1564         %tmp5 = sub i32 0, %tmp34               ; <i32> [#uses=1]
1565         ret i32 %tmp5
1566 }
1567
1568 and
1569
1570 _test:
1571         movl    8(%esp), %eax
1572         cmpl    %eax, 4(%esp)
1573         setb    %al
1574         movzbl  %al, %eax
1575         negl    %eax
1576         ret
1577
1578 Several deficiencies here.  First, we should instcombine zext+neg into sext:
1579
1580 define i32 @test2(i32 %a, i32 %b) nounwind  {
1581         %tmp3 = icmp ult i32 %a, %b             ; <i1> [#uses=1]
1582         %tmp34 = sext i1 %tmp3 to i32           ; <i32> [#uses=1]
1583         ret i32 %tmp34
1584 }
1585
1586 However, before we can do that, we have to fix the bad codegen that we get for
1587 sext from bool:
1588
1589 _test2:
1590         movl    8(%esp), %eax
1591         cmpl    %eax, 4(%esp)
1592         setb    %al
1593         movzbl  %al, %eax
1594         shll    $31, %eax
1595         sarl    $31, %eax
1596         ret
1597
1598 This code should be at least as good as the code above.  Once this is fixed, we
1599 can optimize this specific case even more to:
1600
1601         movl    8(%esp), %eax
1602         xorl    %ecx, %ecx
1603         cmpl    %eax, 4(%esp)
1604         sbbl    %ecx, %ecx
1605
1606 //===---------------------------------------------------------------------===//