lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Missing features:
   6   - Support for SSE4: http://www.intel.com/software/penryn
   7 http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
   8   - support for 3DNow!
   9   - weird abis?
  10
  11 //===---------------------------------------------------------------------===//
  12
  13 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  14 backend knows how to three-addressify this shift, but it appears the register
  15 allocator isn't even asking it to do so in this case.  We should investigate
  16 why this isn't happening, it could have significant impact on other important
  17 cases for X86 as well.
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 This should be one DIV/IDIV instruction, not a libcall:
  22
  23 unsigned test(unsigned long long X, unsigned Y) {
  24         return X/Y;
  25 }
  26
  27 This can be done trivially with a custom legalizer.  What about overflow
  28 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  29
  30 //===---------------------------------------------------------------------===//
  31
  32 Improvements to the multiply -> shift/add algorithm:
  33 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  34
  35 //===---------------------------------------------------------------------===//
  36
  37 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  38 long long foo(int x) { return 1LL << x; }
  39
  40 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  41 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  42 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  43
  44 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  45
  46 One better solution for 1LL << x is:
  47         xorl    %eax, %eax
  48         xorl    %edx, %edx
  49         testb   $32, %cl
  50         sete    %al
  51         setne   %dl
  52         sall    %cl, %eax
  53         sall    %cl, %edx
  54
  55 But that requires good 8-bit subreg support.
  56
  57 64-bit shifts (in general) expand to really bad code.  Instead of using
  58 cmovs, we should expand to a conditional branch like GCC produces.
  59
  60 //===---------------------------------------------------------------------===//
  61
  62 Compile this:
  63 _Bool f(_Bool a) { return a!=1; }
  64
  65 into:
  66         movzbl  %dil, %eax
  67         xorl    $1, %eax
  68         ret
  69
  70 //===---------------------------------------------------------------------===//
  71
  72 Some isel ideas:
  73
  74 1. Dynamic programming based approach when compile time if not an
  75    issue.
  76 2. Code duplication (addressing mode) during isel.
  77 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  78    Sequencing of Instructions".
  79 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  80    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  81    and other related papers.
  82    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  83
  84 //===---------------------------------------------------------------------===//
  85
  86 Should we promote i16 to i32 to avoid partial register update stalls?
  87
  88 //===---------------------------------------------------------------------===//
  89
  90 Leave any_extend as pseudo instruction and hint to register
  91 allocator. Delay codegen until post register allocation.
  92 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
  93 the coalescer how to deal with it though.
  94
  95 //===---------------------------------------------------------------------===//
  96
  97 Count leading zeros and count trailing zeros:
  98
  99 int clz(int X) { return __builtin_clz(X); }
 100 int ctz(int X) { return __builtin_ctz(X); }
 101
 102 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 103 clz:
 104         bsr     %eax, DWORD PTR [%esp+4]
 105         xor     %eax, 31
 106         ret
 107 ctz:
 108         bsf     %eax, DWORD PTR [%esp+4]
 109         ret
 110
 111 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 112 aren't.
 113
 114 Another example (use predsimplify to eliminate a select):
 115
 116 int foo (unsigned long j) {
 117   if (j)
 118     return __builtin_ffs (j) - 1;
 119   else
 120     return 0;
 121 }
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 It appears icc use push for parameter passing. Need to investigate.
 126
 127 //===---------------------------------------------------------------------===//
 128
 129 Only use inc/neg/not instructions on processors where they are faster than
 130 add/sub/xor.  They are slower on the P4 due to only updating some processor
 131 flags.
 132
 133 //===---------------------------------------------------------------------===//
 134
 135 The instruction selector sometimes misses folding a load into a compare.  The
 136 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 137 commutative, it is not matched with the load on both sides.  The dag combiner
 138 should be made smart enough to cannonicalize the load into the RHS of a compare
 139 when it can invert the result of the compare for free.
 140
 141 //===---------------------------------------------------------------------===//
 142
 143 How about intrinsics? An example is:
 144   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 145
 146 compiles to
 147         pmuludq (%eax), %xmm0
 148         movl 8(%esp), %eax
 149         movdqa (%eax), %xmm1
 150         pmulhuw %xmm0, %xmm1
 151
 152 The transformation probably requires a X86 specific pass or a DAG combiner
 153 target specific hook.
 154
 155 //===---------------------------------------------------------------------===//
 156
 157 In many cases, LLVM generates code like this:
 158
 159 _test:
 160         movl 8(%esp), %eax
 161         cmpl %eax, 4(%esp)
 162         setl %al
 163         movzbl %al, %eax
 164         ret
 165
 166 on some processors (which ones?), it is more efficient to do this:
 167
 168 _test:
 169         movl 8(%esp), %ebx
 170         xor  %eax, %eax
 171         cmpl %ebx, 4(%esp)
 172         setl %al
 173         ret
 174
 175 Doing this correctly is tricky though, as the xor clobbers the flags.
 176
 177 //===---------------------------------------------------------------------===//
 178
 179 We should generate bts/btr/etc instructions on targets where they are cheap or
 180 when codesize is important.  e.g., for:
 181
 182 void setbit(int *target, int bit) {
 183     *target |= (1 << bit);
 184 }
 185 void clearbit(int *target, int bit) {
 186     *target &= ~(1 << bit);
 187 }
 188
 189 //===---------------------------------------------------------------------===//
 190
 191 Instead of the following for memset char*, 1, 10:
 192
 193         movl $16843009, 4(%edx)
 194         movl $16843009, (%edx)
 195         movw $257, 8(%edx)
 196
 197 It might be better to generate
 198
 199         movl $16843009, %eax
 200         movl %eax, 4(%edx)
 201         movl %eax, (%edx)
 202         movw al, 8(%edx)
 203
 204 when we can spare a register. It reduces code size.
 205
 206 //===---------------------------------------------------------------------===//
 207
 208 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 209 get this:
 210
 211 int %test1(int %X) {
 212         %Y = div int %X, 8
 213         ret int %Y
 214 }
 215
 216 _test1:
 217         movl 4(%esp), %eax
 218         movl %eax, %ecx
 219         sarl $31, %ecx
 220         shrl $29, %ecx
 221         addl %ecx, %eax
 222         sarl $3, %eax
 223         ret
 224
 225 GCC knows several different ways to codegen it, one of which is this:
 226
 227 _test1:
 228         movl    4(%esp), %eax
 229         cmpl    $-1, %eax
 230         leal    7(%eax), %ecx
 231         cmovle  %ecx, %eax
 232         sarl    $3, %eax
 233         ret
 234
 235 which is probably slower, but it's interesting at least :)
 236
 237 //===---------------------------------------------------------------------===//
 238
 239 The first BB of this code:
 240
 241 declare bool %foo()
 242 int %bar() {
 243         %V = call bool %foo()
 244         br bool %V, label %T, label %F
 245 T:
 246         ret int 1
 247 F:
 248         call bool %foo()
 249         ret int 12
 250 }
 251
 252 compiles to:
 253
 254 _bar:
 255         subl $12, %esp
 256         call L_foo$stub
 257         xorb $1, %al
 258         testb %al, %al
 259         jne LBB_bar_2   # F
 260
 261 It would be better to emit "cmp %al, 1" than a xor and test.
 262
 263 //===---------------------------------------------------------------------===//
 264
 265 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 266 We should leave these as libcalls for everything over a much lower threshold,
 267 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 268 stores, TLB preheating, etc)
 269
 270 //===---------------------------------------------------------------------===//
 271
 272 Optimize this into something reasonable:
 273  x * copysign(1.0, y) * copysign(1.0, z)
 274
 275 //===---------------------------------------------------------------------===//
 276
 277 Optimize copysign(x, *y) to use an integer load from y.
 278
 279 //===---------------------------------------------------------------------===//
 280
 281 %X = weak global int 0
 282
 283 void %foo(int %N) {
 284         %N = cast int %N to uint
 285         %tmp.24 = setgt int %N, 0
 286         br bool %tmp.24, label %no_exit, label %return
 287
 288 no_exit:
 289         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 290         %i.0.0 = cast uint %indvar to int
 291         volatile store int %i.0.0, int* %X
 292         %indvar.next = add uint %indvar, 1
 293         %exitcond = seteq uint %indvar.next, %N
 294         br bool %exitcond, label %return, label %no_exit
 295
 296 return:
 297         ret void
 298 }
 299
 300 compiles into:
 301
 302         .text
 303         .align  4
 304         .globl  _foo
 305 _foo:
 306         movl 4(%esp), %eax
 307         cmpl $1, %eax
 308         jl LBB_foo_4    # return
 309 LBB_foo_1:      # no_exit.preheader
 310         xorl %ecx, %ecx
 311 LBB_foo_2:      # no_exit
 312         movl L_X$non_lazy_ptr, %edx
 313         movl %ecx, (%edx)
 314         incl %ecx
 315         cmpl %eax, %ecx
 316         jne LBB_foo_2   # no_exit
 317 LBB_foo_3:      # return.loopexit
 318 LBB_foo_4:      # return
 319         ret
 320
 321 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 322 remateralization is implemented. This can be accomplished with 1) a target
 323 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 324
 325 //===---------------------------------------------------------------------===//
 326
 327 The following tests perform worse with LSR:
 328
 329 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 330
 331 //===---------------------------------------------------------------------===//
 332
 333 We are generating far worse code than gcc:
 334
 335 volatile short X, Y;
 336
 337 void foo(int N) {
 338   int i;
 339   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 340 }
 341
 342 LBB1_1: # entry.bb_crit_edge
 343         xorl    %ecx, %ecx
 344         xorw    %dx, %dx
 345 LBB1_2: # bb
 346         movl    L_X$non_lazy_ptr, %esi
 347         movw    %cx, (%esi)
 348         movl    L_Y$non_lazy_ptr, %esi
 349         movw    %dx, (%esi)
 350         addw    $4, %dx
 351         incl    %ecx
 352         cmpl    %eax, %ecx
 353         jne     LBB1_2  # bb
 354
 355 vs.
 356
 357         xorl    %edx, %edx
 358         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 359         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 360 L4:
 361         movw    %dx, (%esi)
 362         leal    0(,%edx,4), %eax
 363         movw    %ax, (%ecx)
 364         addl    $1, %edx
 365         cmpl    %edx, %edi
 366         jne     L4
 367
 368 This is due to the lack of post regalloc LICM.
 369
 370 //===---------------------------------------------------------------------===//
 371
 372 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 373 FR64 to VR128.
 374
 375 //===---------------------------------------------------------------------===//
 376
 377 mov $reg, 48(%esp)
 378 ...
 379 leal 48(%esp), %eax
 380 mov %eax, (%esp)
 381 call _foo
 382
 383 Obviously it would have been better for the first mov (or any op) to store
 384 directly %esp[0] if there are no other uses.
 385
 386 //===---------------------------------------------------------------------===//
 387
 388 Adding to the list of cmp / test poor codegen issues:
 389
 390 int test(__m128 *A, __m128 *B) {
 391   if (_mm_comige_ss(*A, *B))
 392     return 3;
 393   else
 394     return 4;
 395 }
 396
 397 _test:
 398         movl 8(%esp), %eax
 399         movaps (%eax), %xmm0
 400         movl 4(%esp), %eax
 401         movaps (%eax), %xmm1
 402         comiss %xmm0, %xmm1
 403         setae %al
 404         movzbl %al, %ecx
 405         movl $3, %eax
 406         movl $4, %edx
 407         cmpl $0, %ecx
 408         cmove %edx, %eax
 409         ret
 410
 411 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 412 are a number of issues. 1) We are introducing a setcc between the result of the
 413 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 414 so a any extend (which becomes a zero extend) is added.
 415
 416 We probably need some kind of target DAG combine hook to fix this.
 417
 418 //===---------------------------------------------------------------------===//
 419
 420 We generate significantly worse code for this than GCC:
 421 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 422 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 423
 424 There is also one case we do worse on PPC.
 425
 426 //===---------------------------------------------------------------------===//
 427
 428 If shorter, we should use things like:
 429 movzwl %ax, %eax
 430 instead of:
 431 andl $65535, %EAX
 432
 433 The former can also be used when the two-addressy nature of the 'and' would
 434 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 435
 436 //===---------------------------------------------------------------------===//
 437
 438 Consider this:
 439
 440 typedef struct pair { float A, B; } pair;
 441 void pairtest(pair P, float *FP) {
 442         *FP = P.A+P.B;
 443 }
 444
 445 We currently generate this code with llvmgcc4:
 446
 447 _pairtest:
 448         movl 8(%esp), %eax
 449         movl 4(%esp), %ecx
 450         movd %eax, %xmm0
 451         movd %ecx, %xmm1
 452         addss %xmm0, %xmm1
 453         movl 12(%esp), %eax
 454         movss %xmm1, (%eax)
 455         ret
 456
 457 we should be able to generate:
 458 _pairtest:
 459         movss 4(%esp), %xmm0
 460         movl 12(%esp), %eax
 461         addss 8(%esp), %xmm0
 462         movss %xmm0, (%eax)
 463         ret
 464
 465 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 466 integer chunks.  It does this so that structs like {short,short} are passed in
 467 a single 32-bit integer stack slot.  We should handle the safe cases above much
 468 nicer, while still handling the hard cases.
 469
 470 While true in general, in this specific case we could do better by promoting
 471 load int + bitcast to float -> load fload.  This basically needs alignment info,
 472 the code is already implemented (but disabled) in dag combine).
 473
 474 //===---------------------------------------------------------------------===//
 475
 476 Another instruction selector deficiency:
 477
 478 void %bar() {
 479         %tmp = load int (int)** %foo
 480         %tmp = tail call int %tmp( int 3 )
 481         ret void
 482 }
 483
 484 _bar:
 485         subl $12, %esp
 486         movl L_foo$non_lazy_ptr, %eax
 487         movl (%eax), %eax
 488         call *%eax
 489         addl $12, %esp
 490         ret
 491
 492 The current isel scheme will not allow the load to be folded in the call since
 493 the load's chain result is read by the callseq_start.
 494
 495 //===---------------------------------------------------------------------===//
 496
 497 For this:
 498
 499 int test(int a)
 500 {
 501   return a * 3;
 502 }
 503
 504 We currently emits
 505         imull $3, 4(%esp), %eax
 506
 507 Perhaps this is what we really should generate is? Is imull three or four
 508 cycles? Note: ICC generates this:
 509         movl    4(%esp), %eax
 510         leal    (%eax,%eax,2), %eax
 511
 512 The current instruction priority is based on pattern complexity. The former is
 513 more "complex" because it folds a load so the latter will not be emitted.
 514
 515 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 516 should always try to match LEA first since the LEA matching code does some
 517 estimate to determine whether the match is profitable.
 518
 519 However, if we care more about code size, then imull is better. It's two bytes
 520 shorter than movl + leal.
 521
 522 //===---------------------------------------------------------------------===//
 523
 524 Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
 525
 526 int ctz_(unsigned X) { return __builtin_ctz(X); }
 527 int clz_(unsigned X) { return __builtin_clz(X); }
 528 int ffs_(unsigned X) { return __builtin_ffs(X); }
 529
 530 _ctz_:
 531         bsfl    4(%esp), %eax
 532         ret
 533 _clz_:
 534         bsrl    4(%esp), %eax
 535         xorl    $31, %eax
 536         ret
 537 _ffs_:
 538         movl    $-1, %edx
 539         bsfl    4(%esp), %eax
 540         cmove   %edx, %eax
 541         addl    $1, %eax
 542         ret
 543
 544 //===---------------------------------------------------------------------===//
 545
 546 It appears gcc place string data with linkonce linkage in
 547 .section __TEXT,__const_coal,coalesced instead of
 548 .section __DATA,__const_coal,coalesced.
 549 Take a look at darwin.h, there are other Darwin assembler directives that we
 550 do not make use of.
 551
 552 //===---------------------------------------------------------------------===//
 553
 554 int %foo(int* %a, int %t) {
 555 entry:
 556         br label %cond_true
 557
 558 cond_true:              ; preds = %cond_true, %entry
 559         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 560         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 561         %tmp2 = getelementptr int* %a, int %x.0.0
 562         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 563         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 564         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 565         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 566         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 567         br bool %tmp, label %bb12, label %cond_true
 568
 569 bb12:           ; preds = %cond_true
 570         ret int %tmp7
 571 }
 572
 573 is pessimized by -loop-reduce and -indvars
 574
 575 //===---------------------------------------------------------------------===//
 576
 577 u32 to float conversion improvement:
 578
 579 float uint32_2_float( unsigned u ) {
 580   float fl = (int) (u & 0xffff);
 581   float fh = (int) (u >> 16);
 582   fh *= 0x1.0p16f;
 583   return fh + fl;
 584 }
 585
 586 00000000        subl    $0x04,%esp
 587 00000003        movl    0x08(%esp,1),%eax
 588 00000007        movl    %eax,%ecx
 589 00000009        shrl    $0x10,%ecx
 590 0000000c        cvtsi2ss        %ecx,%xmm0
 591 00000010        andl    $0x0000ffff,%eax
 592 00000015        cvtsi2ss        %eax,%xmm1
 593 00000019        mulss   0x00000078,%xmm0
 594 00000021        addss   %xmm1,%xmm0
 595 00000025        movss   %xmm0,(%esp,1)
 596 0000002a        flds    (%esp,1)
 597 0000002d        addl    $0x04,%esp
 598 00000030        ret
 599
 600 //===---------------------------------------------------------------------===//
 601
 602 When using fastcc abi, align stack slot of argument of type double on 8 byte
 603 boundary to improve performance.
 604
 605 //===---------------------------------------------------------------------===//
 606
 607 Codegen:
 608
 609 int f(int a, int b) {
 610   if (a == 4 || a == 6)
 611     b++;
 612   return b;
 613 }
 614
 615
 616 as:
 617
 618 or eax, 2
 619 cmp eax, 6
 620 jz label
 621
 622 //===---------------------------------------------------------------------===//
 623
 624 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 625 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 626
 627 int G;
 628 void f(int X, int Y) {
 629   G = X < 0 ? 14 : 13;
 630 }
 631
 632 compiling to:
 633
 634 _f:
 635         movl $14, %eax
 636         movl $13, %ecx
 637         movl 4(%esp), %edx
 638         testl %edx, %edx
 639         cmovl %eax, %ecx
 640         movl %ecx, _G
 641         ret
 642
 643 it could be:
 644 _f:
 645         movl    4(%esp), %eax
 646         sarl    $31, %eax
 647         notl    %eax
 648         addl    $14, %eax
 649         movl    %eax, _G
 650         ret
 651
 652 etc.
 653
 654 //===---------------------------------------------------------------------===//
 655
 656 Currently we don't have elimination of redundant stack manipulations. Consider
 657 the code:
 658
 659 int %main() {
 660 entry:
 661         call fastcc void %test1( )
 662         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 663         ret int 0
 664 }
 665
 666 declare fastcc void %test1()
 667
 668 declare fastcc void %test2(sbyte*)
 669
 670
 671 This currently compiles to:
 672
 673         subl $16, %esp
 674         call _test5
 675         addl $12, %esp
 676         subl $16, %esp
 677         movl $_test5, (%esp)
 678         call _test6
 679         addl $12, %esp
 680
 681 The add\sub pair is really unneeded here.
 682
 683 //===---------------------------------------------------------------------===//
 684
 685 We currently compile sign_extend_inreg into two shifts:
 686
 687 long foo(long X) {
 688   return (long)(signed char)X;
 689 }
 690
 691 becomes:
 692
 693 _foo:
 694         movl 4(%esp), %eax
 695         shll $24, %eax
 696         sarl $24, %eax
 697         ret
 698
 699 This could be:
 700
 701 _foo:
 702         movsbl  4(%esp),%eax
 703         ret
 704
 705 //===---------------------------------------------------------------------===//
 706
 707 Consider the expansion of:
 708
 709 uint %test3(uint %X) {
 710         %tmp1 = rem uint %X, 255
 711         ret uint %tmp1
 712 }
 713
 714 Currently it compiles to:
 715
 716 ...
 717         movl $2155905153, %ecx
 718         movl 8(%esp), %esi
 719         movl %esi, %eax
 720         mull %ecx
 721 ...
 722
 723 This could be "reassociated" into:
 724
 725         movl $2155905153, %eax
 726         movl 8(%esp), %ecx
 727         mull %ecx
 728
 729 to avoid the copy.  In fact, the existing two-address stuff would do this
 730 except that mul isn't a commutative 2-addr instruction.  I guess this has
 731 to be done at isel time based on the #uses to mul?
 732
 733 //===---------------------------------------------------------------------===//
 734
 735 Make sure the instruction which starts a loop does not cross a cacheline
 736 boundary. This requires knowning the exact length of each machine instruction.
 737 That is somewhat complicated, but doable. Example 256.bzip2:
 738
 739 In the new trace, the hot loop has an instruction which crosses a cacheline
 740 boundary.  In addition to potential cache misses, this can't help decoding as I
 741 imagine there has to be some kind of complicated decoder reset and realignment
 742 to grab the bytes from the next cacheline.
 743
 744 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 745 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 746 937  937 0x3d0a incl     %esi
 747 3    3   0x3d0b cmpb     %bl, %dl
 748 27   27  0x3d0d jnz      0x000062db <main+11707>
 749
 750 //===---------------------------------------------------------------------===//
 751
 752 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 753
 754 //===---------------------------------------------------------------------===//
 755
 756 This could be a single 16-bit load.
 757
 758 int f(char *p) {
 759     if ((p[0] == 1) & (p[1] == 2)) return 1;
 760     return 0;
 761 }
 762
 763 //===---------------------------------------------------------------------===//
 764
 765 We should inline lrintf and probably other libc functions.
 766
 767 //===---------------------------------------------------------------------===//
 768
 769 Start using the flags more.  For example, compile:
 770
 771 int add_zf(int *x, int y, int a, int b) {
 772      if ((*x += y) == 0)
 773           return a;
 774      else
 775           return b;
 776 }
 777
 778 to:
 779        addl    %esi, (%rdi)
 780        movl    %edx, %eax
 781        cmovne  %ecx, %eax
 782        ret
 783 instead of:
 784
 785 _add_zf:
 786         addl (%rdi), %esi
 787         movl %esi, (%rdi)
 788         testl %esi, %esi
 789         cmove %edx, %ecx
 790         movl %ecx, %eax
 791         ret
 792
 793 and:
 794
 795 int add_zf(int *x, int y, int a, int b) {
 796      if ((*x + y) < 0)
 797           return a;
 798      else
 799           return b;
 800 }
 801
 802 to:
 803
 804 add_zf:
 805         addl    (%rdi), %esi
 806         movl    %edx, %eax
 807         cmovns  %ecx, %eax
 808         ret
 809
 810 instead of:
 811
 812 _add_zf:
 813         addl (%rdi), %esi
 814         testl %esi, %esi
 815         cmovs %edx, %ecx
 816         movl %ecx, %eax
 817         ret
 818
 819 //===---------------------------------------------------------------------===//
 820
 821 This:
 822 #include <math.h>
 823 int foo(double X) { return isnan(X); }
 824
 825 compiles to (-m64):
 826
 827 _foo:
 828         pxor %xmm1, %xmm1
 829         ucomisd %xmm1, %xmm0
 830         setp %al
 831         movzbl %al, %eax
 832         ret
 833
 834 the pxor is not needed, we could compare the value against itself.
 835
 836 //===---------------------------------------------------------------------===//
 837
 838 These two functions have identical effects:
 839
 840 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 841 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 842
 843 We currently compile them to:
 844
 845 _f:
 846         movl 4(%esp), %eax
 847         movl %eax, %ecx
 848         incl %ecx
 849         movl 8(%esp), %edx
 850         cmpl %edx, %ecx
 851         jne LBB1_2      #UnifiedReturnBlock
 852 LBB1_1: #cond_true
 853         addl $2, %eax
 854         ret
 855 LBB1_2: #UnifiedReturnBlock
 856         movl %ecx, %eax
 857         ret
 858 _f2:
 859         movl 4(%esp), %eax
 860         movl %eax, %ecx
 861         incl %ecx
 862         cmpl 8(%esp), %ecx
 863         sete %cl
 864         movzbl %cl, %ecx
 865         leal 1(%ecx,%eax), %eax
 866         ret
 867
 868 both of which are inferior to GCC's:
 869
 870 _f:
 871         movl    4(%esp), %edx
 872         leal    1(%edx), %eax
 873         addl    $2, %edx
 874         cmpl    8(%esp), %eax
 875         cmove   %edx, %eax
 876         ret
 877 _f2:
 878         movl    4(%esp), %eax
 879         addl    $1, %eax
 880         xorl    %edx, %edx
 881         cmpl    8(%esp), %eax
 882         sete    %dl
 883         addl    %edx, %eax
 884         ret
 885
 886 //===---------------------------------------------------------------------===//
 887
 888 This code:
 889
 890 void test(int X) {
 891   if (X) abort();
 892 }
 893
 894 is currently compiled to:
 895
 896 _test:
 897         subl $12, %esp
 898         cmpl $0, 16(%esp)
 899         jne LBB1_1
 900         addl $12, %esp
 901         ret
 902 LBB1_1:
 903         call L_abort$stub
 904
 905 It would be better to produce:
 906
 907 _test:
 908         subl $12, %esp
 909         cmpl $0, 16(%esp)
 910         jne L_abort$stub
 911         addl $12, %esp
 912         ret
 913
 914 This can be applied to any no-return function call that takes no arguments etc.
 915 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 916 something like this:
 917
 918 _test:
 919         cmpl $0, 4(%esp)
 920         jne LBB1_1
 921         ret
 922 LBB1_1:
 923         subl $12, %esp
 924         call L_abort$stub
 925
 926 Both are useful in different situations.  Finally, it could be shrink-wrapped
 927 and tail called, like this:
 928
 929 _test:
 930         cmpl $0, 4(%esp)
 931         jne LBB1_1
 932         ret
 933 LBB1_1:
 934         pop %eax   # realign stack.
 935         call L_abort$stub
 936
 937 Though this probably isn't worth it.
 938
 939 //===---------------------------------------------------------------------===//
 940
 941 We need to teach the codegen to convert two-address INC instructions to LEA
 942 when the flags are dead (likewise dec).  For example, on X86-64, compile:
 943
 944 int foo(int A, int B) {
 945   return A+1;
 946 }
 947
 948 to:
 949
 950 _foo:
 951         leal    1(%edi), %eax
 952         ret
 953
 954 instead of:
 955
 956 _foo:
 957         incl %edi
 958         movl %edi, %eax
 959         ret
 960
 961 Another example is:
 962
 963 ;; X's live range extends beyond the shift, so the register allocator
 964 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
 965 ;; emitted before the shift to save the register value before it is
 966 ;; clobbered.  However, this copy is not needed if the register
 967 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
 968
 969 ; Check that the shift gets turned into an LEA.
 970 ; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
 971 ; RUN:   not grep {mov E.X, E.X}
 972
 973 %G = external global int
 974
 975 int %test1(int %X, int %Y) {
 976         %Z = add int %X, %Y
 977         volatile store int %Y, int* %G
 978         volatile store int %Z, int* %G
 979         ret int %X
 980 }
 981
 982 int %test2(int %X) {
 983         %Z = add int %X, 1  ;; inc
 984         volatile store int %Z, int* %G
 985         ret int %X
 986 }
 987
 988 //===---------------------------------------------------------------------===//
 989
 990 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 991 a neg instead of a sub instruction.  Consider:
 992
 993 int test(char X) { return 7-X; }
 994
 995 we currently produce:
 996 _test:
 997         movl $7, %eax
 998         movsbl 4(%esp), %ecx
 999         subl %ecx, %eax
1000         ret
1001
1002 We would use one fewer register if codegen'd as:
1003
1004         movsbl 4(%esp), %eax
1005         neg %eax
1006         add $7, %eax
1007         ret
1008
1009 Note that this isn't beneficial if the load can be folded into the sub.  In
1010 this case, we want a sub:
1011
1012 int test(int X) { return 7-X; }
1013 _test:
1014         movl $7, %eax
1015         subl 4(%esp), %eax
1016         ret
1017
1018 //===---------------------------------------------------------------------===//
1019
1020 For code like:
1021 phi (undef, x)
1022
1023 We get an implicit def on the undef side. If the phi is spilled, we then get:
1024 implicitdef xmm1
1025 store xmm1 -> stack
1026
1027 It should be possible to teach the x86 backend to "fold" the store into the
1028 implicitdef, which just deletes the implicit def.
1029
1030 These instructions should go away:
1031 #IMPLICIT_DEF %xmm1
1032 movaps %xmm1, 192(%esp)
1033 movaps %xmm1, 224(%esp)
1034 movaps %xmm1, 176(%esp)
1035
1036 //===---------------------------------------------------------------------===//
1037
1038 This is a "commutable two-address" register coallescing deficiency:
1039
1040 define <4 x float> @test1(<4 x float> %V) {
1041 entry:
1042         %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
1043                                         <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
1044         %add = add <4 x float> %tmp8, %V
1045         ret <4 x float> %add
1046 }
1047
1048 this codegens to:
1049
1050 _test1:
1051         pshufd  $27, %xmm0, %xmm1
1052         addps   %xmm0, %xmm1
1053         movaps  %xmm1, %xmm0
1054         ret
1055
1056 instead of:
1057
1058 _test1:
1059         pshufd  $27, %xmm0, %xmm1
1060         addps   %xmm1, %xmm0
1061         ret
1062
1063 //===---------------------------------------------------------------------===//
1064
1065 Leaf functions that require one 4-byte spill slot have a prolog like this:
1066
1067 _foo:
1068         pushl   %esi
1069         subl    $4, %esp
1070 ...
1071 and an epilog like this:
1072         addl    $4, %esp
1073         popl    %esi
1074         ret
1075
1076 It would be smaller, and potentially faster, to push eax on entry and to
1077 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
1078 into any return registers :)
1079
1080 //===---------------------------------------------------------------------===//
1081
1082 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
1083 branches.  We generate really poor code for:
1084
1085 double testf(double a) {
1086        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
1087 }
1088
1089 For example, the entry BB is:
1090
1091 _testf:
1092         subl    $20, %esp
1093         pxor    %xmm0, %xmm0
1094         movsd   24(%esp), %xmm1
1095         ucomisd %xmm0, %xmm1
1096         setnp   %al
1097         sete    %cl
1098         testb   %cl, %al
1099         jne     LBB1_5  # UnifiedReturnBlock
1100 LBB1_1: # cond_true
1101
1102
1103 it would be better to replace the last four instructions with:
1104
1105         jp LBB1_1
1106         je LBB1_5
1107 LBB1_1:
1108
1109 We also codegen the inner ?: into a diamond:
1110
1111        cvtss2sd        LCPI1_0(%rip), %xmm2
1112         cvtss2sd        LCPI1_1(%rip), %xmm3
1113         ucomisd %xmm1, %xmm0
1114         ja      LBB1_3  # cond_true
1115 LBB1_2: # cond_true
1116         movapd  %xmm3, %xmm2
1117 LBB1_3: # cond_true
1118         movapd  %xmm2, %xmm0
1119         ret
1120
1121 We should sink the load into xmm3 into the LBB1_2 block.  This should
1122 be pretty easy, and will nuke all the copies.
1123
1124 //===---------------------------------------------------------------------===//
1125
1126 This:
1127         #include <algorithm>
1128         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
1129         { return std::make_pair(a + b, a + b < a); }
1130         bool no_overflow(unsigned a, unsigned b)
1131         { return !full_add(a, b).second; }
1132
1133 Should compile to:
1134
1135
1136         _Z11no_overflowjj:
1137                 addl    %edi, %esi
1138                 setae   %al
1139                 ret
1140
1141 on x86-64, not:
1142
1143 __Z11no_overflowjj:
1144         addl    %edi, %esi
1145         cmpl    %edi, %esi
1146         setae   %al
1147         movzbl  %al, %eax
1148         ret
1149
1150
1151 //===---------------------------------------------------------------------===//
1152
1153 Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
1154 condition register is dead. xor reg reg is shorter than mov reg, #0.
1155
1156 //===---------------------------------------------------------------------===//
1157
1158 We aren't matching RMW instructions aggressively
1159 enough.  Here's a reduced testcase (more in PR1160):
1160
1161 define void @test(i32* %huge_ptr, i32* %target_ptr) {
1162         %A = load i32* %huge_ptr                ; <i32> [#uses=1]
1163         %B = load i32* %target_ptr              ; <i32> [#uses=1]
1164         %C = or i32 %A, %B              ; <i32> [#uses=1]
1165         store i32 %C, i32* %target_ptr
1166         ret void
1167 }
1168
1169 $ llvm-as < t.ll | llc -march=x86-64
1170
1171 _test:
1172         movl (%rdi), %eax
1173         orl (%rsi), %eax
1174         movl %eax, (%rsi)
1175         ret
1176
1177 That should be something like:
1178
1179 _test:
1180         movl (%rdi), %eax
1181         orl %eax, (%rsi)
1182         ret
1183
1184 //===---------------------------------------------------------------------===//
1185
1186 The following code:
1187
1188 bb114.preheader:                ; preds = %cond_next94
1189         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
1190         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
1191         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
1192         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
1193         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
1194         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
1195         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
1196         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
1197         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
1198         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
1199         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
1200         br label %bb114
1201
1202 produces:
1203
1204 LBB3_5: # bb114.preheader
1205         movswl  -68(%ebp), %eax
1206         movl    $32, %ecx
1207         movl    %ecx, -80(%ebp)
1208         subl    %eax, -80(%ebp)
1209         movswl  -52(%ebp), %eax
1210         movl    %ecx, -84(%ebp)
1211         subl    %eax, -84(%ebp)
1212         movswl  -70(%ebp), %eax
1213         movl    %ecx, -88(%ebp)
1214         subl    %eax, -88(%ebp)
1215         movswl  -50(%ebp), %eax
1216         subl    %eax, %ecx
1217         movl    %ecx, -76(%ebp)
1218         movswl  -42(%ebp), %eax
1219         movl    %eax, -92(%ebp)
1220         movswl  -66(%ebp), %eax
1221         movl    %eax, -96(%ebp)
1222         movw    $0, -98(%ebp)
1223
1224 This appears to be bad because the RA is not folding the store to the stack
1225 slot into the movl.  The above instructions could be:
1226         movl    $32, -80(%ebp)
1227 ...
1228         movl    $32, -84(%ebp)
1229 ...
1230 This seems like a cross between remat and spill folding.
1231
1232 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
1233 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
1234 vice-versa).
1235
1236 //===---------------------------------------------------------------------===//
1237
1238 For this code:
1239
1240 cond_next603:           ; preds = %bb493, %cond_true336, %cond_next599
1241         %v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ]                ; <i32> [#uses=1]
1242         %maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ]             ; <i32> [#uses=2]
1243         %cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ]                ; <i32> [#uses=2]
1244         %byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ]             ; <i8*> [#uses=9]
1245         %bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ]             ; <i32> [#uses=4]
1246         %source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ]             ; <i32> [#uses=7]
1247         %tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1           ; <i8*> [#uses=1]
1248         %tmp607 = load i8* %tmp606, align 1             ; <i8> [#uses=1]
1249
1250 We produce this:
1251
1252 LBB4_70:        # cond_next603
1253         movl    -20(%ebp), %esi
1254         movl    L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
1255
1256 However, ICC caches this information before the loop and produces this:
1257
1258         movl      88(%esp), %eax                                #481.12
1259
1260 //===---------------------------------------------------------------------===//
1261
1262 This code:
1263
1264         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
1265         br i1 %tmp659, label %cond_true662, label %cond_next715
1266
1267 produces this:
1268
1269         testw   %cx, %cx
1270         movswl  %cx, %esi
1271         jns     LBB4_109        # cond_next715
1272
1273 Shark tells us that using %cx in the testw instruction is sub-optimal. It
1274 suggests using the 32-bit register (which is what ICC uses).
1275
1276 //===---------------------------------------------------------------------===//
1277
1278 rdar://5506677 - We compile this:
1279
1280 define i32 @foo(double %x) {
1281         %x14 = bitcast double %x to i64         ; <i64> [#uses=1]
1282         %tmp713 = trunc i64 %x14 to i32         ; <i32> [#uses=1]
1283         %tmp8 = and i32 %tmp713, 2147483647             ; <i32> [#uses=1]
1284         ret i32 %tmp8
1285 }
1286
1287 to:
1288
1289 _foo:
1290         subl    $12, %esp
1291         fldl    16(%esp)
1292         fstpl   (%esp)
1293         movl    $2147483647, %eax
1294         andl    (%esp), %eax
1295         addl    $12, %esp
1296         #FP_REG_KILL
1297         ret
1298
1299 It would be much better to eliminate the fldl/fstpl by folding the bitcast
1300 into the load SDNode.  That would give us:
1301
1302 _foo:
1303         movl    $2147483647, %eax
1304         andl    4(%esp), %eax
1305         ret
1306
1307 //===---------------------------------------------------------------------===//
1308
1309 We compile this:
1310
1311 void compare (long long foo) {
1312   if (foo < 4294967297LL)
1313     abort();
1314 }
1315
1316 to:
1317
1318 _compare:
1319         subl    $12, %esp
1320         cmpl    $0, 16(%esp)
1321         setne   %al
1322         movzbw  %al, %ax
1323         cmpl    $1, 20(%esp)
1324         setg    %cl
1325         movzbw  %cl, %cx
1326         cmove   %ax, %cx
1327         movw    %cx, %ax
1328         testb   $1, %al
1329         je      LBB1_2  # cond_true
1330
1331 (also really horrible code on ppc).  This is due to the expand code for 64-bit
1332 compares.  GCC produces multiple branches, which is much nicer:
1333
1334 _compare:
1335         pushl   %ebp
1336         movl    %esp, %ebp
1337         subl    $8, %esp
1338         movl    8(%ebp), %eax
1339         movl    12(%ebp), %edx
1340         subl    $1, %edx
1341         jg     L5
1342 L7:
1343         jl      L4
1344         cmpl    $0, %eax
1345         jbe      L4
1346 L5:
1347
1348 //===---------------------------------------------------------------------===//
1349
1350 Tail call optimization improvements: Tail call optimization currently
1351 pushes all arguments on the top of the stack (their normal place for
1352 non-tail call optimized calls) before moving them to actual stack
1353 slot. This is done to prevent overwriting of parameters (see example
1354 below) that might be used, since the arguments of the callee
1355 overwrites caller's arguments.
1356
1357 example:
1358
1359 int callee(int32, int64);
1360 int caller(int32 arg1, int32 arg2) {
1361   int64 local = arg2 * 2;
1362   return callee(arg2, (int64)local);
1363 }
1364
1365 [arg1]          [!arg2 no longer valid since we moved local onto it]
1366 [arg2]      ->  [(int64)
1367 [RETADDR]        local  ]
1368
1369 Moving arg1 onto the stack slot of callee function would overwrite
1370 arg2 of the caller.
1371
1372 Possible optimizations:
1373
1374  - Only push those arguments to the top of the stack that are actual
1375    parameters of the caller function and have no local value in the
1376    caller.
1377
1378    In the above example local does not need to be pushed onto the top
1379    of the stack as it is definitely not a caller's function
1380    parameter.
1381
1382  - Analyse the actual parameters of the callee to see which would
1383    overwrite a caller parameter which is used by the callee and only
1384    push them onto the top of the stack.
1385
1386    int callee (int32 arg1, int32 arg2);
1387    int caller (int32 arg1, int32 arg2) {
1388        return callee(arg1,arg2);
1389    }
1390
1391    Here we don't need to write any variables to the top of the stack
1392    since they don't overwrite each other.
1393
1394    int callee (int32 arg1, int32 arg2);
1395    int caller (int32 arg1, int32 arg2) {
1396        return callee(arg2,arg1);
1397    }
1398
1399    Here we need to push the arguments because they overwrite each
1400    other.
1401
1402
1403    Code for lowering directly onto callers arguments:
1404 +  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
1405 +  SmallVector<SDOperand, 8> MemOpChains;
1406 +
1407 +  SDOperand FramePtr;
1408 +  SDOperand PtrOff;
1409 +  SDOperand FIN;
1410 +  int FI = 0;
1411 +  // Walk the register/memloc assignments, inserting copies/loads.
1412 +  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1413 +    CCValAssign &VA = ArgLocs[i];
1414 +    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
1415 +
1416 +    ....
1417 +
1418 +    if (VA.isRegLoc()) {
1419 +      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1420 +    } else {
1421 +      assert(VA.isMemLoc());
1422 +      // create frame index
1423 +      int32_t Offset = VA.getLocMemOffset()+FPDiff;
1424 +      uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
1425 +      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
1426 +      FIN = DAG.getFrameIndex(FI, MVT::i32);
1427 +      // store relative to framepointer
1428 +      MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0));
1429 +    }
1430 +  }
1431 //===---------------------------------------------------------------------===//
1432
1433 main ()
1434 {
1435   int i = 0;
1436   unsigned long int z = 0;
1437
1438   do {
1439     z -= 0x00004000;
1440     i++;
1441     if (i > 0x00040000)
1442       abort ();
1443   } while (z > 0);
1444   exit (0);
1445 }
1446
1447 gcc compiles this to:
1448
1449 _main:
1450         subl    $28, %esp
1451         xorl    %eax, %eax
1452         jmp     L2
1453 L3:
1454         cmpl    $262144, %eax
1455         je      L10
1456 L2:
1457         addl    $1, %eax
1458         cmpl    $262145, %eax
1459         jne     L3
1460         call    L_abort$stub
1461 L10:
1462         movl    $0, (%esp)
1463         call    L_exit$stub
1464
1465 llvm:
1466
1467 _main:
1468         subl    $12, %esp
1469         movl    $1, %eax
1470         movl    $16384, %ecx
1471 LBB1_1: # bb
1472         cmpl    $262145, %eax
1473         jge     LBB1_4  # cond_true
1474 LBB1_2: # cond_next
1475         incl    %eax
1476         addl    $4294950912, %ecx
1477         cmpl    $16384, %ecx
1478         jne     LBB1_1  # bb
1479 LBB1_3: # bb11
1480         xorl    %eax, %eax
1481         addl    $12, %esp
1482         ret
1483 LBB1_4: # cond_true
1484         call    L_abort$stub
1485
1486 1. LSR should rewrite the first cmp with induction variable %ecx.
1487 2. DAG combiner should fold
1488         leal    1(%eax), %edx
1489         cmpl    $262145, %edx
1490    =>
1491         cmpl    $262144, %eax
1492
1493 //===---------------------------------------------------------------------===//