lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 This should be one DIV/IDIV instruction, not a libcall:
   6
   7 unsigned test(unsigned long long X, unsigned Y) {
   8         return X/Y;
   9 }
  10
  11 This can be done trivially with a custom legalizer.  What about overflow
  12 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  13
  14 //===---------------------------------------------------------------------===//
  15
  16 Improvements to the multiply -> shift/add algorithm:
  17 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  22 long long foo(int x) { return 1LL << x; }
  23
  24 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  25 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  26 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  27
  28 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  29
  30 One better solution for 1LL << x is:
  31         xorl    %eax, %eax
  32         xorl    %edx, %edx
  33         testb   $32, %cl
  34         sete    %al
  35         setne   %dl
  36         sall    %cl, %eax
  37         sall    %cl, %edx
  38
  39 But that requires good 8-bit subreg support.
  40
  41 Also, this might be better.  It's an extra shift, but it's one instruction
  42 shorter, and doesn't stress 8-bit subreg support.
  43 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
  44 but without the unnecessary and.)
  45         movl %ecx, %eax
  46         shrl $5, %eax
  47         movl %eax, %edx
  48         xorl $1, %edx
  49         sall %cl, %eax
  50         sall %cl. %edx
  51
  52 64-bit shifts (in general) expand to really bad code.  Instead of using
  53 cmovs, we should expand to a conditional branch like GCC produces.
  54
  55 //===---------------------------------------------------------------------===//
  56
  57 Some isel ideas:
  58
  59 1. Dynamic programming based approach when compile time is not an
  60    issue.
  61 2. Code duplication (addressing mode) during isel.
  62 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  63    Sequencing of Instructions".
  64 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  65    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  66    and other related papers.
  67    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  68
  69 //===---------------------------------------------------------------------===//
  70
  71 Should we promote i16 to i32 to avoid partial register update stalls?
  72
  73 //===---------------------------------------------------------------------===//
  74
  75 Leave any_extend as pseudo instruction and hint to register
  76 allocator. Delay codegen until post register allocation.
  77 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
  78 the coalescer how to deal with it though.
  79
  80 //===---------------------------------------------------------------------===//
  81
  82 It appears icc use push for parameter passing. Need to investigate.
  83
  84 //===---------------------------------------------------------------------===//
  85
  86 This:
  87
  88 void foo(void);
  89 void bar(int x, int *P) {
  90   x >>= 2;
  91   if (x)
  92     foo();
  93   *P = x;
  94 }
  95
  96 compiles into:
  97
  98         movq    %rsi, %rbx
  99         movl    %edi, %r14d
 100         sarl    $2, %r14d
 101         testl   %r14d, %r14d
 102         je      LBB0_2
 103
 104 Instead of doing an explicit test, we can use the flags off the sar.  This
 105 occurs in a bigger testcase like this, which is pretty common:
 106
 107 #include <vector>
 108 int test1(std::vector<int> &X) {
 109   int Sum = 0;
 110   for (long i = 0, e = X.size(); i != e; ++i)
 111     X[i] = 0;
 112   return Sum;
 113 }
 114
 115 //===---------------------------------------------------------------------===//
 116
 117 Only use inc/neg/not instructions on processors where they are faster than
 118 add/sub/xor.  They are slower on the P4 due to only updating some processor
 119 flags.
 120
 121 //===---------------------------------------------------------------------===//
 122
 123 The instruction selector sometimes misses folding a load into a compare.  The
 124 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 125 commutative, it is not matched with the load on both sides.  The dag combiner
 126 should be made smart enough to canonicalize the load into the RHS of a compare
 127 when it can invert the result of the compare for free.
 128
 129 //===---------------------------------------------------------------------===//
 130
 131 In many cases, LLVM generates code like this:
 132
 133 _test:
 134         movl 8(%esp), %eax
 135         cmpl %eax, 4(%esp)
 136         setl %al
 137         movzbl %al, %eax
 138         ret
 139
 140 on some processors (which ones?), it is more efficient to do this:
 141
 142 _test:
 143         movl 8(%esp), %ebx
 144         xor  %eax, %eax
 145         cmpl %ebx, 4(%esp)
 146         setl %al
 147         ret
 148
 149 Doing this correctly is tricky though, as the xor clobbers the flags.
 150
 151 //===---------------------------------------------------------------------===//
 152
 153 We should generate bts/btr/etc instructions on targets where they are cheap or
 154 when codesize is important.  e.g., for:
 155
 156 void setbit(int *target, int bit) {
 157     *target |= (1 << bit);
 158 }
 159 void clearbit(int *target, int bit) {
 160     *target &= ~(1 << bit);
 161 }
 162
 163 //===---------------------------------------------------------------------===//
 164
 165 Instead of the following for memset char*, 1, 10:
 166
 167         movl $16843009, 4(%edx)
 168         movl $16843009, (%edx)
 169         movw $257, 8(%edx)
 170
 171 It might be better to generate
 172
 173         movl $16843009, %eax
 174         movl %eax, 4(%edx)
 175         movl %eax, (%edx)
 176         movw al, 8(%edx)
 177
 178 when we can spare a register. It reduces code size.
 179
 180 //===---------------------------------------------------------------------===//
 181
 182 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 183 get this:
 184
 185 define i32 @test1(i32 %X) {
 186     %Y = sdiv i32 %X, 8
 187     ret i32 %Y
 188 }
 189
 190 _test1:
 191         movl 4(%esp), %eax
 192         movl %eax, %ecx
 193         sarl $31, %ecx
 194         shrl $29, %ecx
 195         addl %ecx, %eax
 196         sarl $3, %eax
 197         ret
 198
 199 GCC knows several different ways to codegen it, one of which is this:
 200
 201 _test1:
 202         movl    4(%esp), %eax
 203         cmpl    $-1, %eax
 204         leal    7(%eax), %ecx
 205         cmovle  %ecx, %eax
 206         sarl    $3, %eax
 207         ret
 208
 209 which is probably slower, but it's interesting at least :)
 210
 211 //===---------------------------------------------------------------------===//
 212
 213 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 214 We should leave these as libcalls for everything over a much lower threshold,
 215 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 216 stores, TLB preheating, etc)
 217
 218 //===---------------------------------------------------------------------===//
 219
 220 Optimize this into something reasonable:
 221  x * copysign(1.0, y) * copysign(1.0, z)
 222
 223 //===---------------------------------------------------------------------===//
 224
 225 Optimize copysign(x, *y) to use an integer load from y.
 226
 227 //===---------------------------------------------------------------------===//
 228
 229 The following tests perform worse with LSR:
 230
 231 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 232
 233 //===---------------------------------------------------------------------===//
 234
 235 Adding to the list of cmp / test poor codegen issues:
 236
 237 int test(__m128 *A, __m128 *B) {
 238   if (_mm_comige_ss(*A, *B))
 239     return 3;
 240   else
 241     return 4;
 242 }
 243
 244 _test:
 245         movl 8(%esp), %eax
 246         movaps (%eax), %xmm0
 247         movl 4(%esp), %eax
 248         movaps (%eax), %xmm1
 249         comiss %xmm0, %xmm1
 250         setae %al
 251         movzbl %al, %ecx
 252         movl $3, %eax
 253         movl $4, %edx
 254         cmpl $0, %ecx
 255         cmove %edx, %eax
 256         ret
 257
 258 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 259 are a number of issues. 1) We are introducing a setcc between the result of the
 260 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 261 so a any extend (which becomes a zero extend) is added.
 262
 263 We probably need some kind of target DAG combine hook to fix this.
 264
 265 //===---------------------------------------------------------------------===//
 266
 267 We generate significantly worse code for this than GCC:
 268 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 269 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 270
 271 There is also one case we do worse on PPC.
 272
 273 //===---------------------------------------------------------------------===//
 274
 275 For this:
 276
 277 int test(int a)
 278 {
 279   return a * 3;
 280 }
 281
 282 We currently emits
 283         imull $3, 4(%esp), %eax
 284
 285 Perhaps this is what we really should generate is? Is imull three or four
 286 cycles? Note: ICC generates this:
 287         movl    4(%esp), %eax
 288         leal    (%eax,%eax,2), %eax
 289
 290 The current instruction priority is based on pattern complexity. The former is
 291 more "complex" because it folds a load so the latter will not be emitted.
 292
 293 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 294 should always try to match LEA first since the LEA matching code does some
 295 estimate to determine whether the match is profitable.
 296
 297 However, if we care more about code size, then imull is better. It's two bytes
 298 shorter than movl + leal.
 299
 300 On a Pentium M, both variants have the same characteristics with regard
 301 to throughput; however, the multiplication has a latency of four cycles, as
 302 opposed to two cycles for the movl+lea variant.
 303
 304 //===---------------------------------------------------------------------===//
 305
 306 __builtin_ffs codegen is messy.
 307
 308 int ffs_(unsigned X) { return __builtin_ffs(X); }
 309
 310 llvm produces:
 311 ffs_:
 312         movl    4(%esp), %ecx
 313         bsfl    %ecx, %eax
 314         movl    $32, %edx
 315         cmove   %edx, %eax
 316         incl    %eax
 317         xorl    %edx, %edx
 318         testl   %ecx, %ecx
 319         cmove   %edx, %eax
 320         ret
 321
 322 vs gcc:
 323
 324 _ffs_:
 325         movl    $-1, %edx
 326         bsfl    4(%esp), %eax
 327         cmove   %edx, %eax
 328         addl    $1, %eax
 329         ret
 330
 331 Another example of __builtin_ffs (use predsimplify to eliminate a select):
 332
 333 int foo (unsigned long j) {
 334   if (j)
 335     return __builtin_ffs (j) - 1;
 336   else
 337     return 0;
 338 }
 339
 340 //===---------------------------------------------------------------------===//
 341
 342 It appears gcc place string data with linkonce linkage in
 343 .section __TEXT,__const_coal,coalesced instead of
 344 .section __DATA,__const_coal,coalesced.
 345 Take a look at darwin.h, there are other Darwin assembler directives that we
 346 do not make use of.
 347
 348 //===---------------------------------------------------------------------===//
 349
 350 define i32 @foo(i32* %a, i32 %t) {
 351 entry:
 352         br label %cond_true
 353
 354 cond_true:              ; preds = %cond_true, %entry
 355         %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]           ; <i32> [#uses=3]
 356         %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]             ; <i32> [#uses=1]
 357         %tmp2 = getelementptr i32* %a, i32 %x.0.0               ; <i32*> [#uses=1]
 358         %tmp3 = load i32* %tmp2         ; <i32> [#uses=1]
 359         %tmp5 = add i32 %t_addr.0.0, %x.0.0             ; <i32> [#uses=1]
 360         %tmp7 = add i32 %tmp5, %tmp3            ; <i32> [#uses=2]
 361         %tmp9 = add i32 %x.0.0, 1               ; <i32> [#uses=2]
 362         %tmp = icmp sgt i32 %tmp9, 39           ; <i1> [#uses=1]
 363         br i1 %tmp, label %bb12, label %cond_true
 364
 365 bb12:           ; preds = %cond_true
 366         ret i32 %tmp7
 367 }
 368 is pessimized by -loop-reduce and -indvars
 369
 370 //===---------------------------------------------------------------------===//
 371
 372 u32 to float conversion improvement:
 373
 374 float uint32_2_float( unsigned u ) {
 375   float fl = (int) (u & 0xffff);
 376   float fh = (int) (u >> 16);
 377   fh *= 0x1.0p16f;
 378   return fh + fl;
 379 }
 380
 381 00000000        subl    $0x04,%esp
 382 00000003        movl    0x08(%esp,1),%eax
 383 00000007        movl    %eax,%ecx
 384 00000009        shrl    $0x10,%ecx
 385 0000000c        cvtsi2ss        %ecx,%xmm0
 386 00000010        andl    $0x0000ffff,%eax
 387 00000015        cvtsi2ss        %eax,%xmm1
 388 00000019        mulss   0x00000078,%xmm0
 389 00000021        addss   %xmm1,%xmm0
 390 00000025        movss   %xmm0,(%esp,1)
 391 0000002a        flds    (%esp,1)
 392 0000002d        addl    $0x04,%esp
 393 00000030        ret
 394
 395 //===---------------------------------------------------------------------===//
 396
 397 When using fastcc abi, align stack slot of argument of type double on 8 byte
 398 boundary to improve performance.
 399
 400 //===---------------------------------------------------------------------===//
 401
 402 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 403 simplifications for integer "x cmp y ? a : b".
 404
 405 //===---------------------------------------------------------------------===//
 406
 407 Consider the expansion of:
 408
 409 define i32 @test3(i32 %X) {
 410         %tmp1 = urem i32 %X, 255
 411         ret i32 %tmp1
 412 }
 413
 414 Currently it compiles to:
 415
 416 ...
 417         movl $2155905153, %ecx
 418         movl 8(%esp), %esi
 419         movl %esi, %eax
 420         mull %ecx
 421 ...
 422
 423 This could be "reassociated" into:
 424
 425         movl $2155905153, %eax
 426         movl 8(%esp), %ecx
 427         mull %ecx
 428
 429 to avoid the copy.  In fact, the existing two-address stuff would do this
 430 except that mul isn't a commutative 2-addr instruction.  I guess this has
 431 to be done at isel time based on the #uses to mul?
 432
 433 //===---------------------------------------------------------------------===//
 434
 435 Make sure the instruction which starts a loop does not cross a cacheline
 436 boundary. This requires knowning the exact length of each machine instruction.
 437 That is somewhat complicated, but doable. Example 256.bzip2:
 438
 439 In the new trace, the hot loop has an instruction which crosses a cacheline
 440 boundary.  In addition to potential cache misses, this can't help decoding as I
 441 imagine there has to be some kind of complicated decoder reset and realignment
 442 to grab the bytes from the next cacheline.
 443
 444 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 445 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 446 937  937 0x3d0a incl     %esi
 447 3    3   0x3d0b cmpb     %bl, %dl
 448 27   27  0x3d0d jnz      0x000062db <main+11707>
 449
 450 //===---------------------------------------------------------------------===//
 451
 452 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 453
 454 //===---------------------------------------------------------------------===//
 455
 456 This could be a single 16-bit load.
 457
 458 int f(char *p) {
 459     if ((p[0] == 1) & (p[1] == 2)) return 1;
 460     return 0;
 461 }
 462
 463 //===---------------------------------------------------------------------===//
 464
 465 We should inline lrintf and probably other libc functions.
 466
 467 //===---------------------------------------------------------------------===//
 468
 469 Use the FLAGS values from arithmetic instructions more.  For example, compile:
 470
 471 int add_zf(int *x, int y, int a, int b) {
 472      if ((*x += y) == 0)
 473           return a;
 474      else
 475           return b;
 476 }
 477
 478 to:
 479        addl    %esi, (%rdi)
 480        movl    %edx, %eax
 481        cmovne  %ecx, %eax
 482        ret
 483 instead of:
 484
 485 _add_zf:
 486         addl (%rdi), %esi
 487         movl %esi, (%rdi)
 488         testl %esi, %esi
 489         cmove %edx, %ecx
 490         movl %ecx, %eax
 491         ret
 492
 493 As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
 494 without a test instruction.
 495
 496 //===---------------------------------------------------------------------===//
 497
 498 These two functions have identical effects:
 499
 500 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 501 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 502
 503 We currently compile them to:
 504
 505 _f:
 506         movl 4(%esp), %eax
 507         movl %eax, %ecx
 508         incl %ecx
 509         movl 8(%esp), %edx
 510         cmpl %edx, %ecx
 511         jne LBB1_2      #UnifiedReturnBlock
 512 LBB1_1: #cond_true
 513         addl $2, %eax
 514         ret
 515 LBB1_2: #UnifiedReturnBlock
 516         movl %ecx, %eax
 517         ret
 518 _f2:
 519         movl 4(%esp), %eax
 520         movl %eax, %ecx
 521         incl %ecx
 522         cmpl 8(%esp), %ecx
 523         sete %cl
 524         movzbl %cl, %ecx
 525         leal 1(%ecx,%eax), %eax
 526         ret
 527
 528 both of which are inferior to GCC's:
 529
 530 _f:
 531         movl    4(%esp), %edx
 532         leal    1(%edx), %eax
 533         addl    $2, %edx
 534         cmpl    8(%esp), %eax
 535         cmove   %edx, %eax
 536         ret
 537 _f2:
 538         movl    4(%esp), %eax
 539         addl    $1, %eax
 540         xorl    %edx, %edx
 541         cmpl    8(%esp), %eax
 542         sete    %dl
 543         addl    %edx, %eax
 544         ret
 545
 546 //===---------------------------------------------------------------------===//
 547
 548 This code:
 549
 550 void test(int X) {
 551   if (X) abort();
 552 }
 553
 554 is currently compiled to:
 555
 556 _test:
 557         subl $12, %esp
 558         cmpl $0, 16(%esp)
 559         jne LBB1_1
 560         addl $12, %esp
 561         ret
 562 LBB1_1:
 563         call L_abort$stub
 564
 565 It would be better to produce:
 566
 567 _test:
 568         subl $12, %esp
 569         cmpl $0, 16(%esp)
 570         jne L_abort$stub
 571         addl $12, %esp
 572         ret
 573
 574 This can be applied to any no-return function call that takes no arguments etc.
 575 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 576 something like this:
 577
 578 _test:
 579         cmpl $0, 4(%esp)
 580         jne LBB1_1
 581         ret
 582 LBB1_1:
 583         subl $12, %esp
 584         call L_abort$stub
 585
 586 Both are useful in different situations.  Finally, it could be shrink-wrapped
 587 and tail called, like this:
 588
 589 _test:
 590         cmpl $0, 4(%esp)
 591         jne LBB1_1
 592         ret
 593 LBB1_1:
 594         pop %eax   # realign stack.
 595         call L_abort$stub
 596
 597 Though this probably isn't worth it.
 598
 599 //===---------------------------------------------------------------------===//
 600
 601 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 602 a neg instead of a sub instruction.  Consider:
 603
 604 int test(char X) { return 7-X; }
 605
 606 we currently produce:
 607 _test:
 608         movl $7, %eax
 609         movsbl 4(%esp), %ecx
 610         subl %ecx, %eax
 611         ret
 612
 613 We would use one fewer register if codegen'd as:
 614
 615         movsbl 4(%esp), %eax
 616         neg %eax
 617         add $7, %eax
 618         ret
 619
 620 Note that this isn't beneficial if the load can be folded into the sub.  In
 621 this case, we want a sub:
 622
 623 int test(int X) { return 7-X; }
 624 _test:
 625         movl $7, %eax
 626         subl 4(%esp), %eax
 627         ret
 628
 629 //===---------------------------------------------------------------------===//
 630
 631 Leaf functions that require one 4-byte spill slot have a prolog like this:
 632
 633 _foo:
 634         pushl   %esi
 635         subl    $4, %esp
 636 ...
 637 and an epilog like this:
 638         addl    $4, %esp
 639         popl    %esi
 640         ret
 641
 642 It would be smaller, and potentially faster, to push eax on entry and to
 643 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
 644 into any return registers :)
 645
 646 //===---------------------------------------------------------------------===//
 647
 648 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
 649 branches.  We generate really poor code for:
 650
 651 double testf(double a) {
 652        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 653 }
 654
 655 For example, the entry BB is:
 656
 657 _testf:
 658         subl    $20, %esp
 659         pxor    %xmm0, %xmm0
 660         movsd   24(%esp), %xmm1
 661         ucomisd %xmm0, %xmm1
 662         setnp   %al
 663         sete    %cl
 664         testb   %cl, %al
 665         jne     LBB1_5  # UnifiedReturnBlock
 666 LBB1_1: # cond_true
 667
 668
 669 it would be better to replace the last four instructions with:
 670
 671         jp LBB1_1
 672         je LBB1_5
 673 LBB1_1:
 674
 675 We also codegen the inner ?: into a diamond:
 676
 677        cvtss2sd        LCPI1_0(%rip), %xmm2
 678         cvtss2sd        LCPI1_1(%rip), %xmm3
 679         ucomisd %xmm1, %xmm0
 680         ja      LBB1_3  # cond_true
 681 LBB1_2: # cond_true
 682         movapd  %xmm3, %xmm2
 683 LBB1_3: # cond_true
 684         movapd  %xmm2, %xmm0
 685         ret
 686
 687 We should sink the load into xmm3 into the LBB1_2 block.  This should
 688 be pretty easy, and will nuke all the copies.
 689
 690 //===---------------------------------------------------------------------===//
 691
 692 This:
 693         #include <algorithm>
 694         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 695         { return std::make_pair(a + b, a + b < a); }
 696         bool no_overflow(unsigned a, unsigned b)
 697         { return !full_add(a, b).second; }
 698
 699 Should compile to:
 700         addl    %esi, %edi
 701         setae   %al
 702         movzbl  %al, %eax
 703         ret
 704
 705 on x86-64, instead of the rather stupid-looking:
 706         addl    %esi, %edi
 707         setb    %al
 708         xorb    $1, %al
 709         movzbl  %al, %eax
 710         ret
 711
 712
 713 //===---------------------------------------------------------------------===//
 714
 715 The following code:
 716
 717 bb114.preheader:                ; preds = %cond_next94
 718         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
 719         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
 720         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
 721         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
 722         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
 723         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
 724         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
 725         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
 726         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
 727         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
 728         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
 729         br label %bb114
 730
 731 produces:
 732
 733 LBB3_5: # bb114.preheader
 734         movswl  -68(%ebp), %eax
 735         movl    $32, %ecx
 736         movl    %ecx, -80(%ebp)
 737         subl    %eax, -80(%ebp)
 738         movswl  -52(%ebp), %eax
 739         movl    %ecx, -84(%ebp)
 740         subl    %eax, -84(%ebp)
 741         movswl  -70(%ebp), %eax
 742         movl    %ecx, -88(%ebp)
 743         subl    %eax, -88(%ebp)
 744         movswl  -50(%ebp), %eax
 745         subl    %eax, %ecx
 746         movl    %ecx, -76(%ebp)
 747         movswl  -42(%ebp), %eax
 748         movl    %eax, -92(%ebp)
 749         movswl  -66(%ebp), %eax
 750         movl    %eax, -96(%ebp)
 751         movw    $0, -98(%ebp)
 752
 753 This appears to be bad because the RA is not folding the store to the stack
 754 slot into the movl.  The above instructions could be:
 755         movl    $32, -80(%ebp)
 756 ...
 757         movl    $32, -84(%ebp)
 758 ...
 759 This seems like a cross between remat and spill folding.
 760
 761 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
 762 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
 763 vice-versa).
 764
 765 //===---------------------------------------------------------------------===//
 766
 767 This code:
 768
 769         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
 770         br i1 %tmp659, label %cond_true662, label %cond_next715
 771
 772 produces this:
 773
 774         testw   %cx, %cx
 775         movswl  %cx, %esi
 776         jns     LBB4_109        # cond_next715
 777
 778 Shark tells us that using %cx in the testw instruction is sub-optimal. It
 779 suggests using the 32-bit register (which is what ICC uses).
 780
 781 //===---------------------------------------------------------------------===//
 782
 783 We compile this:
 784
 785 void compare (long long foo) {
 786   if (foo < 4294967297LL)
 787     abort();
 788 }
 789
 790 to:
 791
 792 compare:
 793         subl    $4, %esp
 794         cmpl    $0, 8(%esp)
 795         setne   %al
 796         movzbw  %al, %ax
 797         cmpl    $1, 12(%esp)
 798         setg    %cl
 799         movzbw  %cl, %cx
 800         cmove   %ax, %cx
 801         testb   $1, %cl
 802         jne     .LBB1_2 # UnifiedReturnBlock
 803 .LBB1_1:        # ifthen
 804         call    abort
 805 .LBB1_2:        # UnifiedReturnBlock
 806         addl    $4, %esp
 807         ret
 808
 809 (also really horrible code on ppc).  This is due to the expand code for 64-bit
 810 compares.  GCC produces multiple branches, which is much nicer:
 811
 812 compare:
 813         subl    $12, %esp
 814         movl    20(%esp), %edx
 815         movl    16(%esp), %eax
 816         decl    %edx
 817         jle     .L7
 818 .L5:
 819         addl    $12, %esp
 820         ret
 821         .p2align 4,,7
 822 .L7:
 823         jl      .L4
 824         cmpl    $0, %eax
 825         .p2align 4,,8
 826         ja      .L5
 827 .L4:
 828         .p2align 4,,9
 829         call    abort
 830
 831 //===---------------------------------------------------------------------===//
 832
 833 Tail call optimization improvements: Tail call optimization currently
 834 pushes all arguments on the top of the stack (their normal place for
 835 non-tail call optimized calls) that source from the callers arguments
 836 or  that source from a virtual register (also possibly sourcing from
 837 callers arguments).
 838 This is done to prevent overwriting of parameters (see example
 839 below) that might be used later.
 840
 841 example:
 842
 843 int callee(int32, int64);
 844 int caller(int32 arg1, int32 arg2) {
 845   int64 local = arg2 * 2;
 846   return callee(arg2, (int64)local);
 847 }
 848
 849 [arg1]          [!arg2 no longer valid since we moved local onto it]
 850 [arg2]      ->  [(int64)
 851 [RETADDR]        local  ]
 852
 853 Moving arg1 onto the stack slot of callee function would overwrite
 854 arg2 of the caller.
 855
 856 Possible optimizations:
 857
 858
 859  - Analyse the actual parameters of the callee to see which would
 860    overwrite a caller parameter which is used by the callee and only
 861    push them onto the top of the stack.
 862
 863    int callee (int32 arg1, int32 arg2);
 864    int caller (int32 arg1, int32 arg2) {
 865        return callee(arg1,arg2);
 866    }
 867
 868    Here we don't need to write any variables to the top of the stack
 869    since they don't overwrite each other.
 870
 871    int callee (int32 arg1, int32 arg2);
 872    int caller (int32 arg1, int32 arg2) {
 873        return callee(arg2,arg1);
 874    }
 875
 876    Here we need to push the arguments because they overwrite each
 877    other.
 878
 879 //===---------------------------------------------------------------------===//
 880
 881 main ()
 882 {
 883   int i = 0;
 884   unsigned long int z = 0;
 885
 886   do {
 887     z -= 0x00004000;
 888     i++;
 889     if (i > 0x00040000)
 890       abort ();
 891   } while (z > 0);
 892   exit (0);
 893 }
 894
 895 gcc compiles this to:
 896
 897 _main:
 898         subl    $28, %esp
 899         xorl    %eax, %eax
 900         jmp     L2
 901 L3:
 902         cmpl    $262144, %eax
 903         je      L10
 904 L2:
 905         addl    $1, %eax
 906         cmpl    $262145, %eax
 907         jne     L3
 908         call    L_abort$stub
 909 L10:
 910         movl    $0, (%esp)
 911         call    L_exit$stub
 912
 913 llvm:
 914
 915 _main:
 916         subl    $12, %esp
 917         movl    $1, %eax
 918         movl    $16384, %ecx
 919 LBB1_1: # bb
 920         cmpl    $262145, %eax
 921         jge     LBB1_4  # cond_true
 922 LBB1_2: # cond_next
 923         incl    %eax
 924         addl    $4294950912, %ecx
 925         cmpl    $16384, %ecx
 926         jne     LBB1_1  # bb
 927 LBB1_3: # bb11
 928         xorl    %eax, %eax
 929         addl    $12, %esp
 930         ret
 931 LBB1_4: # cond_true
 932         call    L_abort$stub
 933
 934 1. LSR should rewrite the first cmp with induction variable %ecx.
 935 2. DAG combiner should fold
 936         leal    1(%eax), %edx
 937         cmpl    $262145, %edx
 938    =>
 939         cmpl    $262144, %eax
 940
 941 //===---------------------------------------------------------------------===//
 942
 943 define i64 @test(double %X) {
 944         %Y = fptosi double %X to i64
 945         ret i64 %Y
 946 }
 947
 948 compiles to:
 949
 950 _test:
 951         subl    $20, %esp
 952         movsd   24(%esp), %xmm0
 953         movsd   %xmm0, 8(%esp)
 954         fldl    8(%esp)
 955         fisttpll        (%esp)
 956         movl    4(%esp), %edx
 957         movl    (%esp), %eax
 958         addl    $20, %esp
 959         #FP_REG_KILL
 960         ret
 961
 962 This should just fldl directly from the input stack slot.
 963
 964 //===---------------------------------------------------------------------===//
 965
 966 This code:
 967 int foo (int x) { return (x & 65535) | 255; }
 968
 969 Should compile into:
 970
 971 _foo:
 972         movzwl  4(%esp), %eax
 973         orl     $255, %eax
 974         ret
 975
 976 instead of:
 977 _foo:
 978         movl    $65280, %eax
 979         andl    4(%esp), %eax
 980         orl     $255, %eax
 981         ret
 982
 983 //===---------------------------------------------------------------------===//
 984
 985 We're codegen'ing multiply of long longs inefficiently:
 986
 987 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
 988   return arg1 *  arg2;
 989 }
 990
 991 We compile to (fomit-frame-pointer):
 992
 993 _LLM:
 994         pushl   %esi
 995         movl    8(%esp), %ecx
 996         movl    16(%esp), %esi
 997         movl    %esi, %eax
 998         mull    %ecx
 999         imull   12(%esp), %esi
1000         addl    %edx, %esi
1001         imull   20(%esp), %ecx
1002         movl    %esi, %edx
1003         addl    %ecx, %edx
1004         popl    %esi
1005         ret
1006
1007 This looks like a scheduling deficiency and lack of remat of the load from
1008 the argument area.  ICC apparently produces:
1009
1010         movl      8(%esp), %ecx
1011         imull     12(%esp), %ecx
1012         movl      16(%esp), %eax
1013         imull     4(%esp), %eax
1014         addl      %eax, %ecx
1015         movl      4(%esp), %eax
1016         mull      12(%esp)
1017         addl      %ecx, %edx
1018         ret
1019
1020 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1021 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1022
1023 //===---------------------------------------------------------------------===//
1024
1025 We can fold a store into "zeroing a reg".  Instead of:
1026
1027 xorl    %eax, %eax
1028 movl    %eax, 124(%esp)
1029
1030 we should get:
1031
1032 movl    $0, 124(%esp)
1033
1034 if the flags of the xor are dead.
1035
1036 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1037 be folded into: shl [mem], 1
1038
1039 //===---------------------------------------------------------------------===//
1040
1041 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1042 or and instruction, for example:
1043
1044         xorpd   LCPI1_0, %xmm2
1045
1046 However, if xmm2 gets spilled, we end up with really ugly code like this:
1047
1048         movsd   (%esp), %xmm0
1049         xorpd   LCPI1_0, %xmm0
1050         movsd   %xmm0, (%esp)
1051
1052 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1053 the neg/abs instruction, turning it into an *integer* operation, like this:
1054
1055         xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1056
1057 you could also use xorb, but xorl is less likely to lead to a partial register
1058 stall.  Here is a contrived testcase:
1059
1060 double a, b, c;
1061 void test(double *P) {
1062   double X = *P;
1063   a = X;
1064   bar();
1065   X = -X;
1066   b = X;
1067   bar();
1068   c = X;
1069 }
1070
1071 //===---------------------------------------------------------------------===//
1072
1073 The generated code on x86 for checking for signed overflow on a multiply the
1074 obvious way is much longer than it needs to be.
1075
1076 int x(int a, int b) {
1077   long long prod = (long long)a*b;
1078   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1079 }
1080
1081 See PR2053 for more details.
1082
1083 //===---------------------------------------------------------------------===//
1084
1085 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1086 more aggressively; it should cost the same as a move+shift on any modern
1087 processor, but it's a lot shorter. Downside is that it puts more
1088 pressure on register allocation because it has fixed operands.
1089
1090 Example:
1091 int abs(int x) {return x < 0 ? -x : x;}
1092
1093 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1094 abs:
1095         movl    4(%esp), %eax
1096         cltd
1097         xorl    %edx, %eax
1098         subl    %edx, %eax
1099         ret
1100
1101 //===---------------------------------------------------------------------===//
1102
1103 Take the following code (from
1104 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1105
1106 extern unsigned char first_one[65536];
1107 int FirstOnet(unsigned long long arg1)
1108 {
1109   if (arg1 >> 48)
1110     return (first_one[arg1 >> 48]);
1111   return 0;
1112 }
1113
1114
1115 The following code is currently generated:
1116 FirstOnet:
1117         movl    8(%esp), %eax
1118         cmpl    $65536, %eax
1119         movl    4(%esp), %ecx
1120         jb      .LBB1_2 # UnifiedReturnBlock
1121 .LBB1_1:        # ifthen
1122         shrl    $16, %eax
1123         movzbl  first_one(%eax), %eax
1124         ret
1125 .LBB1_2:        # UnifiedReturnBlock
1126         xorl    %eax, %eax
1127         ret
1128
1129 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
1130 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
1131
1132 //===---------------------------------------------------------------------===//
1133
1134 We compile this function:
1135
1136 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
1137 entry:
1138         %tmp2 = icmp eq i8 %d, 0                ; <i1> [#uses=1]
1139         br i1 %tmp2, label %bb7, label %bb
1140
1141 bb:             ; preds = %entry
1142         %tmp6 = add i32 %b, %a          ; <i32> [#uses=1]
1143         ret i32 %tmp6
1144
1145 bb7:            ; preds = %entry
1146         %tmp10 = sub i32 %a, %c         ; <i32> [#uses=1]
1147         ret i32 %tmp10
1148 }
1149
1150 to:
1151
1152 foo:                                    # @foo
1153 # BB#0:                                 # %entry
1154         movl    4(%esp), %ecx
1155         cmpb    $0, 16(%esp)
1156         je      .LBB0_2
1157 # BB#1:                                 # %bb
1158         movl    8(%esp), %eax
1159         addl    %ecx, %eax
1160         ret
1161 .LBB0_2:                                # %bb7
1162         movl    12(%esp), %edx
1163         movl    %ecx, %eax
1164         subl    %edx, %eax
1165         ret
1166
1167 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1168 couple more movls by putting 4(%esp) into %eax instead of %ecx.
1169
1170 //===---------------------------------------------------------------------===//
1171
1172 See rdar://4653682.
1173
1174 From flops:
1175
1176 LBB1_15:        # bb310
1177         cvtss2sd        LCPI1_0, %xmm1
1178         addsd   %xmm1, %xmm0
1179         movsd   176(%esp), %xmm2
1180         mulsd   %xmm0, %xmm2
1181         movapd  %xmm2, %xmm3
1182         mulsd   %xmm3, %xmm3
1183         movapd  %xmm3, %xmm4
1184         mulsd   LCPI1_23, %xmm4
1185         addsd   LCPI1_24, %xmm4
1186         mulsd   %xmm3, %xmm4
1187         addsd   LCPI1_25, %xmm4
1188         mulsd   %xmm3, %xmm4
1189         addsd   LCPI1_26, %xmm4
1190         mulsd   %xmm3, %xmm4
1191         addsd   LCPI1_27, %xmm4
1192         mulsd   %xmm3, %xmm4
1193         addsd   LCPI1_28, %xmm4
1194         mulsd   %xmm3, %xmm4
1195         addsd   %xmm1, %xmm4
1196         mulsd   %xmm2, %xmm4
1197         movsd   152(%esp), %xmm1
1198         addsd   %xmm4, %xmm1
1199         movsd   %xmm1, 152(%esp)
1200         incl    %eax
1201         cmpl    %eax, %esi
1202         jge     LBB1_15 # bb310
1203 LBB1_16:        # bb358.loopexit
1204         movsd   152(%esp), %xmm0
1205         addsd   %xmm0, %xmm0
1206         addsd   LCPI1_22, %xmm0
1207         movsd   %xmm0, 152(%esp)
1208
1209 Rather than spilling the result of the last addsd in the loop, we should have
1210 insert a copy to split the interval (one for the duration of the loop, one
1211 extending to the fall through). The register pressure in the loop isn't high
1212 enough to warrant the spill.
1213
1214 Also check why xmm7 is not used at all in the function.
1215
1216 //===---------------------------------------------------------------------===//
1217
1218 Take the following:
1219
1220 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
1221 target triple = "i386-apple-darwin8"
1222 @in_exit.4870.b = internal global i1 false              ; <i1*> [#uses=2]
1223 define fastcc void @abort_gzip() noreturn nounwind  {
1224 entry:
1225         %tmp.b.i = load i1* @in_exit.4870.b             ; <i1> [#uses=1]
1226         br i1 %tmp.b.i, label %bb.i, label %bb4.i
1227 bb.i:           ; preds = %entry
1228         tail call void @exit( i32 1 ) noreturn nounwind
1229         unreachable
1230 bb4.i:          ; preds = %entry
1231         store i1 true, i1* @in_exit.4870.b
1232         tail call void @exit( i32 1 ) noreturn nounwind
1233         unreachable
1234 }
1235 declare void @exit(i32) noreturn nounwind
1236
1237 This compiles into:
1238 _abort_gzip:                            ## @abort_gzip
1239 ## BB#0:                                ## %entry
1240         subl    $12, %esp
1241         movb    _in_exit.4870.b, %al
1242         cmpb    $1, %al
1243         jne     LBB0_2
1244
1245 We somehow miss folding the movb into the cmpb.
1246
1247 //===---------------------------------------------------------------------===//
1248
1249 We compile:
1250
1251 int test(int x, int y) {
1252   return x-y-1;
1253 }
1254
1255 into (-m64):
1256
1257 _test:
1258         decl    %edi
1259         movl    %edi, %eax
1260         subl    %esi, %eax
1261         ret
1262
1263 it would be better to codegen as: x+~y  (notl+addl)
1264
1265 //===---------------------------------------------------------------------===//
1266
1267 This code:
1268
1269 int foo(const char *str,...)
1270 {
1271  __builtin_va_list a; int x;
1272  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1273  return x;
1274 }
1275
1276 gets compiled into this on x86-64:
1277         subq    $200, %rsp
1278         movaps  %xmm7, 160(%rsp)
1279         movaps  %xmm6, 144(%rsp)
1280         movaps  %xmm5, 128(%rsp)
1281         movaps  %xmm4, 112(%rsp)
1282         movaps  %xmm3, 96(%rsp)
1283         movaps  %xmm2, 80(%rsp)
1284         movaps  %xmm1, 64(%rsp)
1285         movaps  %xmm0, 48(%rsp)
1286         movq    %r9, 40(%rsp)
1287         movq    %r8, 32(%rsp)
1288         movq    %rcx, 24(%rsp)
1289         movq    %rdx, 16(%rsp)
1290         movq    %rsi, 8(%rsp)
1291         leaq    (%rsp), %rax
1292         movq    %rax, 192(%rsp)
1293         leaq    208(%rsp), %rax
1294         movq    %rax, 184(%rsp)
1295         movl    $48, 180(%rsp)
1296         movl    $8, 176(%rsp)
1297         movl    176(%rsp), %eax
1298         cmpl    $47, %eax
1299         jbe     .LBB1_3 # bb
1300 .LBB1_1:        # bb3
1301         movq    184(%rsp), %rcx
1302         leaq    8(%rcx), %rax
1303         movq    %rax, 184(%rsp)
1304 .LBB1_2:        # bb4
1305         movl    (%rcx), %eax
1306         addq    $200, %rsp
1307         ret
1308 .LBB1_3:        # bb
1309         movl    %eax, %ecx
1310         addl    $8, %eax
1311         addq    192(%rsp), %rcx
1312         movl    %eax, 176(%rsp)
1313         jmp     .LBB1_2 # bb4
1314
1315 gcc 4.3 generates:
1316         subq    $96, %rsp
1317 .LCFI0:
1318         leaq    104(%rsp), %rax
1319         movq    %rsi, -80(%rsp)
1320         movl    $8, -120(%rsp)
1321         movq    %rax, -112(%rsp)
1322         leaq    -88(%rsp), %rax
1323         movq    %rax, -104(%rsp)
1324         movl    $8, %eax
1325         cmpl    $48, %eax
1326         jb      .L6
1327         movq    -112(%rsp), %rdx
1328         movl    (%rdx), %eax
1329         addq    $96, %rsp
1330         ret
1331         .p2align 4,,10
1332         .p2align 3
1333 .L6:
1334         mov     %eax, %edx
1335         addq    -104(%rsp), %rdx
1336         addl    $8, %eax
1337         movl    %eax, -120(%rsp)
1338         movl    (%rdx), %eax
1339         addq    $96, %rsp
1340         ret
1341
1342 and it gets compiled into this on x86:
1343         pushl   %ebp
1344         movl    %esp, %ebp
1345         subl    $4, %esp
1346         leal    12(%ebp), %eax
1347         movl    %eax, -4(%ebp)
1348         leal    16(%ebp), %eax
1349         movl    %eax, -4(%ebp)
1350         movl    12(%ebp), %eax
1351         addl    $4, %esp
1352         popl    %ebp
1353         ret
1354
1355 gcc 4.3 generates:
1356         pushl   %ebp
1357         movl    %esp, %ebp
1358         movl    12(%ebp), %eax
1359         popl    %ebp
1360         ret
1361
1362 //===---------------------------------------------------------------------===//
1363
1364 Teach tblgen not to check bitconvert source type in some cases. This allows us
1365 to consolidate the following patterns in X86InstrMMX.td:
1366
1367 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1368                                                   (iPTR 0))))),
1369           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1370 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1371                                                   (iPTR 0))))),
1372           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1373 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1374                                                   (iPTR 0))))),
1375           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1376
1377 There are other cases in various td files.
1378
1379 //===---------------------------------------------------------------------===//
1380
1381 Take something like the following on x86-32:
1382 unsigned a(unsigned long long x, unsigned y) {return x % y;}
1383
1384 We currently generate a libcall, but we really shouldn't: the expansion is
1385 shorter and likely faster than the libcall.  The expected code is something
1386 like the following:
1387
1388         movl    12(%ebp), %eax
1389         movl    16(%ebp), %ecx
1390         xorl    %edx, %edx
1391         divl    %ecx
1392         movl    8(%ebp), %eax
1393         divl    %ecx
1394         movl    %edx, %eax
1395         ret
1396
1397 A similar code sequence works for division.
1398
1399 //===---------------------------------------------------------------------===//
1400
1401 These should compile to the same code, but the later codegen's to useless
1402 instructions on X86. This may be a trivial dag combine (GCC PR7061):
1403
1404 struct s1 { unsigned char a, b; };
1405 unsigned long f1(struct s1 x) {
1406     return x.a + x.b;
1407 }
1408 struct s2 { unsigned a: 8, b: 8; };
1409 unsigned long f2(struct s2 x) {
1410     return x.a + x.b;
1411 }
1412
1413 //===---------------------------------------------------------------------===//
1414
1415 We currently compile this:
1416
1417 define i32 @func1(i32 %v1, i32 %v2) nounwind {
1418 entry:
1419   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1420   %sum = extractvalue {i32, i1} %t, 0
1421   %obit = extractvalue {i32, i1} %t, 1
1422   br i1 %obit, label %overflow, label %normal
1423 normal:
1424   ret i32 %sum
1425 overflow:
1426   call void @llvm.trap()
1427   unreachable
1428 }
1429 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1430 declare void @llvm.trap()
1431
1432 to:
1433
1434 _func1:
1435         movl    4(%esp), %eax
1436         addl    8(%esp), %eax
1437         jo      LBB1_2  ## overflow
1438 LBB1_1: ## normal
1439         ret
1440 LBB1_2: ## overflow
1441         ud2
1442
1443 it would be nice to produce "into" someday.
1444
1445 //===---------------------------------------------------------------------===//
1446
1447 This code:
1448
1449 void vec_mpys1(int y[], const int x[], int scaler) {
1450 int i;
1451 for (i = 0; i < 150; i++)
1452  y[i] += (((long long)scaler * (long long)x[i]) >> 31);
1453 }
1454
1455 Compiles to this loop with GCC 3.x:
1456
1457 .L5:
1458         movl    %ebx, %eax
1459         imull   (%edi,%ecx,4)
1460         shrdl   $31, %edx, %eax
1461         addl    %eax, (%esi,%ecx,4)
1462         incl    %ecx
1463         cmpl    $149, %ecx
1464         jle     .L5
1465
1466 llvm-gcc compiles it to the much uglier:
1467
1468 LBB1_1: ## bb1
1469         movl    24(%esp), %eax
1470         movl    (%eax,%edi,4), %ebx
1471         movl    %ebx, %ebp
1472         imull   %esi, %ebp
1473         movl    %ebx, %eax
1474         mull    %ecx
1475         addl    %ebp, %edx
1476         sarl    $31, %ebx
1477         imull   %ecx, %ebx
1478         addl    %edx, %ebx
1479         shldl   $1, %eax, %ebx
1480         movl    20(%esp), %eax
1481         addl    %ebx, (%eax,%edi,4)
1482         incl    %edi
1483         cmpl    $150, %edi
1484         jne     LBB1_1  ## bb1
1485
1486 The issue is that we hoist the cast of "scaler" to long long outside of the
1487 loop, the value comes into the loop as two values, and
1488 RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
1489 constructed BUILD_PAIR which represents the cast value.
1490
1491 This can be handled by making CodeGenPrepare sink the cast.
1492
1493 //===---------------------------------------------------------------------===//
1494
1495 Test instructions can be eliminated by using EFLAGS values from arithmetic
1496 instructions. This is currently not done for mul, and, or, xor, neg, shl,
1497 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1498 for read-modify-write instructions. It is also current not done if the
1499 OF or CF flags are needed.
1500
1501 The shift operators have the complication that when the shift count is
1502 zero, EFLAGS is not set, so they can only subsume a test instruction if
1503 the shift count is known to be non-zero. Also, using the EFLAGS value
1504 from a shift is apparently very slow on some x86 implementations.
1505
1506 In read-modify-write instructions, the root node in the isel match is
1507 the store, and isel has no way for the use of the EFLAGS result of the
1508 arithmetic to be remapped to the new node.
1509
1510 Add and subtract instructions set OF on signed overflow and CF on unsiged
1511 overflow, while test instructions always clear OF and CF. In order to
1512 replace a test with an add or subtract in a situation where OF or CF is
1513 needed, codegen must be able to prove that the operation cannot see
1514 signed or unsigned overflow, respectively.
1515
1516 //===---------------------------------------------------------------------===//
1517
1518 memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
1519 define <16 x float> @foo(<16 x float> %A) nounwind {
1520         %tmp = alloca <16 x float>, align 16
1521         %tmp2 = alloca <16 x float>, align 16
1522         store <16 x float> %A, <16 x float>* %tmp
1523         %s = bitcast <16 x float>* %tmp to i8*
1524         %s2 = bitcast <16 x float>* %tmp2 to i8*
1525         call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1526         %R = load <16 x float>* %tmp2
1527         ret <16 x float> %R
1528 }
1529
1530 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1531
1532 which compiles to:
1533
1534 _foo:
1535         subl    $140, %esp
1536         movaps  %xmm3, 112(%esp)
1537         movaps  %xmm2, 96(%esp)
1538         movaps  %xmm1, 80(%esp)
1539         movaps  %xmm0, 64(%esp)
1540         movl    60(%esp), %eax
1541         movl    %eax, 124(%esp)
1542         movl    56(%esp), %eax
1543         movl    %eax, 120(%esp)
1544         movl    52(%esp), %eax
1545         <many many more 32-bit copies>
1546         movaps  (%esp), %xmm0
1547         movaps  16(%esp), %xmm1
1548         movaps  32(%esp), %xmm2
1549         movaps  48(%esp), %xmm3
1550         addl    $140, %esp
1551         ret
1552
1553 On Nehalem, it may even be cheaper to just use movups when unaligned than to
1554 fall back to lower-granularity chunks.
1555
1556 //===---------------------------------------------------------------------===//
1557
1558 Implement processor-specific optimizations for parity with GCC on these
1559 processors.  GCC does two optimizations:
1560
1561 1. ix86_pad_returns inserts a noop before ret instructions if immediately
1562    preceded by a conditional branch or is the target of a jump.
1563 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1564    code contains more than 3 branches.
1565
1566 The first one is done for all AMDs, Core2, and "Generic"
1567 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1568   Core 2, and "Generic"
1569
1570 //===---------------------------------------------------------------------===//
1571 Testcase:
1572 int x(int a) { return (a&0xf0)>>4; }
1573
1574 Current output:
1575         movl    4(%esp), %eax
1576         shrl    $4, %eax
1577         andl    $15, %eax
1578         ret
1579
1580 Ideal output:
1581         movzbl  4(%esp), %eax
1582         shrl    $4, %eax
1583         ret
1584
1585 //===---------------------------------------------------------------------===//
1586
1587 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1588 properly.
1589
1590 When the return value is not used (i.e. only care about the value in the
1591 memory), x86 does not have to use add to implement these. Instead, it can use
1592 add, sub, inc, dec instructions with the "lock" prefix.
1593
1594 This is currently implemented using a bit of instruction selection trick. The
1595 issue is the target independent pattern produces one output and a chain and we
1596 want to map it into one that just output a chain. The current trick is to select
1597 it into a MERGE_VALUES with the first definition being an implicit_def. The
1598 proper solution is to add new ISD opcodes for the no-output variant. DAG
1599 combiner can then transform the node before it gets to target node selection.
1600
1601 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1602 fact these instructions are identical to the non-lock versions. We need a way to
1603 add target specific information to target nodes and have this information
1604 carried over to machine instructions. Asm printer (or JIT) can use this
1605 information to add the "lock" prefix.
1606
1607 //===---------------------------------------------------------------------===//
1608
1609 struct B {
1610   unsigned char y0 : 1;
1611 };
1612
1613 int bar(struct B* a) { return a->y0; }
1614
1615 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
1616   %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
1617   %2 = load i8* %1, align 1
1618   %3 = and i8 %2, 1
1619   %4 = zext i8 %3 to i32
1620   ret i32 %4
1621 }
1622
1623 bar:                                    # @bar
1624 # BB#0:
1625         movb    (%rdi), %al
1626         andb    $1, %al
1627         movzbl  %al, %eax
1628         ret
1629
1630 Missed optimization: should be movl+andl.
1631
1632 //===---------------------------------------------------------------------===//
1633
1634 The x86_64 abi says:
1635
1636 Booleans, when stored in a memory object, are stored as single byte objects the
1637 value of which is always 0 (false) or 1 (true).
1638
1639 We are not using this fact:
1640
1641 int bar(_Bool *a) { return *a; }
1642
1643 define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
1644   %1 = load i8* %a, align 1, !tbaa !0
1645   %tmp = and i8 %1, 1
1646   %2 = zext i8 %tmp to i32
1647   ret i32 %2
1648 }
1649
1650 bar:
1651         movb    (%rdi), %al
1652         andb    $1, %al
1653         movzbl  %al, %eax
1654         ret
1655
1656 GCC produces
1657
1658 bar:
1659         movzbl  (%rdi), %eax
1660         ret
1661
1662 //===---------------------------------------------------------------------===//
1663
1664 Consider the following two functions compiled with clang:
1665 _Bool foo(int *x) { return !(*x & 4); }
1666 unsigned bar(int *x) { return !(*x & 4); }
1667
1668 foo:
1669         movl    4(%esp), %eax
1670         testb   $4, (%eax)
1671         sete    %al
1672         movzbl  %al, %eax
1673         ret
1674
1675 bar:
1676         movl    4(%esp), %eax
1677         movl    (%eax), %eax
1678         shrl    $2, %eax
1679         andl    $1, %eax
1680         xorl    $1, %eax
1681         ret
1682
1683 The second function generates more code even though the two functions are
1684 are functionally identical.
1685
1686 //===---------------------------------------------------------------------===//
1687
1688 Take the following C code:
1689 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1690
1691 We generate the following IR with clang:
1692 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1693 entry:
1694   %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
1695   %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
1696   %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
1697   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1698   ret i32 %conv5
1699 }
1700
1701 And the following x86 code:
1702         xorl    %esi, %edi
1703         testb   $-1, %dil
1704         sete    %al
1705         movzbl  %al, %eax
1706         ret
1707
1708 A cmpb instead of the xorl+testb would be one instruction shorter.
1709
1710 //===---------------------------------------------------------------------===//
1711
1712 Given the following C code:
1713 int f(int a, int b) { return (signed char)a == (signed char)b; }
1714
1715 We generate the following IR with clang:
1716 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1717 entry:
1718   %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
1719   %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
1720   %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
1721   %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
1722   %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
1723   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1724   ret i32 %conv5
1725 }
1726
1727 And the following x86 code:
1728         movsbl  %sil, %eax
1729         movsbl  %dil, %ecx
1730         cmpl    %eax, %ecx
1731         sete    %al
1732         movzbl  %al, %eax
1733         ret
1734
1735
1736 It should be possible to eliminate the sign extensions.
1737
1738 //===---------------------------------------------------------------------===//
1739
1740 LLVM misses a load+store narrowing opportunity in this code:
1741
1742 %struct.bf = type { i64, i16, i16, i32 }
1743
1744 @bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
1745
1746 define void @t1() nounwind ssp {
1747 entry:
1748   %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1749   %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1750   %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
1751   %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
1752   %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
1753   store i32 %4, i32* %2, align 1
1754   %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1755   %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1756   %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
1757   %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
1758   %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
1759   store i32 %9, i32* %7, align 1
1760   ret void
1761 }
1762
1763 LLVM currently emits this:
1764
1765   movq  bfi(%rip), %rax
1766   andl  $-65537, 8(%rax)
1767   movq  bfi(%rip), %rax
1768   andl  $-131073, 8(%rax)
1769   ret
1770
1771 It could narrow the loads and stores to emit this:
1772
1773   movq  bfi(%rip), %rax
1774   andb  $-2, 10(%rax)
1775   movq  bfi(%rip), %rax
1776   andb  $-3, 10(%rax)
1777   ret
1778
1779 The trouble is that there is a TokenFactor between the store and the
1780 load, making it non-trivial to determine if there's anything between
1781 the load and the store which would prohibit narrowing.
1782
1783 //===---------------------------------------------------------------------===//
1784
1785 This code:
1786 void foo(unsigned x) {
1787   if (x == 0) bar();
1788   else if (x == 1) qux();
1789 }
1790
1791 currently compiles into:
1792 _foo:
1793         movl    4(%esp), %eax
1794         cmpl    $1, %eax
1795         je      LBB0_3
1796         testl   %eax, %eax
1797         jne     LBB0_4
1798
1799 the testl could be removed:
1800 _foo:
1801         movl    4(%esp), %eax
1802         cmpl    $1, %eax
1803         je      LBB0_3
1804         jb      LBB0_4
1805
1806 0 is the only unsigned number < 1.
1807
1808 //===---------------------------------------------------------------------===//
1809
1810 This code:
1811
1812 %0 = type { i32, i1 }
1813
1814 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1815 entry:
1816   %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1817   %cmp = extractvalue %0 %uadd, 1
1818   %inc = zext i1 %cmp to i32
1819   %add = add i32 %x, %sum
1820   %z.0 = add i32 %add, %inc
1821   ret i32 %z.0
1822 }
1823
1824 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1825
1826 compiles to:
1827
1828 _add32carry:                            ## @add32carry
1829         addl    %esi, %edi
1830         sbbl    %ecx, %ecx
1831         movl    %edi, %eax
1832         subl    %ecx, %eax
1833         ret
1834
1835 But it could be:
1836
1837 _add32carry:
1838         leal    (%rsi,%rdi), %eax
1839         cmpl    %esi, %eax
1840         adcl    $0, %eax
1841         ret
1842
1843 //===---------------------------------------------------------------------===//
1844
1845 The hot loop of 256.bzip2 contains code that looks a bit like this:
1846
1847 int foo(char *P, char *Q, int x, int y) {
1848   if (P[0] != Q[0])
1849      return P[0] < Q[0];
1850   if (P[1] != Q[1])
1851      return P[1] < Q[1];
1852   if (P[2] != Q[2])
1853      return P[2] < Q[2];
1854    return P[3] < Q[3];
1855 }
1856
1857 In the real code, we get a lot more wrong than this.  However, even in this
1858 code we generate:
1859
1860 _foo:                                   ## @foo
1861 ## BB#0:                                ## %entry
1862         movb    (%rsi), %al
1863         movb    (%rdi), %cl
1864         cmpb    %al, %cl
1865         je      LBB0_2
1866 LBB0_1:                                 ## %if.then
1867         cmpb    %al, %cl
1868         jmp     LBB0_5
1869 LBB0_2:                                 ## %if.end
1870         movb    1(%rsi), %al
1871         movb    1(%rdi), %cl
1872         cmpb    %al, %cl
1873         jne     LBB0_1
1874 ## BB#3:                                ## %if.end38
1875         movb    2(%rsi), %al
1876         movb    2(%rdi), %cl
1877         cmpb    %al, %cl
1878         jne     LBB0_1
1879 ## BB#4:                                ## %if.end60
1880         movb    3(%rdi), %al
1881         cmpb    3(%rsi), %al
1882 LBB0_5:                                 ## %if.end60
1883         setl    %al
1884         movzbl  %al, %eax
1885         ret
1886
1887 Note that we generate jumps to LBB0_1 which does a redundant compare.  The
1888 redundant compare also forces the register values to be live, which prevents
1889 folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
1890
1891 _foo:
1892         movzbl  (%rsi), %eax
1893         cmpb    %al, (%rdi)
1894         jne     L10
1895 L12:
1896         movzbl  1(%rsi), %eax
1897         cmpb    %al, 1(%rdi)
1898         jne     L10
1899         movzbl  2(%rsi), %eax
1900         cmpb    %al, 2(%rdi)
1901         jne     L10
1902         movzbl  3(%rdi), %eax
1903         cmpb    3(%rsi), %al
1904 L10:
1905         setl    %al
1906         movzbl  %al, %eax
1907         ret
1908
1909 which is "perfect".
1910
1911 //===---------------------------------------------------------------------===//
1912
1913 For the branch in the following code:
1914 int a();
1915 int b(int x, int y) {
1916   if (x & (1<<(y&7)))
1917     return a();
1918   return y;
1919 }
1920
1921 We currently generate:
1922         movb    %sil, %al
1923         andb    $7, %al
1924         movzbl  %al, %eax
1925         btl     %eax, %edi
1926         jae     .LBB0_2
1927
1928 movl+andl would be shorter than the movb+andb+movzbl sequence.
1929
1930 //===---------------------------------------------------------------------===//
1931
1932 For the following:
1933 struct u1 {
1934     float x, y;
1935 };
1936 float foo(struct u1 u) {
1937     return u.x + u.y;
1938 }
1939
1940 We currently generate:
1941         movdqa  %xmm0, %xmm1
1942         pshufd  $1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
1943         addss   %xmm1, %xmm0
1944         ret
1945
1946 We could save an instruction here by commuting the addss.
1947
1948 //===---------------------------------------------------------------------===//
1949
1950 This (from PR9661):
1951
1952 float clamp_float(float a) {
1953         if (a > 1.0f)
1954                 return 1.0f;
1955         else if (a < 0.0f)
1956                 return 0.0f;
1957         else
1958                 return a;
1959 }
1960
1961 Could compile to:
1962
1963 clamp_float:                            # @clamp_float
1964         movss   .LCPI0_0(%rip), %xmm1
1965         minss   %xmm1, %xmm0
1966         pxor    %xmm1, %xmm1
1967         maxss   %xmm1, %xmm0
1968         ret
1969
1970 with -ffast-math.
1971
1972 //===---------------------------------------------------------------------===//
1973
1974 This function (from PR9803):
1975
1976 int clamp2(int a) {
1977         if (a > 5)
1978                 a = 5;
1979         if (a < 0)
1980                 return 0;
1981         return a;
1982 }
1983
1984 Compiles to:
1985
1986 _clamp2:                                ## @clamp2
1987         pushq   %rbp
1988         movq    %rsp, %rbp
1989         cmpl    $5, %edi
1990         movl    $5, %ecx
1991         cmovlel %edi, %ecx
1992         testl   %ecx, %ecx
1993         movl    $0, %eax
1994         cmovnsl %ecx, %eax
1995         popq    %rbp
1996         ret
1997
1998 The move of 0 could be scheduled above the test to make it is xor reg,reg.
1999
2000 //===---------------------------------------------------------------------===//
2001
2002 GCC PR48986.  We currently compile this:
2003
2004 void bar(void);
2005 void yyy(int* p) {
2006     if (__sync_fetch_and_add(p, -1) == 1)
2007       bar();
2008 }
2009
2010 into:
2011         movl    $-1, %eax
2012         lock
2013         xaddl   %eax, (%rdi)
2014         cmpl    $1, %eax
2015         je      LBB0_2
2016
2017 Instead we could generate:
2018
2019         lock
2020         dec %rdi
2021         je LBB0_2
2022
2023 The trick is to match "fetch_and_add(X, -C) == C".
2024
2025 //===---------------------------------------------------------------------===//
2026
2027 unsigned t(unsigned a, unsigned b) {
2028   return a <= b ? 5 : -5;
2029 }
2030
2031 We generate:
2032         movl    $5, %ecx
2033         cmpl    %esi, %edi
2034         movl    $-5, %eax
2035         cmovbel %ecx, %eax
2036
2037 GCC:
2038         cmpl    %edi, %esi
2039         sbbl    %eax, %eax
2040         andl    $-10, %eax
2041         addl    $5, %eax
2042
2043 //===---------------------------------------------------------------------===//