lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Missing features:
   6   - Support for SSE4: http://www.intel.com/software/penryn
   7 http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
   8   - support for 3DNow!
   9   - weird abis?
  10
  11 //===---------------------------------------------------------------------===//
  12
  13 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  14 backend knows how to three-addressify this shift, but it appears the register
  15 allocator isn't even asking it to do so in this case.  We should investigate
  16 why this isn't happening, it could have significant impact on other important
  17 cases for X86 as well.
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 This should be one DIV/IDIV instruction, not a libcall:
  22
  23 unsigned test(unsigned long long X, unsigned Y) {
  24         return X/Y;
  25 }
  26
  27 This can be done trivially with a custom legalizer.  What about overflow
  28 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  29
  30 //===---------------------------------------------------------------------===//
  31
  32 Improvements to the multiply -> shift/add algorithm:
  33 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  34
  35 //===---------------------------------------------------------------------===//
  36
  37 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  38 long long foo(int x) { return 1LL << x; }
  39
  40 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  41 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  42 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  43
  44 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  45
  46 One better solution for 1LL << x is:
  47         xorl    %eax, %eax
  48         xorl    %edx, %edx
  49         testb   $32, %cl
  50         sete    %al
  51         setne   %dl
  52         sall    %cl, %eax
  53         sall    %cl, %edx
  54
  55 But that requires good 8-bit subreg support.
  56
  57 64-bit shifts (in general) expand to really bad code.  Instead of using
  58 cmovs, we should expand to a conditional branch like GCC produces.
  59
  60 //===---------------------------------------------------------------------===//
  61
  62 Compile this:
  63 _Bool f(_Bool a) { return a!=1; }
  64
  65 into:
  66         movzbl  %dil, %eax
  67         xorl    $1, %eax
  68         ret
  69
  70 //===---------------------------------------------------------------------===//
  71
  72 Some isel ideas:
  73
  74 1. Dynamic programming based approach when compile time if not an
  75    issue.
  76 2. Code duplication (addressing mode) during isel.
  77 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  78    Sequencing of Instructions".
  79 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  80    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  81    and other related papers.
  82    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  83
  84 //===---------------------------------------------------------------------===//
  85
  86 Should we promote i16 to i32 to avoid partial register update stalls?
  87
  88 //===---------------------------------------------------------------------===//
  89
  90 Leave any_extend as pseudo instruction and hint to register
  91 allocator. Delay codegen until post register allocation.
  92 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
  93 the coalescer how to deal with it though.
  94
  95 //===---------------------------------------------------------------------===//
  96
  97 Count leading zeros and count trailing zeros:
  98
  99 int clz(int X) { return __builtin_clz(X); }
 100 int ctz(int X) { return __builtin_ctz(X); }
 101
 102 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 103 clz:
 104         bsr     %eax, DWORD PTR [%esp+4]
 105         xor     %eax, 31
 106         ret
 107 ctz:
 108         bsf     %eax, DWORD PTR [%esp+4]
 109         ret
 110
 111 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 112 aren't.
 113
 114 Another example (use predsimplify to eliminate a select):
 115
 116 int foo (unsigned long j) {
 117   if (j)
 118     return __builtin_ffs (j) - 1;
 119   else
 120     return 0;
 121 }
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 It appears icc use push for parameter passing. Need to investigate.
 126
 127 //===---------------------------------------------------------------------===//
 128
 129 Only use inc/neg/not instructions on processors where they are faster than
 130 add/sub/xor.  They are slower on the P4 due to only updating some processor
 131 flags.
 132
 133 //===---------------------------------------------------------------------===//
 134
 135 The instruction selector sometimes misses folding a load into a compare.  The
 136 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 137 commutative, it is not matched with the load on both sides.  The dag combiner
 138 should be made smart enough to cannonicalize the load into the RHS of a compare
 139 when it can invert the result of the compare for free.
 140
 141 //===---------------------------------------------------------------------===//
 142
 143 How about intrinsics? An example is:
 144   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 145
 146 compiles to
 147         pmuludq (%eax), %xmm0
 148         movl 8(%esp), %eax
 149         movdqa (%eax), %xmm1
 150         pmulhuw %xmm0, %xmm1
 151
 152 The transformation probably requires a X86 specific pass or a DAG combiner
 153 target specific hook.
 154
 155 //===---------------------------------------------------------------------===//
 156
 157 In many cases, LLVM generates code like this:
 158
 159 _test:
 160         movl 8(%esp), %eax
 161         cmpl %eax, 4(%esp)
 162         setl %al
 163         movzbl %al, %eax
 164         ret
 165
 166 on some processors (which ones?), it is more efficient to do this:
 167
 168 _test:
 169         movl 8(%esp), %ebx
 170         xor  %eax, %eax
 171         cmpl %ebx, 4(%esp)
 172         setl %al
 173         ret
 174
 175 Doing this correctly is tricky though, as the xor clobbers the flags.
 176
 177 //===---------------------------------------------------------------------===//
 178
 179 We should generate bts/btr/etc instructions on targets where they are cheap or
 180 when codesize is important.  e.g., for:
 181
 182 void setbit(int *target, int bit) {
 183     *target |= (1 << bit);
 184 }
 185 void clearbit(int *target, int bit) {
 186     *target &= ~(1 << bit);
 187 }
 188
 189 //===---------------------------------------------------------------------===//
 190
 191 Instead of the following for memset char*, 1, 10:
 192
 193         movl $16843009, 4(%edx)
 194         movl $16843009, (%edx)
 195         movw $257, 8(%edx)
 196
 197 It might be better to generate
 198
 199         movl $16843009, %eax
 200         movl %eax, 4(%edx)
 201         movl %eax, (%edx)
 202         movw al, 8(%edx)
 203
 204 when we can spare a register. It reduces code size.
 205
 206 //===---------------------------------------------------------------------===//
 207
 208 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 209 get this:
 210
 211 int %test1(int %X) {
 212         %Y = div int %X, 8
 213         ret int %Y
 214 }
 215
 216 _test1:
 217         movl 4(%esp), %eax
 218         movl %eax, %ecx
 219         sarl $31, %ecx
 220         shrl $29, %ecx
 221         addl %ecx, %eax
 222         sarl $3, %eax
 223         ret
 224
 225 GCC knows several different ways to codegen it, one of which is this:
 226
 227 _test1:
 228         movl    4(%esp), %eax
 229         cmpl    $-1, %eax
 230         leal    7(%eax), %ecx
 231         cmovle  %ecx, %eax
 232         sarl    $3, %eax
 233         ret
 234
 235 which is probably slower, but it's interesting at least :)
 236
 237 //===---------------------------------------------------------------------===//
 238
 239 The first BB of this code:
 240
 241 declare bool %foo()
 242 int %bar() {
 243         %V = call bool %foo()
 244         br bool %V, label %T, label %F
 245 T:
 246         ret int 1
 247 F:
 248         call bool %foo()
 249         ret int 12
 250 }
 251
 252 compiles to:
 253
 254 _bar:
 255         subl $12, %esp
 256         call L_foo$stub
 257         xorb $1, %al
 258         testb %al, %al
 259         jne LBB_bar_2   # F
 260
 261 It would be better to emit "cmp %al, 1" than a xor and test.
 262
 263 //===---------------------------------------------------------------------===//
 264
 265 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 266 We should leave these as libcalls for everything over a much lower threshold,
 267 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 268 stores, TLB preheating, etc)
 269
 270 //===---------------------------------------------------------------------===//
 271
 272 Optimize this into something reasonable:
 273  x * copysign(1.0, y) * copysign(1.0, z)
 274
 275 //===---------------------------------------------------------------------===//
 276
 277 Optimize copysign(x, *y) to use an integer load from y.
 278
 279 //===---------------------------------------------------------------------===//
 280
 281 %X = weak global int 0
 282
 283 void %foo(int %N) {
 284         %N = cast int %N to uint
 285         %tmp.24 = setgt int %N, 0
 286         br bool %tmp.24, label %no_exit, label %return
 287
 288 no_exit:
 289         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 290         %i.0.0 = cast uint %indvar to int
 291         volatile store int %i.0.0, int* %X
 292         %indvar.next = add uint %indvar, 1
 293         %exitcond = seteq uint %indvar.next, %N
 294         br bool %exitcond, label %return, label %no_exit
 295
 296 return:
 297         ret void
 298 }
 299
 300 compiles into:
 301
 302         .text
 303         .align  4
 304         .globl  _foo
 305 _foo:
 306         movl 4(%esp), %eax
 307         cmpl $1, %eax
 308         jl LBB_foo_4    # return
 309 LBB_foo_1:      # no_exit.preheader
 310         xorl %ecx, %ecx
 311 LBB_foo_2:      # no_exit
 312         movl L_X$non_lazy_ptr, %edx
 313         movl %ecx, (%edx)
 314         incl %ecx
 315         cmpl %eax, %ecx
 316         jne LBB_foo_2   # no_exit
 317 LBB_foo_3:      # return.loopexit
 318 LBB_foo_4:      # return
 319         ret
 320
 321 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 322 remateralization is implemented. This can be accomplished with 1) a target
 323 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 324
 325 //===---------------------------------------------------------------------===//
 326
 327 The following tests perform worse with LSR:
 328
 329 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 330
 331 //===---------------------------------------------------------------------===//
 332
 333 We are generating far worse code than gcc:
 334
 335 volatile short X, Y;
 336
 337 void foo(int N) {
 338   int i;
 339   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 340 }
 341
 342 LBB1_1: #bb.preheader
 343         xorl %ecx, %ecx
 344         xorw %dx, %dx
 345 LBB1_2: #bb
 346         movl L_X$non_lazy_ptr, %esi
 347         movw %dx, (%esi)
 348         movw %dx, %si
 349         shlw $2, %si
 350         movl L_Y$non_lazy_ptr, %edi
 351         movw %si, (%edi)
 352         incl %ecx
 353         incw %dx
 354         cmpl %eax, %ecx
 355         jne LBB1_2      #bb
 356
 357 vs.
 358
 359         xorl    %edx, %edx
 360         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 361         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 362 L4:
 363         movw    %dx, (%esi)
 364         leal    0(,%edx,4), %eax
 365         movw    %ax, (%ecx)
 366         addl    $1, %edx
 367         cmpl    %edx, %edi
 368         jne     L4
 369
 370 There are 3 issues:
 371
 372 1. Lack of post regalloc LICM.
 373 2. LSR unable to reused IV for a different type (i16 vs. i32) even though
 374    the cast would be free.
 375
 376 //===---------------------------------------------------------------------===//
 377
 378 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 379 FR64 to VR128.
 380
 381 //===---------------------------------------------------------------------===//
 382
 383 mov $reg, 48(%esp)
 384 ...
 385 leal 48(%esp), %eax
 386 mov %eax, (%esp)
 387 call _foo
 388
 389 Obviously it would have been better for the first mov (or any op) to store
 390 directly %esp[0] if there are no other uses.
 391
 392 //===---------------------------------------------------------------------===//
 393
 394 Adding to the list of cmp / test poor codegen issues:
 395
 396 int test(__m128 *A, __m128 *B) {
 397   if (_mm_comige_ss(*A, *B))
 398     return 3;
 399   else
 400     return 4;
 401 }
 402
 403 _test:
 404         movl 8(%esp), %eax
 405         movaps (%eax), %xmm0
 406         movl 4(%esp), %eax
 407         movaps (%eax), %xmm1
 408         comiss %xmm0, %xmm1
 409         setae %al
 410         movzbl %al, %ecx
 411         movl $3, %eax
 412         movl $4, %edx
 413         cmpl $0, %ecx
 414         cmove %edx, %eax
 415         ret
 416
 417 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 418 are a number of issues. 1) We are introducing a setcc between the result of the
 419 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 420 so a any extend (which becomes a zero extend) is added.
 421
 422 We probably need some kind of target DAG combine hook to fix this.
 423
 424 //===---------------------------------------------------------------------===//
 425
 426 We generate significantly worse code for this than GCC:
 427 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 428 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 429
 430 There is also one case we do worse on PPC.
 431
 432 //===---------------------------------------------------------------------===//
 433
 434 If shorter, we should use things like:
 435 movzwl %ax, %eax
 436 instead of:
 437 andl $65535, %EAX
 438
 439 The former can also be used when the two-addressy nature of the 'and' would
 440 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 441
 442 //===---------------------------------------------------------------------===//
 443
 444 Consider this:
 445
 446 typedef struct pair { float A, B; } pair;
 447 void pairtest(pair P, float *FP) {
 448         *FP = P.A+P.B;
 449 }
 450
 451 We currently generate this code with llvmgcc4:
 452
 453 _pairtest:
 454         movl 8(%esp), %eax
 455         movl 4(%esp), %ecx
 456         movd %eax, %xmm0
 457         movd %ecx, %xmm1
 458         addss %xmm0, %xmm1
 459         movl 12(%esp), %eax
 460         movss %xmm1, (%eax)
 461         ret
 462
 463 we should be able to generate:
 464 _pairtest:
 465         movss 4(%esp), %xmm0
 466         movl 12(%esp), %eax
 467         addss 8(%esp), %xmm0
 468         movss %xmm0, (%eax)
 469         ret
 470
 471 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 472 integer chunks.  It does this so that structs like {short,short} are passed in
 473 a single 32-bit integer stack slot.  We should handle the safe cases above much
 474 nicer, while still handling the hard cases.
 475
 476 While true in general, in this specific case we could do better by promoting
 477 load int + bitcast to float -> load fload.  This basically needs alignment info,
 478 the code is already implemented (but disabled) in dag combine).
 479
 480 //===---------------------------------------------------------------------===//
 481
 482 Another instruction selector deficiency:
 483
 484 void %bar() {
 485         %tmp = load int (int)** %foo
 486         %tmp = tail call int %tmp( int 3 )
 487         ret void
 488 }
 489
 490 _bar:
 491         subl $12, %esp
 492         movl L_foo$non_lazy_ptr, %eax
 493         movl (%eax), %eax
 494         call *%eax
 495         addl $12, %esp
 496         ret
 497
 498 The current isel scheme will not allow the load to be folded in the call since
 499 the load's chain result is read by the callseq_start.
 500
 501 //===---------------------------------------------------------------------===//
 502
 503 For this:
 504
 505 int test(int a)
 506 {
 507   return a * 3;
 508 }
 509
 510 We currently emits
 511         imull $3, 4(%esp), %eax
 512
 513 Perhaps this is what we really should generate is? Is imull three or four
 514 cycles? Note: ICC generates this:
 515         movl    4(%esp), %eax
 516         leal    (%eax,%eax,2), %eax
 517
 518 The current instruction priority is based on pattern complexity. The former is
 519 more "complex" because it folds a load so the latter will not be emitted.
 520
 521 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 522 should always try to match LEA first since the LEA matching code does some
 523 estimate to determine whether the match is profitable.
 524
 525 However, if we care more about code size, then imull is better. It's two bytes
 526 shorter than movl + leal.
 527
 528 //===---------------------------------------------------------------------===//
 529
 530 Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
 531
 532 int ctz_(unsigned X) { return __builtin_ctz(X); }
 533 int clz_(unsigned X) { return __builtin_clz(X); }
 534 int ffs_(unsigned X) { return __builtin_ffs(X); }
 535
 536 _ctz_:
 537         bsfl    4(%esp), %eax
 538         ret
 539 _clz_:
 540         bsrl    4(%esp), %eax
 541         xorl    $31, %eax
 542         ret
 543 _ffs_:
 544         movl    $-1, %edx
 545         bsfl    4(%esp), %eax
 546         cmove   %edx, %eax
 547         addl    $1, %eax
 548         ret
 549
 550 //===---------------------------------------------------------------------===//
 551
 552 It appears gcc place string data with linkonce linkage in
 553 .section __TEXT,__const_coal,coalesced instead of
 554 .section __DATA,__const_coal,coalesced.
 555 Take a look at darwin.h, there are other Darwin assembler directives that we
 556 do not make use of.
 557
 558 //===---------------------------------------------------------------------===//
 559
 560 int %foo(int* %a, int %t) {
 561 entry:
 562         br label %cond_true
 563
 564 cond_true:              ; preds = %cond_true, %entry
 565         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 566         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 567         %tmp2 = getelementptr int* %a, int %x.0.0
 568         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 569         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 570         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 571         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 572         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 573         br bool %tmp, label %bb12, label %cond_true
 574
 575 bb12:           ; preds = %cond_true
 576         ret int %tmp7
 577 }
 578
 579 is pessimized by -loop-reduce and -indvars
 580
 581 //===---------------------------------------------------------------------===//
 582
 583 u32 to float conversion improvement:
 584
 585 float uint32_2_float( unsigned u ) {
 586   float fl = (int) (u & 0xffff);
 587   float fh = (int) (u >> 16);
 588   fh *= 0x1.0p16f;
 589   return fh + fl;
 590 }
 591
 592 00000000        subl    $0x04,%esp
 593 00000003        movl    0x08(%esp,1),%eax
 594 00000007        movl    %eax,%ecx
 595 00000009        shrl    $0x10,%ecx
 596 0000000c        cvtsi2ss        %ecx,%xmm0
 597 00000010        andl    $0x0000ffff,%eax
 598 00000015        cvtsi2ss        %eax,%xmm1
 599 00000019        mulss   0x00000078,%xmm0
 600 00000021        addss   %xmm1,%xmm0
 601 00000025        movss   %xmm0,(%esp,1)
 602 0000002a        flds    (%esp,1)
 603 0000002d        addl    $0x04,%esp
 604 00000030        ret
 605
 606 //===---------------------------------------------------------------------===//
 607
 608 When using fastcc abi, align stack slot of argument of type double on 8 byte
 609 boundary to improve performance.
 610
 611 //===---------------------------------------------------------------------===//
 612
 613 Codegen:
 614
 615 int f(int a, int b) {
 616   if (a == 4 || a == 6)
 617     b++;
 618   return b;
 619 }
 620
 621
 622 as:
 623
 624 or eax, 2
 625 cmp eax, 6
 626 jz label
 627
 628 //===---------------------------------------------------------------------===//
 629
 630 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 631 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 632
 633 int G;
 634 void f(int X, int Y) {
 635   G = X < 0 ? 14 : 13;
 636 }
 637
 638 compiling to:
 639
 640 _f:
 641         movl $14, %eax
 642         movl $13, %ecx
 643         movl 4(%esp), %edx
 644         testl %edx, %edx
 645         cmovl %eax, %ecx
 646         movl %ecx, _G
 647         ret
 648
 649 it could be:
 650 _f:
 651         movl    4(%esp), %eax
 652         sarl    $31, %eax
 653         notl    %eax
 654         addl    $14, %eax
 655         movl    %eax, _G
 656         ret
 657
 658 etc.
 659
 660 //===---------------------------------------------------------------------===//
 661
 662 Currently we don't have elimination of redundant stack manipulations. Consider
 663 the code:
 664
 665 int %main() {
 666 entry:
 667         call fastcc void %test1( )
 668         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 669         ret int 0
 670 }
 671
 672 declare fastcc void %test1()
 673
 674 declare fastcc void %test2(sbyte*)
 675
 676
 677 This currently compiles to:
 678
 679         subl $16, %esp
 680         call _test5
 681         addl $12, %esp
 682         subl $16, %esp
 683         movl $_test5, (%esp)
 684         call _test6
 685         addl $12, %esp
 686
 687 The add\sub pair is really unneeded here.
 688
 689 //===---------------------------------------------------------------------===//
 690
 691 We currently compile sign_extend_inreg into two shifts:
 692
 693 long foo(long X) {
 694   return (long)(signed char)X;
 695 }
 696
 697 becomes:
 698
 699 _foo:
 700         movl 4(%esp), %eax
 701         shll $24, %eax
 702         sarl $24, %eax
 703         ret
 704
 705 This could be:
 706
 707 _foo:
 708         movsbl  4(%esp),%eax
 709         ret
 710
 711 //===---------------------------------------------------------------------===//
 712
 713 Consider the expansion of:
 714
 715 uint %test3(uint %X) {
 716         %tmp1 = rem uint %X, 255
 717         ret uint %tmp1
 718 }
 719
 720 Currently it compiles to:
 721
 722 ...
 723         movl $2155905153, %ecx
 724         movl 8(%esp), %esi
 725         movl %esi, %eax
 726         mull %ecx
 727 ...
 728
 729 This could be "reassociated" into:
 730
 731         movl $2155905153, %eax
 732         movl 8(%esp), %ecx
 733         mull %ecx
 734
 735 to avoid the copy.  In fact, the existing two-address stuff would do this
 736 except that mul isn't a commutative 2-addr instruction.  I guess this has
 737 to be done at isel time based on the #uses to mul?
 738
 739 //===---------------------------------------------------------------------===//
 740
 741 Make sure the instruction which starts a loop does not cross a cacheline
 742 boundary. This requires knowning the exact length of each machine instruction.
 743 That is somewhat complicated, but doable. Example 256.bzip2:
 744
 745 In the new trace, the hot loop has an instruction which crosses a cacheline
 746 boundary.  In addition to potential cache misses, this can't help decoding as I
 747 imagine there has to be some kind of complicated decoder reset and realignment
 748 to grab the bytes from the next cacheline.
 749
 750 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 751 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 752 937  937 0x3d0a incl     %esi
 753 3    3   0x3d0b cmpb     %bl, %dl
 754 27   27  0x3d0d jnz      0x000062db <main+11707>
 755
 756 //===---------------------------------------------------------------------===//
 757
 758 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 759
 760 //===---------------------------------------------------------------------===//
 761
 762 This could be a single 16-bit load.
 763
 764 int f(char *p) {
 765     if ((p[0] == 1) & (p[1] == 2)) return 1;
 766     return 0;
 767 }
 768
 769 //===---------------------------------------------------------------------===//
 770
 771 We should inline lrintf and probably other libc functions.
 772
 773 //===---------------------------------------------------------------------===//
 774
 775 Start using the flags more.  For example, compile:
 776
 777 int add_zf(int *x, int y, int a, int b) {
 778      if ((*x += y) == 0)
 779           return a;
 780      else
 781           return b;
 782 }
 783
 784 to:
 785        addl    %esi, (%rdi)
 786        movl    %edx, %eax
 787        cmovne  %ecx, %eax
 788        ret
 789 instead of:
 790
 791 _add_zf:
 792         addl (%rdi), %esi
 793         movl %esi, (%rdi)
 794         testl %esi, %esi
 795         cmove %edx, %ecx
 796         movl %ecx, %eax
 797         ret
 798
 799 and:
 800
 801 int add_zf(int *x, int y, int a, int b) {
 802      if ((*x + y) < 0)
 803           return a;
 804      else
 805           return b;
 806 }
 807
 808 to:
 809
 810 add_zf:
 811         addl    (%rdi), %esi
 812         movl    %edx, %eax
 813         cmovns  %ecx, %eax
 814         ret
 815
 816 instead of:
 817
 818 _add_zf:
 819         addl (%rdi), %esi
 820         testl %esi, %esi
 821         cmovs %edx, %ecx
 822         movl %ecx, %eax
 823         ret
 824
 825 //===---------------------------------------------------------------------===//
 826
 827 This:
 828 #include <math.h>
 829 int foo(double X) { return isnan(X); }
 830
 831 compiles to (-m64):
 832
 833 _foo:
 834         pxor %xmm1, %xmm1
 835         ucomisd %xmm1, %xmm0
 836         setp %al
 837         movzbl %al, %eax
 838         ret
 839
 840 the pxor is not needed, we could compare the value against itself.
 841
 842 //===---------------------------------------------------------------------===//
 843
 844 These two functions have identical effects:
 845
 846 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 847 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 848
 849 We currently compile them to:
 850
 851 _f:
 852         movl 4(%esp), %eax
 853         movl %eax, %ecx
 854         incl %ecx
 855         movl 8(%esp), %edx
 856         cmpl %edx, %ecx
 857         jne LBB1_2      #UnifiedReturnBlock
 858 LBB1_1: #cond_true
 859         addl $2, %eax
 860         ret
 861 LBB1_2: #UnifiedReturnBlock
 862         movl %ecx, %eax
 863         ret
 864 _f2:
 865         movl 4(%esp), %eax
 866         movl %eax, %ecx
 867         incl %ecx
 868         cmpl 8(%esp), %ecx
 869         sete %cl
 870         movzbl %cl, %ecx
 871         leal 1(%ecx,%eax), %eax
 872         ret
 873
 874 both of which are inferior to GCC's:
 875
 876 _f:
 877         movl    4(%esp), %edx
 878         leal    1(%edx), %eax
 879         addl    $2, %edx
 880         cmpl    8(%esp), %eax
 881         cmove   %edx, %eax
 882         ret
 883 _f2:
 884         movl    4(%esp), %eax
 885         addl    $1, %eax
 886         xorl    %edx, %edx
 887         cmpl    8(%esp), %eax
 888         sete    %dl
 889         addl    %edx, %eax
 890         ret
 891
 892 //===---------------------------------------------------------------------===//
 893
 894 This code:
 895
 896 void test(int X) {
 897   if (X) abort();
 898 }
 899
 900 is currently compiled to:
 901
 902 _test:
 903         subl $12, %esp
 904         cmpl $0, 16(%esp)
 905         jne LBB1_1
 906         addl $12, %esp
 907         ret
 908 LBB1_1:
 909         call L_abort$stub
 910
 911 It would be better to produce:
 912
 913 _test:
 914         subl $12, %esp
 915         cmpl $0, 16(%esp)
 916         jne L_abort$stub
 917         addl $12, %esp
 918         ret
 919
 920 This can be applied to any no-return function call that takes no arguments etc.
 921 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 922 something like this:
 923
 924 _test:
 925         cmpl $0, 4(%esp)
 926         jne LBB1_1
 927         ret
 928 LBB1_1:
 929         subl $12, %esp
 930         call L_abort$stub
 931
 932 Both are useful in different situations.  Finally, it could be shrink-wrapped
 933 and tail called, like this:
 934
 935 _test:
 936         cmpl $0, 4(%esp)
 937         jne LBB1_1
 938         ret
 939 LBB1_1:
 940         pop %eax   # realign stack.
 941         call L_abort$stub
 942
 943 Though this probably isn't worth it.
 944
 945 //===---------------------------------------------------------------------===//
 946
 947 We need to teach the codegen to convert two-address INC instructions to LEA
 948 when the flags are dead (likewise dec).  For example, on X86-64, compile:
 949
 950 int foo(int A, int B) {
 951   return A+1;
 952 }
 953
 954 to:
 955
 956 _foo:
 957         leal    1(%edi), %eax
 958         ret
 959
 960 instead of:
 961
 962 _foo:
 963         incl %edi
 964         movl %edi, %eax
 965         ret
 966
 967 Another example is:
 968
 969 ;; X's live range extends beyond the shift, so the register allocator
 970 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
 971 ;; emitted before the shift to save the register value before it is
 972 ;; clobbered.  However, this copy is not needed if the register
 973 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
 974
 975 ; Check that the shift gets turned into an LEA.
 976 ; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
 977 ; RUN:   not grep {mov E.X, E.X}
 978
 979 %G = external global int
 980
 981 int %test1(int %X, int %Y) {
 982         %Z = add int %X, %Y
 983         volatile store int %Y, int* %G
 984         volatile store int %Z, int* %G
 985         ret int %X
 986 }
 987
 988 int %test2(int %X) {
 989         %Z = add int %X, 1  ;; inc
 990         volatile store int %Z, int* %G
 991         ret int %X
 992 }
 993
 994 //===---------------------------------------------------------------------===//
 995
 996 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 997 a neg instead of a sub instruction.  Consider:
 998
 999 int test(char X) { return 7-X; }
1000
1001 we currently produce:
1002 _test:
1003         movl $7, %eax
1004         movsbl 4(%esp), %ecx
1005         subl %ecx, %eax
1006         ret
1007
1008 We would use one fewer register if codegen'd as:
1009
1010         movsbl 4(%esp), %eax
1011         neg %eax
1012         add $7, %eax
1013         ret
1014
1015 Note that this isn't beneficial if the load can be folded into the sub.  In
1016 this case, we want a sub:
1017
1018 int test(int X) { return 7-X; }
1019 _test:
1020         movl $7, %eax
1021         subl 4(%esp), %eax
1022         ret
1023
1024 //===---------------------------------------------------------------------===//
1025
1026 For code like:
1027 phi (undef, x)
1028
1029 We get an implicit def on the undef side. If the phi is spilled, we then get:
1030 implicitdef xmm1
1031 store xmm1 -> stack
1032
1033 It should be possible to teach the x86 backend to "fold" the store into the
1034 implicitdef, which just deletes the implicit def.
1035
1036 These instructions should go away:
1037 #IMPLICIT_DEF %xmm1
1038 movaps %xmm1, 192(%esp)
1039 movaps %xmm1, 224(%esp)
1040 movaps %xmm1, 176(%esp)
1041
1042 //===---------------------------------------------------------------------===//
1043
1044 This is a "commutable two-address" register coallescing deficiency:
1045
1046 define <4 x float> @test1(<4 x float> %V) {
1047 entry:
1048         %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
1049                                         <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
1050         %add = add <4 x float> %tmp8, %V
1051         ret <4 x float> %add
1052 }
1053
1054 this codegens to:
1055
1056 _test1:
1057         pshufd  $27, %xmm0, %xmm1
1058         addps   %xmm0, %xmm1
1059         movaps  %xmm1, %xmm0
1060         ret
1061
1062 instead of:
1063
1064 _test1:
1065         pshufd  $27, %xmm0, %xmm1
1066         addps   %xmm1, %xmm0
1067         ret
1068
1069 //===---------------------------------------------------------------------===//
1070
1071 Leaf functions that require one 4-byte spill slot have a prolog like this:
1072
1073 _foo:
1074         pushl   %esi
1075         subl    $4, %esp
1076 ...
1077 and an epilog like this:
1078         addl    $4, %esp
1079         popl    %esi
1080         ret
1081
1082 It would be smaller, and potentially faster, to push eax on entry and to
1083 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
1084 into any return registers :)
1085
1086 //===---------------------------------------------------------------------===//
1087
1088 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
1089 branches.  We generate really poor code for:
1090
1091 double testf(double a) {
1092        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
1093 }
1094
1095 For example, the entry BB is:
1096
1097 _testf:
1098         subl    $20, %esp
1099         pxor    %xmm0, %xmm0
1100         movsd   24(%esp), %xmm1
1101         ucomisd %xmm0, %xmm1
1102         setnp   %al
1103         sete    %cl
1104         testb   %cl, %al
1105         jne     LBB1_5  # UnifiedReturnBlock
1106 LBB1_1: # cond_true
1107
1108
1109 it would be better to replace the last four instructions with:
1110
1111         jp LBB1_1
1112         je LBB1_5
1113 LBB1_1:
1114
1115 We also codegen the inner ?: into a diamond:
1116
1117        cvtss2sd        LCPI1_0(%rip), %xmm2
1118         cvtss2sd        LCPI1_1(%rip), %xmm3
1119         ucomisd %xmm1, %xmm0
1120         ja      LBB1_3  # cond_true
1121 LBB1_2: # cond_true
1122         movapd  %xmm3, %xmm2
1123 LBB1_3: # cond_true
1124         movapd  %xmm2, %xmm0
1125         ret
1126
1127 We should sink the load into xmm3 into the LBB1_2 block.  This should
1128 be pretty easy, and will nuke all the copies.
1129
1130 //===---------------------------------------------------------------------===//
1131
1132 This:
1133         #include <algorithm>
1134         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
1135         { return std::make_pair(a + b, a + b < a); }
1136         bool no_overflow(unsigned a, unsigned b)
1137         { return !full_add(a, b).second; }
1138
1139 Should compile to:
1140
1141
1142         _Z11no_overflowjj:
1143                 addl    %edi, %esi
1144                 setae   %al
1145                 ret
1146
1147 on x86-64, not:
1148
1149 __Z11no_overflowjj:
1150         addl    %edi, %esi
1151         cmpl    %edi, %esi
1152         setae   %al
1153         movzbl  %al, %eax
1154         ret
1155
1156
1157 //===---------------------------------------------------------------------===//
1158
1159 Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
1160 condition register is dead. xor reg reg is shorter than mov reg, #0.
1161
1162 //===---------------------------------------------------------------------===//
1163
1164 We aren't matching RMW instructions aggressively
1165 enough.  Here's a reduced testcase (more in PR1160):
1166
1167 define void @test(i32* %huge_ptr, i32* %target_ptr) {
1168         %A = load i32* %huge_ptr                ; <i32> [#uses=1]
1169         %B = load i32* %target_ptr              ; <i32> [#uses=1]
1170         %C = or i32 %A, %B              ; <i32> [#uses=1]
1171         store i32 %C, i32* %target_ptr
1172         ret void
1173 }
1174
1175 $ llvm-as < t.ll | llc -march=x86-64
1176
1177 _test:
1178         movl (%rdi), %eax
1179         orl (%rsi), %eax
1180         movl %eax, (%rsi)
1181         ret
1182
1183 That should be something like:
1184
1185 _test:
1186         movl (%rdi), %eax
1187         orl %eax, (%rsi)
1188         ret
1189
1190 //===---------------------------------------------------------------------===//
1191
1192 The following code:
1193
1194 bb114.preheader:                ; preds = %cond_next94
1195         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
1196         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
1197         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
1198         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
1199         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
1200         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
1201         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
1202         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
1203         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
1204         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
1205         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
1206         br label %bb114
1207
1208 produces:
1209
1210 LBB3_5: # bb114.preheader
1211         movswl  -68(%ebp), %eax
1212         movl    $32, %ecx
1213         movl    %ecx, -80(%ebp)
1214         subl    %eax, -80(%ebp)
1215         movswl  -52(%ebp), %eax
1216         movl    %ecx, -84(%ebp)
1217         subl    %eax, -84(%ebp)
1218         movswl  -70(%ebp), %eax
1219         movl    %ecx, -88(%ebp)
1220         subl    %eax, -88(%ebp)
1221         movswl  -50(%ebp), %eax
1222         subl    %eax, %ecx
1223         movl    %ecx, -76(%ebp)
1224         movswl  -42(%ebp), %eax
1225         movl    %eax, -92(%ebp)
1226         movswl  -66(%ebp), %eax
1227         movl    %eax, -96(%ebp)
1228         movw    $0, -98(%ebp)
1229
1230 This appears to be bad because the RA is not folding the store to the stack
1231 slot into the movl.  The above instructions could be:
1232         movl    $32, -80(%ebp)
1233 ...
1234         movl    $32, -84(%ebp)
1235 ...
1236 This seems like a cross between remat and spill folding.
1237
1238 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
1239 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
1240 vice-versa).
1241
1242 //===---------------------------------------------------------------------===//
1243
1244 For this code:
1245
1246 cond_next603:           ; preds = %bb493, %cond_true336, %cond_next599
1247         %v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ]                ; <i32> [#uses=1]
1248         %maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ]             ; <i32> [#uses=2]
1249         %cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ]                ; <i32> [#uses=2]
1250         %byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ]             ; <i8*> [#uses=9]
1251         %bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ]             ; <i32> [#uses=4]
1252         %source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ]             ; <i32> [#uses=7]
1253         %tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1           ; <i8*> [#uses=1]
1254         %tmp607 = load i8* %tmp606, align 1             ; <i8> [#uses=1]
1255
1256 We produce this:
1257
1258 LBB4_70:        # cond_next603
1259         movl    -20(%ebp), %esi
1260         movl    L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
1261
1262 However, ICC caches this information before the loop and produces this:
1263
1264         movl      88(%esp), %eax                                #481.12
1265
1266 //===---------------------------------------------------------------------===//
1267
1268 This code:
1269
1270         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
1271         br i1 %tmp659, label %cond_true662, label %cond_next715
1272
1273 produces this:
1274
1275         testw   %cx, %cx
1276         movswl  %cx, %esi
1277         jns     LBB4_109        # cond_next715
1278
1279 Shark tells us that using %cx in the testw instruction is sub-optimal. It
1280 suggests using the 32-bit register (which is what ICC uses).
1281
1282 //===---------------------------------------------------------------------===//
1283
1284 rdar://5506677 - We compile this:
1285
1286 define i32 @foo(double %x) {
1287         %x14 = bitcast double %x to i64         ; <i64> [#uses=1]
1288         %tmp713 = trunc i64 %x14 to i32         ; <i32> [#uses=1]
1289         %tmp8 = and i32 %tmp713, 2147483647             ; <i32> [#uses=1]
1290         ret i32 %tmp8
1291 }
1292
1293 to:
1294
1295 _foo:
1296         subl    $12, %esp
1297         fldl    16(%esp)
1298         fstpl   (%esp)
1299         movl    $2147483647, %eax
1300         andl    (%esp), %eax
1301         addl    $12, %esp
1302         #FP_REG_KILL
1303         ret
1304
1305 It would be much better to eliminate the fldl/fstpl by folding the bitcast
1306 into the load SDNode.  That would give us:
1307
1308 _foo:
1309         movl    $2147483647, %eax
1310         andl    4(%esp), %eax
1311         ret
1312
1313 //===---------------------------------------------------------------------===//
1314
1315 We compile this:
1316
1317 void compare (long long foo) {
1318   if (foo < 4294967297LL)
1319     abort();
1320 }
1321
1322 to:
1323
1324 _compare:
1325         subl    $12, %esp
1326         cmpl    $0, 16(%esp)
1327         setne   %al
1328         movzbw  %al, %ax
1329         cmpl    $1, 20(%esp)
1330         setg    %cl
1331         movzbw  %cl, %cx
1332         cmove   %ax, %cx
1333         movw    %cx, %ax
1334         testb   $1, %al
1335         je      LBB1_2  # cond_true
1336
1337 (also really horrible code on ppc).  This is due to the expand code for 64-bit
1338 compares.  GCC produces multiple branches, which is much nicer:
1339
1340 _compare:
1341         pushl   %ebp
1342         movl    %esp, %ebp
1343         subl    $8, %esp
1344         movl    8(%ebp), %eax
1345         movl    12(%ebp), %edx
1346         subl    $1, %edx
1347         jg     L5
1348 L7:
1349         jl      L4
1350         cmpl    $0, %eax
1351         jbe      L4
1352 L5:
1353
1354 //===---------------------------------------------------------------------===//
1355
1356 Tail call optimization improvements: Tail call optimization currently
1357 pushes all arguments on the top of the stack (their normal place for
1358 non-tail call optimized calls) before moving them to actual stack
1359 slot. This is done to prevent overwriting of parameters (see example
1360 below) that might be used, since the arguments of the callee
1361 overwrites caller's arguments.
1362
1363 example:
1364
1365 int callee(int32, int64);
1366 int caller(int32 arg1, int32 arg2) {
1367   int64 local = arg2 * 2;
1368   return callee(arg2, (int64)local);
1369 }
1370
1371 [arg1]          [!arg2 no longer valid since we moved local onto it]
1372 [arg2]      ->  [(int64)
1373 [RETADDR]        local  ]
1374
1375 Moving arg1 onto the stack slot of callee function would overwrite
1376 arg2 of the caller.
1377
1378 Possible optimizations:
1379
1380  - Only push those arguments to the top of the stack that are actual
1381    parameters of the caller function and have no local value in the
1382    caller.
1383
1384    In the above example local does not need to be pushed onto the top
1385    of the stack as it is definitely not a caller's function
1386    parameter.
1387
1388  - Analyse the actual parameters of the callee to see which would
1389    overwrite a caller parameter which is used by the callee and only
1390    push them onto the top of the stack.
1391
1392    int callee (int32 arg1, int32 arg2);
1393    int caller (int32 arg1, int32 arg2) {
1394        return callee(arg1,arg2);
1395    }
1396
1397    Here we don't need to write any variables to the top of the stack
1398    since they don't overwrite each other.
1399
1400    int callee (int32 arg1, int32 arg2);
1401    int caller (int32 arg1, int32 arg2) {
1402        return callee(arg2,arg1);
1403    }
1404
1405    Here we need to push the arguments because they overwrite each
1406    other.
1407
1408
1409    Code for lowering directly onto callers arguments:
1410 +  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
1411 +  SmallVector<SDOperand, 8> MemOpChains;
1412 +
1413 +  SDOperand FramePtr;
1414 +  SDOperand PtrOff;
1415 +  SDOperand FIN;
1416 +  int FI = 0;
1417 +  // Walk the register/memloc assignments, inserting copies/loads.
1418 +  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1419 +    CCValAssign &VA = ArgLocs[i];
1420 +    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
1421 +
1422 +    ....
1423 +
1424 +    if (VA.isRegLoc()) {
1425 +      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1426 +    } else {
1427 +      assert(VA.isMemLoc());
1428 +      // create frame index
1429 +      int32_t Offset = VA.getLocMemOffset()+FPDiff;
1430 +      uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
1431 +      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
1432 +      FIN = DAG.getFrameIndex(FI, MVT::i32);
1433 +      // store relative to framepointer
1434 +      MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0));
1435 +    }
1436 +  }
1437 //===---------------------------------------------------------------------===//