lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Missing features:
   6   - Support for SSE4: http://www.intel.com/software/penryn
   7 http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
   8   - support for 3DNow!
   9   - weird abis?
  10
  11 //===---------------------------------------------------------------------===//
  12
  13 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
  14 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
  15 X86, & make the dag combiner produce it when needed.  This will eliminate one
  16 imul from the code generated for:
  17
  18 long long test(long long X, long long Y) { return X*Y; }
  19
  20 by using the EAX result from the mul.  We should add a similar node for
  21 DIVREM.
  22
  23 another case is:
  24
  25 long long test(int X, int Y) { return (long long)X*Y; }
  26
  27 ... which should only be one imul instruction.
  28
  29 or:
  30
  31 unsigned long long int t2(unsigned int a, unsigned int b) {
  32        return (unsigned long long)a * b;
  33 }
  34
  35 ... which should be one mul instruction.
  36
  37
  38 This can be done with a custom expander, but it would be nice to move this to
  39 generic code.
  40
  41 //===---------------------------------------------------------------------===//
  42
  43 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  44 backend knows how to three-addressify this shift, but it appears the register
  45 allocator isn't even asking it to do so in this case.  We should investigate
  46 why this isn't happening, it could have significant impact on other important
  47 cases for X86 as well.
  48
  49 //===---------------------------------------------------------------------===//
  50
  51 This should be one DIV/IDIV instruction, not a libcall:
  52
  53 unsigned test(unsigned long long X, unsigned Y) {
  54         return X/Y;
  55 }
  56
  57 This can be done trivially with a custom legalizer.  What about overflow
  58 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  59
  60 //===---------------------------------------------------------------------===//
  61
  62 Improvements to the multiply -> shift/add algorithm:
  63 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  64
  65 //===---------------------------------------------------------------------===//
  66
  67 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  68 long long foo(int x) { return 1LL << x; }
  69
  70 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  71 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  72 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  73
  74 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  75
  76 One better solution for 1LL << x is:
  77         xorl    %eax, %eax
  78         xorl    %edx, %edx
  79         testb   $32, %cl
  80         sete    %al
  81         setne   %dl
  82         sall    %cl, %eax
  83         sall    %cl, %edx
  84
  85 But that requires good 8-bit subreg support.
  86
  87 64-bit shifts (in general) expand to really bad code.  Instead of using
  88 cmovs, we should expand to a conditional branch like GCC produces.
  89
  90 //===---------------------------------------------------------------------===//
  91
  92 Compile this:
  93 _Bool f(_Bool a) { return a!=1; }
  94
  95 into:
  96         movzbl  %dil, %eax
  97         xorl    $1, %eax
  98         ret
  99
 100 //===---------------------------------------------------------------------===//
 101
 102 Some isel ideas:
 103
 104 1. Dynamic programming based approach when compile time if not an
 105    issue.
 106 2. Code duplication (addressing mode) during isel.
 107 3. Other ideas from "Register-Sensitive Selection, Duplication, and
 108    Sequencing of Instructions".
 109 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
 110    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
 111    and other related papers.
 112    http://citeseer.ist.psu.edu/govindarajan01minimum.html
 113
 114 //===---------------------------------------------------------------------===//
 115
 116 Should we promote i16 to i32 to avoid partial register update stalls?
 117
 118 //===---------------------------------------------------------------------===//
 119
 120 Leave any_extend as pseudo instruction and hint to register
 121 allocator. Delay codegen until post register allocation.
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 Count leading zeros and count trailing zeros:
 126
 127 int clz(int X) { return __builtin_clz(X); }
 128 int ctz(int X) { return __builtin_ctz(X); }
 129
 130 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 131 clz:
 132         bsr     %eax, DWORD PTR [%esp+4]
 133         xor     %eax, 31
 134         ret
 135 ctz:
 136         bsf     %eax, DWORD PTR [%esp+4]
 137         ret
 138
 139 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 140 aren't.
 141
 142 Another example (use predsimplify to eliminate a select):
 143
 144 int foo (unsigned long j) {
 145   if (j)
 146     return __builtin_ffs (j) - 1;
 147   else
 148     return 0;
 149 }
 150
 151 //===---------------------------------------------------------------------===//
 152
 153 It appears icc use push for parameter passing. Need to investigate.
 154
 155 //===---------------------------------------------------------------------===//
 156
 157 Only use inc/neg/not instructions on processors where they are faster than
 158 add/sub/xor.  They are slower on the P4 due to only updating some processor
 159 flags.
 160
 161 //===---------------------------------------------------------------------===//
 162
 163 The instruction selector sometimes misses folding a load into a compare.  The
 164 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 165 commutative, it is not matched with the load on both sides.  The dag combiner
 166 should be made smart enough to cannonicalize the load into the RHS of a compare
 167 when it can invert the result of the compare for free.
 168
 169 //===---------------------------------------------------------------------===//
 170
 171 How about intrinsics? An example is:
 172   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 173
 174 compiles to
 175         pmuludq (%eax), %xmm0
 176         movl 8(%esp), %eax
 177         movdqa (%eax), %xmm1
 178         pmulhuw %xmm0, %xmm1
 179
 180 The transformation probably requires a X86 specific pass or a DAG combiner
 181 target specific hook.
 182
 183 //===---------------------------------------------------------------------===//
 184
 185 In many cases, LLVM generates code like this:
 186
 187 _test:
 188         movl 8(%esp), %eax
 189         cmpl %eax, 4(%esp)
 190         setl %al
 191         movzbl %al, %eax
 192         ret
 193
 194 on some processors (which ones?), it is more efficient to do this:
 195
 196 _test:
 197         movl 8(%esp), %ebx
 198         xor  %eax, %eax
 199         cmpl %ebx, 4(%esp)
 200         setl %al
 201         ret
 202
 203 Doing this correctly is tricky though, as the xor clobbers the flags.
 204
 205 //===---------------------------------------------------------------------===//
 206
 207 We should generate bts/btr/etc instructions on targets where they are cheap or
 208 when codesize is important.  e.g., for:
 209
 210 void setbit(int *target, int bit) {
 211     *target |= (1 << bit);
 212 }
 213 void clearbit(int *target, int bit) {
 214     *target &= ~(1 << bit);
 215 }
 216
 217 //===---------------------------------------------------------------------===//
 218
 219 Instead of the following for memset char*, 1, 10:
 220
 221         movl $16843009, 4(%edx)
 222         movl $16843009, (%edx)
 223         movw $257, 8(%edx)
 224
 225 It might be better to generate
 226
 227         movl $16843009, %eax
 228         movl %eax, 4(%edx)
 229         movl %eax, (%edx)
 230         movw al, 8(%edx)
 231
 232 when we can spare a register. It reduces code size.
 233
 234 //===---------------------------------------------------------------------===//
 235
 236 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 237 get this:
 238
 239 int %test1(int %X) {
 240         %Y = div int %X, 8
 241         ret int %Y
 242 }
 243
 244 _test1:
 245         movl 4(%esp), %eax
 246         movl %eax, %ecx
 247         sarl $31, %ecx
 248         shrl $29, %ecx
 249         addl %ecx, %eax
 250         sarl $3, %eax
 251         ret
 252
 253 GCC knows several different ways to codegen it, one of which is this:
 254
 255 _test1:
 256         movl    4(%esp), %eax
 257         cmpl    $-1, %eax
 258         leal    7(%eax), %ecx
 259         cmovle  %ecx, %eax
 260         sarl    $3, %eax
 261         ret
 262
 263 which is probably slower, but it's interesting at least :)
 264
 265 //===---------------------------------------------------------------------===//
 266
 267 The first BB of this code:
 268
 269 declare bool %foo()
 270 int %bar() {
 271         %V = call bool %foo()
 272         br bool %V, label %T, label %F
 273 T:
 274         ret int 1
 275 F:
 276         call bool %foo()
 277         ret int 12
 278 }
 279
 280 compiles to:
 281
 282 _bar:
 283         subl $12, %esp
 284         call L_foo$stub
 285         xorb $1, %al
 286         testb %al, %al
 287         jne LBB_bar_2   # F
 288
 289 It would be better to emit "cmp %al, 1" than a xor and test.
 290
 291 //===---------------------------------------------------------------------===//
 292
 293 Enable X86InstrInfo::convertToThreeAddress().
 294
 295 //===---------------------------------------------------------------------===//
 296
 297 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 298 We should leave these as libcalls for everything over a much lower threshold,
 299 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 300 stores, TLB preheating, etc)
 301
 302 //===---------------------------------------------------------------------===//
 303
 304 Optimize this into something reasonable:
 305  x * copysign(1.0, y) * copysign(1.0, z)
 306
 307 //===---------------------------------------------------------------------===//
 308
 309 Optimize copysign(x, *y) to use an integer load from y.
 310
 311 //===---------------------------------------------------------------------===//
 312
 313 %X = weak global int 0
 314
 315 void %foo(int %N) {
 316         %N = cast int %N to uint
 317         %tmp.24 = setgt int %N, 0
 318         br bool %tmp.24, label %no_exit, label %return
 319
 320 no_exit:
 321         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 322         %i.0.0 = cast uint %indvar to int
 323         volatile store int %i.0.0, int* %X
 324         %indvar.next = add uint %indvar, 1
 325         %exitcond = seteq uint %indvar.next, %N
 326         br bool %exitcond, label %return, label %no_exit
 327
 328 return:
 329         ret void
 330 }
 331
 332 compiles into:
 333
 334         .text
 335         .align  4
 336         .globl  _foo
 337 _foo:
 338         movl 4(%esp), %eax
 339         cmpl $1, %eax
 340         jl LBB_foo_4    # return
 341 LBB_foo_1:      # no_exit.preheader
 342         xorl %ecx, %ecx
 343 LBB_foo_2:      # no_exit
 344         movl L_X$non_lazy_ptr, %edx
 345         movl %ecx, (%edx)
 346         incl %ecx
 347         cmpl %eax, %ecx
 348         jne LBB_foo_2   # no_exit
 349 LBB_foo_3:      # return.loopexit
 350 LBB_foo_4:      # return
 351         ret
 352
 353 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 354 remateralization is implemented. This can be accomplished with 1) a target
 355 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 356
 357 //===---------------------------------------------------------------------===//
 358
 359 The following tests perform worse with LSR:
 360
 361 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 362
 363 //===---------------------------------------------------------------------===//
 364
 365 We are generating far worse code than gcc:
 366
 367 volatile short X, Y;
 368
 369 void foo(int N) {
 370   int i;
 371   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 372 }
 373
 374 LBB1_1: #bb.preheader
 375         xorl %ecx, %ecx
 376         xorw %dx, %dx
 377 LBB1_2: #bb
 378         movl L_X$non_lazy_ptr, %esi
 379         movw %dx, (%esi)
 380         movw %dx, %si
 381         shlw $2, %si
 382         movl L_Y$non_lazy_ptr, %edi
 383         movw %si, (%edi)
 384         incl %ecx
 385         incw %dx
 386         cmpl %eax, %ecx
 387         jne LBB1_2      #bb
 388
 389 vs.
 390
 391         xorl    %edx, %edx
 392         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 393         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 394 L4:
 395         movw    %dx, (%esi)
 396         leal    0(,%edx,4), %eax
 397         movw    %ax, (%ecx)
 398         addl    $1, %edx
 399         cmpl    %edx, %edi
 400         jne     L4
 401
 402 There are 3 issues:
 403
 404 1. Lack of post regalloc LICM.
 405 2. LSR unable to reused IV for a different type (i16 vs. i32) even though
 406    the cast would be free.
 407
 408 //===---------------------------------------------------------------------===//
 409
 410 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 411 FR64 to VR128.
 412
 413 //===---------------------------------------------------------------------===//
 414
 415 mov $reg, 48(%esp)
 416 ...
 417 leal 48(%esp), %eax
 418 mov %eax, (%esp)
 419 call _foo
 420
 421 Obviously it would have been better for the first mov (or any op) to store
 422 directly %esp[0] if there are no other uses.
 423
 424 //===---------------------------------------------------------------------===//
 425
 426 Adding to the list of cmp / test poor codegen issues:
 427
 428 int test(__m128 *A, __m128 *B) {
 429   if (_mm_comige_ss(*A, *B))
 430     return 3;
 431   else
 432     return 4;
 433 }
 434
 435 _test:
 436         movl 8(%esp), %eax
 437         movaps (%eax), %xmm0
 438         movl 4(%esp), %eax
 439         movaps (%eax), %xmm1
 440         comiss %xmm0, %xmm1
 441         setae %al
 442         movzbl %al, %ecx
 443         movl $3, %eax
 444         movl $4, %edx
 445         cmpl $0, %ecx
 446         cmove %edx, %eax
 447         ret
 448
 449 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 450 are a number of issues. 1) We are introducing a setcc between the result of the
 451 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 452 so a any extend (which becomes a zero extend) is added.
 453
 454 We probably need some kind of target DAG combine hook to fix this.
 455
 456 //===---------------------------------------------------------------------===//
 457
 458 We generate significantly worse code for this than GCC:
 459 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 460 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 461
 462 There is also one case we do worse on PPC.
 463
 464 //===---------------------------------------------------------------------===//
 465
 466 If shorter, we should use things like:
 467 movzwl %ax, %eax
 468 instead of:
 469 andl $65535, %EAX
 470
 471 The former can also be used when the two-addressy nature of the 'and' would
 472 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 473
 474 //===---------------------------------------------------------------------===//
 475
 476 Consider this:
 477
 478 typedef struct pair { float A, B; } pair;
 479 void pairtest(pair P, float *FP) {
 480         *FP = P.A+P.B;
 481 }
 482
 483 We currently generate this code with llvmgcc4:
 484
 485 _pairtest:
 486         movl 8(%esp), %eax
 487         movl 4(%esp), %ecx
 488         movd %eax, %xmm0
 489         movd %ecx, %xmm1
 490         addss %xmm0, %xmm1
 491         movl 12(%esp), %eax
 492         movss %xmm1, (%eax)
 493         ret
 494
 495 we should be able to generate:
 496 _pairtest:
 497         movss 4(%esp), %xmm0
 498         movl 12(%esp), %eax
 499         addss 8(%esp), %xmm0
 500         movss %xmm0, (%eax)
 501         ret
 502
 503 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 504 integer chunks.  It does this so that structs like {short,short} are passed in
 505 a single 32-bit integer stack slot.  We should handle the safe cases above much
 506 nicer, while still handling the hard cases.
 507
 508 While true in general, in this specific case we could do better by promoting
 509 load int + bitcast to float -> load fload.  This basically needs alignment info,
 510 the code is already implemented (but disabled) in dag combine).
 511
 512 //===---------------------------------------------------------------------===//
 513
 514 Another instruction selector deficiency:
 515
 516 void %bar() {
 517         %tmp = load int (int)** %foo
 518         %tmp = tail call int %tmp( int 3 )
 519         ret void
 520 }
 521
 522 _bar:
 523         subl $12, %esp
 524         movl L_foo$non_lazy_ptr, %eax
 525         movl (%eax), %eax
 526         call *%eax
 527         addl $12, %esp
 528         ret
 529
 530 The current isel scheme will not allow the load to be folded in the call since
 531 the load's chain result is read by the callseq_start.
 532
 533 //===---------------------------------------------------------------------===//
 534
 535 For this:
 536
 537 int test(int a)
 538 {
 539   return a * 3;
 540 }
 541
 542 We currently emits
 543         imull $3, 4(%esp), %eax
 544
 545 Perhaps this is what we really should generate is? Is imull three or four
 546 cycles? Note: ICC generates this:
 547         movl    4(%esp), %eax
 548         leal    (%eax,%eax,2), %eax
 549
 550 The current instruction priority is based on pattern complexity. The former is
 551 more "complex" because it folds a load so the latter will not be emitted.
 552
 553 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 554 should always try to match LEA first since the LEA matching code does some
 555 estimate to determine whether the match is profitable.
 556
 557 However, if we care more about code size, then imull is better. It's two bytes
 558 shorter than movl + leal.
 559
 560 //===---------------------------------------------------------------------===//
 561
 562 Implement CTTZ, CTLZ with bsf and bsr.
 563
 564 //===---------------------------------------------------------------------===//
 565
 566 It appears gcc place string data with linkonce linkage in
 567 .section __TEXT,__const_coal,coalesced instead of
 568 .section __DATA,__const_coal,coalesced.
 569 Take a look at darwin.h, there are other Darwin assembler directives that we
 570 do not make use of.
 571
 572 //===---------------------------------------------------------------------===//
 573
 574 int %foo(int* %a, int %t) {
 575 entry:
 576         br label %cond_true
 577
 578 cond_true:              ; preds = %cond_true, %entry
 579         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 580         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 581         %tmp2 = getelementptr int* %a, int %x.0.0
 582         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 583         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 584         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 585         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 586         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 587         br bool %tmp, label %bb12, label %cond_true
 588
 589 bb12:           ; preds = %cond_true
 590         ret int %tmp7
 591 }
 592
 593 is pessimized by -loop-reduce and -indvars
 594
 595 //===---------------------------------------------------------------------===//
 596
 597 u32 to float conversion improvement:
 598
 599 float uint32_2_float( unsigned u ) {
 600   float fl = (int) (u & 0xffff);
 601   float fh = (int) (u >> 16);
 602   fh *= 0x1.0p16f;
 603   return fh + fl;
 604 }
 605
 606 00000000        subl    $0x04,%esp
 607 00000003        movl    0x08(%esp,1),%eax
 608 00000007        movl    %eax,%ecx
 609 00000009        shrl    $0x10,%ecx
 610 0000000c        cvtsi2ss        %ecx,%xmm0
 611 00000010        andl    $0x0000ffff,%eax
 612 00000015        cvtsi2ss        %eax,%xmm1
 613 00000019        mulss   0x00000078,%xmm0
 614 00000021        addss   %xmm1,%xmm0
 615 00000025        movss   %xmm0,(%esp,1)
 616 0000002a        flds    (%esp,1)
 617 0000002d        addl    $0x04,%esp
 618 00000030        ret
 619
 620 //===---------------------------------------------------------------------===//
 621
 622 When using fastcc abi, align stack slot of argument of type double on 8 byte
 623 boundary to improve performance.
 624
 625 //===---------------------------------------------------------------------===//
 626
 627 Codegen:
 628
 629 int f(int a, int b) {
 630   if (a == 4 || a == 6)
 631     b++;
 632   return b;
 633 }
 634
 635
 636 as:
 637
 638 or eax, 2
 639 cmp eax, 6
 640 jz label
 641
 642 //===---------------------------------------------------------------------===//
 643
 644 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 645 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 646
 647 int G;
 648 void f(int X, int Y) {
 649   G = X < 0 ? 14 : 13;
 650 }
 651
 652 compiling to:
 653
 654 _f:
 655         movl $14, %eax
 656         movl $13, %ecx
 657         movl 4(%esp), %edx
 658         testl %edx, %edx
 659         cmovl %eax, %ecx
 660         movl %ecx, _G
 661         ret
 662
 663 it could be:
 664 _f:
 665         movl    4(%esp), %eax
 666         sarl    $31, %eax
 667         notl    %eax
 668         addl    $14, %eax
 669         movl    %eax, _G
 670         ret
 671
 672 etc.
 673
 674 //===---------------------------------------------------------------------===//
 675
 676 Currently we don't have elimination of redundant stack manipulations. Consider
 677 the code:
 678
 679 int %main() {
 680 entry:
 681         call fastcc void %test1( )
 682         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 683         ret int 0
 684 }
 685
 686 declare fastcc void %test1()
 687
 688 declare fastcc void %test2(sbyte*)
 689
 690
 691 This currently compiles to:
 692
 693         subl $16, %esp
 694         call _test5
 695         addl $12, %esp
 696         subl $16, %esp
 697         movl $_test5, (%esp)
 698         call _test6
 699         addl $12, %esp
 700
 701 The add\sub pair is really unneeded here.
 702
 703 //===---------------------------------------------------------------------===//
 704
 705 We currently compile sign_extend_inreg into two shifts:
 706
 707 long foo(long X) {
 708   return (long)(signed char)X;
 709 }
 710
 711 becomes:
 712
 713 _foo:
 714         movl 4(%esp), %eax
 715         shll $24, %eax
 716         sarl $24, %eax
 717         ret
 718
 719 This could be:
 720
 721 _foo:
 722         movsbl  4(%esp),%eax
 723         ret
 724
 725 //===---------------------------------------------------------------------===//
 726
 727 Consider the expansion of:
 728
 729 uint %test3(uint %X) {
 730         %tmp1 = rem uint %X, 255
 731         ret uint %tmp1
 732 }
 733
 734 Currently it compiles to:
 735
 736 ...
 737         movl $2155905153, %ecx
 738         movl 8(%esp), %esi
 739         movl %esi, %eax
 740         mull %ecx
 741 ...
 742
 743 This could be "reassociated" into:
 744
 745         movl $2155905153, %eax
 746         movl 8(%esp), %ecx
 747         mull %ecx
 748
 749 to avoid the copy.  In fact, the existing two-address stuff would do this
 750 except that mul isn't a commutative 2-addr instruction.  I guess this has
 751 to be done at isel time based on the #uses to mul?
 752
 753 //===---------------------------------------------------------------------===//
 754
 755 Make sure the instruction which starts a loop does not cross a cacheline
 756 boundary. This requires knowning the exact length of each machine instruction.
 757 That is somewhat complicated, but doable. Example 256.bzip2:
 758
 759 In the new trace, the hot loop has an instruction which crosses a cacheline
 760 boundary.  In addition to potential cache misses, this can't help decoding as I
 761 imagine there has to be some kind of complicated decoder reset and realignment
 762 to grab the bytes from the next cacheline.
 763
 764 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 765 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 766 937  937 0x3d0a incl     %esi
 767 3    3   0x3d0b cmpb     %bl, %dl
 768 27   27  0x3d0d jnz      0x000062db <main+11707>
 769
 770 //===---------------------------------------------------------------------===//
 771
 772 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 773
 774 //===---------------------------------------------------------------------===//
 775
 776 This could be a single 16-bit load.
 777
 778 int f(char *p) {
 779     if ((p[0] == 1) & (p[1] == 2)) return 1;
 780     return 0;
 781 }
 782
 783 //===---------------------------------------------------------------------===//
 784
 785 We should inline lrintf and probably other libc functions.
 786
 787 //===---------------------------------------------------------------------===//
 788
 789 Start using the flags more.  For example, compile:
 790
 791 int add_zf(int *x, int y, int a, int b) {
 792      if ((*x += y) == 0)
 793           return a;
 794      else
 795           return b;
 796 }
 797
 798 to:
 799        addl    %esi, (%rdi)
 800        movl    %edx, %eax
 801        cmovne  %ecx, %eax
 802        ret
 803 instead of:
 804
 805 _add_zf:
 806         addl (%rdi), %esi
 807         movl %esi, (%rdi)
 808         testl %esi, %esi
 809         cmove %edx, %ecx
 810         movl %ecx, %eax
 811         ret
 812
 813 and:
 814
 815 int add_zf(int *x, int y, int a, int b) {
 816      if ((*x + y) < 0)
 817           return a;
 818      else
 819           return b;
 820 }
 821
 822 to:
 823
 824 add_zf:
 825         addl    (%rdi), %esi
 826         movl    %edx, %eax
 827         cmovns  %ecx, %eax
 828         ret
 829
 830 instead of:
 831
 832 _add_zf:
 833         addl (%rdi), %esi
 834         testl %esi, %esi
 835         cmovs %edx, %ecx
 836         movl %ecx, %eax
 837         ret
 838
 839 //===---------------------------------------------------------------------===//
 840
 841 This:
 842 #include <math.h>
 843 int foo(double X) { return isnan(X); }
 844
 845 compiles to (-m64):
 846
 847 _foo:
 848         pxor %xmm1, %xmm1
 849         ucomisd %xmm1, %xmm0
 850         setp %al
 851         movzbl %al, %eax
 852         ret
 853
 854 the pxor is not needed, we could compare the value against itself.
 855
 856 //===---------------------------------------------------------------------===//
 857
 858 These two functions have identical effects:
 859
 860 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 861 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 862
 863 We currently compile them to:
 864
 865 _f:
 866         movl 4(%esp), %eax
 867         movl %eax, %ecx
 868         incl %ecx
 869         movl 8(%esp), %edx
 870         cmpl %edx, %ecx
 871         jne LBB1_2      #UnifiedReturnBlock
 872 LBB1_1: #cond_true
 873         addl $2, %eax
 874         ret
 875 LBB1_2: #UnifiedReturnBlock
 876         movl %ecx, %eax
 877         ret
 878 _f2:
 879         movl 4(%esp), %eax
 880         movl %eax, %ecx
 881         incl %ecx
 882         cmpl 8(%esp), %ecx
 883         sete %cl
 884         movzbl %cl, %ecx
 885         leal 1(%ecx,%eax), %eax
 886         ret
 887
 888 both of which are inferior to GCC's:
 889
 890 _f:
 891         movl    4(%esp), %edx
 892         leal    1(%edx), %eax
 893         addl    $2, %edx
 894         cmpl    8(%esp), %eax
 895         cmove   %edx, %eax
 896         ret
 897 _f2:
 898         movl    4(%esp), %eax
 899         addl    $1, %eax
 900         xorl    %edx, %edx
 901         cmpl    8(%esp), %eax
 902         sete    %dl
 903         addl    %edx, %eax
 904         ret
 905
 906 //===---------------------------------------------------------------------===//
 907
 908 This code:
 909
 910 void test(int X) {
 911   if (X) abort();
 912 }
 913
 914 is currently compiled to:
 915
 916 _test:
 917         subl $12, %esp
 918         cmpl $0, 16(%esp)
 919         jne LBB1_1
 920         addl $12, %esp
 921         ret
 922 LBB1_1:
 923         call L_abort$stub
 924
 925 It would be better to produce:
 926
 927 _test:
 928         subl $12, %esp
 929         cmpl $0, 16(%esp)
 930         jne L_abort$stub
 931         addl $12, %esp
 932         ret
 933
 934 This can be applied to any no-return function call that takes no arguments etc.
 935 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 936 something like this:
 937
 938 _test:
 939         cmpl $0, 4(%esp)
 940         jne LBB1_1
 941         ret
 942 LBB1_1:
 943         subl $12, %esp
 944         call L_abort$stub
 945
 946 Both are useful in different situations.  Finally, it could be shrink-wrapped
 947 and tail called, like this:
 948
 949 _test:
 950         cmpl $0, 4(%esp)
 951         jne LBB1_1
 952         ret
 953 LBB1_1:
 954         pop %eax   # realign stack.
 955         call L_abort$stub
 956
 957 Though this probably isn't worth it.
 958
 959 //===---------------------------------------------------------------------===//
 960
 961 We need to teach the codegen to convert two-address INC instructions to LEA
 962 when the flags are dead.  For example, on X86-64, compile:
 963
 964 int foo(int A, int B) {
 965   return A+1;
 966 }
 967
 968 to:
 969
 970 _foo:
 971         leal    1(%edi), %eax
 972         ret
 973
 974 instead of:
 975
 976 _foo:
 977         incl %edi
 978         movl %edi, %eax
 979         ret
 980
 981 Another example is:
 982
 983 ;; X's live range extends beyond the shift, so the register allocator
 984 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
 985 ;; emitted before the shift to save the register value before it is
 986 ;; clobbered.  However, this copy is not needed if the register
 987 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
 988
 989 ; Check that the shift gets turned into an LEA.
 990 ; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
 991 ; RUN:   not grep {mov E.X, E.X}
 992
 993 %G = external global int
 994
 995 int %test1(int %X, int %Y) {
 996         %Z = add int %X, %Y
 997         volatile store int %Y, int* %G
 998         volatile store int %Z, int* %G
 999         ret int %X
1000 }
1001
1002 int %test2(int %X) {
1003         %Z = add int %X, 1  ;; inc
1004         volatile store int %Z, int* %G
1005         ret int %X
1006 }
1007
1008 //===---------------------------------------------------------------------===//
1009
1010 This:
1011 #include <xmmintrin.h>
1012 unsigned test(float f) {
1013  return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
1014 }
1015
1016 Compiles to:
1017 _test:
1018         movss 4(%esp), %xmm0
1019         movd %xmm0, %eax
1020         ret
1021
1022 it should compile to a move from the stack slot directly into eax.  DAGCombine
1023 has this xform, but it is currently disabled until the alignment fields of
1024 the load/store nodes are trustworthy.
1025
1026 //===---------------------------------------------------------------------===//
1027
1028 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
1029 a neg instead of a sub instruction.  Consider:
1030
1031 int test(char X) { return 7-X; }
1032
1033 we currently produce:
1034 _test:
1035         movl $7, %eax
1036         movsbl 4(%esp), %ecx
1037         subl %ecx, %eax
1038         ret
1039
1040 We would use one fewer register if codegen'd as:
1041
1042         movsbl 4(%esp), %eax
1043         neg %eax
1044         add $7, %eax
1045         ret
1046
1047 Note that this isn't beneficial if the load can be folded into the sub.  In
1048 this case, we want a sub:
1049
1050 int test(int X) { return 7-X; }
1051 _test:
1052         movl $7, %eax
1053         subl 4(%esp), %eax
1054         ret
1055
1056 //===---------------------------------------------------------------------===//
1057
1058 For code like:
1059 phi (undef, x)
1060
1061 We get an implicit def on the undef side. If the phi is spilled, we then get:
1062 implicitdef xmm1
1063 store xmm1 -> stack
1064
1065 It should be possible to teach the x86 backend to "fold" the store into the
1066 implicitdef, which just deletes the implicit def.
1067
1068 These instructions should go away:
1069 #IMPLICIT_DEF %xmm1
1070 movaps %xmm1, 192(%esp)
1071 movaps %xmm1, 224(%esp)
1072 movaps %xmm1, 176(%esp)
1073
1074 //===---------------------------------------------------------------------===//
1075
1076 This is a "commutable two-address" register coallescing deficiency:
1077
1078 define <4 x float> @test1(<4 x float> %V) {
1079 entry:
1080         %tmp8 = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> < i32 3, i32 2, i32 1, i32 0 >               ; <<4 x float>> [#uses=1]
1081         %add = add <4 x float> %tmp8, %V                ; <<4 x float>> [#uses=1]
1082         ret <4 x float> %add
1083 }
1084
1085 this codegens to:
1086
1087 _test1:
1088         pshufd  $27, %xmm0, %xmm1
1089         addps   %xmm0, %xmm1
1090         movaps  %xmm1, %xmm0
1091         ret
1092
1093 instead of:
1094
1095 _test1:
1096         pshufd  $27, %xmm0, %xmm1
1097         addps   %xmm1, %xmm0
1098         ret
1099