lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 This can be done with a custom expander, but it would be nice to move this to
  22 generic code.
  23
  24 //===---------------------------------------------------------------------===//
  25
  26 This should be one DIV/IDIV instruction, not a libcall:
  27
  28 unsigned test(unsigned long long X, unsigned Y) {
  29         return X/Y;
  30 }
  31
  32 This can be done trivially with a custom legalizer.  What about overflow
  33 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  34
  35 //===---------------------------------------------------------------------===//
  36
  37 Improvements to the multiply -> shift/add algorithm:
  38 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  39
  40 //===---------------------------------------------------------------------===//
  41
  42 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  43 long long foo(int x) { return 1LL << x; }
  44
  45 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  46 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  47 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  48
  49 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  50
  51 One better solution for 1LL << x is:
  52         xorl    %eax, %eax
  53         xorl    %edx, %edx
  54         testb   $32, %cl
  55         sete    %al
  56         setne   %dl
  57         sall    %cl, %eax
  58         sall    %cl, %edx
  59
  60 But that requires good 8-bit subreg support.
  61
  62 64-bit shifts (in general) expand to really bad code.  Instead of using
  63 cmovs, we should expand to a conditional branch like GCC produces.
  64
  65 //===---------------------------------------------------------------------===//
  66
  67 Compile this:
  68 _Bool f(_Bool a) { return a!=1; }
  69
  70 into:
  71         movzbl  %dil, %eax
  72         xorl    $1, %eax
  73         ret
  74
  75 //===---------------------------------------------------------------------===//
  76
  77 Some isel ideas:
  78
  79 1. Dynamic programming based approach when compile time if not an
  80    issue.
  81 2. Code duplication (addressing mode) during isel.
  82 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  83    Sequencing of Instructions".
  84 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  85    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  86    and other related papers.
  87    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  88
  89 //===---------------------------------------------------------------------===//
  90
  91 Should we promote i16 to i32 to avoid partial register update stalls?
  92
  93 //===---------------------------------------------------------------------===//
  94
  95 Leave any_extend as pseudo instruction and hint to register
  96 allocator. Delay codegen until post register allocation.
  97
  98 //===---------------------------------------------------------------------===//
  99
 100 Count leading zeros and count trailing zeros:
 101
 102 int clz(int X) { return __builtin_clz(X); }
 103 int ctz(int X) { return __builtin_ctz(X); }
 104
 105 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 106 clz:
 107         bsr     %eax, DWORD PTR [%esp+4]
 108         xor     %eax, 31
 109         ret
 110 ctz:
 111         bsf     %eax, DWORD PTR [%esp+4]
 112         ret
 113
 114 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 115 aren't.
 116
 117 Another example (use predsimplify to eliminate a select):
 118
 119 int foo (unsigned long j) {
 120   if (j)
 121     return __builtin_ffs (j) - 1;
 122   else
 123     return 0;
 124 }
 125
 126 //===---------------------------------------------------------------------===//
 127
 128 Use push/pop instructions in prolog/epilog sequences instead of stores off
 129 ESP (certain code size win, perf win on some [which?] processors).
 130 Also, it appears icc use push for parameter passing. Need to investigate.
 131
 132 //===---------------------------------------------------------------------===//
 133
 134 Only use inc/neg/not instructions on processors where they are faster than
 135 add/sub/xor.  They are slower on the P4 due to only updating some processor
 136 flags.
 137
 138 //===---------------------------------------------------------------------===//
 139
 140 The instruction selector sometimes misses folding a load into a compare.  The
 141 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 142 commutative, it is not matched with the load on both sides.  The dag combiner
 143 should be made smart enough to cannonicalize the load into the RHS of a compare
 144 when it can invert the result of the compare for free.
 145
 146 //===---------------------------------------------------------------------===//
 147
 148 How about intrinsics? An example is:
 149   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 150
 151 compiles to
 152         pmuludq (%eax), %xmm0
 153         movl 8(%esp), %eax
 154         movdqa (%eax), %xmm1
 155         pmulhuw %xmm0, %xmm1
 156
 157 The transformation probably requires a X86 specific pass or a DAG combiner
 158 target specific hook.
 159
 160 //===---------------------------------------------------------------------===//
 161
 162 In many cases, LLVM generates code like this:
 163
 164 _test:
 165         movl 8(%esp), %eax
 166         cmpl %eax, 4(%esp)
 167         setl %al
 168         movzbl %al, %eax
 169         ret
 170
 171 on some processors (which ones?), it is more efficient to do this:
 172
 173 _test:
 174         movl 8(%esp), %ebx
 175         xor  %eax, %eax
 176         cmpl %ebx, 4(%esp)
 177         setl %al
 178         ret
 179
 180 Doing this correctly is tricky though, as the xor clobbers the flags.
 181
 182 //===---------------------------------------------------------------------===//
 183
 184 We should generate bts/btr/etc instructions on targets where they are cheap or
 185 when codesize is important.  e.g., for:
 186
 187 void setbit(int *target, int bit) {
 188     *target |= (1 << bit);
 189 }
 190 void clearbit(int *target, int bit) {
 191     *target &= ~(1 << bit);
 192 }
 193
 194 //===---------------------------------------------------------------------===//
 195
 196 Instead of the following for memset char*, 1, 10:
 197
 198         movl $16843009, 4(%edx)
 199         movl $16843009, (%edx)
 200         movw $257, 8(%edx)
 201
 202 It might be better to generate
 203
 204         movl $16843009, %eax
 205         movl %eax, 4(%edx)
 206         movl %eax, (%edx)
 207         movw al, 8(%edx)
 208
 209 when we can spare a register. It reduces code size.
 210
 211 //===---------------------------------------------------------------------===//
 212
 213 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 214 get this:
 215
 216 int %test1(int %X) {
 217         %Y = div int %X, 8
 218         ret int %Y
 219 }
 220
 221 _test1:
 222         movl 4(%esp), %eax
 223         movl %eax, %ecx
 224         sarl $31, %ecx
 225         shrl $29, %ecx
 226         addl %ecx, %eax
 227         sarl $3, %eax
 228         ret
 229
 230 GCC knows several different ways to codegen it, one of which is this:
 231
 232 _test1:
 233         movl    4(%esp), %eax
 234         cmpl    $-1, %eax
 235         leal    7(%eax), %ecx
 236         cmovle  %ecx, %eax
 237         sarl    $3, %eax
 238         ret
 239
 240 which is probably slower, but it's interesting at least :)
 241
 242 //===---------------------------------------------------------------------===//
 243
 244 The first BB of this code:
 245
 246 declare bool %foo()
 247 int %bar() {
 248         %V = call bool %foo()
 249         br bool %V, label %T, label %F
 250 T:
 251         ret int 1
 252 F:
 253         call bool %foo()
 254         ret int 12
 255 }
 256
 257 compiles to:
 258
 259 _bar:
 260         subl $12, %esp
 261         call L_foo$stub
 262         xorb $1, %al
 263         testb %al, %al
 264         jne LBB_bar_2   # F
 265
 266 It would be better to emit "cmp %al, 1" than a xor and test.
 267
 268 //===---------------------------------------------------------------------===//
 269
 270 Enable X86InstrInfo::convertToThreeAddress().
 271
 272 //===---------------------------------------------------------------------===//
 273
 274 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 275 We should leave these as libcalls for everything over a much lower threshold,
 276 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 277 stores, TLB preheating, etc)
 278
 279 //===---------------------------------------------------------------------===//
 280
 281 Optimize this into something reasonable:
 282  x * copysign(1.0, y) * copysign(1.0, z)
 283
 284 //===---------------------------------------------------------------------===//
 285
 286 Optimize copysign(x, *y) to use an integer load from y.
 287
 288 //===---------------------------------------------------------------------===//
 289
 290 %X = weak global int 0
 291
 292 void %foo(int %N) {
 293         %N = cast int %N to uint
 294         %tmp.24 = setgt int %N, 0
 295         br bool %tmp.24, label %no_exit, label %return
 296
 297 no_exit:
 298         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 299         %i.0.0 = cast uint %indvar to int
 300         volatile store int %i.0.0, int* %X
 301         %indvar.next = add uint %indvar, 1
 302         %exitcond = seteq uint %indvar.next, %N
 303         br bool %exitcond, label %return, label %no_exit
 304
 305 return:
 306         ret void
 307 }
 308
 309 compiles into:
 310
 311         .text
 312         .align  4
 313         .globl  _foo
 314 _foo:
 315         movl 4(%esp), %eax
 316         cmpl $1, %eax
 317         jl LBB_foo_4    # return
 318 LBB_foo_1:      # no_exit.preheader
 319         xorl %ecx, %ecx
 320 LBB_foo_2:      # no_exit
 321         movl L_X$non_lazy_ptr, %edx
 322         movl %ecx, (%edx)
 323         incl %ecx
 324         cmpl %eax, %ecx
 325         jne LBB_foo_2   # no_exit
 326 LBB_foo_3:      # return.loopexit
 327 LBB_foo_4:      # return
 328         ret
 329
 330 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 331 remateralization is implemented. This can be accomplished with 1) a target
 332 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 333
 334 //===---------------------------------------------------------------------===//
 335
 336 The following tests perform worse with LSR:
 337
 338 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 339
 340 //===---------------------------------------------------------------------===//
 341
 342 We are generating far worse code than gcc:
 343
 344 volatile short X, Y;
 345
 346 void foo(int N) {
 347   int i;
 348   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 349 }
 350
 351 LBB1_1: #bb.preheader
 352         xorl %ecx, %ecx
 353         xorw %dx, %dx
 354 LBB1_2: #bb
 355         movl L_X$non_lazy_ptr, %esi
 356         movw %dx, (%esi)
 357         movw %dx, %si
 358         shlw $2, %si
 359         movl L_Y$non_lazy_ptr, %edi
 360         movw %si, (%edi)
 361         incl %ecx
 362         incw %dx
 363         cmpl %eax, %ecx
 364         jne LBB1_2      #bb
 365
 366 vs.
 367
 368         xorl    %edx, %edx
 369         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 370         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 371 L4:
 372         movw    %dx, (%esi)
 373         leal    0(,%edx,4), %eax
 374         movw    %ax, (%ecx)
 375         addl    $1, %edx
 376         cmpl    %edx, %edi
 377         jne     L4
 378
 379 There are 3 issues:
 380
 381 1. Lack of post regalloc LICM.
 382 2. Poor sub-regclass support. That leads to inability to promote the 16-bit
 383    arithmetic op to 32-bit and making use of leal.
 384 3. LSR unable to reused IV for a different type (i16 vs. i32) even though
 385    the cast would be free.
 386
 387 //===---------------------------------------------------------------------===//
 388
 389 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 390 FR64 to VR128.
 391
 392 //===---------------------------------------------------------------------===//
 393
 394 mov $reg, 48(%esp)
 395 ...
 396 leal 48(%esp), %eax
 397 mov %eax, (%esp)
 398 call _foo
 399
 400 Obviously it would have been better for the first mov (or any op) to store
 401 directly %esp[0] if there are no other uses.
 402
 403 //===---------------------------------------------------------------------===//
 404
 405 Adding to the list of cmp / test poor codegen issues:
 406
 407 int test(__m128 *A, __m128 *B) {
 408   if (_mm_comige_ss(*A, *B))
 409     return 3;
 410   else
 411     return 4;
 412 }
 413
 414 _test:
 415         movl 8(%esp), %eax
 416         movaps (%eax), %xmm0
 417         movl 4(%esp), %eax
 418         movaps (%eax), %xmm1
 419         comiss %xmm0, %xmm1
 420         setae %al
 421         movzbl %al, %ecx
 422         movl $3, %eax
 423         movl $4, %edx
 424         cmpl $0, %ecx
 425         cmove %edx, %eax
 426         ret
 427
 428 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 429 are a number of issues. 1) We are introducing a setcc between the result of the
 430 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 431 so a any extend (which becomes a zero extend) is added.
 432
 433 We probably need some kind of target DAG combine hook to fix this.
 434
 435 //===---------------------------------------------------------------------===//
 436
 437 We generate significantly worse code for this than GCC:
 438 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 439 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 440
 441 There is also one case we do worse on PPC.
 442
 443 //===---------------------------------------------------------------------===//
 444
 445 If shorter, we should use things like:
 446 movzwl %ax, %eax
 447 instead of:
 448 andl $65535, %EAX
 449
 450 The former can also be used when the two-addressy nature of the 'and' would
 451 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 452
 453 //===---------------------------------------------------------------------===//
 454
 455 Bad codegen:
 456
 457 char foo(int x) { return x; }
 458
 459 _foo:
 460         movl 4(%esp), %eax
 461         shll $24, %eax
 462         sarl $24, %eax
 463         ret
 464
 465 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 466 sub-registers.
 467
 468 //===---------------------------------------------------------------------===//
 469
 470 Consider this:
 471
 472 typedef struct pair { float A, B; } pair;
 473 void pairtest(pair P, float *FP) {
 474         *FP = P.A+P.B;
 475 }
 476
 477 We currently generate this code with llvmgcc4:
 478
 479 _pairtest:
 480         movl 8(%esp), %eax
 481         movl 4(%esp), %ecx
 482         movd %eax, %xmm0
 483         movd %ecx, %xmm1
 484         addss %xmm0, %xmm1
 485         movl 12(%esp), %eax
 486         movss %xmm1, (%eax)
 487         ret
 488
 489 we should be able to generate:
 490 _pairtest:
 491         movss 4(%esp), %xmm0
 492         movl 12(%esp), %eax
 493         addss 8(%esp), %xmm0
 494         movss %xmm0, (%eax)
 495         ret
 496
 497 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 498 integer chunks.  It does this so that structs like {short,short} are passed in
 499 a single 32-bit integer stack slot.  We should handle the safe cases above much
 500 nicer, while still handling the hard cases.
 501
 502 While true in general, in this specific case we could do better by promoting
 503 load int + bitcast to float -> load fload.  This basically needs alignment info,
 504 the code is already implemented (but disabled) in dag combine).
 505
 506 //===---------------------------------------------------------------------===//
 507
 508 Another instruction selector deficiency:
 509
 510 void %bar() {
 511         %tmp = load int (int)** %foo
 512         %tmp = tail call int %tmp( int 3 )
 513         ret void
 514 }
 515
 516 _bar:
 517         subl $12, %esp
 518         movl L_foo$non_lazy_ptr, %eax
 519         movl (%eax), %eax
 520         call *%eax
 521         addl $12, %esp
 522         ret
 523
 524 The current isel scheme will not allow the load to be folded in the call since
 525 the load's chain result is read by the callseq_start.
 526
 527 //===---------------------------------------------------------------------===//
 528
 529 Don't forget to find a way to squash noop truncates in the JIT environment.
 530
 531 //===---------------------------------------------------------------------===//
 532
 533 Implement anyext in the same manner as truncate that would allow them to be
 534 eliminated.
 535
 536 //===---------------------------------------------------------------------===//
 537
 538 How about implementing truncate / anyext as a property of machine instruction
 539 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 540 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 541 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 542
 543 //===---------------------------------------------------------------------===//
 544
 545 For this:
 546
 547 int test(int a)
 548 {
 549   return a * 3;
 550 }
 551
 552 We currently emits
 553         imull $3, 4(%esp), %eax
 554
 555 Perhaps this is what we really should generate is? Is imull three or four
 556 cycles? Note: ICC generates this:
 557         movl    4(%esp), %eax
 558         leal    (%eax,%eax,2), %eax
 559
 560 The current instruction priority is based on pattern complexity. The former is
 561 more "complex" because it folds a load so the latter will not be emitted.
 562
 563 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 564 should always try to match LEA first since the LEA matching code does some
 565 estimate to determine whether the match is profitable.
 566
 567 However, if we care more about code size, then imull is better. It's two bytes
 568 shorter than movl + leal.
 569
 570 //===---------------------------------------------------------------------===//
 571
 572 Implement CTTZ, CTLZ with bsf and bsr.
 573
 574 //===---------------------------------------------------------------------===//
 575
 576 It appears gcc place string data with linkonce linkage in
 577 .section __TEXT,__const_coal,coalesced instead of
 578 .section __DATA,__const_coal,coalesced.
 579 Take a look at darwin.h, there are other Darwin assembler directives that we
 580 do not make use of.
 581
 582 //===---------------------------------------------------------------------===//
 583
 584 int %foo(int* %a, int %t) {
 585 entry:
 586         br label %cond_true
 587
 588 cond_true:              ; preds = %cond_true, %entry
 589         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 590         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 591         %tmp2 = getelementptr int* %a, int %x.0.0
 592         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 593         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 594         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 595         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 596         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 597         br bool %tmp, label %bb12, label %cond_true
 598
 599 bb12:           ; preds = %cond_true
 600         ret int %tmp7
 601 }
 602
 603 is pessimized by -loop-reduce and -indvars
 604
 605 //===---------------------------------------------------------------------===//
 606
 607 u32 to float conversion improvement:
 608
 609 float uint32_2_float( unsigned u ) {
 610   float fl = (int) (u & 0xffff);
 611   float fh = (int) (u >> 16);
 612   fh *= 0x1.0p16f;
 613   return fh + fl;
 614 }
 615
 616 00000000        subl    $0x04,%esp
 617 00000003        movl    0x08(%esp,1),%eax
 618 00000007        movl    %eax,%ecx
 619 00000009        shrl    $0x10,%ecx
 620 0000000c        cvtsi2ss        %ecx,%xmm0
 621 00000010        andl    $0x0000ffff,%eax
 622 00000015        cvtsi2ss        %eax,%xmm1
 623 00000019        mulss   0x00000078,%xmm0
 624 00000021        addss   %xmm1,%xmm0
 625 00000025        movss   %xmm0,(%esp,1)
 626 0000002a        flds    (%esp,1)
 627 0000002d        addl    $0x04,%esp
 628 00000030        ret
 629
 630 //===---------------------------------------------------------------------===//
 631
 632 When using fastcc abi, align stack slot of argument of type double on 8 byte
 633 boundary to improve performance.
 634
 635 //===---------------------------------------------------------------------===//
 636
 637 Codegen:
 638
 639 int f(int a, int b) {
 640   if (a == 4 || a == 6)
 641     b++;
 642   return b;
 643 }
 644
 645
 646 as:
 647
 648 or eax, 2
 649 cmp eax, 6
 650 jz label
 651
 652 //===---------------------------------------------------------------------===//
 653
 654 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 655 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 656
 657 int G;
 658 void f(int X, int Y) {
 659   G = X < 0 ? 14 : 13;
 660 }
 661
 662 compiling to:
 663
 664 _f:
 665         movl $14, %eax
 666         movl $13, %ecx
 667         movl 4(%esp), %edx
 668         testl %edx, %edx
 669         cmovl %eax, %ecx
 670         movl %ecx, _G
 671         ret
 672
 673 it could be:
 674 _f:
 675         movl    4(%esp), %eax
 676         sarl    $31, %eax
 677         notl    %eax
 678         addl    $14, %eax
 679         movl    %eax, _G
 680         ret
 681
 682 etc.
 683
 684 //===---------------------------------------------------------------------===//
 685
 686 Currently we don't have elimination of redundant stack manipulations. Consider
 687 the code:
 688
 689 int %main() {
 690 entry:
 691         call fastcc void %test1( )
 692         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 693         ret int 0
 694 }
 695
 696 declare fastcc void %test1()
 697
 698 declare fastcc void %test2(sbyte*)
 699
 700
 701 This currently compiles to:
 702
 703         subl $16, %esp
 704         call _test5
 705         addl $12, %esp
 706         subl $16, %esp
 707         movl $_test5, (%esp)
 708         call _test6
 709         addl $12, %esp
 710
 711 The add\sub pair is really unneeded here.
 712
 713 //===---------------------------------------------------------------------===//
 714
 715 We currently compile sign_extend_inreg into two shifts:
 716
 717 long foo(long X) {
 718   return (long)(signed char)X;
 719 }
 720
 721 becomes:
 722
 723 _foo:
 724         movl 4(%esp), %eax
 725         shll $24, %eax
 726         sarl $24, %eax
 727         ret
 728
 729 This could be:
 730
 731 _foo:
 732         movsbl  4(%esp),%eax
 733         ret
 734
 735 //===---------------------------------------------------------------------===//
 736
 737 Consider the expansion of:
 738
 739 uint %test3(uint %X) {
 740         %tmp1 = rem uint %X, 255
 741         ret uint %tmp1
 742 }
 743
 744 Currently it compiles to:
 745
 746 ...
 747         movl $2155905153, %ecx
 748         movl 8(%esp), %esi
 749         movl %esi, %eax
 750         mull %ecx
 751 ...
 752
 753 This could be "reassociated" into:
 754
 755         movl $2155905153, %eax
 756         movl 8(%esp), %ecx
 757         mull %ecx
 758
 759 to avoid the copy.  In fact, the existing two-address stuff would do this
 760 except that mul isn't a commutative 2-addr instruction.  I guess this has
 761 to be done at isel time based on the #uses to mul?
 762
 763 //===---------------------------------------------------------------------===//
 764
 765 Make sure the instruction which starts a loop does not cross a cacheline
 766 boundary. This requires knowning the exact length of each machine instruction.
 767 That is somewhat complicated, but doable. Example 256.bzip2:
 768
 769 In the new trace, the hot loop has an instruction which crosses a cacheline
 770 boundary.  In addition to potential cache misses, this can't help decoding as I
 771 imagine there has to be some kind of complicated decoder reset and realignment
 772 to grab the bytes from the next cacheline.
 773
 774 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 775 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 776 937  937 0x3d0a incl     %esi
 777 3    3   0x3d0b cmpb     %bl, %dl
 778 27   27  0x3d0d jnz      0x000062db <main+11707>
 779
 780 //===---------------------------------------------------------------------===//
 781
 782 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 783
 784 //===---------------------------------------------------------------------===//
 785
 786 This could be a single 16-bit load.
 787
 788 int f(char *p) {
 789     if ((p[0] == 1) & (p[1] == 2)) return 1;
 790     return 0;
 791 }
 792
 793 //===---------------------------------------------------------------------===//
 794
 795 We should inline lrintf and probably other libc functions.
 796
 797 //===---------------------------------------------------------------------===//
 798
 799 Start using the flags more.  For example, compile:
 800
 801 int add_zf(int *x, int y, int a, int b) {
 802      if ((*x += y) == 0)
 803           return a;
 804      else
 805           return b;
 806 }
 807
 808 to:
 809        addl    %esi, (%rdi)
 810        movl    %edx, %eax
 811        cmovne  %ecx, %eax
 812        ret
 813 instead of:
 814
 815 _add_zf:
 816         addl (%rdi), %esi
 817         movl %esi, (%rdi)
 818         testl %esi, %esi
 819         cmove %edx, %ecx
 820         movl %ecx, %eax
 821         ret
 822
 823 and:
 824
 825 int add_zf(int *x, int y, int a, int b) {
 826      if ((*x + y) < 0)
 827           return a;
 828      else
 829           return b;
 830 }
 831
 832 to:
 833
 834 add_zf:
 835         addl    (%rdi), %esi
 836         movl    %edx, %eax
 837         cmovns  %ecx, %eax
 838         ret
 839
 840 instead of:
 841
 842 _add_zf:
 843         addl (%rdi), %esi
 844         testl %esi, %esi
 845         cmovs %edx, %ecx
 846         movl %ecx, %eax
 847         ret
 848
 849 //===---------------------------------------------------------------------===//
 850
 851 This:
 852 #include <math.h>
 853 int foo(double X) { return isnan(X); }
 854
 855 compiles to (-m64):
 856
 857 _foo:
 858         pxor %xmm1, %xmm1
 859         ucomisd %xmm1, %xmm0
 860         setp %al
 861         movzbl %al, %eax
 862         ret
 863
 864 the pxor is not needed, we could compare the value against itself.
 865
 866 //===---------------------------------------------------------------------===//
 867
 868 These two functions have identical effects:
 869
 870 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 871 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 872
 873 We currently compile them to:
 874
 875 _f:
 876         movl 4(%esp), %eax
 877         movl %eax, %ecx
 878         incl %ecx
 879         movl 8(%esp), %edx
 880         cmpl %edx, %ecx
 881         jne LBB1_2      #UnifiedReturnBlock
 882 LBB1_1: #cond_true
 883         addl $2, %eax
 884         ret
 885 LBB1_2: #UnifiedReturnBlock
 886         movl %ecx, %eax
 887         ret
 888 _f2:
 889         movl 4(%esp), %eax
 890         movl %eax, %ecx
 891         incl %ecx
 892         cmpl 8(%esp), %ecx
 893         sete %cl
 894         movzbl %cl, %ecx
 895         leal 1(%ecx,%eax), %eax
 896         ret
 897
 898 both of which are inferior to GCC's:
 899
 900 _f:
 901         movl    4(%esp), %edx
 902         leal    1(%edx), %eax
 903         addl    $2, %edx
 904         cmpl    8(%esp), %eax
 905         cmove   %edx, %eax
 906         ret
 907 _f2:
 908         movl    4(%esp), %eax
 909         addl    $1, %eax
 910         xorl    %edx, %edx
 911         cmpl    8(%esp), %eax
 912         sete    %dl
 913         addl    %edx, %eax
 914         ret
 915
 916 //===---------------------------------------------------------------------===//
 917
 918 This code:
 919
 920 void test(int X) {
 921   if (X) abort();
 922 }
 923
 924 is currently compiled to:
 925
 926 _test:
 927         subl $12, %esp
 928         cmpl $0, 16(%esp)
 929         jne LBB1_1
 930         addl $12, %esp
 931         ret
 932 LBB1_1:
 933         call L_abort$stub
 934
 935 It would be better to produce:
 936
 937 _test:
 938         subl $12, %esp
 939         cmpl $0, 16(%esp)
 940         jne L_abort$stub
 941         addl $12, %esp
 942         ret
 943
 944 This can be applied to any no-return function call that takes no arguments etc.
 945 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 946 something like this:
 947
 948 _test:
 949         cmpl $0, 4(%esp)
 950         jne LBB1_1
 951         ret
 952 LBB1_1:
 953         subl $12, %esp
 954         call L_abort$stub
 955
 956 Both are useful in different situations.  Finally, it could be shrink-wrapped
 957 and tail called, like this:
 958
 959 _test:
 960         cmpl $0, 4(%esp)
 961         jne LBB1_1
 962         ret
 963 LBB1_1:
 964         pop %eax   # realign stack.
 965         call L_abort$stub
 966
 967 Though this probably isn't worth it.
 968
 969 //===---------------------------------------------------------------------===//
 970
 971 We need to teach the codegen to convert two-address INC instructions to LEA
 972 when the flags are dead.  For example, on X86-64, compile:
 973
 974 int foo(int A, int B) {
 975   return A+1;
 976 }
 977
 978 to:
 979
 980 _foo:
 981         leal    1(%edi), %eax
 982         ret
 983
 984 instead of:
 985
 986 _foo:
 987         incl %edi
 988         movl %edi, %eax
 989         ret
 990
 991 //===---------------------------------------------------------------------===//
 992
 993 We use push/pop of stack space around calls in situations where we don't have to.
 994 Call to f below produces:
 995         subl $16, %esp      <<<<<
 996         movl %eax, (%esp)
 997         call L_f$stub
 998         addl $16, %esp     <<<<<
 999 The stack push/pop can be moved into the prolog/epilog.  It does this because it's
1000 building the frame pointer, but this should not be sufficient, only the use of alloca
1001 should cause it to do this.
1002 (There are other issues shown by this code, but this is one.)
1003
1004 typedef struct _range_t {
1005     float fbias;
1006     float fscale;
1007     int ibias;
1008     int iscale;
1009     int ishift;
1010     unsigned char lut[];
1011 } range_t;
1012
1013 struct _decode_t {
1014     int type:4;
1015     int unit:4;
1016     int alpha:8;
1017     int N:8;
1018     int bpc:8;
1019     int bpp:16;
1020     int skip:8;
1021     int swap:8;
1022     const range_t*const*range;
1023 };
1024
1025 typedef struct _decode_t decode_t;
1026
1027 extern int f(const decode_t* decode);
1028
1029 int decode_byte (const decode_t* decode) {
1030   if (decode->swap != 0)
1031     return f(decode);
1032   return 0;
1033 }