lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Missing features:
   6   - support for 3DNow!
   7   - weird abis?
   8
   9 //===---------------------------------------------------------------------===//
  10
  11 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
  12 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
  13 X86, & make the dag combiner produce it when needed.  This will eliminate one
  14 imul from the code generated for:
  15
  16 long long test(long long X, long long Y) { return X*Y; }
  17
  18 by using the EAX result from the mul.  We should add a similar node for
  19 DIVREM.
  20
  21 another case is:
  22
  23 long long test(int X, int Y) { return (long long)X*Y; }
  24
  25 ... which should only be one imul instruction.
  26
  27 This can be done with a custom expander, but it would be nice to move this to
  28 generic code.
  29
  30 //===---------------------------------------------------------------------===//
  31
  32 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  33 backend knows how to three-addressify this shift, but it appears the register
  34 allocator isn't even asking it to do so in this case.  We should investigate
  35 why this isn't happening, it could have significant impact on other important
  36 cases for X86 as well.
  37
  38 //===---------------------------------------------------------------------===//
  39
  40 This should be one DIV/IDIV instruction, not a libcall:
  41
  42 unsigned test(unsigned long long X, unsigned Y) {
  43         return X/Y;
  44 }
  45
  46 This can be done trivially with a custom legalizer.  What about overflow
  47 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  48
  49 //===---------------------------------------------------------------------===//
  50
  51 Improvements to the multiply -> shift/add algorithm:
  52 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  53
  54 //===---------------------------------------------------------------------===//
  55
  56 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  57 long long foo(int x) { return 1LL << x; }
  58
  59 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  60 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  61 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  62
  63 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  64
  65 One better solution for 1LL << x is:
  66         xorl    %eax, %eax
  67         xorl    %edx, %edx
  68         testb   $32, %cl
  69         sete    %al
  70         setne   %dl
  71         sall    %cl, %eax
  72         sall    %cl, %edx
  73
  74 But that requires good 8-bit subreg support.
  75
  76 64-bit shifts (in general) expand to really bad code.  Instead of using
  77 cmovs, we should expand to a conditional branch like GCC produces.
  78
  79 //===---------------------------------------------------------------------===//
  80
  81 Compile this:
  82 _Bool f(_Bool a) { return a!=1; }
  83
  84 into:
  85         movzbl  %dil, %eax
  86         xorl    $1, %eax
  87         ret
  88
  89 //===---------------------------------------------------------------------===//
  90
  91 Some isel ideas:
  92
  93 1. Dynamic programming based approach when compile time if not an
  94    issue.
  95 2. Code duplication (addressing mode) during isel.
  96 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  97    Sequencing of Instructions".
  98 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  99    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
 100    and other related papers.
 101    http://citeseer.ist.psu.edu/govindarajan01minimum.html
 102
 103 //===---------------------------------------------------------------------===//
 104
 105 Should we promote i16 to i32 to avoid partial register update stalls?
 106
 107 //===---------------------------------------------------------------------===//
 108
 109 Leave any_extend as pseudo instruction and hint to register
 110 allocator. Delay codegen until post register allocation.
 111
 112 //===---------------------------------------------------------------------===//
 113
 114 Count leading zeros and count trailing zeros:
 115
 116 int clz(int X) { return __builtin_clz(X); }
 117 int ctz(int X) { return __builtin_ctz(X); }
 118
 119 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 120 clz:
 121         bsr     %eax, DWORD PTR [%esp+4]
 122         xor     %eax, 31
 123         ret
 124 ctz:
 125         bsf     %eax, DWORD PTR [%esp+4]
 126         ret
 127
 128 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 129 aren't.
 130
 131 Another example (use predsimplify to eliminate a select):
 132
 133 int foo (unsigned long j) {
 134   if (j)
 135     return __builtin_ffs (j) - 1;
 136   else
 137     return 0;
 138 }
 139
 140 //===---------------------------------------------------------------------===//
 141
 142 Use push/pop instructions in prolog/epilog sequences instead of stores off
 143 ESP (certain code size win, perf win on some [which?] processors).
 144 Also, it appears icc use push for parameter passing. Need to investigate.
 145
 146 //===---------------------------------------------------------------------===//
 147
 148 Only use inc/neg/not instructions on processors where they are faster than
 149 add/sub/xor.  They are slower on the P4 due to only updating some processor
 150 flags.
 151
 152 //===---------------------------------------------------------------------===//
 153
 154 The instruction selector sometimes misses folding a load into a compare.  The
 155 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 156 commutative, it is not matched with the load on both sides.  The dag combiner
 157 should be made smart enough to cannonicalize the load into the RHS of a compare
 158 when it can invert the result of the compare for free.
 159
 160 //===---------------------------------------------------------------------===//
 161
 162 How about intrinsics? An example is:
 163   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 164
 165 compiles to
 166         pmuludq (%eax), %xmm0
 167         movl 8(%esp), %eax
 168         movdqa (%eax), %xmm1
 169         pmulhuw %xmm0, %xmm1
 170
 171 The transformation probably requires a X86 specific pass or a DAG combiner
 172 target specific hook.
 173
 174 //===---------------------------------------------------------------------===//
 175
 176 In many cases, LLVM generates code like this:
 177
 178 _test:
 179         movl 8(%esp), %eax
 180         cmpl %eax, 4(%esp)
 181         setl %al
 182         movzbl %al, %eax
 183         ret
 184
 185 on some processors (which ones?), it is more efficient to do this:
 186
 187 _test:
 188         movl 8(%esp), %ebx
 189         xor  %eax, %eax
 190         cmpl %ebx, 4(%esp)
 191         setl %al
 192         ret
 193
 194 Doing this correctly is tricky though, as the xor clobbers the flags.
 195
 196 //===---------------------------------------------------------------------===//
 197
 198 We should generate bts/btr/etc instructions on targets where they are cheap or
 199 when codesize is important.  e.g., for:
 200
 201 void setbit(int *target, int bit) {
 202     *target |= (1 << bit);
 203 }
 204 void clearbit(int *target, int bit) {
 205     *target &= ~(1 << bit);
 206 }
 207
 208 //===---------------------------------------------------------------------===//
 209
 210 Instead of the following for memset char*, 1, 10:
 211
 212         movl $16843009, 4(%edx)
 213         movl $16843009, (%edx)
 214         movw $257, 8(%edx)
 215
 216 It might be better to generate
 217
 218         movl $16843009, %eax
 219         movl %eax, 4(%edx)
 220         movl %eax, (%edx)
 221         movw al, 8(%edx)
 222
 223 when we can spare a register. It reduces code size.
 224
 225 //===---------------------------------------------------------------------===//
 226
 227 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 228 get this:
 229
 230 int %test1(int %X) {
 231         %Y = div int %X, 8
 232         ret int %Y
 233 }
 234
 235 _test1:
 236         movl 4(%esp), %eax
 237         movl %eax, %ecx
 238         sarl $31, %ecx
 239         shrl $29, %ecx
 240         addl %ecx, %eax
 241         sarl $3, %eax
 242         ret
 243
 244 GCC knows several different ways to codegen it, one of which is this:
 245
 246 _test1:
 247         movl    4(%esp), %eax
 248         cmpl    $-1, %eax
 249         leal    7(%eax), %ecx
 250         cmovle  %ecx, %eax
 251         sarl    $3, %eax
 252         ret
 253
 254 which is probably slower, but it's interesting at least :)
 255
 256 //===---------------------------------------------------------------------===//
 257
 258 The first BB of this code:
 259
 260 declare bool %foo()
 261 int %bar() {
 262         %V = call bool %foo()
 263         br bool %V, label %T, label %F
 264 T:
 265         ret int 1
 266 F:
 267         call bool %foo()
 268         ret int 12
 269 }
 270
 271 compiles to:
 272
 273 _bar:
 274         subl $12, %esp
 275         call L_foo$stub
 276         xorb $1, %al
 277         testb %al, %al
 278         jne LBB_bar_2   # F
 279
 280 It would be better to emit "cmp %al, 1" than a xor and test.
 281
 282 //===---------------------------------------------------------------------===//
 283
 284 Enable X86InstrInfo::convertToThreeAddress().
 285
 286 //===---------------------------------------------------------------------===//
 287
 288 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 289 We should leave these as libcalls for everything over a much lower threshold,
 290 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 291 stores, TLB preheating, etc)
 292
 293 //===---------------------------------------------------------------------===//
 294
 295 Optimize this into something reasonable:
 296  x * copysign(1.0, y) * copysign(1.0, z)
 297
 298 //===---------------------------------------------------------------------===//
 299
 300 Optimize copysign(x, *y) to use an integer load from y.
 301
 302 //===---------------------------------------------------------------------===//
 303
 304 %X = weak global int 0
 305
 306 void %foo(int %N) {
 307         %N = cast int %N to uint
 308         %tmp.24 = setgt int %N, 0
 309         br bool %tmp.24, label %no_exit, label %return
 310
 311 no_exit:
 312         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 313         %i.0.0 = cast uint %indvar to int
 314         volatile store int %i.0.0, int* %X
 315         %indvar.next = add uint %indvar, 1
 316         %exitcond = seteq uint %indvar.next, %N
 317         br bool %exitcond, label %return, label %no_exit
 318
 319 return:
 320         ret void
 321 }
 322
 323 compiles into:
 324
 325         .text
 326         .align  4
 327         .globl  _foo
 328 _foo:
 329         movl 4(%esp), %eax
 330         cmpl $1, %eax
 331         jl LBB_foo_4    # return
 332 LBB_foo_1:      # no_exit.preheader
 333         xorl %ecx, %ecx
 334 LBB_foo_2:      # no_exit
 335         movl L_X$non_lazy_ptr, %edx
 336         movl %ecx, (%edx)
 337         incl %ecx
 338         cmpl %eax, %ecx
 339         jne LBB_foo_2   # no_exit
 340 LBB_foo_3:      # return.loopexit
 341 LBB_foo_4:      # return
 342         ret
 343
 344 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 345 remateralization is implemented. This can be accomplished with 1) a target
 346 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 347
 348 //===---------------------------------------------------------------------===//
 349
 350 The following tests perform worse with LSR:
 351
 352 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 353
 354 //===---------------------------------------------------------------------===//
 355
 356 We are generating far worse code than gcc:
 357
 358 volatile short X, Y;
 359
 360 void foo(int N) {
 361   int i;
 362   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 363 }
 364
 365 LBB1_1: #bb.preheader
 366         xorl %ecx, %ecx
 367         xorw %dx, %dx
 368 LBB1_2: #bb
 369         movl L_X$non_lazy_ptr, %esi
 370         movw %dx, (%esi)
 371         movw %dx, %si
 372         shlw $2, %si
 373         movl L_Y$non_lazy_ptr, %edi
 374         movw %si, (%edi)
 375         incl %ecx
 376         incw %dx
 377         cmpl %eax, %ecx
 378         jne LBB1_2      #bb
 379
 380 vs.
 381
 382         xorl    %edx, %edx
 383         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 384         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 385 L4:
 386         movw    %dx, (%esi)
 387         leal    0(,%edx,4), %eax
 388         movw    %ax, (%ecx)
 389         addl    $1, %edx
 390         cmpl    %edx, %edi
 391         jne     L4
 392
 393 There are 3 issues:
 394
 395 1. Lack of post regalloc LICM.
 396 2. Poor sub-regclass support. That leads to inability to promote the 16-bit
 397    arithmetic op to 32-bit and making use of leal.
 398 3. LSR unable to reused IV for a different type (i16 vs. i32) even though
 399    the cast would be free.
 400
 401 //===---------------------------------------------------------------------===//
 402
 403 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 404 FR64 to VR128.
 405
 406 //===---------------------------------------------------------------------===//
 407
 408 mov $reg, 48(%esp)
 409 ...
 410 leal 48(%esp), %eax
 411 mov %eax, (%esp)
 412 call _foo
 413
 414 Obviously it would have been better for the first mov (or any op) to store
 415 directly %esp[0] if there are no other uses.
 416
 417 //===---------------------------------------------------------------------===//
 418
 419 Adding to the list of cmp / test poor codegen issues:
 420
 421 int test(__m128 *A, __m128 *B) {
 422   if (_mm_comige_ss(*A, *B))
 423     return 3;
 424   else
 425     return 4;
 426 }
 427
 428 _test:
 429         movl 8(%esp), %eax
 430         movaps (%eax), %xmm0
 431         movl 4(%esp), %eax
 432         movaps (%eax), %xmm1
 433         comiss %xmm0, %xmm1
 434         setae %al
 435         movzbl %al, %ecx
 436         movl $3, %eax
 437         movl $4, %edx
 438         cmpl $0, %ecx
 439         cmove %edx, %eax
 440         ret
 441
 442 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 443 are a number of issues. 1) We are introducing a setcc between the result of the
 444 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 445 so a any extend (which becomes a zero extend) is added.
 446
 447 We probably need some kind of target DAG combine hook to fix this.
 448
 449 //===---------------------------------------------------------------------===//
 450
 451 We generate significantly worse code for this than GCC:
 452 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 453 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 454
 455 There is also one case we do worse on PPC.
 456
 457 //===---------------------------------------------------------------------===//
 458
 459 If shorter, we should use things like:
 460 movzwl %ax, %eax
 461 instead of:
 462 andl $65535, %EAX
 463
 464 The former can also be used when the two-addressy nature of the 'and' would
 465 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 466
 467 //===---------------------------------------------------------------------===//
 468
 469 Bad codegen:
 470
 471 char foo(int x) { return x; }
 472
 473 _foo:
 474         movl 4(%esp), %eax
 475         shll $24, %eax
 476         sarl $24, %eax
 477         ret
 478
 479 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 480 sub-registers.
 481
 482 //===---------------------------------------------------------------------===//
 483
 484 Consider this:
 485
 486 typedef struct pair { float A, B; } pair;
 487 void pairtest(pair P, float *FP) {
 488         *FP = P.A+P.B;
 489 }
 490
 491 We currently generate this code with llvmgcc4:
 492
 493 _pairtest:
 494         movl 8(%esp), %eax
 495         movl 4(%esp), %ecx
 496         movd %eax, %xmm0
 497         movd %ecx, %xmm1
 498         addss %xmm0, %xmm1
 499         movl 12(%esp), %eax
 500         movss %xmm1, (%eax)
 501         ret
 502
 503 we should be able to generate:
 504 _pairtest:
 505         movss 4(%esp), %xmm0
 506         movl 12(%esp), %eax
 507         addss 8(%esp), %xmm0
 508         movss %xmm0, (%eax)
 509         ret
 510
 511 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 512 integer chunks.  It does this so that structs like {short,short} are passed in
 513 a single 32-bit integer stack slot.  We should handle the safe cases above much
 514 nicer, while still handling the hard cases.
 515
 516 While true in general, in this specific case we could do better by promoting
 517 load int + bitcast to float -> load fload.  This basically needs alignment info,
 518 the code is already implemented (but disabled) in dag combine).
 519
 520 //===---------------------------------------------------------------------===//
 521
 522 Another instruction selector deficiency:
 523
 524 void %bar() {
 525         %tmp = load int (int)** %foo
 526         %tmp = tail call int %tmp( int 3 )
 527         ret void
 528 }
 529
 530 _bar:
 531         subl $12, %esp
 532         movl L_foo$non_lazy_ptr, %eax
 533         movl (%eax), %eax
 534         call *%eax
 535         addl $12, %esp
 536         ret
 537
 538 The current isel scheme will not allow the load to be folded in the call since
 539 the load's chain result is read by the callseq_start.
 540
 541 //===---------------------------------------------------------------------===//
 542
 543 Don't forget to find a way to squash noop truncates in the JIT environment.
 544
 545 //===---------------------------------------------------------------------===//
 546
 547 Implement anyext in the same manner as truncate that would allow them to be
 548 eliminated.
 549
 550 //===---------------------------------------------------------------------===//
 551
 552 How about implementing truncate / anyext as a property of machine instruction
 553 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 554 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 555 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 556
 557 //===---------------------------------------------------------------------===//
 558
 559 For this:
 560
 561 int test(int a)
 562 {
 563   return a * 3;
 564 }
 565
 566 We currently emits
 567         imull $3, 4(%esp), %eax
 568
 569 Perhaps this is what we really should generate is? Is imull three or four
 570 cycles? Note: ICC generates this:
 571         movl    4(%esp), %eax
 572         leal    (%eax,%eax,2), %eax
 573
 574 The current instruction priority is based on pattern complexity. The former is
 575 more "complex" because it folds a load so the latter will not be emitted.
 576
 577 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 578 should always try to match LEA first since the LEA matching code does some
 579 estimate to determine whether the match is profitable.
 580
 581 However, if we care more about code size, then imull is better. It's two bytes
 582 shorter than movl + leal.
 583
 584 //===---------------------------------------------------------------------===//
 585
 586 Implement CTTZ, CTLZ with bsf and bsr.
 587
 588 //===---------------------------------------------------------------------===//
 589
 590 It appears gcc place string data with linkonce linkage in
 591 .section __TEXT,__const_coal,coalesced instead of
 592 .section __DATA,__const_coal,coalesced.
 593 Take a look at darwin.h, there are other Darwin assembler directives that we
 594 do not make use of.
 595
 596 //===---------------------------------------------------------------------===//
 597
 598 int %foo(int* %a, int %t) {
 599 entry:
 600         br label %cond_true
 601
 602 cond_true:              ; preds = %cond_true, %entry
 603         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 604         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 605         %tmp2 = getelementptr int* %a, int %x.0.0
 606         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 607         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 608         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 609         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 610         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 611         br bool %tmp, label %bb12, label %cond_true
 612
 613 bb12:           ; preds = %cond_true
 614         ret int %tmp7
 615 }
 616
 617 is pessimized by -loop-reduce and -indvars
 618
 619 //===---------------------------------------------------------------------===//
 620
 621 u32 to float conversion improvement:
 622
 623 float uint32_2_float( unsigned u ) {
 624   float fl = (int) (u & 0xffff);
 625   float fh = (int) (u >> 16);
 626   fh *= 0x1.0p16f;
 627   return fh + fl;
 628 }
 629
 630 00000000        subl    $0x04,%esp
 631 00000003        movl    0x08(%esp,1),%eax
 632 00000007        movl    %eax,%ecx
 633 00000009        shrl    $0x10,%ecx
 634 0000000c        cvtsi2ss        %ecx,%xmm0
 635 00000010        andl    $0x0000ffff,%eax
 636 00000015        cvtsi2ss        %eax,%xmm1
 637 00000019        mulss   0x00000078,%xmm0
 638 00000021        addss   %xmm1,%xmm0
 639 00000025        movss   %xmm0,(%esp,1)
 640 0000002a        flds    (%esp,1)
 641 0000002d        addl    $0x04,%esp
 642 00000030        ret
 643
 644 //===---------------------------------------------------------------------===//
 645
 646 When using fastcc abi, align stack slot of argument of type double on 8 byte
 647 boundary to improve performance.
 648
 649 //===---------------------------------------------------------------------===//
 650
 651 Codegen:
 652
 653 int f(int a, int b) {
 654   if (a == 4 || a == 6)
 655     b++;
 656   return b;
 657 }
 658
 659
 660 as:
 661
 662 or eax, 2
 663 cmp eax, 6
 664 jz label
 665
 666 //===---------------------------------------------------------------------===//
 667
 668 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 669 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 670
 671 int G;
 672 void f(int X, int Y) {
 673   G = X < 0 ? 14 : 13;
 674 }
 675
 676 compiling to:
 677
 678 _f:
 679         movl $14, %eax
 680         movl $13, %ecx
 681         movl 4(%esp), %edx
 682         testl %edx, %edx
 683         cmovl %eax, %ecx
 684         movl %ecx, _G
 685         ret
 686
 687 it could be:
 688 _f:
 689         movl    4(%esp), %eax
 690         sarl    $31, %eax
 691         notl    %eax
 692         addl    $14, %eax
 693         movl    %eax, _G
 694         ret
 695
 696 etc.
 697
 698 //===---------------------------------------------------------------------===//
 699
 700 Currently we don't have elimination of redundant stack manipulations. Consider
 701 the code:
 702
 703 int %main() {
 704 entry:
 705         call fastcc void %test1( )
 706         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 707         ret int 0
 708 }
 709
 710 declare fastcc void %test1()
 711
 712 declare fastcc void %test2(sbyte*)
 713
 714
 715 This currently compiles to:
 716
 717         subl $16, %esp
 718         call _test5
 719         addl $12, %esp
 720         subl $16, %esp
 721         movl $_test5, (%esp)
 722         call _test6
 723         addl $12, %esp
 724
 725 The add\sub pair is really unneeded here.
 726
 727 //===---------------------------------------------------------------------===//
 728
 729 We currently compile sign_extend_inreg into two shifts:
 730
 731 long foo(long X) {
 732   return (long)(signed char)X;
 733 }
 734
 735 becomes:
 736
 737 _foo:
 738         movl 4(%esp), %eax
 739         shll $24, %eax
 740         sarl $24, %eax
 741         ret
 742
 743 This could be:
 744
 745 _foo:
 746         movsbl  4(%esp),%eax
 747         ret
 748
 749 //===---------------------------------------------------------------------===//
 750
 751 Consider the expansion of:
 752
 753 uint %test3(uint %X) {
 754         %tmp1 = rem uint %X, 255
 755         ret uint %tmp1
 756 }
 757
 758 Currently it compiles to:
 759
 760 ...
 761         movl $2155905153, %ecx
 762         movl 8(%esp), %esi
 763         movl %esi, %eax
 764         mull %ecx
 765 ...
 766
 767 This could be "reassociated" into:
 768
 769         movl $2155905153, %eax
 770         movl 8(%esp), %ecx
 771         mull %ecx
 772
 773 to avoid the copy.  In fact, the existing two-address stuff would do this
 774 except that mul isn't a commutative 2-addr instruction.  I guess this has
 775 to be done at isel time based on the #uses to mul?
 776
 777 //===---------------------------------------------------------------------===//
 778
 779 Make sure the instruction which starts a loop does not cross a cacheline
 780 boundary. This requires knowning the exact length of each machine instruction.
 781 That is somewhat complicated, but doable. Example 256.bzip2:
 782
 783 In the new trace, the hot loop has an instruction which crosses a cacheline
 784 boundary.  In addition to potential cache misses, this can't help decoding as I
 785 imagine there has to be some kind of complicated decoder reset and realignment
 786 to grab the bytes from the next cacheline.
 787
 788 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 789 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 790 937  937 0x3d0a incl     %esi
 791 3    3   0x3d0b cmpb     %bl, %dl
 792 27   27  0x3d0d jnz      0x000062db <main+11707>
 793
 794 //===---------------------------------------------------------------------===//
 795
 796 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 797
 798 //===---------------------------------------------------------------------===//
 799
 800 This could be a single 16-bit load.
 801
 802 int f(char *p) {
 803     if ((p[0] == 1) & (p[1] == 2)) return 1;
 804     return 0;
 805 }
 806
 807 //===---------------------------------------------------------------------===//
 808
 809 We should inline lrintf and probably other libc functions.
 810
 811 //===---------------------------------------------------------------------===//
 812
 813 Start using the flags more.  For example, compile:
 814
 815 int add_zf(int *x, int y, int a, int b) {
 816      if ((*x += y) == 0)
 817           return a;
 818      else
 819           return b;
 820 }
 821
 822 to:
 823        addl    %esi, (%rdi)
 824        movl    %edx, %eax
 825        cmovne  %ecx, %eax
 826        ret
 827 instead of:
 828
 829 _add_zf:
 830         addl (%rdi), %esi
 831         movl %esi, (%rdi)
 832         testl %esi, %esi
 833         cmove %edx, %ecx
 834         movl %ecx, %eax
 835         ret
 836
 837 and:
 838
 839 int add_zf(int *x, int y, int a, int b) {
 840      if ((*x + y) < 0)
 841           return a;
 842      else
 843           return b;
 844 }
 845
 846 to:
 847
 848 add_zf:
 849         addl    (%rdi), %esi
 850         movl    %edx, %eax
 851         cmovns  %ecx, %eax
 852         ret
 853
 854 instead of:
 855
 856 _add_zf:
 857         addl (%rdi), %esi
 858         testl %esi, %esi
 859         cmovs %edx, %ecx
 860         movl %ecx, %eax
 861         ret
 862
 863 //===---------------------------------------------------------------------===//
 864
 865 This:
 866 #include <math.h>
 867 int foo(double X) { return isnan(X); }
 868
 869 compiles to (-m64):
 870
 871 _foo:
 872         pxor %xmm1, %xmm1
 873         ucomisd %xmm1, %xmm0
 874         setp %al
 875         movzbl %al, %eax
 876         ret
 877
 878 the pxor is not needed, we could compare the value against itself.
 879
 880 //===---------------------------------------------------------------------===//
 881
 882 These two functions have identical effects:
 883
 884 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 885 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 886
 887 We currently compile them to:
 888
 889 _f:
 890         movl 4(%esp), %eax
 891         movl %eax, %ecx
 892         incl %ecx
 893         movl 8(%esp), %edx
 894         cmpl %edx, %ecx
 895         jne LBB1_2      #UnifiedReturnBlock
 896 LBB1_1: #cond_true
 897         addl $2, %eax
 898         ret
 899 LBB1_2: #UnifiedReturnBlock
 900         movl %ecx, %eax
 901         ret
 902 _f2:
 903         movl 4(%esp), %eax
 904         movl %eax, %ecx
 905         incl %ecx
 906         cmpl 8(%esp), %ecx
 907         sete %cl
 908         movzbl %cl, %ecx
 909         leal 1(%ecx,%eax), %eax
 910         ret
 911
 912 both of which are inferior to GCC's:
 913
 914 _f:
 915         movl    4(%esp), %edx
 916         leal    1(%edx), %eax
 917         addl    $2, %edx
 918         cmpl    8(%esp), %eax
 919         cmove   %edx, %eax
 920         ret
 921 _f2:
 922         movl    4(%esp), %eax
 923         addl    $1, %eax
 924         xorl    %edx, %edx
 925         cmpl    8(%esp), %eax
 926         sete    %dl
 927         addl    %edx, %eax
 928         ret
 929
 930 //===---------------------------------------------------------------------===//
 931
 932 This code:
 933
 934 void test(int X) {
 935   if (X) abort();
 936 }
 937
 938 is currently compiled to:
 939
 940 _test:
 941         subl $12, %esp
 942         cmpl $0, 16(%esp)
 943         jne LBB1_1
 944         addl $12, %esp
 945         ret
 946 LBB1_1:
 947         call L_abort$stub
 948
 949 It would be better to produce:
 950
 951 _test:
 952         subl $12, %esp
 953         cmpl $0, 16(%esp)
 954         jne L_abort$stub
 955         addl $12, %esp
 956         ret
 957
 958 This can be applied to any no-return function call that takes no arguments etc.
 959 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 960 something like this:
 961
 962 _test:
 963         cmpl $0, 4(%esp)
 964         jne LBB1_1
 965         ret
 966 LBB1_1:
 967         subl $12, %esp
 968         call L_abort$stub
 969
 970 Both are useful in different situations.  Finally, it could be shrink-wrapped
 971 and tail called, like this:
 972
 973 _test:
 974         cmpl $0, 4(%esp)
 975         jne LBB1_1
 976         ret
 977 LBB1_1:
 978         pop %eax   # realign stack.
 979         call L_abort$stub
 980
 981 Though this probably isn't worth it.
 982
 983 //===---------------------------------------------------------------------===//
 984
 985 We need to teach the codegen to convert two-address INC instructions to LEA
 986 when the flags are dead.  For example, on X86-64, compile:
 987
 988 int foo(int A, int B) {
 989   return A+1;
 990 }
 991
 992 to:
 993
 994 _foo:
 995         leal    1(%edi), %eax
 996         ret
 997
 998 instead of:
 999
1000 _foo:
1001         incl %edi
1002         movl %edi, %eax
1003         ret
1004
1005 //===---------------------------------------------------------------------===//
1006
1007 We use push/pop of stack space around calls in situations where we don't have to.
1008 Call to f below produces:
1009         subl $16, %esp      <<<<<
1010         movl %eax, (%esp)
1011         call L_f$stub
1012         addl $16, %esp     <<<<<
1013 The stack push/pop can be moved into the prolog/epilog.  It does this because it's
1014 building the frame pointer, but this should not be sufficient, only the use of alloca
1015 should cause it to do this.
1016 (There are other issues shown by this code, but this is one.)
1017
1018 typedef struct _range_t {
1019     float fbias;
1020     float fscale;
1021     int ibias;
1022     int iscale;
1023     int ishift;
1024     unsigned char lut[];
1025 } range_t;
1026
1027 struct _decode_t {
1028     int type:4;
1029     int unit:4;
1030     int alpha:8;
1031     int N:8;
1032     int bpc:8;
1033     int bpp:16;
1034     int skip:8;
1035     int swap:8;
1036     const range_t*const*range;
1037 };
1038
1039 typedef struct _decode_t decode_t;
1040
1041 extern int f(const decode_t* decode);
1042
1043 int decode_byte (const decode_t* decode) {
1044   if (decode->swap != 0)
1045     return f(decode);
1046   return 0;
1047 }
1048
1049
1050 //===---------------------------------------------------------------------===//
1051