lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Missing features:
   6   - Support for SSE4: http://www.intel.com/software/penryn
   7 http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
   8   - support for 3DNow!
   9   - weird abis?
  10
  11 //===---------------------------------------------------------------------===//
  12
  13 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
  14 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
  15 X86, & make the dag combiner produce it when needed.  This will eliminate one
  16 imul from the code generated for:
  17
  18 long long test(long long X, long long Y) { return X*Y; }
  19
  20 by using the EAX result from the mul.  We should add a similar node for
  21 DIVREM.
  22
  23 another case is:
  24
  25 long long test(int X, int Y) { return (long long)X*Y; }
  26
  27 ... which should only be one imul instruction.
  28
  29 This can be done with a custom expander, but it would be nice to move this to
  30 generic code.
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  35 backend knows how to three-addressify this shift, but it appears the register
  36 allocator isn't even asking it to do so in this case.  We should investigate
  37 why this isn't happening, it could have significant impact on other important
  38 cases for X86 as well.
  39
  40 //===---------------------------------------------------------------------===//
  41
  42 This should be one DIV/IDIV instruction, not a libcall:
  43
  44 unsigned test(unsigned long long X, unsigned Y) {
  45         return X/Y;
  46 }
  47
  48 This can be done trivially with a custom legalizer.  What about overflow
  49 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  50
  51 //===---------------------------------------------------------------------===//
  52
  53 Improvements to the multiply -> shift/add algorithm:
  54 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  55
  56 //===---------------------------------------------------------------------===//
  57
  58 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  59 long long foo(int x) { return 1LL << x; }
  60
  61 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  62 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  63 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  64
  65 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  66
  67 One better solution for 1LL << x is:
  68         xorl    %eax, %eax
  69         xorl    %edx, %edx
  70         testb   $32, %cl
  71         sete    %al
  72         setne   %dl
  73         sall    %cl, %eax
  74         sall    %cl, %edx
  75
  76 But that requires good 8-bit subreg support.
  77
  78 64-bit shifts (in general) expand to really bad code.  Instead of using
  79 cmovs, we should expand to a conditional branch like GCC produces.
  80
  81 //===---------------------------------------------------------------------===//
  82
  83 Compile this:
  84 _Bool f(_Bool a) { return a!=1; }
  85
  86 into:
  87         movzbl  %dil, %eax
  88         xorl    $1, %eax
  89         ret
  90
  91 //===---------------------------------------------------------------------===//
  92
  93 Some isel ideas:
  94
  95 1. Dynamic programming based approach when compile time if not an
  96    issue.
  97 2. Code duplication (addressing mode) during isel.
  98 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  99    Sequencing of Instructions".
 100 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
 101    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
 102    and other related papers.
 103    http://citeseer.ist.psu.edu/govindarajan01minimum.html
 104
 105 //===---------------------------------------------------------------------===//
 106
 107 Should we promote i16 to i32 to avoid partial register update stalls?
 108
 109 //===---------------------------------------------------------------------===//
 110
 111 Leave any_extend as pseudo instruction and hint to register
 112 allocator. Delay codegen until post register allocation.
 113
 114 //===---------------------------------------------------------------------===//
 115
 116 Count leading zeros and count trailing zeros:
 117
 118 int clz(int X) { return __builtin_clz(X); }
 119 int ctz(int X) { return __builtin_ctz(X); }
 120
 121 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 122 clz:
 123         bsr     %eax, DWORD PTR [%esp+4]
 124         xor     %eax, 31
 125         ret
 126 ctz:
 127         bsf     %eax, DWORD PTR [%esp+4]
 128         ret
 129
 130 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 131 aren't.
 132
 133 Another example (use predsimplify to eliminate a select):
 134
 135 int foo (unsigned long j) {
 136   if (j)
 137     return __builtin_ffs (j) - 1;
 138   else
 139     return 0;
 140 }
 141
 142 //===---------------------------------------------------------------------===//
 143
 144 Use push/pop instructions in prolog/epilog sequences instead of stores off
 145 ESP (certain code size win, perf win on some [which?] processors).
 146 Also, it appears icc use push for parameter passing. Need to investigate.
 147
 148 //===---------------------------------------------------------------------===//
 149
 150 Only use inc/neg/not instructions on processors where they are faster than
 151 add/sub/xor.  They are slower on the P4 due to only updating some processor
 152 flags.
 153
 154 //===---------------------------------------------------------------------===//
 155
 156 The instruction selector sometimes misses folding a load into a compare.  The
 157 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 158 commutative, it is not matched with the load on both sides.  The dag combiner
 159 should be made smart enough to cannonicalize the load into the RHS of a compare
 160 when it can invert the result of the compare for free.
 161
 162 //===---------------------------------------------------------------------===//
 163
 164 How about intrinsics? An example is:
 165   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 166
 167 compiles to
 168         pmuludq (%eax), %xmm0
 169         movl 8(%esp), %eax
 170         movdqa (%eax), %xmm1
 171         pmulhuw %xmm0, %xmm1
 172
 173 The transformation probably requires a X86 specific pass or a DAG combiner
 174 target specific hook.
 175
 176 //===---------------------------------------------------------------------===//
 177
 178 In many cases, LLVM generates code like this:
 179
 180 _test:
 181         movl 8(%esp), %eax
 182         cmpl %eax, 4(%esp)
 183         setl %al
 184         movzbl %al, %eax
 185         ret
 186
 187 on some processors (which ones?), it is more efficient to do this:
 188
 189 _test:
 190         movl 8(%esp), %ebx
 191         xor  %eax, %eax
 192         cmpl %ebx, 4(%esp)
 193         setl %al
 194         ret
 195
 196 Doing this correctly is tricky though, as the xor clobbers the flags.
 197
 198 //===---------------------------------------------------------------------===//
 199
 200 We should generate bts/btr/etc instructions on targets where they are cheap or
 201 when codesize is important.  e.g., for:
 202
 203 void setbit(int *target, int bit) {
 204     *target |= (1 << bit);
 205 }
 206 void clearbit(int *target, int bit) {
 207     *target &= ~(1 << bit);
 208 }
 209
 210 //===---------------------------------------------------------------------===//
 211
 212 Instead of the following for memset char*, 1, 10:
 213
 214         movl $16843009, 4(%edx)
 215         movl $16843009, (%edx)
 216         movw $257, 8(%edx)
 217
 218 It might be better to generate
 219
 220         movl $16843009, %eax
 221         movl %eax, 4(%edx)
 222         movl %eax, (%edx)
 223         movw al, 8(%edx)
 224
 225 when we can spare a register. It reduces code size.
 226
 227 //===---------------------------------------------------------------------===//
 228
 229 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 230 get this:
 231
 232 int %test1(int %X) {
 233         %Y = div int %X, 8
 234         ret int %Y
 235 }
 236
 237 _test1:
 238         movl 4(%esp), %eax
 239         movl %eax, %ecx
 240         sarl $31, %ecx
 241         shrl $29, %ecx
 242         addl %ecx, %eax
 243         sarl $3, %eax
 244         ret
 245
 246 GCC knows several different ways to codegen it, one of which is this:
 247
 248 _test1:
 249         movl    4(%esp), %eax
 250         cmpl    $-1, %eax
 251         leal    7(%eax), %ecx
 252         cmovle  %ecx, %eax
 253         sarl    $3, %eax
 254         ret
 255
 256 which is probably slower, but it's interesting at least :)
 257
 258 //===---------------------------------------------------------------------===//
 259
 260 The first BB of this code:
 261
 262 declare bool %foo()
 263 int %bar() {
 264         %V = call bool %foo()
 265         br bool %V, label %T, label %F
 266 T:
 267         ret int 1
 268 F:
 269         call bool %foo()
 270         ret int 12
 271 }
 272
 273 compiles to:
 274
 275 _bar:
 276         subl $12, %esp
 277         call L_foo$stub
 278         xorb $1, %al
 279         testb %al, %al
 280         jne LBB_bar_2   # F
 281
 282 It would be better to emit "cmp %al, 1" than a xor and test.
 283
 284 //===---------------------------------------------------------------------===//
 285
 286 Enable X86InstrInfo::convertToThreeAddress().
 287
 288 //===---------------------------------------------------------------------===//
 289
 290 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 291 We should leave these as libcalls for everything over a much lower threshold,
 292 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 293 stores, TLB preheating, etc)
 294
 295 //===---------------------------------------------------------------------===//
 296
 297 Optimize this into something reasonable:
 298  x * copysign(1.0, y) * copysign(1.0, z)
 299
 300 //===---------------------------------------------------------------------===//
 301
 302 Optimize copysign(x, *y) to use an integer load from y.
 303
 304 //===---------------------------------------------------------------------===//
 305
 306 %X = weak global int 0
 307
 308 void %foo(int %N) {
 309         %N = cast int %N to uint
 310         %tmp.24 = setgt int %N, 0
 311         br bool %tmp.24, label %no_exit, label %return
 312
 313 no_exit:
 314         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 315         %i.0.0 = cast uint %indvar to int
 316         volatile store int %i.0.0, int* %X
 317         %indvar.next = add uint %indvar, 1
 318         %exitcond = seteq uint %indvar.next, %N
 319         br bool %exitcond, label %return, label %no_exit
 320
 321 return:
 322         ret void
 323 }
 324
 325 compiles into:
 326
 327         .text
 328         .align  4
 329         .globl  _foo
 330 _foo:
 331         movl 4(%esp), %eax
 332         cmpl $1, %eax
 333         jl LBB_foo_4    # return
 334 LBB_foo_1:      # no_exit.preheader
 335         xorl %ecx, %ecx
 336 LBB_foo_2:      # no_exit
 337         movl L_X$non_lazy_ptr, %edx
 338         movl %ecx, (%edx)
 339         incl %ecx
 340         cmpl %eax, %ecx
 341         jne LBB_foo_2   # no_exit
 342 LBB_foo_3:      # return.loopexit
 343 LBB_foo_4:      # return
 344         ret
 345
 346 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 347 remateralization is implemented. This can be accomplished with 1) a target
 348 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 349
 350 //===---------------------------------------------------------------------===//
 351
 352 The following tests perform worse with LSR:
 353
 354 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 355
 356 //===---------------------------------------------------------------------===//
 357
 358 We are generating far worse code than gcc:
 359
 360 volatile short X, Y;
 361
 362 void foo(int N) {
 363   int i;
 364   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 365 }
 366
 367 LBB1_1: #bb.preheader
 368         xorl %ecx, %ecx
 369         xorw %dx, %dx
 370 LBB1_2: #bb
 371         movl L_X$non_lazy_ptr, %esi
 372         movw %dx, (%esi)
 373         movw %dx, %si
 374         shlw $2, %si
 375         movl L_Y$non_lazy_ptr, %edi
 376         movw %si, (%edi)
 377         incl %ecx
 378         incw %dx
 379         cmpl %eax, %ecx
 380         jne LBB1_2      #bb
 381
 382 vs.
 383
 384         xorl    %edx, %edx
 385         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 386         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 387 L4:
 388         movw    %dx, (%esi)
 389         leal    0(,%edx,4), %eax
 390         movw    %ax, (%ecx)
 391         addl    $1, %edx
 392         cmpl    %edx, %edi
 393         jne     L4
 394
 395 There are 3 issues:
 396
 397 1. Lack of post regalloc LICM.
 398 2. Poor sub-regclass support. That leads to inability to promote the 16-bit
 399    arithmetic op to 32-bit and making use of leal.
 400 3. LSR unable to reused IV for a different type (i16 vs. i32) even though
 401    the cast would be free.
 402
 403 //===---------------------------------------------------------------------===//
 404
 405 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 406 FR64 to VR128.
 407
 408 //===---------------------------------------------------------------------===//
 409
 410 mov $reg, 48(%esp)
 411 ...
 412 leal 48(%esp), %eax
 413 mov %eax, (%esp)
 414 call _foo
 415
 416 Obviously it would have been better for the first mov (or any op) to store
 417 directly %esp[0] if there are no other uses.
 418
 419 //===---------------------------------------------------------------------===//
 420
 421 Adding to the list of cmp / test poor codegen issues:
 422
 423 int test(__m128 *A, __m128 *B) {
 424   if (_mm_comige_ss(*A, *B))
 425     return 3;
 426   else
 427     return 4;
 428 }
 429
 430 _test:
 431         movl 8(%esp), %eax
 432         movaps (%eax), %xmm0
 433         movl 4(%esp), %eax
 434         movaps (%eax), %xmm1
 435         comiss %xmm0, %xmm1
 436         setae %al
 437         movzbl %al, %ecx
 438         movl $3, %eax
 439         movl $4, %edx
 440         cmpl $0, %ecx
 441         cmove %edx, %eax
 442         ret
 443
 444 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 445 are a number of issues. 1) We are introducing a setcc between the result of the
 446 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 447 so a any extend (which becomes a zero extend) is added.
 448
 449 We probably need some kind of target DAG combine hook to fix this.
 450
 451 //===---------------------------------------------------------------------===//
 452
 453 We generate significantly worse code for this than GCC:
 454 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 455 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 456
 457 There is also one case we do worse on PPC.
 458
 459 //===---------------------------------------------------------------------===//
 460
 461 If shorter, we should use things like:
 462 movzwl %ax, %eax
 463 instead of:
 464 andl $65535, %EAX
 465
 466 The former can also be used when the two-addressy nature of the 'and' would
 467 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 468
 469 //===---------------------------------------------------------------------===//
 470
 471 Bad codegen:
 472
 473 char foo(int x) { return x; }
 474
 475 _foo:
 476         movl 4(%esp), %eax
 477         shll $24, %eax
 478         sarl $24, %eax
 479         ret
 480
 481 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 482 sub-registers.
 483
 484 //===---------------------------------------------------------------------===//
 485
 486 Consider this:
 487
 488 typedef struct pair { float A, B; } pair;
 489 void pairtest(pair P, float *FP) {
 490         *FP = P.A+P.B;
 491 }
 492
 493 We currently generate this code with llvmgcc4:
 494
 495 _pairtest:
 496         movl 8(%esp), %eax
 497         movl 4(%esp), %ecx
 498         movd %eax, %xmm0
 499         movd %ecx, %xmm1
 500         addss %xmm0, %xmm1
 501         movl 12(%esp), %eax
 502         movss %xmm1, (%eax)
 503         ret
 504
 505 we should be able to generate:
 506 _pairtest:
 507         movss 4(%esp), %xmm0
 508         movl 12(%esp), %eax
 509         addss 8(%esp), %xmm0
 510         movss %xmm0, (%eax)
 511         ret
 512
 513 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 514 integer chunks.  It does this so that structs like {short,short} are passed in
 515 a single 32-bit integer stack slot.  We should handle the safe cases above much
 516 nicer, while still handling the hard cases.
 517
 518 While true in general, in this specific case we could do better by promoting
 519 load int + bitcast to float -> load fload.  This basically needs alignment info,
 520 the code is already implemented (but disabled) in dag combine).
 521
 522 //===---------------------------------------------------------------------===//
 523
 524 Another instruction selector deficiency:
 525
 526 void %bar() {
 527         %tmp = load int (int)** %foo
 528         %tmp = tail call int %tmp( int 3 )
 529         ret void
 530 }
 531
 532 _bar:
 533         subl $12, %esp
 534         movl L_foo$non_lazy_ptr, %eax
 535         movl (%eax), %eax
 536         call *%eax
 537         addl $12, %esp
 538         ret
 539
 540 The current isel scheme will not allow the load to be folded in the call since
 541 the load's chain result is read by the callseq_start.
 542
 543 //===---------------------------------------------------------------------===//
 544
 545 Don't forget to find a way to squash noop truncates in the JIT environment.
 546
 547 //===---------------------------------------------------------------------===//
 548
 549 Implement anyext in the same manner as truncate that would allow them to be
 550 eliminated.
 551
 552 //===---------------------------------------------------------------------===//
 553
 554 How about implementing truncate / anyext as a property of machine instruction
 555 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 556 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 557 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 558
 559 //===---------------------------------------------------------------------===//
 560
 561 For this:
 562
 563 int test(int a)
 564 {
 565   return a * 3;
 566 }
 567
 568 We currently emits
 569         imull $3, 4(%esp), %eax
 570
 571 Perhaps this is what we really should generate is? Is imull three or four
 572 cycles? Note: ICC generates this:
 573         movl    4(%esp), %eax
 574         leal    (%eax,%eax,2), %eax
 575
 576 The current instruction priority is based on pattern complexity. The former is
 577 more "complex" because it folds a load so the latter will not be emitted.
 578
 579 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 580 should always try to match LEA first since the LEA matching code does some
 581 estimate to determine whether the match is profitable.
 582
 583 However, if we care more about code size, then imull is better. It's two bytes
 584 shorter than movl + leal.
 585
 586 //===---------------------------------------------------------------------===//
 587
 588 Implement CTTZ, CTLZ with bsf and bsr.
 589
 590 //===---------------------------------------------------------------------===//
 591
 592 It appears gcc place string data with linkonce linkage in
 593 .section __TEXT,__const_coal,coalesced instead of
 594 .section __DATA,__const_coal,coalesced.
 595 Take a look at darwin.h, there are other Darwin assembler directives that we
 596 do not make use of.
 597
 598 //===---------------------------------------------------------------------===//
 599
 600 int %foo(int* %a, int %t) {
 601 entry:
 602         br label %cond_true
 603
 604 cond_true:              ; preds = %cond_true, %entry
 605         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 606         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 607         %tmp2 = getelementptr int* %a, int %x.0.0
 608         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 609         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 610         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 611         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 612         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 613         br bool %tmp, label %bb12, label %cond_true
 614
 615 bb12:           ; preds = %cond_true
 616         ret int %tmp7
 617 }
 618
 619 is pessimized by -loop-reduce and -indvars
 620
 621 //===---------------------------------------------------------------------===//
 622
 623 u32 to float conversion improvement:
 624
 625 float uint32_2_float( unsigned u ) {
 626   float fl = (int) (u & 0xffff);
 627   float fh = (int) (u >> 16);
 628   fh *= 0x1.0p16f;
 629   return fh + fl;
 630 }
 631
 632 00000000        subl    $0x04,%esp
 633 00000003        movl    0x08(%esp,1),%eax
 634 00000007        movl    %eax,%ecx
 635 00000009        shrl    $0x10,%ecx
 636 0000000c        cvtsi2ss        %ecx,%xmm0
 637 00000010        andl    $0x0000ffff,%eax
 638 00000015        cvtsi2ss        %eax,%xmm1
 639 00000019        mulss   0x00000078,%xmm0
 640 00000021        addss   %xmm1,%xmm0
 641 00000025        movss   %xmm0,(%esp,1)
 642 0000002a        flds    (%esp,1)
 643 0000002d        addl    $0x04,%esp
 644 00000030        ret
 645
 646 //===---------------------------------------------------------------------===//
 647
 648 When using fastcc abi, align stack slot of argument of type double on 8 byte
 649 boundary to improve performance.
 650
 651 //===---------------------------------------------------------------------===//
 652
 653 Codegen:
 654
 655 int f(int a, int b) {
 656   if (a == 4 || a == 6)
 657     b++;
 658   return b;
 659 }
 660
 661
 662 as:
 663
 664 or eax, 2
 665 cmp eax, 6
 666 jz label
 667
 668 //===---------------------------------------------------------------------===//
 669
 670 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 671 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 672
 673 int G;
 674 void f(int X, int Y) {
 675   G = X < 0 ? 14 : 13;
 676 }
 677
 678 compiling to:
 679
 680 _f:
 681         movl $14, %eax
 682         movl $13, %ecx
 683         movl 4(%esp), %edx
 684         testl %edx, %edx
 685         cmovl %eax, %ecx
 686         movl %ecx, _G
 687         ret
 688
 689 it could be:
 690 _f:
 691         movl    4(%esp), %eax
 692         sarl    $31, %eax
 693         notl    %eax
 694         addl    $14, %eax
 695         movl    %eax, _G
 696         ret
 697
 698 etc.
 699
 700 //===---------------------------------------------------------------------===//
 701
 702 Currently we don't have elimination of redundant stack manipulations. Consider
 703 the code:
 704
 705 int %main() {
 706 entry:
 707         call fastcc void %test1( )
 708         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 709         ret int 0
 710 }
 711
 712 declare fastcc void %test1()
 713
 714 declare fastcc void %test2(sbyte*)
 715
 716
 717 This currently compiles to:
 718
 719         subl $16, %esp
 720         call _test5
 721         addl $12, %esp
 722         subl $16, %esp
 723         movl $_test5, (%esp)
 724         call _test6
 725         addl $12, %esp
 726
 727 The add\sub pair is really unneeded here.
 728
 729 //===---------------------------------------------------------------------===//
 730
 731 We currently compile sign_extend_inreg into two shifts:
 732
 733 long foo(long X) {
 734   return (long)(signed char)X;
 735 }
 736
 737 becomes:
 738
 739 _foo:
 740         movl 4(%esp), %eax
 741         shll $24, %eax
 742         sarl $24, %eax
 743         ret
 744
 745 This could be:
 746
 747 _foo:
 748         movsbl  4(%esp),%eax
 749         ret
 750
 751 //===---------------------------------------------------------------------===//
 752
 753 Consider the expansion of:
 754
 755 uint %test3(uint %X) {
 756         %tmp1 = rem uint %X, 255
 757         ret uint %tmp1
 758 }
 759
 760 Currently it compiles to:
 761
 762 ...
 763         movl $2155905153, %ecx
 764         movl 8(%esp), %esi
 765         movl %esi, %eax
 766         mull %ecx
 767 ...
 768
 769 This could be "reassociated" into:
 770
 771         movl $2155905153, %eax
 772         movl 8(%esp), %ecx
 773         mull %ecx
 774
 775 to avoid the copy.  In fact, the existing two-address stuff would do this
 776 except that mul isn't a commutative 2-addr instruction.  I guess this has
 777 to be done at isel time based on the #uses to mul?
 778
 779 //===---------------------------------------------------------------------===//
 780
 781 Make sure the instruction which starts a loop does not cross a cacheline
 782 boundary. This requires knowning the exact length of each machine instruction.
 783 That is somewhat complicated, but doable. Example 256.bzip2:
 784
 785 In the new trace, the hot loop has an instruction which crosses a cacheline
 786 boundary.  In addition to potential cache misses, this can't help decoding as I
 787 imagine there has to be some kind of complicated decoder reset and realignment
 788 to grab the bytes from the next cacheline.
 789
 790 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 791 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 792 937  937 0x3d0a incl     %esi
 793 3    3   0x3d0b cmpb     %bl, %dl
 794 27   27  0x3d0d jnz      0x000062db <main+11707>
 795
 796 //===---------------------------------------------------------------------===//
 797
 798 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 799
 800 //===---------------------------------------------------------------------===//
 801
 802 This could be a single 16-bit load.
 803
 804 int f(char *p) {
 805     if ((p[0] == 1) & (p[1] == 2)) return 1;
 806     return 0;
 807 }
 808
 809 //===---------------------------------------------------------------------===//
 810
 811 We should inline lrintf and probably other libc functions.
 812
 813 //===---------------------------------------------------------------------===//
 814
 815 Start using the flags more.  For example, compile:
 816
 817 int add_zf(int *x, int y, int a, int b) {
 818      if ((*x += y) == 0)
 819           return a;
 820      else
 821           return b;
 822 }
 823
 824 to:
 825        addl    %esi, (%rdi)
 826        movl    %edx, %eax
 827        cmovne  %ecx, %eax
 828        ret
 829 instead of:
 830
 831 _add_zf:
 832         addl (%rdi), %esi
 833         movl %esi, (%rdi)
 834         testl %esi, %esi
 835         cmove %edx, %ecx
 836         movl %ecx, %eax
 837         ret
 838
 839 and:
 840
 841 int add_zf(int *x, int y, int a, int b) {
 842      if ((*x + y) < 0)
 843           return a;
 844      else
 845           return b;
 846 }
 847
 848 to:
 849
 850 add_zf:
 851         addl    (%rdi), %esi
 852         movl    %edx, %eax
 853         cmovns  %ecx, %eax
 854         ret
 855
 856 instead of:
 857
 858 _add_zf:
 859         addl (%rdi), %esi
 860         testl %esi, %esi
 861         cmovs %edx, %ecx
 862         movl %ecx, %eax
 863         ret
 864
 865 //===---------------------------------------------------------------------===//
 866
 867 This:
 868 #include <math.h>
 869 int foo(double X) { return isnan(X); }
 870
 871 compiles to (-m64):
 872
 873 _foo:
 874         pxor %xmm1, %xmm1
 875         ucomisd %xmm1, %xmm0
 876         setp %al
 877         movzbl %al, %eax
 878         ret
 879
 880 the pxor is not needed, we could compare the value against itself.
 881
 882 //===---------------------------------------------------------------------===//
 883
 884 These two functions have identical effects:
 885
 886 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 887 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 888
 889 We currently compile them to:
 890
 891 _f:
 892         movl 4(%esp), %eax
 893         movl %eax, %ecx
 894         incl %ecx
 895         movl 8(%esp), %edx
 896         cmpl %edx, %ecx
 897         jne LBB1_2      #UnifiedReturnBlock
 898 LBB1_1: #cond_true
 899         addl $2, %eax
 900         ret
 901 LBB1_2: #UnifiedReturnBlock
 902         movl %ecx, %eax
 903         ret
 904 _f2:
 905         movl 4(%esp), %eax
 906         movl %eax, %ecx
 907         incl %ecx
 908         cmpl 8(%esp), %ecx
 909         sete %cl
 910         movzbl %cl, %ecx
 911         leal 1(%ecx,%eax), %eax
 912         ret
 913
 914 both of which are inferior to GCC's:
 915
 916 _f:
 917         movl    4(%esp), %edx
 918         leal    1(%edx), %eax
 919         addl    $2, %edx
 920         cmpl    8(%esp), %eax
 921         cmove   %edx, %eax
 922         ret
 923 _f2:
 924         movl    4(%esp), %eax
 925         addl    $1, %eax
 926         xorl    %edx, %edx
 927         cmpl    8(%esp), %eax
 928         sete    %dl
 929         addl    %edx, %eax
 930         ret
 931
 932 //===---------------------------------------------------------------------===//
 933
 934 This code:
 935
 936 void test(int X) {
 937   if (X) abort();
 938 }
 939
 940 is currently compiled to:
 941
 942 _test:
 943         subl $12, %esp
 944         cmpl $0, 16(%esp)
 945         jne LBB1_1
 946         addl $12, %esp
 947         ret
 948 LBB1_1:
 949         call L_abort$stub
 950
 951 It would be better to produce:
 952
 953 _test:
 954         subl $12, %esp
 955         cmpl $0, 16(%esp)
 956         jne L_abort$stub
 957         addl $12, %esp
 958         ret
 959
 960 This can be applied to any no-return function call that takes no arguments etc.
 961 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 962 something like this:
 963
 964 _test:
 965         cmpl $0, 4(%esp)
 966         jne LBB1_1
 967         ret
 968 LBB1_1:
 969         subl $12, %esp
 970         call L_abort$stub
 971
 972 Both are useful in different situations.  Finally, it could be shrink-wrapped
 973 and tail called, like this:
 974
 975 _test:
 976         cmpl $0, 4(%esp)
 977         jne LBB1_1
 978         ret
 979 LBB1_1:
 980         pop %eax   # realign stack.
 981         call L_abort$stub
 982
 983 Though this probably isn't worth it.
 984
 985 //===---------------------------------------------------------------------===//
 986
 987 We need to teach the codegen to convert two-address INC instructions to LEA
 988 when the flags are dead.  For example, on X86-64, compile:
 989
 990 int foo(int A, int B) {
 991   return A+1;
 992 }
 993
 994 to:
 995
 996 _foo:
 997         leal    1(%edi), %eax
 998         ret
 999
1000 instead of:
1001
1002 _foo:
1003         incl %edi
1004         movl %edi, %eax
1005         ret
1006
1007 //===---------------------------------------------------------------------===//
1008
1009 We use push/pop of stack space around calls in situations where we don't have to.
1010 Call to f below produces:
1011         subl $16, %esp      <<<<<
1012         movl %eax, (%esp)
1013         call L_f$stub
1014         addl $16, %esp     <<<<<
1015 The stack push/pop can be moved into the prolog/epilog.  It does this because it's
1016 building the frame pointer, but this should not be sufficient, only the use of alloca
1017 should cause it to do this.
1018 (There are other issues shown by this code, but this is one.)
1019
1020 typedef struct _range_t {
1021     float fbias;
1022     float fscale;
1023     int ibias;
1024     int iscale;
1025     int ishift;
1026     unsigned char lut[];
1027 } range_t;
1028
1029 struct _decode_t {
1030     int type:4;
1031     int unit:4;
1032     int alpha:8;
1033     int N:8;
1034     int bpc:8;
1035     int bpp:16;
1036     int skip:8;
1037     int swap:8;
1038     const range_t*const*range;
1039 };
1040
1041 typedef struct _decode_t decode_t;
1042
1043 extern int f(const decode_t* decode);
1044
1045 int decode_byte (const decode_t* decode) {
1046   if (decode->swap != 0)
1047     return f(decode);
1048   return 0;
1049 }
1050
1051
1052 //===---------------------------------------------------------------------===//
1053
1054 This:
1055 #include <xmmintrin.h>
1056 unsigned test(float f) {
1057  return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
1058 }
1059
1060 Compiles to:
1061 _test:
1062         movss 4(%esp), %xmm0
1063         movd %xmm0, %eax
1064         ret
1065
1066 it should compile to a move from the stack slot directly into eax.  DAGCombine
1067 has this xform, but it is currently disabled until the alignment fields of
1068 the load/store nodes are trustworthy.
1069
1070
1071