lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 This can be done with a custom expander, but it would be nice to move this to
  22 generic code.
  23
  24 //===---------------------------------------------------------------------===//
  25
  26 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  27 backend knows how to three-addressify this shift, but it appears the register
  28 allocator isn't even asking it to do so in this case.  We should investigate
  29 why this isn't happening, it could have significant impact on other important
  30 cases for X86 as well.
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 This should be one DIV/IDIV instruction, not a libcall:
  35
  36 unsigned test(unsigned long long X, unsigned Y) {
  37         return X/Y;
  38 }
  39
  40 This can be done trivially with a custom legalizer.  What about overflow
  41 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  42
  43 //===---------------------------------------------------------------------===//
  44
  45 Improvements to the multiply -> shift/add algorithm:
  46 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  47
  48 //===---------------------------------------------------------------------===//
  49
  50 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  51 long long foo(int x) { return 1LL << x; }
  52
  53 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  54 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  55 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  56
  57 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  58
  59 One better solution for 1LL << x is:
  60         xorl    %eax, %eax
  61         xorl    %edx, %edx
  62         testb   $32, %cl
  63         sete    %al
  64         setne   %dl
  65         sall    %cl, %eax
  66         sall    %cl, %edx
  67
  68 But that requires good 8-bit subreg support.
  69
  70 64-bit shifts (in general) expand to really bad code.  Instead of using
  71 cmovs, we should expand to a conditional branch like GCC produces.
  72
  73 //===---------------------------------------------------------------------===//
  74
  75 Compile this:
  76 _Bool f(_Bool a) { return a!=1; }
  77
  78 into:
  79         movzbl  %dil, %eax
  80         xorl    $1, %eax
  81         ret
  82
  83 //===---------------------------------------------------------------------===//
  84
  85 Some isel ideas:
  86
  87 1. Dynamic programming based approach when compile time if not an
  88    issue.
  89 2. Code duplication (addressing mode) during isel.
  90 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  91    Sequencing of Instructions".
  92 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  93    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  94    and other related papers.
  95    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  96
  97 //===---------------------------------------------------------------------===//
  98
  99 Should we promote i16 to i32 to avoid partial register update stalls?
 100
 101 //===---------------------------------------------------------------------===//
 102
 103 Leave any_extend as pseudo instruction and hint to register
 104 allocator. Delay codegen until post register allocation.
 105
 106 //===---------------------------------------------------------------------===//
 107
 108 Count leading zeros and count trailing zeros:
 109
 110 int clz(int X) { return __builtin_clz(X); }
 111 int ctz(int X) { return __builtin_ctz(X); }
 112
 113 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 114 clz:
 115         bsr     %eax, DWORD PTR [%esp+4]
 116         xor     %eax, 31
 117         ret
 118 ctz:
 119         bsf     %eax, DWORD PTR [%esp+4]
 120         ret
 121
 122 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 123 aren't.
 124
 125 Another example (use predsimplify to eliminate a select):
 126
 127 int foo (unsigned long j) {
 128   if (j)
 129     return __builtin_ffs (j) - 1;
 130   else
 131     return 0;
 132 }
 133
 134 //===---------------------------------------------------------------------===//
 135
 136 Use push/pop instructions in prolog/epilog sequences instead of stores off
 137 ESP (certain code size win, perf win on some [which?] processors).
 138 Also, it appears icc use push for parameter passing. Need to investigate.
 139
 140 //===---------------------------------------------------------------------===//
 141
 142 Only use inc/neg/not instructions on processors where they are faster than
 143 add/sub/xor.  They are slower on the P4 due to only updating some processor
 144 flags.
 145
 146 //===---------------------------------------------------------------------===//
 147
 148 The instruction selector sometimes misses folding a load into a compare.  The
 149 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 150 commutative, it is not matched with the load on both sides.  The dag combiner
 151 should be made smart enough to cannonicalize the load into the RHS of a compare
 152 when it can invert the result of the compare for free.
 153
 154 //===---------------------------------------------------------------------===//
 155
 156 How about intrinsics? An example is:
 157   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 158
 159 compiles to
 160         pmuludq (%eax), %xmm0
 161         movl 8(%esp), %eax
 162         movdqa (%eax), %xmm1
 163         pmulhuw %xmm0, %xmm1
 164
 165 The transformation probably requires a X86 specific pass or a DAG combiner
 166 target specific hook.
 167
 168 //===---------------------------------------------------------------------===//
 169
 170 In many cases, LLVM generates code like this:
 171
 172 _test:
 173         movl 8(%esp), %eax
 174         cmpl %eax, 4(%esp)
 175         setl %al
 176         movzbl %al, %eax
 177         ret
 178
 179 on some processors (which ones?), it is more efficient to do this:
 180
 181 _test:
 182         movl 8(%esp), %ebx
 183         xor  %eax, %eax
 184         cmpl %ebx, 4(%esp)
 185         setl %al
 186         ret
 187
 188 Doing this correctly is tricky though, as the xor clobbers the flags.
 189
 190 //===---------------------------------------------------------------------===//
 191
 192 We should generate bts/btr/etc instructions on targets where they are cheap or
 193 when codesize is important.  e.g., for:
 194
 195 void setbit(int *target, int bit) {
 196     *target |= (1 << bit);
 197 }
 198 void clearbit(int *target, int bit) {
 199     *target &= ~(1 << bit);
 200 }
 201
 202 //===---------------------------------------------------------------------===//
 203
 204 Instead of the following for memset char*, 1, 10:
 205
 206         movl $16843009, 4(%edx)
 207         movl $16843009, (%edx)
 208         movw $257, 8(%edx)
 209
 210 It might be better to generate
 211
 212         movl $16843009, %eax
 213         movl %eax, 4(%edx)
 214         movl %eax, (%edx)
 215         movw al, 8(%edx)
 216
 217 when we can spare a register. It reduces code size.
 218
 219 //===---------------------------------------------------------------------===//
 220
 221 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 222 get this:
 223
 224 int %test1(int %X) {
 225         %Y = div int %X, 8
 226         ret int %Y
 227 }
 228
 229 _test1:
 230         movl 4(%esp), %eax
 231         movl %eax, %ecx
 232         sarl $31, %ecx
 233         shrl $29, %ecx
 234         addl %ecx, %eax
 235         sarl $3, %eax
 236         ret
 237
 238 GCC knows several different ways to codegen it, one of which is this:
 239
 240 _test1:
 241         movl    4(%esp), %eax
 242         cmpl    $-1, %eax
 243         leal    7(%eax), %ecx
 244         cmovle  %ecx, %eax
 245         sarl    $3, %eax
 246         ret
 247
 248 which is probably slower, but it's interesting at least :)
 249
 250 //===---------------------------------------------------------------------===//
 251
 252 The first BB of this code:
 253
 254 declare bool %foo()
 255 int %bar() {
 256         %V = call bool %foo()
 257         br bool %V, label %T, label %F
 258 T:
 259         ret int 1
 260 F:
 261         call bool %foo()
 262         ret int 12
 263 }
 264
 265 compiles to:
 266
 267 _bar:
 268         subl $12, %esp
 269         call L_foo$stub
 270         xorb $1, %al
 271         testb %al, %al
 272         jne LBB_bar_2   # F
 273
 274 It would be better to emit "cmp %al, 1" than a xor and test.
 275
 276 //===---------------------------------------------------------------------===//
 277
 278 Enable X86InstrInfo::convertToThreeAddress().
 279
 280 //===---------------------------------------------------------------------===//
 281
 282 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 283 We should leave these as libcalls for everything over a much lower threshold,
 284 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 285 stores, TLB preheating, etc)
 286
 287 //===---------------------------------------------------------------------===//
 288
 289 Optimize this into something reasonable:
 290  x * copysign(1.0, y) * copysign(1.0, z)
 291
 292 //===---------------------------------------------------------------------===//
 293
 294 Optimize copysign(x, *y) to use an integer load from y.
 295
 296 //===---------------------------------------------------------------------===//
 297
 298 %X = weak global int 0
 299
 300 void %foo(int %N) {
 301         %N = cast int %N to uint
 302         %tmp.24 = setgt int %N, 0
 303         br bool %tmp.24, label %no_exit, label %return
 304
 305 no_exit:
 306         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 307         %i.0.0 = cast uint %indvar to int
 308         volatile store int %i.0.0, int* %X
 309         %indvar.next = add uint %indvar, 1
 310         %exitcond = seteq uint %indvar.next, %N
 311         br bool %exitcond, label %return, label %no_exit
 312
 313 return:
 314         ret void
 315 }
 316
 317 compiles into:
 318
 319         .text
 320         .align  4
 321         .globl  _foo
 322 _foo:
 323         movl 4(%esp), %eax
 324         cmpl $1, %eax
 325         jl LBB_foo_4    # return
 326 LBB_foo_1:      # no_exit.preheader
 327         xorl %ecx, %ecx
 328 LBB_foo_2:      # no_exit
 329         movl L_X$non_lazy_ptr, %edx
 330         movl %ecx, (%edx)
 331         incl %ecx
 332         cmpl %eax, %ecx
 333         jne LBB_foo_2   # no_exit
 334 LBB_foo_3:      # return.loopexit
 335 LBB_foo_4:      # return
 336         ret
 337
 338 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 339 remateralization is implemented. This can be accomplished with 1) a target
 340 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 341
 342 //===---------------------------------------------------------------------===//
 343
 344 The following tests perform worse with LSR:
 345
 346 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 347
 348 //===---------------------------------------------------------------------===//
 349
 350 We are generating far worse code than gcc:
 351
 352 volatile short X, Y;
 353
 354 void foo(int N) {
 355   int i;
 356   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 357 }
 358
 359 LBB1_1: #bb.preheader
 360         xorl %ecx, %ecx
 361         xorw %dx, %dx
 362 LBB1_2: #bb
 363         movl L_X$non_lazy_ptr, %esi
 364         movw %dx, (%esi)
 365         movw %dx, %si
 366         shlw $2, %si
 367         movl L_Y$non_lazy_ptr, %edi
 368         movw %si, (%edi)
 369         incl %ecx
 370         incw %dx
 371         cmpl %eax, %ecx
 372         jne LBB1_2      #bb
 373
 374 vs.
 375
 376         xorl    %edx, %edx
 377         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 378         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 379 L4:
 380         movw    %dx, (%esi)
 381         leal    0(,%edx,4), %eax
 382         movw    %ax, (%ecx)
 383         addl    $1, %edx
 384         cmpl    %edx, %edi
 385         jne     L4
 386
 387 There are 3 issues:
 388
 389 1. Lack of post regalloc LICM.
 390 2. Poor sub-regclass support. That leads to inability to promote the 16-bit
 391    arithmetic op to 32-bit and making use of leal.
 392 3. LSR unable to reused IV for a different type (i16 vs. i32) even though
 393    the cast would be free.
 394
 395 //===---------------------------------------------------------------------===//
 396
 397 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 398 FR64 to VR128.
 399
 400 //===---------------------------------------------------------------------===//
 401
 402 mov $reg, 48(%esp)
 403 ...
 404 leal 48(%esp), %eax
 405 mov %eax, (%esp)
 406 call _foo
 407
 408 Obviously it would have been better for the first mov (or any op) to store
 409 directly %esp[0] if there are no other uses.
 410
 411 //===---------------------------------------------------------------------===//
 412
 413 Adding to the list of cmp / test poor codegen issues:
 414
 415 int test(__m128 *A, __m128 *B) {
 416   if (_mm_comige_ss(*A, *B))
 417     return 3;
 418   else
 419     return 4;
 420 }
 421
 422 _test:
 423         movl 8(%esp), %eax
 424         movaps (%eax), %xmm0
 425         movl 4(%esp), %eax
 426         movaps (%eax), %xmm1
 427         comiss %xmm0, %xmm1
 428         setae %al
 429         movzbl %al, %ecx
 430         movl $3, %eax
 431         movl $4, %edx
 432         cmpl $0, %ecx
 433         cmove %edx, %eax
 434         ret
 435
 436 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 437 are a number of issues. 1) We are introducing a setcc between the result of the
 438 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 439 so a any extend (which becomes a zero extend) is added.
 440
 441 We probably need some kind of target DAG combine hook to fix this.
 442
 443 //===---------------------------------------------------------------------===//
 444
 445 We generate significantly worse code for this than GCC:
 446 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 447 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 448
 449 There is also one case we do worse on PPC.
 450
 451 //===---------------------------------------------------------------------===//
 452
 453 If shorter, we should use things like:
 454 movzwl %ax, %eax
 455 instead of:
 456 andl $65535, %EAX
 457
 458 The former can also be used when the two-addressy nature of the 'and' would
 459 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 460
 461 //===---------------------------------------------------------------------===//
 462
 463 Bad codegen:
 464
 465 char foo(int x) { return x; }
 466
 467 _foo:
 468         movl 4(%esp), %eax
 469         shll $24, %eax
 470         sarl $24, %eax
 471         ret
 472
 473 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 474 sub-registers.
 475
 476 //===---------------------------------------------------------------------===//
 477
 478 Consider this:
 479
 480 typedef struct pair { float A, B; } pair;
 481 void pairtest(pair P, float *FP) {
 482         *FP = P.A+P.B;
 483 }
 484
 485 We currently generate this code with llvmgcc4:
 486
 487 _pairtest:
 488         movl 8(%esp), %eax
 489         movl 4(%esp), %ecx
 490         movd %eax, %xmm0
 491         movd %ecx, %xmm1
 492         addss %xmm0, %xmm1
 493         movl 12(%esp), %eax
 494         movss %xmm1, (%eax)
 495         ret
 496
 497 we should be able to generate:
 498 _pairtest:
 499         movss 4(%esp), %xmm0
 500         movl 12(%esp), %eax
 501         addss 8(%esp), %xmm0
 502         movss %xmm0, (%eax)
 503         ret
 504
 505 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 506 integer chunks.  It does this so that structs like {short,short} are passed in
 507 a single 32-bit integer stack slot.  We should handle the safe cases above much
 508 nicer, while still handling the hard cases.
 509
 510 While true in general, in this specific case we could do better by promoting
 511 load int + bitcast to float -> load fload.  This basically needs alignment info,
 512 the code is already implemented (but disabled) in dag combine).
 513
 514 //===---------------------------------------------------------------------===//
 515
 516 Another instruction selector deficiency:
 517
 518 void %bar() {
 519         %tmp = load int (int)** %foo
 520         %tmp = tail call int %tmp( int 3 )
 521         ret void
 522 }
 523
 524 _bar:
 525         subl $12, %esp
 526         movl L_foo$non_lazy_ptr, %eax
 527         movl (%eax), %eax
 528         call *%eax
 529         addl $12, %esp
 530         ret
 531
 532 The current isel scheme will not allow the load to be folded in the call since
 533 the load's chain result is read by the callseq_start.
 534
 535 //===---------------------------------------------------------------------===//
 536
 537 Don't forget to find a way to squash noop truncates in the JIT environment.
 538
 539 //===---------------------------------------------------------------------===//
 540
 541 Implement anyext in the same manner as truncate that would allow them to be
 542 eliminated.
 543
 544 //===---------------------------------------------------------------------===//
 545
 546 How about implementing truncate / anyext as a property of machine instruction
 547 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 548 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 549 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 550
 551 //===---------------------------------------------------------------------===//
 552
 553 For this:
 554
 555 int test(int a)
 556 {
 557   return a * 3;
 558 }
 559
 560 We currently emits
 561         imull $3, 4(%esp), %eax
 562
 563 Perhaps this is what we really should generate is? Is imull three or four
 564 cycles? Note: ICC generates this:
 565         movl    4(%esp), %eax
 566         leal    (%eax,%eax,2), %eax
 567
 568 The current instruction priority is based on pattern complexity. The former is
 569 more "complex" because it folds a load so the latter will not be emitted.
 570
 571 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 572 should always try to match LEA first since the LEA matching code does some
 573 estimate to determine whether the match is profitable.
 574
 575 However, if we care more about code size, then imull is better. It's two bytes
 576 shorter than movl + leal.
 577
 578 //===---------------------------------------------------------------------===//
 579
 580 Implement CTTZ, CTLZ with bsf and bsr.
 581
 582 //===---------------------------------------------------------------------===//
 583
 584 It appears gcc place string data with linkonce linkage in
 585 .section __TEXT,__const_coal,coalesced instead of
 586 .section __DATA,__const_coal,coalesced.
 587 Take a look at darwin.h, there are other Darwin assembler directives that we
 588 do not make use of.
 589
 590 //===---------------------------------------------------------------------===//
 591
 592 int %foo(int* %a, int %t) {
 593 entry:
 594         br label %cond_true
 595
 596 cond_true:              ; preds = %cond_true, %entry
 597         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 598         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 599         %tmp2 = getelementptr int* %a, int %x.0.0
 600         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 601         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 602         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 603         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 604         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 605         br bool %tmp, label %bb12, label %cond_true
 606
 607 bb12:           ; preds = %cond_true
 608         ret int %tmp7
 609 }
 610
 611 is pessimized by -loop-reduce and -indvars
 612
 613 //===---------------------------------------------------------------------===//
 614
 615 u32 to float conversion improvement:
 616
 617 float uint32_2_float( unsigned u ) {
 618   float fl = (int) (u & 0xffff);
 619   float fh = (int) (u >> 16);
 620   fh *= 0x1.0p16f;
 621   return fh + fl;
 622 }
 623
 624 00000000        subl    $0x04,%esp
 625 00000003        movl    0x08(%esp,1),%eax
 626 00000007        movl    %eax,%ecx
 627 00000009        shrl    $0x10,%ecx
 628 0000000c        cvtsi2ss        %ecx,%xmm0
 629 00000010        andl    $0x0000ffff,%eax
 630 00000015        cvtsi2ss        %eax,%xmm1
 631 00000019        mulss   0x00000078,%xmm0
 632 00000021        addss   %xmm1,%xmm0
 633 00000025        movss   %xmm0,(%esp,1)
 634 0000002a        flds    (%esp,1)
 635 0000002d        addl    $0x04,%esp
 636 00000030        ret
 637
 638 //===---------------------------------------------------------------------===//
 639
 640 When using fastcc abi, align stack slot of argument of type double on 8 byte
 641 boundary to improve performance.
 642
 643 //===---------------------------------------------------------------------===//
 644
 645 Codegen:
 646
 647 int f(int a, int b) {
 648   if (a == 4 || a == 6)
 649     b++;
 650   return b;
 651 }
 652
 653
 654 as:
 655
 656 or eax, 2
 657 cmp eax, 6
 658 jz label
 659
 660 //===---------------------------------------------------------------------===//
 661
 662 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 663 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 664
 665 int G;
 666 void f(int X, int Y) {
 667   G = X < 0 ? 14 : 13;
 668 }
 669
 670 compiling to:
 671
 672 _f:
 673         movl $14, %eax
 674         movl $13, %ecx
 675         movl 4(%esp), %edx
 676         testl %edx, %edx
 677         cmovl %eax, %ecx
 678         movl %ecx, _G
 679         ret
 680
 681 it could be:
 682 _f:
 683         movl    4(%esp), %eax
 684         sarl    $31, %eax
 685         notl    %eax
 686         addl    $14, %eax
 687         movl    %eax, _G
 688         ret
 689
 690 etc.
 691
 692 //===---------------------------------------------------------------------===//
 693
 694 Currently we don't have elimination of redundant stack manipulations. Consider
 695 the code:
 696
 697 int %main() {
 698 entry:
 699         call fastcc void %test1( )
 700         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 701         ret int 0
 702 }
 703
 704 declare fastcc void %test1()
 705
 706 declare fastcc void %test2(sbyte*)
 707
 708
 709 This currently compiles to:
 710
 711         subl $16, %esp
 712         call _test5
 713         addl $12, %esp
 714         subl $16, %esp
 715         movl $_test5, (%esp)
 716         call _test6
 717         addl $12, %esp
 718
 719 The add\sub pair is really unneeded here.
 720
 721 //===---------------------------------------------------------------------===//
 722
 723 We currently compile sign_extend_inreg into two shifts:
 724
 725 long foo(long X) {
 726   return (long)(signed char)X;
 727 }
 728
 729 becomes:
 730
 731 _foo:
 732         movl 4(%esp), %eax
 733         shll $24, %eax
 734         sarl $24, %eax
 735         ret
 736
 737 This could be:
 738
 739 _foo:
 740         movsbl  4(%esp),%eax
 741         ret
 742
 743 //===---------------------------------------------------------------------===//
 744
 745 Consider the expansion of:
 746
 747 uint %test3(uint %X) {
 748         %tmp1 = rem uint %X, 255
 749         ret uint %tmp1
 750 }
 751
 752 Currently it compiles to:
 753
 754 ...
 755         movl $2155905153, %ecx
 756         movl 8(%esp), %esi
 757         movl %esi, %eax
 758         mull %ecx
 759 ...
 760
 761 This could be "reassociated" into:
 762
 763         movl $2155905153, %eax
 764         movl 8(%esp), %ecx
 765         mull %ecx
 766
 767 to avoid the copy.  In fact, the existing two-address stuff would do this
 768 except that mul isn't a commutative 2-addr instruction.  I guess this has
 769 to be done at isel time based on the #uses to mul?
 770
 771 //===---------------------------------------------------------------------===//
 772
 773 Make sure the instruction which starts a loop does not cross a cacheline
 774 boundary. This requires knowning the exact length of each machine instruction.
 775 That is somewhat complicated, but doable. Example 256.bzip2:
 776
 777 In the new trace, the hot loop has an instruction which crosses a cacheline
 778 boundary.  In addition to potential cache misses, this can't help decoding as I
 779 imagine there has to be some kind of complicated decoder reset and realignment
 780 to grab the bytes from the next cacheline.
 781
 782 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 783 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 784 937  937 0x3d0a incl     %esi
 785 3    3   0x3d0b cmpb     %bl, %dl
 786 27   27  0x3d0d jnz      0x000062db <main+11707>
 787
 788 //===---------------------------------------------------------------------===//
 789
 790 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 791
 792 //===---------------------------------------------------------------------===//
 793
 794 This could be a single 16-bit load.
 795
 796 int f(char *p) {
 797     if ((p[0] == 1) & (p[1] == 2)) return 1;
 798     return 0;
 799 }
 800
 801 //===---------------------------------------------------------------------===//
 802
 803 We should inline lrintf and probably other libc functions.
 804
 805 //===---------------------------------------------------------------------===//
 806
 807 Start using the flags more.  For example, compile:
 808
 809 int add_zf(int *x, int y, int a, int b) {
 810      if ((*x += y) == 0)
 811           return a;
 812      else
 813           return b;
 814 }
 815
 816 to:
 817        addl    %esi, (%rdi)
 818        movl    %edx, %eax
 819        cmovne  %ecx, %eax
 820        ret
 821 instead of:
 822
 823 _add_zf:
 824         addl (%rdi), %esi
 825         movl %esi, (%rdi)
 826         testl %esi, %esi
 827         cmove %edx, %ecx
 828         movl %ecx, %eax
 829         ret
 830
 831 and:
 832
 833 int add_zf(int *x, int y, int a, int b) {
 834      if ((*x + y) < 0)
 835           return a;
 836      else
 837           return b;
 838 }
 839
 840 to:
 841
 842 add_zf:
 843         addl    (%rdi), %esi
 844         movl    %edx, %eax
 845         cmovns  %ecx, %eax
 846         ret
 847
 848 instead of:
 849
 850 _add_zf:
 851         addl (%rdi), %esi
 852         testl %esi, %esi
 853         cmovs %edx, %ecx
 854         movl %ecx, %eax
 855         ret
 856
 857 //===---------------------------------------------------------------------===//
 858
 859 This:
 860 #include <math.h>
 861 int foo(double X) { return isnan(X); }
 862
 863 compiles to (-m64):
 864
 865 _foo:
 866         pxor %xmm1, %xmm1
 867         ucomisd %xmm1, %xmm0
 868         setp %al
 869         movzbl %al, %eax
 870         ret
 871
 872 the pxor is not needed, we could compare the value against itself.
 873
 874 //===---------------------------------------------------------------------===//
 875
 876 These two functions have identical effects:
 877
 878 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 879 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 880
 881 We currently compile them to:
 882
 883 _f:
 884         movl 4(%esp), %eax
 885         movl %eax, %ecx
 886         incl %ecx
 887         movl 8(%esp), %edx
 888         cmpl %edx, %ecx
 889         jne LBB1_2      #UnifiedReturnBlock
 890 LBB1_1: #cond_true
 891         addl $2, %eax
 892         ret
 893 LBB1_2: #UnifiedReturnBlock
 894         movl %ecx, %eax
 895         ret
 896 _f2:
 897         movl 4(%esp), %eax
 898         movl %eax, %ecx
 899         incl %ecx
 900         cmpl 8(%esp), %ecx
 901         sete %cl
 902         movzbl %cl, %ecx
 903         leal 1(%ecx,%eax), %eax
 904         ret
 905
 906 both of which are inferior to GCC's:
 907
 908 _f:
 909         movl    4(%esp), %edx
 910         leal    1(%edx), %eax
 911         addl    $2, %edx
 912         cmpl    8(%esp), %eax
 913         cmove   %edx, %eax
 914         ret
 915 _f2:
 916         movl    4(%esp), %eax
 917         addl    $1, %eax
 918         xorl    %edx, %edx
 919         cmpl    8(%esp), %eax
 920         sete    %dl
 921         addl    %edx, %eax
 922         ret
 923
 924 //===---------------------------------------------------------------------===//
 925
 926 This code:
 927
 928 void test(int X) {
 929   if (X) abort();
 930 }
 931
 932 is currently compiled to:
 933
 934 _test:
 935         subl $12, %esp
 936         cmpl $0, 16(%esp)
 937         jne LBB1_1
 938         addl $12, %esp
 939         ret
 940 LBB1_1:
 941         call L_abort$stub
 942
 943 It would be better to produce:
 944
 945 _test:
 946         subl $12, %esp
 947         cmpl $0, 16(%esp)
 948         jne L_abort$stub
 949         addl $12, %esp
 950         ret
 951
 952 This can be applied to any no-return function call that takes no arguments etc.
 953 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 954 something like this:
 955
 956 _test:
 957         cmpl $0, 4(%esp)
 958         jne LBB1_1
 959         ret
 960 LBB1_1:
 961         subl $12, %esp
 962         call L_abort$stub
 963
 964 Both are useful in different situations.  Finally, it could be shrink-wrapped
 965 and tail called, like this:
 966
 967 _test:
 968         cmpl $0, 4(%esp)
 969         jne LBB1_1
 970         ret
 971 LBB1_1:
 972         pop %eax   # realign stack.
 973         call L_abort$stub
 974
 975 Though this probably isn't worth it.
 976
 977 //===---------------------------------------------------------------------===//
 978
 979 We need to teach the codegen to convert two-address INC instructions to LEA
 980 when the flags are dead.  For example, on X86-64, compile:
 981
 982 int foo(int A, int B) {
 983   return A+1;
 984 }
 985
 986 to:
 987
 988 _foo:
 989         leal    1(%edi), %eax
 990         ret
 991
 992 instead of:
 993
 994 _foo:
 995         incl %edi
 996         movl %edi, %eax
 997         ret
 998
 999 //===---------------------------------------------------------------------===//
1000
1001 We use push/pop of stack space around calls in situations where we don't have to.
1002 Call to f below produces:
1003         subl $16, %esp      <<<<<
1004         movl %eax, (%esp)
1005         call L_f$stub
1006         addl $16, %esp     <<<<<
1007 The stack push/pop can be moved into the prolog/epilog.  It does this because it's
1008 building the frame pointer, but this should not be sufficient, only the use of alloca
1009 should cause it to do this.
1010 (There are other issues shown by this code, but this is one.)
1011
1012 typedef struct _range_t {
1013     float fbias;
1014     float fscale;
1015     int ibias;
1016     int iscale;
1017     int ishift;
1018     unsigned char lut[];
1019 } range_t;
1020
1021 struct _decode_t {
1022     int type:4;
1023     int unit:4;
1024     int alpha:8;
1025     int N:8;
1026     int bpc:8;
1027     int bpp:16;
1028     int skip:8;
1029     int swap:8;
1030     const range_t*const*range;
1031 };
1032
1033 typedef struct _decode_t decode_t;
1034
1035 extern int f(const decode_t* decode);
1036
1037 int decode_byte (const decode_t* decode) {
1038   if (decode->swap != 0)
1039     return f(decode);
1040   return 0;
1041 }