lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 This can be done with a custom expander, but it would be nice to move this to
  22 generic code.
  23
  24 //===---------------------------------------------------------------------===//
  25
  26 This should be one DIV/IDIV instruction, not a libcall:
  27
  28 unsigned test(unsigned long long X, unsigned Y) {
  29         return X/Y;
  30 }
  31
  32 This can be done trivially with a custom legalizer.  What about overflow
  33 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  34
  35 //===---------------------------------------------------------------------===//
  36
  37 Improvements to the multiply -> shift/add algorithm:
  38 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  39
  40 //===---------------------------------------------------------------------===//
  41
  42 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  43 long long foo(int x) { return 1LL << x; }
  44
  45 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  46 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  47 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  48
  49 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  50
  51 One better solution for 1LL << x is:
  52         xorl    %eax, %eax
  53         xorl    %edx, %edx
  54         testb   $32, %cl
  55         sete    %al
  56         setne   %dl
  57         sall    %cl, %eax
  58         sall    %cl, %edx
  59
  60 But that requires good 8-bit subreg support.
  61
  62 64-bit shifts (in general) expand to really bad code.  Instead of using
  63 cmovs, we should expand to a conditional branch like GCC produces.
  64
  65 //===---------------------------------------------------------------------===//
  66
  67 Compile this:
  68 _Bool f(_Bool a) { return a!=1; }
  69
  70 into:
  71         movzbl  %dil, %eax
  72         xorl    $1, %eax
  73         ret
  74
  75 //===---------------------------------------------------------------------===//
  76
  77 Some isel ideas:
  78
  79 1. Dynamic programming based approach when compile time if not an
  80    issue.
  81 2. Code duplication (addressing mode) during isel.
  82 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  83    Sequencing of Instructions".
  84 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  85    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  86    and other related papers.
  87    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  88
  89 //===---------------------------------------------------------------------===//
  90
  91 Should we promote i16 to i32 to avoid partial register update stalls?
  92
  93 //===---------------------------------------------------------------------===//
  94
  95 Leave any_extend as pseudo instruction and hint to register
  96 allocator. Delay codegen until post register allocation.
  97
  98 //===---------------------------------------------------------------------===//
  99
 100 Count leading zeros and count trailing zeros:
 101
 102 int clz(int X) { return __builtin_clz(X); }
 103 int ctz(int X) { return __builtin_ctz(X); }
 104
 105 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 106 clz:
 107         bsr     %eax, DWORD PTR [%esp+4]
 108         xor     %eax, 31
 109         ret
 110 ctz:
 111         bsf     %eax, DWORD PTR [%esp+4]
 112         ret
 113
 114 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 115 aren't.
 116
 117 //===---------------------------------------------------------------------===//
 118
 119 Use push/pop instructions in prolog/epilog sequences instead of stores off
 120 ESP (certain code size win, perf win on some [which?] processors).
 121 Also, it appears icc use push for parameter passing. Need to investigate.
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 Only use inc/neg/not instructions on processors where they are faster than
 126 add/sub/xor.  They are slower on the P4 due to only updating some processor
 127 flags.
 128
 129 //===---------------------------------------------------------------------===//
 130
 131 The instruction selector sometimes misses folding a load into a compare.  The
 132 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 133 commutative, it is not matched with the load on both sides.  The dag combiner
 134 should be made smart enough to cannonicalize the load into the RHS of a compare
 135 when it can invert the result of the compare for free.
 136
 137 //===---------------------------------------------------------------------===//
 138
 139 How about intrinsics? An example is:
 140   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 141
 142 compiles to
 143         pmuludq (%eax), %xmm0
 144         movl 8(%esp), %eax
 145         movdqa (%eax), %xmm1
 146         pmulhuw %xmm0, %xmm1
 147
 148 The transformation probably requires a X86 specific pass or a DAG combiner
 149 target specific hook.
 150
 151 //===---------------------------------------------------------------------===//
 152
 153 In many cases, LLVM generates code like this:
 154
 155 _test:
 156         movl 8(%esp), %eax
 157         cmpl %eax, 4(%esp)
 158         setl %al
 159         movzbl %al, %eax
 160         ret
 161
 162 on some processors (which ones?), it is more efficient to do this:
 163
 164 _test:
 165         movl 8(%esp), %ebx
 166         xor  %eax, %eax
 167         cmpl %ebx, 4(%esp)
 168         setl %al
 169         ret
 170
 171 Doing this correctly is tricky though, as the xor clobbers the flags.
 172
 173 //===---------------------------------------------------------------------===//
 174
 175 We should generate bts/btr/etc instructions on targets where they are cheap or
 176 when codesize is important.  e.g., for:
 177
 178 void setbit(int *target, int bit) {
 179     *target |= (1 << bit);
 180 }
 181 void clearbit(int *target, int bit) {
 182     *target &= ~(1 << bit);
 183 }
 184
 185 //===---------------------------------------------------------------------===//
 186
 187 Instead of the following for memset char*, 1, 10:
 188
 189         movl $16843009, 4(%edx)
 190         movl $16843009, (%edx)
 191         movw $257, 8(%edx)
 192
 193 It might be better to generate
 194
 195         movl $16843009, %eax
 196         movl %eax, 4(%edx)
 197         movl %eax, (%edx)
 198         movw al, 8(%edx)
 199
 200 when we can spare a register. It reduces code size.
 201
 202 //===---------------------------------------------------------------------===//
 203
 204 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 205 get this:
 206
 207 int %test1(int %X) {
 208         %Y = div int %X, 8
 209         ret int %Y
 210 }
 211
 212 _test1:
 213         movl 4(%esp), %eax
 214         movl %eax, %ecx
 215         sarl $31, %ecx
 216         shrl $29, %ecx
 217         addl %ecx, %eax
 218         sarl $3, %eax
 219         ret
 220
 221 GCC knows several different ways to codegen it, one of which is this:
 222
 223 _test1:
 224         movl    4(%esp), %eax
 225         cmpl    $-1, %eax
 226         leal    7(%eax), %ecx
 227         cmovle  %ecx, %eax
 228         sarl    $3, %eax
 229         ret
 230
 231 which is probably slower, but it's interesting at least :)
 232
 233 //===---------------------------------------------------------------------===//
 234
 235 The first BB of this code:
 236
 237 declare bool %foo()
 238 int %bar() {
 239         %V = call bool %foo()
 240         br bool %V, label %T, label %F
 241 T:
 242         ret int 1
 243 F:
 244         call bool %foo()
 245         ret int 12
 246 }
 247
 248 compiles to:
 249
 250 _bar:
 251         subl $12, %esp
 252         call L_foo$stub
 253         xorb $1, %al
 254         testb %al, %al
 255         jne LBB_bar_2   # F
 256
 257 It would be better to emit "cmp %al, 1" than a xor and test.
 258
 259 //===---------------------------------------------------------------------===//
 260
 261 Enable X86InstrInfo::convertToThreeAddress().
 262
 263 //===---------------------------------------------------------------------===//
 264
 265 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 266 We should leave these as libcalls for everything over a much lower threshold,
 267 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 268 stores, TLB preheating, etc)
 269
 270 //===---------------------------------------------------------------------===//
 271
 272 Optimize this into something reasonable:
 273  x * copysign(1.0, y) * copysign(1.0, z)
 274
 275 //===---------------------------------------------------------------------===//
 276
 277 Optimize copysign(x, *y) to use an integer load from y.
 278
 279 //===---------------------------------------------------------------------===//
 280
 281 %X = weak global int 0
 282
 283 void %foo(int %N) {
 284         %N = cast int %N to uint
 285         %tmp.24 = setgt int %N, 0
 286         br bool %tmp.24, label %no_exit, label %return
 287
 288 no_exit:
 289         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 290         %i.0.0 = cast uint %indvar to int
 291         volatile store int %i.0.0, int* %X
 292         %indvar.next = add uint %indvar, 1
 293         %exitcond = seteq uint %indvar.next, %N
 294         br bool %exitcond, label %return, label %no_exit
 295
 296 return:
 297         ret void
 298 }
 299
 300 compiles into:
 301
 302         .text
 303         .align  4
 304         .globl  _foo
 305 _foo:
 306         movl 4(%esp), %eax
 307         cmpl $1, %eax
 308         jl LBB_foo_4    # return
 309 LBB_foo_1:      # no_exit.preheader
 310         xorl %ecx, %ecx
 311 LBB_foo_2:      # no_exit
 312         movl L_X$non_lazy_ptr, %edx
 313         movl %ecx, (%edx)
 314         incl %ecx
 315         cmpl %eax, %ecx
 316         jne LBB_foo_2   # no_exit
 317 LBB_foo_3:      # return.loopexit
 318 LBB_foo_4:      # return
 319         ret
 320
 321 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 322 remateralization is implemented. This can be accomplished with 1) a target
 323 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 324
 325 //===---------------------------------------------------------------------===//
 326
 327 The following tests perform worse with LSR:
 328
 329 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 330
 331 //===---------------------------------------------------------------------===//
 332
 333 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 334 FR64 to VR128.
 335
 336 //===---------------------------------------------------------------------===//
 337
 338 mov $reg, 48(%esp)
 339 ...
 340 leal 48(%esp), %eax
 341 mov %eax, (%esp)
 342 call _foo
 343
 344 Obviously it would have been better for the first mov (or any op) to store
 345 directly %esp[0] if there are no other uses.
 346
 347 //===---------------------------------------------------------------------===//
 348
 349 Adding to the list of cmp / test poor codegen issues:
 350
 351 int test(__m128 *A, __m128 *B) {
 352   if (_mm_comige_ss(*A, *B))
 353     return 3;
 354   else
 355     return 4;
 356 }
 357
 358 _test:
 359         movl 8(%esp), %eax
 360         movaps (%eax), %xmm0
 361         movl 4(%esp), %eax
 362         movaps (%eax), %xmm1
 363         comiss %xmm0, %xmm1
 364         setae %al
 365         movzbl %al, %ecx
 366         movl $3, %eax
 367         movl $4, %edx
 368         cmpl $0, %ecx
 369         cmove %edx, %eax
 370         ret
 371
 372 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 373 are a number of issues. 1) We are introducing a setcc between the result of the
 374 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 375 so a any extend (which becomes a zero extend) is added.
 376
 377 We probably need some kind of target DAG combine hook to fix this.
 378
 379 //===---------------------------------------------------------------------===//
 380
 381 We generate significantly worse code for this than GCC:
 382 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 383 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 384
 385 There is also one case we do worse on PPC.
 386
 387 //===---------------------------------------------------------------------===//
 388
 389 If shorter, we should use things like:
 390 movzwl %ax, %eax
 391 instead of:
 392 andl $65535, %EAX
 393
 394 The former can also be used when the two-addressy nature of the 'and' would
 395 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 396
 397 //===---------------------------------------------------------------------===//
 398
 399 Bad codegen:
 400
 401 char foo(int x) { return x; }
 402
 403 _foo:
 404         movl 4(%esp), %eax
 405         shll $24, %eax
 406         sarl $24, %eax
 407         ret
 408
 409 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 410 sub-registers.
 411
 412 //===---------------------------------------------------------------------===//
 413
 414 Consider this:
 415
 416 typedef struct pair { float A, B; } pair;
 417 void pairtest(pair P, float *FP) {
 418         *FP = P.A+P.B;
 419 }
 420
 421 We currently generate this code with llvmgcc4:
 422
 423 _pairtest:
 424         subl $12, %esp
 425         movl 20(%esp), %eax
 426         movl %eax, 4(%esp)
 427         movl 16(%esp), %eax
 428         movl %eax, (%esp)
 429         movss (%esp), %xmm0
 430         addss 4(%esp), %xmm0
 431         movl 24(%esp), %eax
 432         movss %xmm0, (%eax)
 433         addl $12, %esp
 434         ret
 435
 436 we should be able to generate:
 437 _pairtest:
 438         movss 4(%esp), %xmm0
 439         movl 12(%esp), %eax
 440         addss 8(%esp), %xmm0
 441         movss %xmm0, (%eax)
 442         ret
 443
 444 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 445 integer chunks.  It does this so that structs like {short,short} are passed in
 446 a single 32-bit integer stack slot.  We should handle the safe cases above much
 447 nicer, while still handling the hard cases.
 448
 449 //===---------------------------------------------------------------------===//
 450
 451 Another instruction selector deficiency:
 452
 453 void %bar() {
 454         %tmp = load int (int)** %foo
 455         %tmp = tail call int %tmp( int 3 )
 456         ret void
 457 }
 458
 459 _bar:
 460         subl $12, %esp
 461         movl L_foo$non_lazy_ptr, %eax
 462         movl (%eax), %eax
 463         call *%eax
 464         addl $12, %esp
 465         ret
 466
 467 The current isel scheme will not allow the load to be folded in the call since
 468 the load's chain result is read by the callseq_start.
 469
 470 //===---------------------------------------------------------------------===//
 471
 472 Don't forget to find a way to squash noop truncates in the JIT environment.
 473
 474 //===---------------------------------------------------------------------===//
 475
 476 Implement anyext in the same manner as truncate that would allow them to be
 477 eliminated.
 478
 479 //===---------------------------------------------------------------------===//
 480
 481 How about implementing truncate / anyext as a property of machine instruction
 482 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 483 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 484 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 485
 486 //===---------------------------------------------------------------------===//
 487
 488 For this:
 489
 490 int test(int a)
 491 {
 492   return a * 3;
 493 }
 494
 495 We currently emits
 496         imull $3, 4(%esp), %eax
 497
 498 Perhaps this is what we really should generate is? Is imull three or four
 499 cycles? Note: ICC generates this:
 500         movl    4(%esp), %eax
 501         leal    (%eax,%eax,2), %eax
 502
 503 The current instruction priority is based on pattern complexity. The former is
 504 more "complex" because it folds a load so the latter will not be emitted.
 505
 506 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 507 should always try to match LEA first since the LEA matching code does some
 508 estimate to determine whether the match is profitable.
 509
 510 However, if we care more about code size, then imull is better. It's two bytes
 511 shorter than movl + leal.
 512
 513 //===---------------------------------------------------------------------===//
 514
 515 Implement CTTZ, CTLZ with bsf and bsr.
 516
 517 //===---------------------------------------------------------------------===//
 518
 519 It appears gcc place string data with linkonce linkage in
 520 .section __TEXT,__const_coal,coalesced instead of
 521 .section __DATA,__const_coal,coalesced.
 522 Take a look at darwin.h, there are other Darwin assembler directives that we
 523 do not make use of.
 524
 525 //===---------------------------------------------------------------------===//
 526
 527 We should handle __attribute__ ((__visibility__ ("hidden"))).
 528
 529 //===---------------------------------------------------------------------===//
 530
 531 int %foo(int* %a, int %t) {
 532 entry:
 533         br label %cond_true
 534
 535 cond_true:              ; preds = %cond_true, %entry
 536         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 537         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 538         %tmp2 = getelementptr int* %a, int %x.0.0
 539         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 540         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 541         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 542         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 543         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 544         br bool %tmp, label %bb12, label %cond_true
 545
 546 bb12:           ; preds = %cond_true
 547         ret int %tmp7
 548 }
 549
 550 is pessimized by -loop-reduce and -indvars
 551
 552 //===---------------------------------------------------------------------===//
 553
 554 u32 to float conversion improvement:
 555
 556 float uint32_2_float( unsigned u ) {
 557   float fl = (int) (u & 0xffff);
 558   float fh = (int) (u >> 16);
 559   fh *= 0x1.0p16f;
 560   return fh + fl;
 561 }
 562
 563 00000000        subl    $0x04,%esp
 564 00000003        movl    0x08(%esp,1),%eax
 565 00000007        movl    %eax,%ecx
 566 00000009        shrl    $0x10,%ecx
 567 0000000c        cvtsi2ss        %ecx,%xmm0
 568 00000010        andl    $0x0000ffff,%eax
 569 00000015        cvtsi2ss        %eax,%xmm1
 570 00000019        mulss   0x00000078,%xmm0
 571 00000021        addss   %xmm1,%xmm0
 572 00000025        movss   %xmm0,(%esp,1)
 573 0000002a        flds    (%esp,1)
 574 0000002d        addl    $0x04,%esp
 575 00000030        ret
 576
 577 //===---------------------------------------------------------------------===//
 578
 579 When using fastcc abi, align stack slot of argument of type double on 8 byte
 580 boundary to improve performance.
 581
 582 //===---------------------------------------------------------------------===//
 583
 584 Codegen:
 585
 586 int f(int a, int b) {
 587   if (a == 4 || a == 6)
 588     b++;
 589   return b;
 590 }
 591
 592
 593 as:
 594
 595 or eax, 2
 596 cmp eax, 6
 597 jz label
 598
 599 //===---------------------------------------------------------------------===//
 600
 601 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 602 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 603
 604 int G;
 605 void f(int X, int Y) {
 606   G = X < 0 ? 14 : 13;
 607 }
 608
 609 compiling to:
 610
 611 _f:
 612         movl $14, %eax
 613         movl $13, %ecx
 614         movl 4(%esp), %edx
 615         testl %edx, %edx
 616         cmovl %eax, %ecx
 617         movl %ecx, _G
 618         ret
 619
 620 it could be:
 621 _f:
 622         movl    4(%esp), %eax
 623         sarl    $31, %eax
 624         notl    %eax
 625         addl    $14, %eax
 626         movl    %eax, _G
 627         ret
 628
 629 etc.
 630
 631 //===---------------------------------------------------------------------===//
 632
 633 Currently we don't have elimination of redundant stack manipulations. Consider
 634 the code:
 635
 636 int %main() {
 637 entry:
 638         call fastcc void %test1( )
 639         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 640         ret int 0
 641 }
 642
 643 declare fastcc void %test1()
 644
 645 declare fastcc void %test2(sbyte*)
 646
 647
 648 This currently compiles to:
 649
 650         subl $16, %esp
 651         call _test5
 652         addl $12, %esp
 653         subl $16, %esp
 654         movl $_test5, (%esp)
 655         call _test6
 656         addl $12, %esp
 657
 658 The add\sub pair is really unneeded here.
 659
 660 //===---------------------------------------------------------------------===//
 661
 662 We generate really bad code in some cases due to lowering SETCC/SELECT at
 663 legalize time, which prevents the post-legalize dag combine pass from
 664 understanding the code.  As a silly example, this prevents us from folding
 665 stuff like this:
 666
 667 bool %test(ulong %x) {
 668   %tmp = setlt ulong %x, 4294967296
 669   ret bool %tmp
 670 }
 671
 672 into x.h == 0
 673
 674 //===---------------------------------------------------------------------===//
 675
 676 We currently compile sign_extend_inreg into two shifts:
 677
 678 long foo(long X) {
 679   return (long)(signed char)X;
 680 }
 681
 682 becomes:
 683
 684 _foo:
 685         movl 4(%esp), %eax
 686         shll $24, %eax
 687         sarl $24, %eax
 688         ret
 689
 690 This could be:
 691
 692 _foo:
 693         movsbl  4(%esp),%eax
 694         ret
 695
 696 //===---------------------------------------------------------------------===//
 697
 698 Consider the expansion of:
 699
 700 uint %test3(uint %X) {
 701         %tmp1 = rem uint %X, 255
 702         ret uint %tmp1
 703 }
 704
 705 Currently it compiles to:
 706
 707 ...
 708         movl $2155905153, %ecx
 709         movl 8(%esp), %esi
 710         movl %esi, %eax
 711         mull %ecx
 712 ...
 713
 714 This could be "reassociated" into:
 715
 716         movl $2155905153, %eax
 717         movl 8(%esp), %ecx
 718         mull %ecx
 719
 720 to avoid the copy.  In fact, the existing two-address stuff would do this
 721 except that mul isn't a commutative 2-addr instruction.  I guess this has
 722 to be done at isel time based on the #uses to mul?
 723