lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 This can be done with a custom expander, but it would be nice to move this to
  22 generic code.
  23
  24 //===---------------------------------------------------------------------===//
  25
  26 This should be one DIV/IDIV instruction, not a libcall:
  27
  28 unsigned test(unsigned long long X, unsigned Y) {
  29         return X/Y;
  30 }
  31
  32 This can be done trivially with a custom legalizer.  What about overflow
  33 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  34
  35 //===---------------------------------------------------------------------===//
  36
  37 Improvements to the multiply -> shift/add algorithm:
  38 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  39
  40 //===---------------------------------------------------------------------===//
  41
  42 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  43 long long foo(int x) { return 1LL << x; }
  44
  45 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  46 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  47 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  48
  49 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  50
  51 One better solution for 1LL << x is:
  52         xorl    %eax, %eax
  53         xorl    %edx, %edx
  54         testb   $32, %cl
  55         sete    %al
  56         setne   %dl
  57         sall    %cl, %eax
  58         sall    %cl, %edx
  59
  60 But that requires good 8-bit subreg support.
  61
  62
  63
  64 //===---------------------------------------------------------------------===//
  65
  66 Compile this:
  67 _Bool f(_Bool a) { return a!=1; }
  68
  69 into:
  70         movzbl  %dil, %eax
  71         xorl    $1, %eax
  72         ret
  73
  74 //===---------------------------------------------------------------------===//
  75
  76 Some isel ideas:
  77
  78 1. Dynamic programming based approach when compile time if not an
  79    issue.
  80 2. Code duplication (addressing mode) during isel.
  81 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  82    Sequencing of Instructions".
  83 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  84    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  85    and other related papers.
  86    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  87
  88 //===---------------------------------------------------------------------===//
  89
  90 Should we promote i16 to i32 to avoid partial register update stalls?
  91
  92 //===---------------------------------------------------------------------===//
  93
  94 Leave any_extend as pseudo instruction and hint to register
  95 allocator. Delay codegen until post register allocation.
  96
  97 //===---------------------------------------------------------------------===//
  98
  99 Count leading zeros and count trailing zeros:
 100
 101 int clz(int X) { return __builtin_clz(X); }
 102 int ctz(int X) { return __builtin_ctz(X); }
 103
 104 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 105 clz:
 106         bsr     %eax, DWORD PTR [%esp+4]
 107         xor     %eax, 31
 108         ret
 109 ctz:
 110         bsf     %eax, DWORD PTR [%esp+4]
 111         ret
 112
 113 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 114 aren't.
 115
 116 //===---------------------------------------------------------------------===//
 117
 118 Use push/pop instructions in prolog/epilog sequences instead of stores off
 119 ESP (certain code size win, perf win on some [which?] processors).
 120 Also, it appears icc use push for parameter passing. Need to investigate.
 121
 122 //===---------------------------------------------------------------------===//
 123
 124 Only use inc/neg/not instructions on processors where they are faster than
 125 add/sub/xor.  They are slower on the P4 due to only updating some processor
 126 flags.
 127
 128 //===---------------------------------------------------------------------===//
 129
 130 The instruction selector sometimes misses folding a load into a compare.  The
 131 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 132 commutative, it is not matched with the load on both sides.  The dag combiner
 133 should be made smart enough to cannonicalize the load into the RHS of a compare
 134 when it can invert the result of the compare for free.
 135
 136 //===---------------------------------------------------------------------===//
 137
 138 How about intrinsics? An example is:
 139   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 140
 141 compiles to
 142         pmuludq (%eax), %xmm0
 143         movl 8(%esp), %eax
 144         movdqa (%eax), %xmm1
 145         pmulhuw %xmm0, %xmm1
 146
 147 The transformation probably requires a X86 specific pass or a DAG combiner
 148 target specific hook.
 149
 150 //===---------------------------------------------------------------------===//
 151
 152 In many cases, LLVM generates code like this:
 153
 154 _test:
 155         movl 8(%esp), %eax
 156         cmpl %eax, 4(%esp)
 157         setl %al
 158         movzbl %al, %eax
 159         ret
 160
 161 on some processors (which ones?), it is more efficient to do this:
 162
 163 _test:
 164         movl 8(%esp), %ebx
 165         xor  %eax, %eax
 166         cmpl %ebx, 4(%esp)
 167         setl %al
 168         ret
 169
 170 Doing this correctly is tricky though, as the xor clobbers the flags.
 171
 172 //===---------------------------------------------------------------------===//
 173
 174 We should generate bts/btr/etc instructions on targets where they are cheap or
 175 when codesize is important.  e.g., for:
 176
 177 void setbit(int *target, int bit) {
 178     *target |= (1 << bit);
 179 }
 180 void clearbit(int *target, int bit) {
 181     *target &= ~(1 << bit);
 182 }
 183
 184 //===---------------------------------------------------------------------===//
 185
 186 Instead of the following for memset char*, 1, 10:
 187
 188         movl $16843009, 4(%edx)
 189         movl $16843009, (%edx)
 190         movw $257, 8(%edx)
 191
 192 It might be better to generate
 193
 194         movl $16843009, %eax
 195         movl %eax, 4(%edx)
 196         movl %eax, (%edx)
 197         movw al, 8(%edx)
 198
 199 when we can spare a register. It reduces code size.
 200
 201 //===---------------------------------------------------------------------===//
 202
 203 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 204 get this:
 205
 206 int %test1(int %X) {
 207         %Y = div int %X, 8
 208         ret int %Y
 209 }
 210
 211 _test1:
 212         movl 4(%esp), %eax
 213         movl %eax, %ecx
 214         sarl $31, %ecx
 215         shrl $29, %ecx
 216         addl %ecx, %eax
 217         sarl $3, %eax
 218         ret
 219
 220 GCC knows several different ways to codegen it, one of which is this:
 221
 222 _test1:
 223         movl    4(%esp), %eax
 224         cmpl    $-1, %eax
 225         leal    7(%eax), %ecx
 226         cmovle  %ecx, %eax
 227         sarl    $3, %eax
 228         ret
 229
 230 which is probably slower, but it's interesting at least :)
 231
 232 //===---------------------------------------------------------------------===//
 233
 234 Should generate min/max for stuff like:
 235
 236 void minf(float a, float b, float *X) {
 237   *X = a <= b ? a : b;
 238 }
 239
 240 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
 241 and ISD::FMAX node types?
 242
 243 //===---------------------------------------------------------------------===//
 244
 245 The first BB of this code:
 246
 247 declare bool %foo()
 248 int %bar() {
 249         %V = call bool %foo()
 250         br bool %V, label %T, label %F
 251 T:
 252         ret int 1
 253 F:
 254         call bool %foo()
 255         ret int 12
 256 }
 257
 258 compiles to:
 259
 260 _bar:
 261         subl $12, %esp
 262         call L_foo$stub
 263         xorb $1, %al
 264         testb %al, %al
 265         jne LBB_bar_2   # F
 266
 267 It would be better to emit "cmp %al, 1" than a xor and test.
 268
 269 //===---------------------------------------------------------------------===//
 270
 271 Enable X86InstrInfo::convertToThreeAddress().
 272
 273 //===---------------------------------------------------------------------===//
 274
 275 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 276 We should leave these as libcalls for everything over a much lower threshold,
 277 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 278 stores, TLB preheating, etc)
 279
 280 //===---------------------------------------------------------------------===//
 281
 282 Optimize this into something reasonable:
 283  x * copysign(1.0, y) * copysign(1.0, z)
 284
 285 //===---------------------------------------------------------------------===//
 286
 287 Optimize copysign(x, *y) to use an integer load from y.
 288
 289 //===---------------------------------------------------------------------===//
 290
 291 %X = weak global int 0
 292
 293 void %foo(int %N) {
 294         %N = cast int %N to uint
 295         %tmp.24 = setgt int %N, 0
 296         br bool %tmp.24, label %no_exit, label %return
 297
 298 no_exit:
 299         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 300         %i.0.0 = cast uint %indvar to int
 301         volatile store int %i.0.0, int* %X
 302         %indvar.next = add uint %indvar, 1
 303         %exitcond = seteq uint %indvar.next, %N
 304         br bool %exitcond, label %return, label %no_exit
 305
 306 return:
 307         ret void
 308 }
 309
 310 compiles into:
 311
 312         .text
 313         .align  4
 314         .globl  _foo
 315 _foo:
 316         movl 4(%esp), %eax
 317         cmpl $1, %eax
 318         jl LBB_foo_4    # return
 319 LBB_foo_1:      # no_exit.preheader
 320         xorl %ecx, %ecx
 321 LBB_foo_2:      # no_exit
 322         movl L_X$non_lazy_ptr, %edx
 323         movl %ecx, (%edx)
 324         incl %ecx
 325         cmpl %eax, %ecx
 326         jne LBB_foo_2   # no_exit
 327 LBB_foo_3:      # return.loopexit
 328 LBB_foo_4:      # return
 329         ret
 330
 331 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 332 remateralization is implemented. This can be accomplished with 1) a target
 333 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 334
 335 //===---------------------------------------------------------------------===//
 336
 337 The following tests perform worse with LSR:
 338
 339 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 340
 341 //===---------------------------------------------------------------------===//
 342
 343 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 344 FR64 to VR128.
 345
 346 //===---------------------------------------------------------------------===//
 347
 348 mov $reg, 48(%esp)
 349 ...
 350 leal 48(%esp), %eax
 351 mov %eax, (%esp)
 352 call _foo
 353
 354 Obviously it would have been better for the first mov (or any op) to store
 355 directly %esp[0] if there are no other uses.
 356
 357 //===---------------------------------------------------------------------===//
 358
 359 Adding to the list of cmp / test poor codegen issues:
 360
 361 int test(__m128 *A, __m128 *B) {
 362   if (_mm_comige_ss(*A, *B))
 363     return 3;
 364   else
 365     return 4;
 366 }
 367
 368 _test:
 369         movl 8(%esp), %eax
 370         movaps (%eax), %xmm0
 371         movl 4(%esp), %eax
 372         movaps (%eax), %xmm1
 373         comiss %xmm0, %xmm1
 374         setae %al
 375         movzbl %al, %ecx
 376         movl $3, %eax
 377         movl $4, %edx
 378         cmpl $0, %ecx
 379         cmove %edx, %eax
 380         ret
 381
 382 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 383 are a number of issues. 1) We are introducing a setcc between the result of the
 384 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 385 so a any extend (which becomes a zero extend) is added.
 386
 387 We probably need some kind of target DAG combine hook to fix this.
 388
 389 //===---------------------------------------------------------------------===//
 390
 391 We generate significantly worse code for this than GCC:
 392 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 393 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 394
 395 There is also one case we do worse on PPC.
 396
 397 //===---------------------------------------------------------------------===//
 398
 399 If shorter, we should use things like:
 400 movzwl %ax, %eax
 401 instead of:
 402 andl $65535, %EAX
 403
 404 The former can also be used when the two-addressy nature of the 'and' would
 405 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 406
 407 //===---------------------------------------------------------------------===//
 408
 409 Bad codegen:
 410
 411 char foo(int x) { return x; }
 412
 413 _foo:
 414         movl 4(%esp), %eax
 415         shll $24, %eax
 416         sarl $24, %eax
 417         ret
 418
 419 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 420 sub-registers.
 421
 422 //===---------------------------------------------------------------------===//
 423
 424 Consider this:
 425
 426 typedef struct pair { float A, B; } pair;
 427 void pairtest(pair P, float *FP) {
 428         *FP = P.A+P.B;
 429 }
 430
 431 We currently generate this code with llvmgcc4:
 432
 433 _pairtest:
 434         subl $12, %esp
 435         movl 20(%esp), %eax
 436         movl %eax, 4(%esp)
 437         movl 16(%esp), %eax
 438         movl %eax, (%esp)
 439         movss (%esp), %xmm0
 440         addss 4(%esp), %xmm0
 441         movl 24(%esp), %eax
 442         movss %xmm0, (%eax)
 443         addl $12, %esp
 444         ret
 445
 446 we should be able to generate:
 447 _pairtest:
 448         movss 4(%esp), %xmm0
 449         movl 12(%esp), %eax
 450         addss 8(%esp), %xmm0
 451         movss %xmm0, (%eax)
 452         ret
 453
 454 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 455 integer chunks.  It does this so that structs like {short,short} are passed in
 456 a single 32-bit integer stack slot.  We should handle the safe cases above much
 457 nicer, while still handling the hard cases.
 458
 459 //===---------------------------------------------------------------------===//
 460
 461 Another instruction selector deficiency:
 462
 463 void %bar() {
 464         %tmp = load int (int)** %foo
 465         %tmp = tail call int %tmp( int 3 )
 466         ret void
 467 }
 468
 469 _bar:
 470         subl $12, %esp
 471         movl L_foo$non_lazy_ptr, %eax
 472         movl (%eax), %eax
 473         call *%eax
 474         addl $12, %esp
 475         ret
 476
 477 The current isel scheme will not allow the load to be folded in the call since
 478 the load's chain result is read by the callseq_start.
 479
 480 //===---------------------------------------------------------------------===//
 481
 482 Don't forget to find a way to squash noop truncates in the JIT environment.
 483
 484 //===---------------------------------------------------------------------===//
 485
 486 Implement anyext in the same manner as truncate that would allow them to be
 487 eliminated.
 488
 489 //===---------------------------------------------------------------------===//
 490
 491 How about implementing truncate / anyext as a property of machine instruction
 492 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 493 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 494 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 495
 496 //===---------------------------------------------------------------------===//
 497
 498 For this:
 499
 500 int test(int a)
 501 {
 502   return a * 3;
 503 }
 504
 505 We currently emits
 506         imull $3, 4(%esp), %eax
 507
 508 Perhaps this is what we really should generate is? Is imull three or four
 509 cycles? Note: ICC generates this:
 510         movl    4(%esp), %eax
 511         leal    (%eax,%eax,2), %eax
 512
 513 The current instruction priority is based on pattern complexity. The former is
 514 more "complex" because it folds a load so the latter will not be emitted.
 515
 516 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 517 should always try to match LEA first since the LEA matching code does some
 518 estimate to determine whether the match is profitable.
 519
 520 However, if we care more about code size, then imull is better. It's two bytes
 521 shorter than movl + leal.
 522
 523 //===---------------------------------------------------------------------===//
 524
 525 Implement CTTZ, CTLZ with bsf and bsr.
 526
 527 //===---------------------------------------------------------------------===//
 528
 529 It appears gcc place string data with linkonce linkage in
 530 .section __TEXT,__const_coal,coalesced instead of
 531 .section __DATA,__const_coal,coalesced.
 532 Take a look at darwin.h, there are other Darwin assembler directives that we
 533 do not make use of.
 534
 535 //===---------------------------------------------------------------------===//
 536
 537 We should handle __attribute__ ((__visibility__ ("hidden"))).
 538
 539 //===---------------------------------------------------------------------===//
 540
 541 int %foo(int* %a, int %t) {
 542 entry:
 543         br label %cond_true
 544
 545 cond_true:              ; preds = %cond_true, %entry
 546         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]           ; <int> [#uses=3]
 547         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]             ; <int> [#uses=1]
 548         %tmp2 = getelementptr int* %a, int %x.0.0               ; <int*> [#uses=1]
 549         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 550         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 551         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 552         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 553         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 554         br bool %tmp, label %bb12, label %cond_true
 555
 556 bb12:           ; preds = %cond_true
 557         ret int %tmp7
 558 }
 559
 560 is pessimized by -loop-reduce and -indvars
 561
 562 //===---------------------------------------------------------------------===//
 563
 564 Use cpuid to auto-detect CPU features such as SSE, SSE2, and SSE3.
 565
 566 //===---------------------------------------------------------------------===//
 567
 568 u32 to float conversion improvement:
 569
 570 float uint32_2_float( unsigned u ) {
 571   float fl = (int) (u & 0xffff);
 572   float fh = (int) (u >> 16);
 573   fh *= 0x1.0p16f;
 574   return fh + fl;
 575 }
 576
 577 00000000        subl    $0x04,%esp
 578 00000003        movl    0x08(%esp,1),%eax
 579 00000007        movl    %eax,%ecx
 580 00000009        shrl    $0x10,%ecx
 581 0000000c        cvtsi2ss        %ecx,%xmm0
 582 00000010        andl    $0x0000ffff,%eax
 583 00000015        cvtsi2ss        %eax,%xmm1
 584 00000019        mulss   0x00000078,%xmm0
 585 00000021        addss   %xmm1,%xmm0
 586 00000025        movss   %xmm0,(%esp,1)
 587 0000002a        flds    (%esp,1)
 588 0000002d        addl    $0x04,%esp
 589 00000030        ret
 590
 591 //===---------------------------------------------------------------------===//
 592
 593 When using fastcc abi, align stack slot of argument of type double on 8 byte
 594 boundary to improve performance.
 595
 596 //===---------------------------------------------------------------------===//
 597
 598 Codegen:
 599
 600 int f(int a, int b) {
 601   if (a == 4 || a == 6)
 602     b++;
 603   return b;
 604 }
 605
 606
 607 as:
 608
 609 or eax, 2
 610 cmp eax, 6
 611 jz label
 612
 613 If we aren't going to do this, we should lower the switch better.  We compile
 614 the code to:
 615
 616 _f:
 617         movl 8(%esp), %eax
 618         movl 4(%esp), %ecx
 619         cmpl $6, %ecx
 620         jl LBB1_4       #entry
 621         jmp LBB1_3      #entry
 622 LBB1_3: #entry
 623         cmpl $6, %ecx
 624         je LBB1_1       #bb
 625         jmp LBB1_2      #UnifiedReturnBlock
 626 LBB1_4: #entry
 627         cmpl $4, %ecx
 628         jne LBB1_2      #UnifiedReturnBlock
 629 LBB1_1: #bb
 630         incl %eax
 631         ret
 632 LBB1_2: #UnifiedReturnBlock
 633         ret
 634
 635 In the code above, the 'if' is turned into a 'switch' at the mid-level.  It looks
 636 like the 'lower to branches' mode could be improved a little here.  In particular,
 637 the fall-through to LBB1_3 doesn't need a branch.  It would also be nice to
 638 eliminate the redundant "cmp 6", maybe by lowering to a linear sequence of
 639 compares if there are below a certain number of cases (instead of a binary sequence)?
 640
 641 //===---------------------------------------------------------------------===//
 642
 643 Compile:
 644 int %test(ulong *%tmp) {
 645         %tmp = load ulong* %tmp         ; <ulong> [#uses=1]
 646         %tmp.mask = shr ulong %tmp, ubyte 50            ; <ulong> [#uses=1]
 647         %tmp.mask = cast ulong %tmp.mask to ubyte               ; <ubyte> [#uses=1]
 648         %tmp2 = and ubyte %tmp.mask, 3          ; <ubyte> [#uses=1]
 649         %tmp2 = cast ubyte %tmp2 to int         ; <int> [#uses=1]
 650         ret int %tmp2
 651 }
 652
 653 to:
 654
 655 _test:
 656         movl 4(%esp), %eax
 657         movl 4(%eax), %eax
 658         shrl $18, %eax
 659         andl $3, %eax
 660         ret
 661
 662 instead of:
 663
 664 _test:
 665         movl 4(%esp), %eax
 666         movl 4(%eax), %eax
 667         shrl $18, %eax
 668         # TRUNCATE movb %al, %al
 669         andb $3, %al
 670         movzbl %al, %eax
 671         ret
 672
 673 This saves a movzbl, and saves a truncate if it doesn't get coallesced right.
 674 This is a simple DAGCombine to propagate the zext through the and.
 675
 676 //===---------------------------------------------------------------------===//
 677
 678 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 679 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 680
 681 int G;
 682 void f(int X, int Y) {
 683   G = X < 0 ? 14 : 13;
 684 }
 685
 686 compiling to:
 687
 688 _f:
 689         movl $14, %eax
 690         movl $13, %ecx
 691         movl 4(%esp), %edx
 692         testl %edx, %edx
 693         cmovl %eax, %ecx
 694         movl %ecx, _G
 695         ret
 696
 697 it could be:
 698 _f:
 699         movl    4(%esp), %eax
 700         sarl    $31, %eax
 701         notl    %eax
 702         addl    $14, %eax
 703         movl    %eax, _G
 704         ret
 705
 706 etc.
 707
 708 //===---------------------------------------------------------------------===//
 709