lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 //===---------------------------------------------------------------------===//
  22
  23 This should be one DIV/IDIV instruction, not a libcall:
  24
  25 unsigned test(unsigned long long X, unsigned Y) {
  26         return X/Y;
  27 }
  28
  29 This can be done trivially with a custom legalizer.  What about overflow
  30 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 Improvements to the multiply -> shift/add algorithm:
  35 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  40 long long foo(int x) { return 1LL << x; }
  41
  42 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  43 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  44 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  45
  46 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  47
  48 //===---------------------------------------------------------------------===//
  49
  50 Compile this:
  51 _Bool f(_Bool a) { return a!=1; }
  52
  53 into:
  54         movzbl  %dil, %eax
  55         xorl    $1, %eax
  56         ret
  57
  58 //===---------------------------------------------------------------------===//
  59
  60 Some isel ideas:
  61
  62 1. Dynamic programming based approach when compile time if not an
  63    issue.
  64 2. Code duplication (addressing mode) during isel.
  65 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  66    Sequencing of Instructions".
  67 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  68    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  69    and other related papers.
  70    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  71
  72 //===---------------------------------------------------------------------===//
  73
  74 Should we promote i16 to i32 to avoid partial register update stalls?
  75
  76 //===---------------------------------------------------------------------===//
  77
  78 Leave any_extend as pseudo instruction and hint to register
  79 allocator. Delay codegen until post register allocation.
  80
  81 //===---------------------------------------------------------------------===//
  82
  83 Count leading zeros and count trailing zeros:
  84
  85 int clz(int X) { return __builtin_clz(X); }
  86 int ctz(int X) { return __builtin_ctz(X); }
  87
  88 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
  89 clz:
  90         bsr     %eax, DWORD PTR [%esp+4]
  91         xor     %eax, 31
  92         ret
  93 ctz:
  94         bsf     %eax, DWORD PTR [%esp+4]
  95         ret
  96
  97 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
  98 aren't.
  99
 100 //===---------------------------------------------------------------------===//
 101
 102 Use push/pop instructions in prolog/epilog sequences instead of stores off
 103 ESP (certain code size win, perf win on some [which?] processors).
 104 Also, it appears icc use push for parameter passing. Need to investigate.
 105
 106 //===---------------------------------------------------------------------===//
 107
 108 Only use inc/neg/not instructions on processors where they are faster than
 109 add/sub/xor.  They are slower on the P4 due to only updating some processor
 110 flags.
 111
 112 //===---------------------------------------------------------------------===//
 113
 114 The instruction selector sometimes misses folding a load into a compare.  The
 115 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 116 commutative, it is not matched with the load on both sides.  The dag combiner
 117 should be made smart enough to cannonicalize the load into the RHS of a compare
 118 when it can invert the result of the compare for free.
 119
 120 //===---------------------------------------------------------------------===//
 121
 122 How about intrinsics? An example is:
 123   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 124
 125 compiles to
 126         pmuludq (%eax), %xmm0
 127         movl 8(%esp), %eax
 128         movdqa (%eax), %xmm1
 129         pmulhuw %xmm0, %xmm1
 130
 131 The transformation probably requires a X86 specific pass or a DAG combiner
 132 target specific hook.
 133
 134 //===---------------------------------------------------------------------===//
 135
 136 In many cases, LLVM generates code like this:
 137
 138 _test:
 139         movl 8(%esp), %eax
 140         cmpl %eax, 4(%esp)
 141         setl %al
 142         movzbl %al, %eax
 143         ret
 144
 145 on some processors (which ones?), it is more efficient to do this:
 146
 147 _test:
 148         movl 8(%esp), %ebx
 149         xor  %eax, %eax
 150         cmpl %ebx, 4(%esp)
 151         setl %al
 152         ret
 153
 154 Doing this correctly is tricky though, as the xor clobbers the flags.
 155
 156 //===---------------------------------------------------------------------===//
 157
 158 We should generate bts/btr/etc instructions on targets where they are cheap or
 159 when codesize is important.  e.g., for:
 160
 161 void setbit(int *target, int bit) {
 162     *target |= (1 << bit);
 163 }
 164 void clearbit(int *target, int bit) {
 165     *target &= ~(1 << bit);
 166 }
 167
 168 //===---------------------------------------------------------------------===//
 169
 170 Instead of the following for memset char*, 1, 10:
 171
 172         movl $16843009, 4(%edx)
 173         movl $16843009, (%edx)
 174         movw $257, 8(%edx)
 175
 176 It might be better to generate
 177
 178         movl $16843009, %eax
 179         movl %eax, 4(%edx)
 180         movl %eax, (%edx)
 181         movw al, 8(%edx)
 182
 183 when we can spare a register. It reduces code size.
 184
 185 //===---------------------------------------------------------------------===//
 186
 187 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 188 get this:
 189
 190 int %test1(int %X) {
 191         %Y = div int %X, 8
 192         ret int %Y
 193 }
 194
 195 _test1:
 196         movl 4(%esp), %eax
 197         movl %eax, %ecx
 198         sarl $31, %ecx
 199         shrl $29, %ecx
 200         addl %ecx, %eax
 201         sarl $3, %eax
 202         ret
 203
 204 GCC knows several different ways to codegen it, one of which is this:
 205
 206 _test1:
 207         movl    4(%esp), %eax
 208         cmpl    $-1, %eax
 209         leal    7(%eax), %ecx
 210         cmovle  %ecx, %eax
 211         sarl    $3, %eax
 212         ret
 213
 214 which is probably slower, but it's interesting at least :)
 215
 216 //===---------------------------------------------------------------------===//
 217
 218 Should generate min/max for stuff like:
 219
 220 void minf(float a, float b, float *X) {
 221   *X = a <= b ? a : b;
 222 }
 223
 224 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
 225 and ISD::FMAX node types?
 226
 227 //===---------------------------------------------------------------------===//
 228
 229 The first BB of this code:
 230
 231 declare bool %foo()
 232 int %bar() {
 233         %V = call bool %foo()
 234         br bool %V, label %T, label %F
 235 T:
 236         ret int 1
 237 F:
 238         call bool %foo()
 239         ret int 12
 240 }
 241
 242 compiles to:
 243
 244 _bar:
 245         subl $12, %esp
 246         call L_foo$stub
 247         xorb $1, %al
 248         testb %al, %al
 249         jne LBB_bar_2   # F
 250
 251 It would be better to emit "cmp %al, 1" than a xor and test.
 252
 253 //===---------------------------------------------------------------------===//
 254
 255 Enable X86InstrInfo::convertToThreeAddress().
 256
 257 //===---------------------------------------------------------------------===//
 258
 259 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 260 We should leave these as libcalls for everything over a much lower threshold,
 261 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 262 stores, TLB preheating, etc)
 263
 264 //===---------------------------------------------------------------------===//
 265
 266 Optimize this into something reasonable:
 267  x * copysign(1.0, y) * copysign(1.0, z)
 268
 269 //===---------------------------------------------------------------------===//
 270
 271 Optimize copysign(x, *y) to use an integer load from y.
 272
 273 //===---------------------------------------------------------------------===//
 274
 275 %X = weak global int 0
 276
 277 void %foo(int %N) {
 278         %N = cast int %N to uint
 279         %tmp.24 = setgt int %N, 0
 280         br bool %tmp.24, label %no_exit, label %return
 281
 282 no_exit:
 283         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 284         %i.0.0 = cast uint %indvar to int
 285         volatile store int %i.0.0, int* %X
 286         %indvar.next = add uint %indvar, 1
 287         %exitcond = seteq uint %indvar.next, %N
 288         br bool %exitcond, label %return, label %no_exit
 289
 290 return:
 291         ret void
 292 }
 293
 294 compiles into:
 295
 296         .text
 297         .align  4
 298         .globl  _foo
 299 _foo:
 300         movl 4(%esp), %eax
 301         cmpl $1, %eax
 302         jl LBB_foo_4    # return
 303 LBB_foo_1:      # no_exit.preheader
 304         xorl %ecx, %ecx
 305 LBB_foo_2:      # no_exit
 306         movl L_X$non_lazy_ptr, %edx
 307         movl %ecx, (%edx)
 308         incl %ecx
 309         cmpl %eax, %ecx
 310         jne LBB_foo_2   # no_exit
 311 LBB_foo_3:      # return.loopexit
 312 LBB_foo_4:      # return
 313         ret
 314
 315 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 316 remateralization is implemented. This can be accomplished with 1) a target
 317 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 318
 319 //===---------------------------------------------------------------------===//
 320
 321 The following tests perform worse with LSR:
 322
 323 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 324
 325 //===---------------------------------------------------------------------===//
 326
 327 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 328 FR64 to VR128.
 329
 330 //===---------------------------------------------------------------------===//
 331
 332 mov $reg, 48(%esp)
 333 ...
 334 leal 48(%esp), %eax
 335 mov %eax, (%esp)
 336 call _foo
 337
 338 Obviously it would have been better for the first mov (or any op) to store
 339 directly %esp[0] if there are no other uses.
 340
 341 //===---------------------------------------------------------------------===//
 342
 343 Adding to the list of cmp / test poor codegen issues:
 344
 345 int test(__m128 *A, __m128 *B) {
 346   if (_mm_comige_ss(*A, *B))
 347     return 3;
 348   else
 349     return 4;
 350 }
 351
 352 _test:
 353         movl 8(%esp), %eax
 354         movaps (%eax), %xmm0
 355         movl 4(%esp), %eax
 356         movaps (%eax), %xmm1
 357         comiss %xmm0, %xmm1
 358         setae %al
 359         movzbl %al, %ecx
 360         movl $3, %eax
 361         movl $4, %edx
 362         cmpl $0, %ecx
 363         cmove %edx, %eax
 364         ret
 365
 366 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 367 are a number of issues. 1) We are introducing a setcc between the result of the
 368 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 369 so a any extend (which becomes a zero extend) is added.
 370
 371 We probably need some kind of target DAG combine hook to fix this.
 372
 373 //===---------------------------------------------------------------------===//
 374
 375 We generate significantly worse code for this than GCC:
 376 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 377 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 378
 379 There is also one case we do worse on PPC.
 380
 381 //===---------------------------------------------------------------------===//
 382
 383 If shorter, we should use things like:
 384 movzwl %ax, %eax
 385 instead of:
 386 andl $65535, %EAX
 387
 388 The former can also be used when the two-addressy nature of the 'and' would
 389 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 390
 391 //===---------------------------------------------------------------------===//
 392
 393 Bad codegen:
 394
 395 char foo(int x) { return x; }
 396
 397 _foo:
 398         movl 4(%esp), %eax
 399         shll $24, %eax
 400         sarl $24, %eax
 401         ret
 402
 403 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 404 sub-registers.
 405
 406 //===---------------------------------------------------------------------===//
 407
 408 Consider this:
 409
 410 typedef struct pair { float A, B; } pair;
 411 void pairtest(pair P, float *FP) {
 412         *FP = P.A+P.B;
 413 }
 414
 415 We currently generate this code with llvmgcc4:
 416
 417 _pairtest:
 418         subl $12, %esp
 419         movl 20(%esp), %eax
 420         movl %eax, 4(%esp)
 421         movl 16(%esp), %eax
 422         movl %eax, (%esp)
 423         movss (%esp), %xmm0
 424         addss 4(%esp), %xmm0
 425         movl 24(%esp), %eax
 426         movss %xmm0, (%eax)
 427         addl $12, %esp
 428         ret
 429
 430 we should be able to generate:
 431 _pairtest:
 432         movss 4(%esp), %xmm0
 433         movl 12(%esp), %eax
 434         addss 8(%esp), %xmm0
 435         movss %xmm0, (%eax)
 436         ret
 437
 438 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 439 integer chunks.  It does this so that structs like {short,short} are passed in
 440 a single 32-bit integer stack slot.  We should handle the safe cases above much
 441 nicer, while still handling the hard cases.
 442
 443 //===---------------------------------------------------------------------===//
 444
 445 Another instruction selector deficiency:
 446
 447 void %bar() {
 448         %tmp = load int (int)** %foo
 449         %tmp = tail call int %tmp( int 3 )
 450         ret void
 451 }
 452
 453 _bar:
 454         subl $12, %esp
 455         movl L_foo$non_lazy_ptr, %eax
 456         movl (%eax), %eax
 457         call *%eax
 458         addl $12, %esp
 459         ret
 460
 461 The current isel scheme will not allow the load to be folded in the call since
 462 the load's chain result is read by the callseq_start.
 463
 464 //===---------------------------------------------------------------------===//
 465
 466 Don't forget to find a way to squash noop truncates in the JIT environment.
 467
 468 //===---------------------------------------------------------------------===//
 469
 470 Implement anyext in the same manner as truncate that would allow them to be
 471 eliminated.
 472
 473 //===---------------------------------------------------------------------===//
 474
 475 How about implementing truncate / anyext as a property of machine instruction
 476 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 477 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 478 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 479
 480 //===---------------------------------------------------------------------===//
 481
 482 For this:
 483
 484 int test(int a)
 485 {
 486   return a * 3;
 487 }
 488
 489 We currently emits
 490         imull $3, 4(%esp), %eax
 491
 492 Perhaps this is what we really should generate is? Is imull three or four
 493 cycles? Note: ICC generates this:
 494         movl    4(%esp), %eax
 495         leal    (%eax,%eax,2), %eax
 496
 497 The current instruction priority is based on pattern complexity. The former is
 498 more "complex" because it folds a load so the latter will not be emitted.
 499
 500 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 501 should always try to match LEA first since the LEA matching code does some
 502 estimate to determine whether the match is profitable.
 503
 504 However, if we care more about code size, then imull is better. It's two bytes
 505 shorter than movl + leal.
 506
 507 //===---------------------------------------------------------------------===//
 508
 509 Implement CTTZ, CTLZ with bsf and bsr.
 510
 511 //===---------------------------------------------------------------------===//
 512
 513 It appears gcc place string data with linkonce linkage in
 514 .section __TEXT,__const_coal,coalesced instead of
 515 .section __DATA,__const_coal,coalesced.
 516 Take a look at darwin.h, there are other Darwin assembler directives that we
 517 do not make use of.
 518
 519 //===---------------------------------------------------------------------===//
 520
 521 We should handle __attribute__ ((__visibility__ ("hidden"))).
 522
 523 //===---------------------------------------------------------------------===//
 524
 525 int %foo(int* %a, int %t) {
 526 entry:
 527         br label %cond_true
 528
 529 cond_true:              ; preds = %cond_true, %entry
 530         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]           ; <int> [#uses=3]
 531         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]             ; <int> [#uses=1]
 532         %tmp2 = getelementptr int* %a, int %x.0.0               ; <int*> [#uses=1]
 533         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 534         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 535         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 536         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 537         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 538         br bool %tmp, label %bb12, label %cond_true
 539
 540 bb12:           ; preds = %cond_true
 541         ret int %tmp7
 542 }
 543
 544 is pessimized by -loop-reduce and -indvars
 545
 546 //===---------------------------------------------------------------------===//
 547
 548 Use cpuid to auto-detect CPU features such as SSE, SSE2, and SSE3.
 549
 550 //===---------------------------------------------------------------------===//
 551
 552 u32 to float conversion improvement:
 553
 554 float uint32_2_float( unsigned u ) {
 555   float fl = (int) (u & 0xffff);
 556   float fh = (int) (u >> 16);
 557   fh *= 0x1.0p16f;
 558   return fh + fl;
 559 }
 560
 561 00000000        subl    $0x04,%esp
 562 00000003        movl    0x08(%esp,1),%eax
 563 00000007        movl    %eax,%ecx
 564 00000009        shrl    $0x10,%ecx
 565 0000000c        cvtsi2ss        %ecx,%xmm0
 566 00000010        andl    $0x0000ffff,%eax
 567 00000015        cvtsi2ss        %eax,%xmm1
 568 00000019        mulss   0x00000078,%xmm0
 569 00000021        addss   %xmm1,%xmm0
 570 00000025        movss   %xmm0,(%esp,1)
 571 0000002a        flds    (%esp,1)
 572 0000002d        addl    $0x04,%esp
 573 00000030        ret
 574
 575 //===---------------------------------------------------------------------===//
 576
 577 When using fastcc abi, align stack slot of argument of type double on 8 byte
 578 boundary to improve performance.
 579
 580 //===---------------------------------------------------------------------===//
 581
 582 Codegen:
 583
 584 int f(int a, int b) {
 585   if (a == 4 || a == 6)
 586     b++;
 587   return b;
 588 }
 589
 590
 591 as:
 592
 593 or eax, 2
 594 cmp eax, 6
 595 jz label
 596
 597 //===---------------------------------------------------------------------===//
 598
 599 Compile:
 600 int %test(ulong *%tmp) {
 601         %tmp = load ulong* %tmp         ; <ulong> [#uses=1]
 602         %tmp.mask = shr ulong %tmp, ubyte 50            ; <ulong> [#uses=1]
 603         %tmp.mask = cast ulong %tmp.mask to ubyte               ; <ubyte> [#uses=1]
 604         %tmp2 = and ubyte %tmp.mask, 3          ; <ubyte> [#uses=1]
 605         %tmp2 = cast ubyte %tmp2 to int         ; <int> [#uses=1]
 606         ret int %tmp2
 607 }
 608
 609 to:
 610
 611 _test:
 612         movl 4(%esp), %eax
 613         movl 4(%eax), %eax
 614         shrl $18, %eax
 615         andl $3, %eax
 616         ret
 617
 618 instead of:
 619
 620 _test:
 621         movl 4(%esp), %eax
 622         movl 4(%eax), %eax
 623         shrl $18, %eax
 624         # TRUNCATE movb %al, %al
 625         andb $3, %al
 626         movzbl %al, %eax
 627         ret
 628
 629 This saves a movzbl, and saves a truncate if it doesn't get coallesced right.
 630 This is a simple DAGCombine to propagate the zext through the and.