lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 //===---------------------------------------------------------------------===//
  22
  23 This should be one DIV/IDIV instruction, not a libcall:
  24
  25 unsigned test(unsigned long long X, unsigned Y) {
  26         return X/Y;
  27 }
  28
  29 This can be done trivially with a custom legalizer.  What about overflow
  30 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 Improvements to the multiply -> shift/add algorithm:
  35 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  40 long long foo(int x) { return 1LL << x; }
  41
  42 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  43 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  44 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  45
  46 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  47
  48 //===---------------------------------------------------------------------===//
  49
  50 Compile this:
  51 _Bool f(_Bool a) { return a!=1; }
  52
  53 into:
  54         movzbl  %dil, %eax
  55         xorl    $1, %eax
  56         ret
  57
  58 //===---------------------------------------------------------------------===//
  59
  60 Some isel ideas:
  61
  62 1. Dynamic programming based approach when compile time if not an
  63    issue.
  64 2. Code duplication (addressing mode) during isel.
  65 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  66    Sequencing of Instructions".
  67 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  68    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  69    and other related papers.
  70    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  71
  72 //===---------------------------------------------------------------------===//
  73
  74 Should we promote i16 to i32 to avoid partial register update stalls?
  75
  76 //===---------------------------------------------------------------------===//
  77
  78 Leave any_extend as pseudo instruction and hint to register
  79 allocator. Delay codegen until post register allocation.
  80
  81 //===---------------------------------------------------------------------===//
  82
  83 Model X86 EFLAGS as a real register to avoid redudant cmp / test. e.g.
  84
  85         cmpl $1, %eax
  86         setg %al
  87         testb %al, %al  # unnecessary
  88         jne .BB7
  89
  90 //===---------------------------------------------------------------------===//
  91
  92 Count leading zeros and count trailing zeros:
  93
  94 int clz(int X) { return __builtin_clz(X); }
  95 int ctz(int X) { return __builtin_ctz(X); }
  96
  97 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
  98 clz:
  99         bsr     %eax, DWORD PTR [%esp+4]
 100         xor     %eax, 31
 101         ret
 102 ctz:
 103         bsf     %eax, DWORD PTR [%esp+4]
 104         ret
 105
 106 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 107 aren't.
 108
 109 //===---------------------------------------------------------------------===//
 110
 111 Use push/pop instructions in prolog/epilog sequences instead of stores off
 112 ESP (certain code size win, perf win on some [which?] processors).
 113 Also, it appears icc use push for parameter passing. Need to investigate.
 114
 115 //===---------------------------------------------------------------------===//
 116
 117 Only use inc/neg/not instructions on processors where they are faster than
 118 add/sub/xor.  They are slower on the P4 due to only updating some processor
 119 flags.
 120
 121 //===---------------------------------------------------------------------===//
 122
 123 The instruction selector sometimes misses folding a load into a compare.  The
 124 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 125 commutative, it is not matched with the load on both sides.  The dag combiner
 126 should be made smart enough to cannonicalize the load into the RHS of a compare
 127 when it can invert the result of the compare for free.
 128
 129 How about intrinsics? An example is:
 130   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 131
 132 compiles to
 133         pmuludq (%eax), %xmm0
 134         movl 8(%esp), %eax
 135         movdqa (%eax), %xmm1
 136         pmulhuw %xmm0, %xmm1
 137
 138 The transformation probably requires a X86 specific pass or a DAG combiner
 139 target specific hook.
 140
 141 //===---------------------------------------------------------------------===//
 142
 143 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
 144 other fast SSE modes.
 145
 146 //===---------------------------------------------------------------------===//
 147
 148 Think about doing i64 math in SSE regs.
 149
 150 //===---------------------------------------------------------------------===//
 151
 152 The DAG Isel doesn't fold the loads into the adds in this testcase.  The
 153 pattern selector does.  This is because the chain value of the load gets
 154 selected first, and the loads aren't checking to see if they are only used by
 155 and add.
 156
 157 .ll:
 158
 159 int %test(int* %x, int* %y, int* %z) {
 160         %X = load int* %x
 161         %Y = load int* %y
 162         %Z = load int* %z
 163         %a = add int %X, %Y
 164         %b = add int %a, %Z
 165         ret int %b
 166 }
 167
 168 dag isel:
 169
 170 _test:
 171         movl 4(%esp), %eax
 172         movl (%eax), %eax
 173         movl 8(%esp), %ecx
 174         movl (%ecx), %ecx
 175         addl %ecx, %eax
 176         movl 12(%esp), %ecx
 177         movl (%ecx), %ecx
 178         addl %ecx, %eax
 179         ret
 180
 181 pattern isel:
 182
 183 _test:
 184         movl 12(%esp), %ecx
 185         movl 4(%esp), %edx
 186         movl 8(%esp), %eax
 187         movl (%eax), %eax
 188         addl (%edx), %eax
 189         addl (%ecx), %eax
 190         ret
 191
 192 This is bad for register pressure, though the dag isel is producing a
 193 better schedule. :)
 194
 195 //===---------------------------------------------------------------------===//
 196
 197 This testcase should have no SSE instructions in it, and only one load from
 198 a constant pool:
 199
 200 double %test3(bool %B) {
 201         %C = select bool %B, double 123.412, double 523.01123123
 202         ret double %C
 203 }
 204
 205 Currently, the select is being lowered, which prevents the dag combiner from
 206 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
 207
 208 The pattern isel got this one right.
 209
 210 //===---------------------------------------------------------------------===//
 211
 212 SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
 213 like this:
 214
 215   X += y
 216
 217 and the register allocator decides to spill X, it is cheaper to emit this as:
 218
 219 Y += [xslot]
 220 store Y -> [xslot]
 221
 222 than as:
 223
 224 tmp = [xslot]
 225 tmp += y
 226 store tmp -> [xslot]
 227
 228 ..and this uses one fewer register (so this should be done at load folding
 229 time, not at spiller time).  *Note* however that this can only be done
 230 if Y is dead.  Here's a testcase:
 231
 232 %.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
 233 implementation   ; Functions:
 234 declare void %printf(int, ...)
 235 void %main() {
 236 build_tree.exit:
 237         br label %no_exit.i7
 238 no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
 239         %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
 240         %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
 241         %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
 242         %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
 243         br bool false, label %Compute_Tree.exit23, label %no_exit.i7
 244 Compute_Tree.exit23:            ; preds = %no_exit.i7
 245         tail call void (int, ...)* %printf( int 0 )
 246         store double %tmp.34.i18, double* null
 247         ret void
 248 }
 249
 250 We currently emit:
 251
 252 .BBmain_1:
 253         xorpd %XMM1, %XMM1
 254         addsd %XMM0, %XMM1
 255 ***     movsd %XMM2, QWORD PTR [%ESP + 8]
 256 ***     addsd %XMM2, %XMM1
 257 ***     movsd QWORD PTR [%ESP + 8], %XMM2
 258         jmp .BBmain_1   # no_exit.i7
 259
 260 This is a bugpoint reduced testcase, which is why the testcase doesn't make
 261 much sense (e.g. its an infinite loop). :)
 262
 263 //===---------------------------------------------------------------------===//
 264
 265 In many cases, LLVM generates code like this:
 266
 267 _test:
 268         movl 8(%esp), %eax
 269         cmpl %eax, 4(%esp)
 270         setl %al
 271         movzbl %al, %eax
 272         ret
 273
 274 on some processors (which ones?), it is more efficient to do this:
 275
 276 _test:
 277         movl 8(%esp), %ebx
 278         xor %eax, %eax
 279         cmpl %ebx, 4(%esp)
 280         setl %al
 281         ret
 282
 283 Doing this correctly is tricky though, as the xor clobbers the flags.
 284
 285 //===---------------------------------------------------------------------===//
 286
 287 We should generate 'test' instead of 'cmp' in various cases, e.g.:
 288
 289 bool %test(int %X) {
 290         %Y = shl int %X, ubyte 1
 291         %C = seteq int %Y, 0
 292         ret bool %C
 293 }
 294 bool %test(int %X) {
 295         %Y = and int %X, 8
 296         %C = seteq int %Y, 0
 297         ret bool %C
 298 }
 299
 300 This may just be a matter of using 'test' to write bigger patterns for X86cmp.
 301
 302 An important case is comparison against zero:
 303
 304 if (X == 0) ...
 305
 306 instead of:
 307
 308         cmpl $0, %eax
 309         je LBB4_2       #cond_next
 310
 311 use:
 312         test %eax, %eax
 313         jz LBB4_2
 314
 315 which is smaller.
 316
 317 //===---------------------------------------------------------------------===//
 318
 319 SSE should implement 'select_cc' using 'emulated conditional moves' that use
 320 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
 321
 322 double %X(double %Y, double %Z, double %A, double %B) {
 323         %C = setlt double %A, %B
 324         %z = add double %Z, 0.0    ;; select operand is not a load
 325         %D = select bool %C, double %Y, double %z
 326         ret double %D
 327 }
 328
 329 We currently emit:
 330
 331 _X:
 332         subl $12, %esp
 333         xorpd %xmm0, %xmm0
 334         addsd 24(%esp), %xmm0
 335         movsd 32(%esp), %xmm1
 336         movsd 16(%esp), %xmm2
 337         ucomisd 40(%esp), %xmm1
 338         jb LBB_X_2
 339 LBB_X_1:
 340         movsd %xmm0, %xmm2
 341 LBB_X_2:
 342         movsd %xmm2, (%esp)
 343         fldl (%esp)
 344         addl $12, %esp
 345         ret
 346
 347 //===---------------------------------------------------------------------===//
 348
 349 We should generate bts/btr/etc instructions on targets where they are cheap or
 350 when codesize is important.  e.g., for:
 351
 352 void setbit(int *target, int bit) {
 353     *target |= (1 << bit);
 354 }
 355 void clearbit(int *target, int bit) {
 356     *target &= ~(1 << bit);
 357 }
 358
 359 //===---------------------------------------------------------------------===//
 360
 361 Instead of the following for memset char*, 1, 10:
 362
 363         movl $16843009, 4(%edx)
 364         movl $16843009, (%edx)
 365         movw $257, 8(%edx)
 366
 367 It might be better to generate
 368
 369         movl $16843009, %eax
 370         movl %eax, 4(%edx)
 371         movl %eax, (%edx)
 372         movw al, 8(%edx)
 373
 374 when we can spare a register. It reduces code size.
 375
 376 //===---------------------------------------------------------------------===//
 377
 378 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
 379 registers. The choice may depend on subtarget information. We should do some
 380 more experiments on different x86 machines.
 381
 382 //===---------------------------------------------------------------------===//
 383
 384 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 385 get this:
 386
 387 int %test1(int %X) {
 388         %Y = div int %X, 8
 389         ret int %Y
 390 }
 391
 392 _test1:
 393         movl 4(%esp), %eax
 394         movl %eax, %ecx
 395         sarl $31, %ecx
 396         shrl $29, %ecx
 397         addl %ecx, %eax
 398         sarl $3, %eax
 399         ret
 400
 401 GCC knows several different ways to codegen it, one of which is this:
 402
 403 _test1:
 404         movl    4(%esp), %eax
 405         cmpl    $-1, %eax
 406         leal    7(%eax), %ecx
 407         cmovle  %ecx, %eax
 408         sarl    $3, %eax
 409         ret
 410
 411 which is probably slower, but it's interesting at least :)
 412
 413 //===---------------------------------------------------------------------===//
 414
 415 Currently the x86 codegen isn't very good at mixing SSE and FPStack
 416 code:
 417
 418 unsigned int foo(double x) { return x; }
 419
 420 foo:
 421         subl $20, %esp
 422         movsd 24(%esp), %xmm0
 423         movsd %xmm0, 8(%esp)
 424         fldl 8(%esp)
 425         fisttpll (%esp)
 426         movl (%esp), %eax
 427         addl $20, %esp
 428         ret
 429
 430 This will be solved when we go to a dynamic programming based isel.
 431
 432 //===---------------------------------------------------------------------===//
 433
 434 Should generate min/max for stuff like:
 435
 436 void minf(float a, float b, float *X) {
 437   *X = a <= b ? a : b;
 438 }
 439
 440 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
 441 and ISD::FMAX node types?
 442
 443 //===---------------------------------------------------------------------===//
 444
 445 The first BB of this code:
 446
 447 declare bool %foo()
 448 int %bar() {
 449         %V = call bool %foo()
 450         br bool %V, label %T, label %F
 451 T:
 452         ret int 1
 453 F:
 454         call bool %foo()
 455         ret int 12
 456 }
 457
 458 compiles to:
 459
 460 _bar:
 461         subl $12, %esp
 462         call L_foo$stub
 463         xorb $1, %al
 464         testb %al, %al
 465         jne LBB_bar_2   # F
 466
 467 It would be better to emit "cmp %al, 1" than a xor and test.
 468
 469 //===---------------------------------------------------------------------===//
 470
 471 Enable X86InstrInfo::convertToThreeAddress().
 472
 473 //===---------------------------------------------------------------------===//
 474
 475 Investigate whether it is better to codegen the following
 476
 477         %tmp.1 = mul int %x, 9
 478 to
 479
 480         movl    4(%esp), %eax
 481         leal    (%eax,%eax,8), %eax
 482
 483 as opposed to what llc is currently generating:
 484
 485         imull $9, 4(%esp), %eax
 486
 487 Currently the load folding imull has a higher complexity than the LEA32 pattern.
 488
 489 //===---------------------------------------------------------------------===//
 490
 491 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 492 We should leave these as libcalls for everything over a much lower threshold,
 493 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 494 stores, TLB preheating, etc)
 495
 496 //===---------------------------------------------------------------------===//
 497
 498 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 499 feasible.
 500
 501 //===---------------------------------------------------------------------===//
 502
 503 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
 504 the reg-reg copy in this example:
 505
 506 float foo(int *x, float *y, unsigned c) {
 507   float res = 0.0;
 508   unsigned i;
 509   for (i = 0; i < c; i++) {
 510     float xx = (float)x[i];
 511     xx = xx * y[i];
 512     xx += res;
 513     res = xx;
 514   }
 515   return res;
 516 }
 517
 518 LBB_foo_3:      # no_exit
 519         cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
 520         mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
 521         addss %XMM0, %XMM1
 522         inc %ESI
 523         cmp %ESI, %ECX
 524 ****    movaps %XMM1, %XMM0
 525         jb LBB_foo_3    # no_exit
 526
 527 //===---------------------------------------------------------------------===//
 528
 529 Codegen:
 530   if (copysign(1.0, x) == copysign(1.0, y))
 531 into:
 532   if (x^y & mask)
 533 when using SSE.
 534
 535 //===---------------------------------------------------------------------===//
 536
 537 Optimize this into something reasonable:
 538  x * copysign(1.0, y) * copysign(1.0, z)
 539
 540 //===---------------------------------------------------------------------===//
 541
 542 Optimize copysign(x, *y) to use an integer load from y.
 543
 544 //===---------------------------------------------------------------------===//
 545
 546 %X = weak global int 0
 547
 548 void %foo(int %N) {
 549         %N = cast int %N to uint
 550         %tmp.24 = setgt int %N, 0
 551         br bool %tmp.24, label %no_exit, label %return
 552
 553 no_exit:
 554         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 555         %i.0.0 = cast uint %indvar to int
 556         volatile store int %i.0.0, int* %X
 557         %indvar.next = add uint %indvar, 1
 558         %exitcond = seteq uint %indvar.next, %N
 559         br bool %exitcond, label %return, label %no_exit
 560
 561 return:
 562         ret void
 563 }
 564
 565 compiles into:
 566
 567         .text
 568         .align  4
 569         .globl  _foo
 570 _foo:
 571         movl 4(%esp), %eax
 572         cmpl $1, %eax
 573         jl LBB_foo_4    # return
 574 LBB_foo_1:      # no_exit.preheader
 575         xorl %ecx, %ecx
 576 LBB_foo_2:      # no_exit
 577         movl L_X$non_lazy_ptr, %edx
 578         movl %ecx, (%edx)
 579         incl %ecx
 580         cmpl %eax, %ecx
 581         jne LBB_foo_2   # no_exit
 582 LBB_foo_3:      # return.loopexit
 583 LBB_foo_4:      # return
 584         ret
 585
 586 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 587 remateralization is implemented. This can be accomplished with 1) a target
 588 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 589
 590 //===---------------------------------------------------------------------===//
 591
 592 The following tests perform worse with LSR:
 593
 594 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 595
 596 //===---------------------------------------------------------------------===//
 597
 598 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 599 FR64 to VR128.
 600
 601 //===---------------------------------------------------------------------===//
 602
 603 mov $reg, 48(%esp)
 604 ...
 605 leal 48(%esp), %eax
 606 mov %eax, (%esp)
 607 call _foo
 608
 609 Obviously it would have been better for the first mov (or any op) to store
 610 directly %esp[0] if there are no other uses.
 611
 612 //===---------------------------------------------------------------------===//
 613
 614 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
 615 of a v4sf value.
 616
 617 //===---------------------------------------------------------------------===//
 618
 619 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 620 Perhaps use pxor / xorp* to clear a XMM register first?
 621
 622 //===---------------------------------------------------------------------===//
 623
 624 Better codegen for:
 625
 626 void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
 627 void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
 628
 629 For the later we generate:
 630
 631 _f:
 632         pxor %xmm0, %xmm0
 633         movss 8(%esp), %xmm1
 634         movaps %xmm0, %xmm2
 635         unpcklps %xmm1, %xmm2
 636         movss 4(%esp), %xmm1
 637         unpcklps %xmm0, %xmm1
 638         unpcklps %xmm2, %xmm1
 639         movl 12(%esp), %eax
 640         movaps %xmm1, (%eax)
 641         ret
 642
 643 This seems like it should use shufps, one for each of a & b.
 644
 645 //===---------------------------------------------------------------------===//
 646
 647 Adding to the list of cmp / test poor codegen issues:
 648
 649 int test(__m128 *A, __m128 *B) {
 650   if (_mm_comige_ss(*A, *B))
 651     return 3;
 652   else
 653     return 4;
 654 }
 655
 656 _test:
 657         movl 8(%esp), %eax
 658         movaps (%eax), %xmm0
 659         movl 4(%esp), %eax
 660         movaps (%eax), %xmm1
 661         comiss %xmm0, %xmm1
 662         setae %al
 663         movzbl %al, %ecx
 664         movl $3, %eax
 665         movl $4, %edx
 666         cmpl $0, %ecx
 667         cmove %edx, %eax
 668         ret
 669
 670 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 671 are a number of issues. 1) We are introducing a setcc between the result of the
 672 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 673 so a any extend (which becomes a zero extend) is added.
 674
 675 We probably need some kind of target DAG combine hook to fix this.
 676
 677 //===---------------------------------------------------------------------===//
 678
 679 How to decide when to use the "floating point version" of logical ops? Here are
 680 some code fragments:
 681
 682         movaps LCPI5_5, %xmm2
 683         divps %xmm1, %xmm2
 684         mulps %xmm2, %xmm3
 685         mulps 8656(%ecx), %xmm3
 686         addps 8672(%ecx), %xmm3
 687         andps LCPI5_6, %xmm2
 688         andps LCPI5_1, %xmm3
 689         por %xmm2, %xmm3
 690         movdqa %xmm3, (%edi)
 691
 692         movaps LCPI5_5, %xmm1
 693         divps %xmm0, %xmm1
 694         mulps %xmm1, %xmm3
 695         mulps 8656(%ecx), %xmm3
 696         addps 8672(%ecx), %xmm3
 697         andps LCPI5_6, %xmm1
 698         andps LCPI5_1, %xmm3
 699         orps %xmm1, %xmm3
 700         movaps %xmm3, 112(%esp)
 701         movaps %xmm3, (%ebx)
 702
 703 Due to some minor source change, the later case ended up using orps and movaps
 704 instead of por and movdqa. Does it matter?
 705
 706 //===---------------------------------------------------------------------===//
 707
 708 Use movddup to splat a v2f64 directly from a memory source. e.g.
 709
 710 #include <emmintrin.h>
 711
 712 void test(__m128d *r, double A) {
 713   *r = _mm_set1_pd(A);
 714 }
 715
 716 llc:
 717
 718 _test:
 719         movsd 8(%esp), %xmm0
 720         unpcklpd %xmm0, %xmm0
 721         movl 4(%esp), %eax
 722         movapd %xmm0, (%eax)
 723         ret
 724
 725 icc:
 726
 727 _test:
 728         movl 4(%esp), %eax
 729         movddup 8(%esp), %xmm0
 730         movapd %xmm0, (%eax)
 731         ret
 732
 733 //===---------------------------------------------------------------------===//
 734
 735 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 736 to choose between movaps, movapd, and movdqa based on types of source and
 737 destination?
 738
 739 How about andps, andpd, and pand? Do we really care about the type of the packed
 740 elements? If not, why not always use the "ps" variants which are likely to be
 741 shorter.
 742
 743 //===---------------------------------------------------------------------===//
 744
 745 We are emitting bad code for this:
 746
 747 float %test(float* %V, int %I, int %D, float %V) {
 748 entry:
 749         %tmp = seteq int %D, 0
 750         br bool %tmp, label %cond_true, label %cond_false23
 751
 752 cond_true:
 753         %tmp3 = getelementptr float* %V, int %I
 754         %tmp = load float* %tmp3
 755         %tmp5 = setgt float %tmp, %V
 756         %tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V )
 757         %tmp7 = or bool %tmp5, %tmp6
 758         br bool %tmp7, label %UnifiedReturnBlock, label %cond_next
 759
 760 cond_next:
 761         %tmp10 = add int %I, 1
 762         %tmp12 = getelementptr float* %V, int %tmp10
 763         %tmp13 = load float* %tmp12
 764         %tmp15 = setle float %tmp13, %V
 765         %tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V )
 766         %tmp17 = or bool %tmp15, %tmp16
 767         %retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00
 768         ret float %retval
 769
 770 cond_false23:
 771         %tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V )
 772         ret float %tmp28
 773
 774 UnifiedReturnBlock:             ; preds = %cond_true
 775         ret float 0.000000e+00
 776 }
 777
 778 declare bool %llvm.isunordered.f32(float, float)
 779
 780 declare float %foo(float*, int, int, float)
 781
 782
 783 It exposes a known load folding problem:
 784
 785         movss (%edx,%ecx,4), %xmm1
 786         ucomiss %xmm1, %xmm0
 787
 788 As well as this:
 789
 790 LBB_test_2:     # cond_next
 791         movss LCPI1_0, %xmm2
 792         pxor %xmm3, %xmm3
 793         ucomiss %xmm0, %xmm1
 794         jbe LBB_test_6  # cond_next
 795 LBB_test_5:     # cond_next
 796         movaps %xmm2, %xmm3
 797 LBB_test_6:     # cond_next
 798         movss %xmm3, 40(%esp)
 799         flds 40(%esp)
 800         addl $44, %esp
 801         ret
 802
 803 Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting
 804 three moves (movss, movaps, movss).
 805
 806 //===---------------------------------------------------------------------===//
 807
 808 External test Nurbs exposed some problems. Look for
 809 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 810 emits:
 811
 812         movaps    (%edx), %xmm2                                 #59.21
 813         movaps    (%edx), %xmm5                                 #60.21
 814         movaps    (%edx), %xmm4                                 #61.21
 815         movaps    (%edx), %xmm3                                 #62.21
 816         movl      40(%ecx), %ebp                                #69.49
 817         shufps    $0, %xmm2, %xmm5                              #60.21
 818         movl      100(%esp), %ebx                               #69.20
 819         movl      (%ebx), %edi                                  #69.20
 820         imull     %ebp, %edi                                    #69.49
 821         addl      (%eax), %edi                                  #70.33
 822         shufps    $85, %xmm2, %xmm4                             #61.21
 823         shufps    $170, %xmm2, %xmm3                            #62.21
 824         shufps    $255, %xmm2, %xmm2                            #63.21
 825         lea       (%ebp,%ebp,2), %ebx                           #69.49
 826         negl      %ebx                                          #69.49
 827         lea       -3(%edi,%ebx), %ebx                           #70.33
 828         shll      $4, %ebx                                      #68.37
 829         addl      32(%ecx), %ebx                                #68.37
 830         testb     $15, %bl                                      #91.13
 831         jne       L_B1.24       # Prob 5%                       #91.13
 832
 833 This is the llvm code after instruction scheduling:
 834
 835 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 836         %reg1078 = MOV32ri -3
 837         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 838         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 839         %reg1080 = IMUL32rr %reg1079, %reg1037
 840         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 841         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 842         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 843         %reg1082 = SHL32ri %reg1038, 4
 844         %reg1039 = ADD32rr %reg1036, %reg1082
 845         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 846         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 847         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 848         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 849         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 850         %reg1040 = MOV32rr %reg1039
 851         %reg1084 = AND32ri8 %reg1039, 15
 852         CMP32ri8 %reg1084, 0
 853         JE mbb<cond_next204,0xa914d30>
 854
 855 Still ok. After register allocation:
 856
 857 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 858         %EAX = MOV32ri -3
 859         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 860         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 861         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 862         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 863         IMUL32rr %EAX<def&use>, %EDX
 864         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 865         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 866         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 867         %EAX = LEA32r %ESI, 1, %EAX, -3
 868         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 869         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 870         %EDI = MOV32rr %EAX
 871         SHL32ri %EDI<def&use>, 4
 872         ADD32rr %EDI<def&use>, %ESI
 873         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 874         %XMM1 = MOVAPSrr %XMM0
 875         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 876         %XMM2 = MOVAPSrr %XMM0
 877         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 878         %XMM3 = MOVAPSrr %XMM0
 879         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 880         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 881         %EBX = MOV32rr %EDI
 882         AND32ri8 %EBX<def&use>, 15
 883         CMP32ri8 %EBX, 0
 884         JE mbb<cond_next204,0xa914d30>
 885
 886 This looks really bad. The problem is shufps is a destructive opcode. Since it
 887 appears as operand two in more than one shufps ops. It resulted in a number of
 888 copies. Note icc also suffers from the same problem. Either the instruction
 889 selector should select pshufd or The register allocator can made the two-address
 890 to three-address transformation.
 891
 892 It also exposes some other problems. See MOV32ri -3 and the spills.
 893
 894 //===---------------------------------------------------------------------===//
 895
 896 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
 897
 898 LLVM is producing bad code.
 899
 900 LBB_main_4:     # cond_true44
 901         addps %xmm1, %xmm2
 902         subps %xmm3, %xmm2
 903         movaps (%ecx), %xmm4
 904         movaps %xmm2, %xmm1
 905         addps %xmm4, %xmm1
 906         addl $16, %ecx
 907         incl %edx
 908         cmpl $262144, %edx
 909         movaps %xmm3, %xmm2
 910         movaps %xmm4, %xmm3
 911         jne LBB_main_4  # cond_true44
 912
 913 There are two problems. 1) No need to two loop induction variables. We can
 914 compare against 262144 * 16. 2) Known register coalescer issue. We should
 915 be able eliminate one of the movaps:
 916
 917         addps %xmm2, %xmm1    <=== Commute!
 918         subps %xmm3, %xmm1
 919         movaps (%ecx), %xmm4
 920         movaps %xmm1, %xmm1   <=== Eliminate!
 921         addps %xmm4, %xmm1
 922         addl $16, %ecx
 923         incl %edx
 924         cmpl $262144, %edx
 925         movaps %xmm3, %xmm2
 926         movaps %xmm4, %xmm3
 927         jne LBB_main_4  # cond_true44
 928
 929 //===---------------------------------------------------------------------===//
 930
 931 Consider:
 932
 933 __m128 test(float a) {
 934   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 935 }
 936
 937 This compiles into:
 938
 939 movss 4(%esp), %xmm1
 940 mulss %xmm1, %xmm1
 941 xorps %xmm0, %xmm0
 942 movss %xmm1, %xmm0
 943 ret
 944
 945 Because mulss doesn't modify the top 3 elements, the top elements of
 946 xmm1 are already zero'd.  We could compile this to:
 947
 948 movss 4(%esp), %xmm0
 949 mulss %xmm0, %xmm0
 950 ret
 951
 952 //===---------------------------------------------------------------------===//
 953
 954 Here's a sick and twisted idea.  Consider code like this:
 955
 956 __m128 test(__m128 a) {
 957   float b = *(float*)&A;
 958   ...
 959   return _mm_set_ps(0.0, 0.0, 0.0, b);
 960 }
 961
 962 This might compile to this code:
 963
 964 movaps c(%esp), %xmm1
 965 xorps %xmm0, %xmm0
 966 movss %xmm1, %xmm0
 967 ret
 968
 969 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 970 this code:
 971
 972 movaps c(%esp), %xmm1
 973 movaps %xmm1, c2(%esp)
 974 ...
 975
 976 xorps %xmm0, %xmm0
 977 movaps c2(%esp), %xmm1
 978 movss %xmm1, %xmm0
 979 ret
 980
 981 However, since the reload is only used by these instructions, we could
 982 "fold" it into the uses, producing something like this:
 983
 984 movaps c(%esp), %xmm1
 985 movaps %xmm1, c2(%esp)
 986 ...
 987
 988 movss c2(%esp), %xmm0
 989 ret
 990
 991 ... saving two instructions.
 992
 993 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 994 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
 995 This can be used to simplify a variety of shuffle operations, where the
 996 elements are fixed zeros.
 997
 998 //===---------------------------------------------------------------------===//
 999
1000 We generate significantly worse code for this than GCC:
1001 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
1002 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
1003
1004 There is also one case we do worse on PPC.
1005
1006 //===---------------------------------------------------------------------===//
1007
1008 For this:
1009
1010 #include <emmintrin.h>
1011 void test(__m128d *r, __m128d *A, double B) {
1012   *r = _mm_loadl_pd(*A, &B);
1013 }
1014
1015 We generates:
1016
1017         subl $12, %esp
1018         movsd 24(%esp), %xmm0
1019         movsd %xmm0, (%esp)
1020         movl 20(%esp), %eax
1021         movapd (%eax), %xmm0
1022         movlpd (%esp), %xmm0
1023         movl 16(%esp), %eax
1024         movapd %xmm0, (%eax)
1025         addl $12, %esp
1026         ret
1027
1028 icc generates:
1029
1030         movl      4(%esp), %edx                                 #3.6
1031         movl      8(%esp), %eax                                 #3.6
1032         movapd    (%eax), %xmm0                                 #4.22
1033         movlpd    12(%esp), %xmm0                               #4.8
1034         movapd    %xmm0, (%edx)                                 #4.3
1035         ret                                                     #5.1
1036
1037 So icc is smart enough to know that B is in memory so it doesn't load it and
1038 store it back to stack.
1039
1040 //===---------------------------------------------------------------------===//
1041
1042 __m128d test1( __m128d A, __m128d B) {
1043   return _mm_shuffle_pd(A, B, 0x3);
1044 }
1045
1046 compiles to
1047
1048 shufpd $3, %xmm1, %xmm0
1049
1050 Perhaps it's better to use unpckhpd instead?
1051
1052 unpckhpd %xmm1, %xmm0
1053
1054 Don't know if unpckhpd is faster. But it is shorter.
1055
1056 //===---------------------------------------------------------------------===//
1057
1058 If shorter, we should use things like:
1059 movzwl %ax, %eax
1060 instead of:
1061 andl $65535, %EAX
1062
1063 The former can also be used when the two-addressy nature of the 'and' would
1064 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
1065
1066 //===---------------------------------------------------------------------===//
1067
1068 This code generates ugly code, probably due to costs being off or something:
1069
1070 void %test(float* %P, <4 x float>* %P2 ) {
1071         %xFloat0.688 = load float* %P
1072         %loadVector37.712 = load <4 x float>* %P2
1073         %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
1074         store <4 x float> %inFloat3.713, <4 x float>* %P2
1075         ret void
1076 }
1077
1078 Generates:
1079
1080 _test:
1081         pxor %xmm0, %xmm0
1082         movd %xmm0, %eax        ;; EAX = 0!
1083         movl 8(%esp), %ecx
1084         movaps (%ecx), %xmm0
1085         pinsrw $6, %eax, %xmm0
1086         shrl $16, %eax          ;; EAX = 0 again!
1087         pinsrw $7, %eax, %xmm0
1088         movaps %xmm0, (%ecx)
1089         ret
1090
1091 It would be better to generate:
1092
1093 _test:
1094         movl 8(%esp), %ecx
1095         movaps (%ecx), %xmm0
1096         xor %eax, %eax
1097         pinsrw $6, %eax, %xmm0
1098         pinsrw $7, %eax, %xmm0
1099         movaps %xmm0, (%ecx)
1100         ret
1101
1102 or use pxor (to make a zero vector) and shuffle (to insert it).
1103
1104 //===---------------------------------------------------------------------===//
1105
1106 Bad codegen:
1107
1108 char foo(int x) { return x; }
1109
1110 _foo:
1111         movl 4(%esp), %eax
1112         shll $24, %eax
1113         sarl $24, %eax
1114         ret
1115
1116 //===---------------------------------------------------------------------===//
1117
1118 Some useful information in the Apple Altivec / SSE Migration Guide:
1119
1120 http://developer.apple.com/documentation/Performance/Conceptual/
1121 Accelerate_sse_migration/index.html
1122
1123 e.g. SSE select using and, andnot, or. Various SSE compare translations.