lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 //===---------------------------------------------------------------------===//
   6
   7 There are serious issues folding loads into "scalar sse" intrinsics.  For
   8 example, this:
   9
  10 float minss4( float x, float *y ) {
  11   return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x),_mm_set_ss(*y)));
  12 }
  13
  14 compiles to:
  15
  16 _minss4:
  17         subl $4, %esp
  18         movl 12(%esp), %eax
  19 ***     movss 8(%esp), %xmm0
  20 ***     movss (%eax), %xmm1
  21 ***     minss %xmm1, %xmm0
  22         movss %xmm0, (%esp)
  23         flds (%esp)
  24         addl $4, %esp
  25         ret
  26
  27 Each operand of the minss is a load.  At least one should be folded!
  28
  29 //===---------------------------------------------------------------------===//
  30
  31 Expand libm rounding functions inline:  Significant speedups possible.
  32 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  33
  34 //===---------------------------------------------------------------------===//
  35
  36 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  37 other fast SSE modes.
  38
  39 //===---------------------------------------------------------------------===//
  40
  41 Think about doing i64 math in SSE regs.
  42
  43 //===---------------------------------------------------------------------===//
  44
  45 This testcase should have no SSE instructions in it, and only one load from
  46 a constant pool:
  47
  48 double %test3(bool %B) {
  49         %C = select bool %B, double 123.412, double 523.01123123
  50         ret double %C
  51 }
  52
  53 Currently, the select is being lowered, which prevents the dag combiner from
  54 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  55
  56 The pattern isel got this one right.
  57
  58 //===---------------------------------------------------------------------===//
  59
  60 SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
  61 like this:
  62
  63   X += y
  64
  65 and the register allocator decides to spill X, it is cheaper to emit this as:
  66
  67 Y += [xslot]
  68 store Y -> [xslot]
  69
  70 than as:
  71
  72 tmp = [xslot]
  73 tmp += y
  74 store tmp -> [xslot]
  75
  76 ..and this uses one fewer register (so this should be done at load folding
  77 time, not at spiller time).  *Note* however that this can only be done
  78 if Y is dead.  Here's a testcase:
  79
  80 %.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
  81 implementation   ; Functions:
  82 declare void %printf(int, ...)
  83 void %main() {
  84 build_tree.exit:
  85         br label %no_exit.i7
  86 no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
  87         %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
  88         %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
  89         %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
  90         %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
  91         br bool false, label %Compute_Tree.exit23, label %no_exit.i7
  92 Compute_Tree.exit23:            ; preds = %no_exit.i7
  93         tail call void (int, ...)* %printf( int 0 )
  94         store double %tmp.34.i18, double* null
  95         ret void
  96 }
  97
  98 We currently emit:
  99
 100 .BBmain_1:
 101         xorpd %XMM1, %XMM1
 102         addsd %XMM0, %XMM1
 103 ***     movsd %XMM2, QWORD PTR [%ESP + 8]
 104 ***     addsd %XMM2, %XMM1
 105 ***     movsd QWORD PTR [%ESP + 8], %XMM2
 106         jmp .BBmain_1   # no_exit.i7
 107
 108 This is a bugpoint reduced testcase, which is why the testcase doesn't make
 109 much sense (e.g. its an infinite loop). :)
 110
 111 //===---------------------------------------------------------------------===//
 112
 113 SSE should implement 'select_cc' using 'emulated conditional moves' that use
 114 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
 115
 116 double %X(double %Y, double %Z, double %A, double %B) {
 117         %C = setlt double %A, %B
 118         %z = add double %Z, 0.0    ;; select operand is not a load
 119         %D = select bool %C, double %Y, double %z
 120         ret double %D
 121 }
 122
 123 We currently emit:
 124
 125 _X:
 126         subl $12, %esp
 127         xorpd %xmm0, %xmm0
 128         addsd 24(%esp), %xmm0
 129         movsd 32(%esp), %xmm1
 130         movsd 16(%esp), %xmm2
 131         ucomisd 40(%esp), %xmm1
 132         jb LBB_X_2
 133 LBB_X_1:
 134         movsd %xmm0, %xmm2
 135 LBB_X_2:
 136         movsd %xmm2, (%esp)
 137         fldl (%esp)
 138         addl $12, %esp
 139         ret
 140
 141 //===---------------------------------------------------------------------===//
 142
 143 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
 144 registers. The choice may depend on subtarget information. We should do some
 145 more experiments on different x86 machines.
 146
 147 //===---------------------------------------------------------------------===//
 148
 149 Currently the x86 codegen isn't very good at mixing SSE and FPStack
 150 code:
 151
 152 unsigned int foo(double x) { return x; }
 153
 154 foo:
 155         subl $20, %esp
 156         movsd 24(%esp), %xmm0
 157         movsd %xmm0, 8(%esp)
 158         fldl 8(%esp)
 159         fisttpll (%esp)
 160         movl (%esp), %eax
 161         addl $20, %esp
 162         ret
 163
 164 This will be solved when we go to a dynamic programming based isel.
 165
 166 //===---------------------------------------------------------------------===//
 167
 168 Should generate min/max for stuff like:
 169
 170 void minf(float a, float b, float *X) {
 171   *X = a <= b ? a : b;
 172 }
 173
 174 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
 175 and ISD::FMAX node types?
 176
 177 //===---------------------------------------------------------------------===//
 178
 179 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 180 feasible.
 181
 182 //===---------------------------------------------------------------------===//
 183
 184 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
 185 the reg-reg copy in this example:
 186
 187 float foo(int *x, float *y, unsigned c) {
 188   float res = 0.0;
 189   unsigned i;
 190   for (i = 0; i < c; i++) {
 191     float xx = (float)x[i];
 192     xx = xx * y[i];
 193     xx += res;
 194     res = xx;
 195   }
 196   return res;
 197 }
 198
 199 LBB_foo_3:      # no_exit
 200         cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
 201         mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
 202         addss %XMM0, %XMM1
 203         inc %ESI
 204         cmp %ESI, %ECX
 205 ****    movaps %XMM1, %XMM0
 206         jb LBB_foo_3    # no_exit
 207
 208 //===---------------------------------------------------------------------===//
 209
 210 Codegen:
 211   if (copysign(1.0, x) == copysign(1.0, y))
 212 into:
 213   if (x^y & mask)
 214 when using SSE.
 215
 216 //===---------------------------------------------------------------------===//
 217
 218 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
 219 of a v4sf value.
 220
 221 //===---------------------------------------------------------------------===//
 222
 223 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 224 Perhaps use pxor / xorp* to clear a XMM register first?
 225
 226 //===---------------------------------------------------------------------===//
 227
 228 Better codegen for:
 229
 230 void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
 231 void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
 232
 233 For the later we generate:
 234
 235 _f:
 236         pxor %xmm0, %xmm0
 237         movss 8(%esp), %xmm1
 238         movaps %xmm0, %xmm2
 239         unpcklps %xmm1, %xmm2
 240         movss 4(%esp), %xmm1
 241         unpcklps %xmm0, %xmm1
 242         unpcklps %xmm2, %xmm1
 243         movl 12(%esp), %eax
 244         movaps %xmm1, (%eax)
 245         ret
 246
 247 This seems like it should use shufps, one for each of a & b.
 248
 249 //===---------------------------------------------------------------------===//
 250
 251 How to decide when to use the "floating point version" of logical ops? Here are
 252 some code fragments:
 253
 254         movaps LCPI5_5, %xmm2
 255         divps %xmm1, %xmm2
 256         mulps %xmm2, %xmm3
 257         mulps 8656(%ecx), %xmm3
 258         addps 8672(%ecx), %xmm3
 259         andps LCPI5_6, %xmm2
 260         andps LCPI5_1, %xmm3
 261         por %xmm2, %xmm3
 262         movdqa %xmm3, (%edi)
 263
 264         movaps LCPI5_5, %xmm1
 265         divps %xmm0, %xmm1
 266         mulps %xmm1, %xmm3
 267         mulps 8656(%ecx), %xmm3
 268         addps 8672(%ecx), %xmm3
 269         andps LCPI5_6, %xmm1
 270         andps LCPI5_1, %xmm3
 271         orps %xmm1, %xmm3
 272         movaps %xmm3, 112(%esp)
 273         movaps %xmm3, (%ebx)
 274
 275 Due to some minor source change, the later case ended up using orps and movaps
 276 instead of por and movdqa. Does it matter?
 277
 278 //===---------------------------------------------------------------------===//
 279
 280 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 281 to choose between movaps, movapd, and movdqa based on types of source and
 282 destination?
 283
 284 How about andps, andpd, and pand? Do we really care about the type of the packed
 285 elements? If not, why not always use the "ps" variants which are likely to be
 286 shorter.
 287
 288 //===---------------------------------------------------------------------===//
 289
 290 External test Nurbs exposed some problems. Look for
 291 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 292 emits:
 293
 294         movaps    (%edx), %xmm2                                 #59.21
 295         movaps    (%edx), %xmm5                                 #60.21
 296         movaps    (%edx), %xmm4                                 #61.21
 297         movaps    (%edx), %xmm3                                 #62.21
 298         movl      40(%ecx), %ebp                                #69.49
 299         shufps    $0, %xmm2, %xmm5                              #60.21
 300         movl      100(%esp), %ebx                               #69.20
 301         movl      (%ebx), %edi                                  #69.20
 302         imull     %ebp, %edi                                    #69.49
 303         addl      (%eax), %edi                                  #70.33
 304         shufps    $85, %xmm2, %xmm4                             #61.21
 305         shufps    $170, %xmm2, %xmm3                            #62.21
 306         shufps    $255, %xmm2, %xmm2                            #63.21
 307         lea       (%ebp,%ebp,2), %ebx                           #69.49
 308         negl      %ebx                                          #69.49
 309         lea       -3(%edi,%ebx), %ebx                           #70.33
 310         shll      $4, %ebx                                      #68.37
 311         addl      32(%ecx), %ebx                                #68.37
 312         testb     $15, %bl                                      #91.13
 313         jne       L_B1.24       # Prob 5%                       #91.13
 314
 315 This is the llvm code after instruction scheduling:
 316
 317 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 318         %reg1078 = MOV32ri -3
 319         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 320         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 321         %reg1080 = IMUL32rr %reg1079, %reg1037
 322         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 323         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 324         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 325         %reg1082 = SHL32ri %reg1038, 4
 326         %reg1039 = ADD32rr %reg1036, %reg1082
 327         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 328         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 329         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 330         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 331         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 332         %reg1040 = MOV32rr %reg1039
 333         %reg1084 = AND32ri8 %reg1039, 15
 334         CMP32ri8 %reg1084, 0
 335         JE mbb<cond_next204,0xa914d30>
 336
 337 Still ok. After register allocation:
 338
 339 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 340         %EAX = MOV32ri -3
 341         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 342         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 343         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 344         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 345         IMUL32rr %EAX<def&use>, %EDX
 346         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 347         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 348         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 349         %EAX = LEA32r %ESI, 1, %EAX, -3
 350         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 351         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 352         %EDI = MOV32rr %EAX
 353         SHL32ri %EDI<def&use>, 4
 354         ADD32rr %EDI<def&use>, %ESI
 355         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 356         %XMM1 = MOVAPSrr %XMM0
 357         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 358         %XMM2 = MOVAPSrr %XMM0
 359         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 360         %XMM3 = MOVAPSrr %XMM0
 361         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 362         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 363         %EBX = MOV32rr %EDI
 364         AND32ri8 %EBX<def&use>, 15
 365         CMP32ri8 %EBX, 0
 366         JE mbb<cond_next204,0xa914d30>
 367
 368 This looks really bad. The problem is shufps is a destructive opcode. Since it
 369 appears as operand two in more than one shufps ops. It resulted in a number of
 370 copies. Note icc also suffers from the same problem. Either the instruction
 371 selector should select pshufd or The register allocator can made the two-address
 372 to three-address transformation.
 373
 374 It also exposes some other problems. See MOV32ri -3 and the spills.
 375
 376 //===---------------------------------------------------------------------===//
 377
 378 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
 379
 380 LLVM is producing bad code.
 381
 382 LBB_main_4:     # cond_true44
 383         addps %xmm1, %xmm2
 384         subps %xmm3, %xmm2
 385         movaps (%ecx), %xmm4
 386         movaps %xmm2, %xmm1
 387         addps %xmm4, %xmm1
 388         addl $16, %ecx
 389         incl %edx
 390         cmpl $262144, %edx
 391         movaps %xmm3, %xmm2
 392         movaps %xmm4, %xmm3
 393         jne LBB_main_4  # cond_true44
 394
 395 There are two problems. 1) No need to two loop induction variables. We can
 396 compare against 262144 * 16. 2) Known register coalescer issue. We should
 397 be able eliminate one of the movaps:
 398
 399         addps %xmm2, %xmm1    <=== Commute!
 400         subps %xmm3, %xmm1
 401         movaps (%ecx), %xmm4
 402         movaps %xmm1, %xmm1   <=== Eliminate!
 403         addps %xmm4, %xmm1
 404         addl $16, %ecx
 405         incl %edx
 406         cmpl $262144, %edx
 407         movaps %xmm3, %xmm2
 408         movaps %xmm4, %xmm3
 409         jne LBB_main_4  # cond_true44
 410
 411 //===---------------------------------------------------------------------===//
 412
 413 Consider:
 414
 415 __m128 test(float a) {
 416   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 417 }
 418
 419 This compiles into:
 420
 421 movss 4(%esp), %xmm1
 422 mulss %xmm1, %xmm1
 423 xorps %xmm0, %xmm0
 424 movss %xmm1, %xmm0
 425 ret
 426
 427 Because mulss doesn't modify the top 3 elements, the top elements of
 428 xmm1 are already zero'd.  We could compile this to:
 429
 430 movss 4(%esp), %xmm0
 431 mulss %xmm0, %xmm0
 432 ret
 433
 434 //===---------------------------------------------------------------------===//
 435
 436 Here's a sick and twisted idea.  Consider code like this:
 437
 438 __m128 test(__m128 a) {
 439   float b = *(float*)&A;
 440   ...
 441   return _mm_set_ps(0.0, 0.0, 0.0, b);
 442 }
 443
 444 This might compile to this code:
 445
 446 movaps c(%esp), %xmm1
 447 xorps %xmm0, %xmm0
 448 movss %xmm1, %xmm0
 449 ret
 450
 451 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 452 this code:
 453
 454 movaps c(%esp), %xmm1
 455 movaps %xmm1, c2(%esp)
 456 ...
 457
 458 xorps %xmm0, %xmm0
 459 movaps c2(%esp), %xmm1
 460 movss %xmm1, %xmm0
 461 ret
 462
 463 However, since the reload is only used by these instructions, we could
 464 "fold" it into the uses, producing something like this:
 465
 466 movaps c(%esp), %xmm1
 467 movaps %xmm1, c2(%esp)
 468 ...
 469
 470 movss c2(%esp), %xmm0
 471 ret
 472
 473 ... saving two instructions.
 474
 475 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 476 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
 477 This can be used to simplify a variety of shuffle operations, where the
 478 elements are fixed zeros.
 479
 480 //===---------------------------------------------------------------------===//
 481
 482 For this:
 483
 484 #include <emmintrin.h>
 485 void test(__m128d *r, __m128d *A, double B) {
 486   *r = _mm_loadl_pd(*A, &B);
 487 }
 488
 489 We generates:
 490
 491         subl $12, %esp
 492         movsd 24(%esp), %xmm0
 493         movsd %xmm0, (%esp)
 494         movl 20(%esp), %eax
 495         movapd (%eax), %xmm0
 496         movlpd (%esp), %xmm0
 497         movl 16(%esp), %eax
 498         movapd %xmm0, (%eax)
 499         addl $12, %esp
 500         ret
 501
 502 icc generates:
 503
 504         movl      4(%esp), %edx                                 #3.6
 505         movl      8(%esp), %eax                                 #3.6
 506         movapd    (%eax), %xmm0                                 #4.22
 507         movlpd    12(%esp), %xmm0                               #4.8
 508         movapd    %xmm0, (%edx)                                 #4.3
 509         ret                                                     #5.1
 510
 511 So icc is smart enough to know that B is in memory so it doesn't load it and
 512 store it back to stack.
 513
 514 //===---------------------------------------------------------------------===//
 515
 516 __m128d test1( __m128d A, __m128d B) {
 517   return _mm_shuffle_pd(A, B, 0x3);
 518 }
 519
 520 compiles to
 521
 522 shufpd $3, %xmm1, %xmm0
 523
 524 Perhaps it's better to use unpckhpd instead?
 525
 526 unpckhpd %xmm1, %xmm0
 527
 528 Don't know if unpckhpd is faster. But it is shorter.
 529
 530 //===---------------------------------------------------------------------===//
 531
 532 This code generates ugly code, probably due to costs being off or something:
 533
 534 void %test(float* %P, <4 x float>* %P2 ) {
 535         %xFloat0.688 = load float* %P
 536         %loadVector37.712 = load <4 x float>* %P2
 537         %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
 538         store <4 x float> %inFloat3.713, <4 x float>* %P2
 539         ret void
 540 }
 541
 542 Generates:
 543
 544 _test:
 545         pxor %xmm0, %xmm0
 546         movd %xmm0, %eax        ;; EAX = 0!
 547         movl 8(%esp), %ecx
 548         movaps (%ecx), %xmm0
 549         pinsrw $6, %eax, %xmm0
 550         shrl $16, %eax          ;; EAX = 0 again!
 551         pinsrw $7, %eax, %xmm0
 552         movaps %xmm0, (%ecx)
 553         ret
 554
 555 It would be better to generate:
 556
 557 _test:
 558         movl 8(%esp), %ecx
 559         movaps (%ecx), %xmm0
 560         xor %eax, %eax
 561         pinsrw $6, %eax, %xmm0
 562         pinsrw $7, %eax, %xmm0
 563         movaps %xmm0, (%ecx)
 564         ret
 565
 566 or use pxor (to make a zero vector) and shuffle (to insert it).
 567
 568 //===---------------------------------------------------------------------===//
 569
 570 Some useful information in the Apple Altivec / SSE Migration Guide:
 571
 572 http://developer.apple.com/documentation/Performance/Conceptual/
 573 Accelerate_sse_migration/index.html
 574
 575 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 576
 577 //===---------------------------------------------------------------------===//
 578
 579 Add hooks to commute some CMPP operations.
 580
 581 //===---------------------------------------------------------------------===//
 582
 583 Implement some missing insert/extract element operations without going through
 584 the stack.  Testcase here:
 585 CodeGen/X86/vec_ins_extract.ll
 586 corresponds to this C code:
 587
 588 typedef float vectorfloat __attribute__((vector_size(16)));
 589 void test(vectorfloat *F, float f) {
 590   vectorfloat G = *F + *F;
 591   *((float*)&G) = f;
 592   *F = G + G;
 593 }
 594 void test2(vectorfloat *F, float f) {
 595   vectorfloat G = *F + *F;
 596   ((float*)&G)[2] = f;
 597   *F = G + G;
 598 }
 599 void test3(vectorfloat *F, float *f) {
 600   vectorfloat G = *F + *F;
 601   *f = ((float*)&G)[2];
 602 }
 603 void test4(vectorfloat *F, float *f) {
 604   vectorfloat G = *F + *F;
 605   *f = *((float*)&G);
 606 }
 607
 608 //===---------------------------------------------------------------------===//
 609
 610 Apply the same transformation that merged four float into a single 128-bit load
 611 to loads from constant pool.