lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 //===---------------------------------------------------------------------===//
   6
   7 There are serious issues folding loads into "scalar sse" intrinsics.  For
   8 example, this:
   9
  10 float minss4( float x, float *y ) {
  11   return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x),_mm_set_ss(*y)));
  12 }
  13
  14 compiles to:
  15
  16 _minss4:
  17         subl $4, %esp
  18         movl 12(%esp), %eax
  19 ***     movss 8(%esp), %xmm0
  20 ***     movss (%eax), %xmm1
  21 ***     minss %xmm1, %xmm0
  22         movss %xmm0, (%esp)
  23         flds (%esp)
  24         addl $4, %esp
  25         ret
  26
  27 Each operand of the minss is a load.  At least one should be folded!
  28
  29 //===---------------------------------------------------------------------===//
  30
  31 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  32 other fast SSE modes.
  33
  34 //===---------------------------------------------------------------------===//
  35
  36 Think about doing i64 math in SSE regs.
  37
  38 //===---------------------------------------------------------------------===//
  39
  40 This testcase should have no SSE instructions in it, and only one load from
  41 a constant pool:
  42
  43 double %test3(bool %B) {
  44         %C = select bool %B, double 123.412, double 523.01123123
  45         ret double %C
  46 }
  47
  48 Currently, the select is being lowered, which prevents the dag combiner from
  49 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  50
  51 The pattern isel got this one right.
  52
  53 //===---------------------------------------------------------------------===//
  54
  55 SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
  56 like this:
  57
  58   X += y
  59
  60 and the register allocator decides to spill X, it is cheaper to emit this as:
  61
  62 Y += [xslot]
  63 store Y -> [xslot]
  64
  65 than as:
  66
  67 tmp = [xslot]
  68 tmp += y
  69 store tmp -> [xslot]
  70
  71 ..and this uses one fewer register (so this should be done at load folding
  72 time, not at spiller time).  *Note* however that this can only be done
  73 if Y is dead.  Here's a testcase:
  74
  75 %.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
  76 implementation   ; Functions:
  77 declare void %printf(int, ...)
  78 void %main() {
  79 build_tree.exit:
  80         br label %no_exit.i7
  81 no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
  82         %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
  83         %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
  84         %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
  85         %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
  86         br bool false, label %Compute_Tree.exit23, label %no_exit.i7
  87 Compute_Tree.exit23:            ; preds = %no_exit.i7
  88         tail call void (int, ...)* %printf( int 0 )
  89         store double %tmp.34.i18, double* null
  90         ret void
  91 }
  92
  93 We currently emit:
  94
  95 .BBmain_1:
  96         xorpd %XMM1, %XMM1
  97         addsd %XMM0, %XMM1
  98 ***     movsd %XMM2, QWORD PTR [%ESP + 8]
  99 ***     addsd %XMM2, %XMM1
 100 ***     movsd QWORD PTR [%ESP + 8], %XMM2
 101         jmp .BBmain_1   # no_exit.i7
 102
 103 This is a bugpoint reduced testcase, which is why the testcase doesn't make
 104 much sense (e.g. its an infinite loop). :)
 105
 106 //===---------------------------------------------------------------------===//
 107
 108 SSE should implement 'select_cc' using 'emulated conditional moves' that use
 109 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
 110
 111 double %X(double %Y, double %Z, double %A, double %B) {
 112         %C = setlt double %A, %B
 113         %z = add double %Z, 0.0    ;; select operand is not a load
 114         %D = select bool %C, double %Y, double %z
 115         ret double %D
 116 }
 117
 118 We currently emit:
 119
 120 _X:
 121         subl $12, %esp
 122         xorpd %xmm0, %xmm0
 123         addsd 24(%esp), %xmm0
 124         movsd 32(%esp), %xmm1
 125         movsd 16(%esp), %xmm2
 126         ucomisd 40(%esp), %xmm1
 127         jb LBB_X_2
 128 LBB_X_1:
 129         movsd %xmm0, %xmm2
 130 LBB_X_2:
 131         movsd %xmm2, (%esp)
 132         fldl (%esp)
 133         addl $12, %esp
 134         ret
 135
 136 //===---------------------------------------------------------------------===//
 137
 138 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
 139 registers. The choice may depend on subtarget information. We should do some
 140 more experiments on different x86 machines.
 141
 142 //===---------------------------------------------------------------------===//
 143
 144 Currently the x86 codegen isn't very good at mixing SSE and FPStack
 145 code:
 146
 147 unsigned int foo(double x) { return x; }
 148
 149 foo:
 150         subl $20, %esp
 151         movsd 24(%esp), %xmm0
 152         movsd %xmm0, 8(%esp)
 153         fldl 8(%esp)
 154         fisttpll (%esp)
 155         movl (%esp), %eax
 156         addl $20, %esp
 157         ret
 158
 159 This will be solved when we go to a dynamic programming based isel.
 160
 161 //===---------------------------------------------------------------------===//
 162
 163 Should generate min/max for stuff like:
 164
 165 void minf(float a, float b, float *X) {
 166   *X = a <= b ? a : b;
 167 }
 168
 169 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
 170 and ISD::FMAX node types?
 171
 172 //===---------------------------------------------------------------------===//
 173
 174 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 175 feasible.
 176
 177 //===---------------------------------------------------------------------===//
 178
 179 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
 180 the reg-reg copy in this example:
 181
 182 float foo(int *x, float *y, unsigned c) {
 183   float res = 0.0;
 184   unsigned i;
 185   for (i = 0; i < c; i++) {
 186     float xx = (float)x[i];
 187     xx = xx * y[i];
 188     xx += res;
 189     res = xx;
 190   }
 191   return res;
 192 }
 193
 194 LBB_foo_3:      # no_exit
 195         cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
 196         mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
 197         addss %XMM0, %XMM1
 198         inc %ESI
 199         cmp %ESI, %ECX
 200 ****    movaps %XMM1, %XMM0
 201         jb LBB_foo_3    # no_exit
 202
 203 //===---------------------------------------------------------------------===//
 204
 205 Codegen:
 206   if (copysign(1.0, x) == copysign(1.0, y))
 207 into:
 208   if (x^y & mask)
 209 when using SSE.
 210
 211 //===---------------------------------------------------------------------===//
 212
 213 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
 214 of a v4sf value.
 215
 216 //===---------------------------------------------------------------------===//
 217
 218 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 219 Perhaps use pxor / xorp* to clear a XMM register first?
 220
 221 //===---------------------------------------------------------------------===//
 222
 223 Better codegen for:
 224
 225 void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
 226 void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
 227
 228 For the later we generate:
 229
 230 _f:
 231         pxor %xmm0, %xmm0
 232         movss 8(%esp), %xmm1
 233         movaps %xmm0, %xmm2
 234         unpcklps %xmm1, %xmm2
 235         movss 4(%esp), %xmm1
 236         unpcklps %xmm0, %xmm1
 237         unpcklps %xmm2, %xmm1
 238         movl 12(%esp), %eax
 239         movaps %xmm1, (%eax)
 240         ret
 241
 242 This seems like it should use shufps, one for each of a & b.
 243
 244 //===---------------------------------------------------------------------===//
 245
 246 How to decide when to use the "floating point version" of logical ops? Here are
 247 some code fragments:
 248
 249         movaps LCPI5_5, %xmm2
 250         divps %xmm1, %xmm2
 251         mulps %xmm2, %xmm3
 252         mulps 8656(%ecx), %xmm3
 253         addps 8672(%ecx), %xmm3
 254         andps LCPI5_6, %xmm2
 255         andps LCPI5_1, %xmm3
 256         por %xmm2, %xmm3
 257         movdqa %xmm3, (%edi)
 258
 259         movaps LCPI5_5, %xmm1
 260         divps %xmm0, %xmm1
 261         mulps %xmm1, %xmm3
 262         mulps 8656(%ecx), %xmm3
 263         addps 8672(%ecx), %xmm3
 264         andps LCPI5_6, %xmm1
 265         andps LCPI5_1, %xmm3
 266         orps %xmm1, %xmm3
 267         movaps %xmm3, 112(%esp)
 268         movaps %xmm3, (%ebx)
 269
 270 Due to some minor source change, the later case ended up using orps and movaps
 271 instead of por and movdqa. Does it matter?
 272
 273 //===---------------------------------------------------------------------===//
 274
 275 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 276 to choose between movaps, movapd, and movdqa based on types of source and
 277 destination?
 278
 279 How about andps, andpd, and pand? Do we really care about the type of the packed
 280 elements? If not, why not always use the "ps" variants which are likely to be
 281 shorter.
 282
 283 //===---------------------------------------------------------------------===//
 284
 285 External test Nurbs exposed some problems. Look for
 286 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 287 emits:
 288
 289         movaps    (%edx), %xmm2                                 #59.21
 290         movaps    (%edx), %xmm5                                 #60.21
 291         movaps    (%edx), %xmm4                                 #61.21
 292         movaps    (%edx), %xmm3                                 #62.21
 293         movl      40(%ecx), %ebp                                #69.49
 294         shufps    $0, %xmm2, %xmm5                              #60.21
 295         movl      100(%esp), %ebx                               #69.20
 296         movl      (%ebx), %edi                                  #69.20
 297         imull     %ebp, %edi                                    #69.49
 298         addl      (%eax), %edi                                  #70.33
 299         shufps    $85, %xmm2, %xmm4                             #61.21
 300         shufps    $170, %xmm2, %xmm3                            #62.21
 301         shufps    $255, %xmm2, %xmm2                            #63.21
 302         lea       (%ebp,%ebp,2), %ebx                           #69.49
 303         negl      %ebx                                          #69.49
 304         lea       -3(%edi,%ebx), %ebx                           #70.33
 305         shll      $4, %ebx                                      #68.37
 306         addl      32(%ecx), %ebx                                #68.37
 307         testb     $15, %bl                                      #91.13
 308         jne       L_B1.24       # Prob 5%                       #91.13
 309
 310 This is the llvm code after instruction scheduling:
 311
 312 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 313         %reg1078 = MOV32ri -3
 314         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 315         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 316         %reg1080 = IMUL32rr %reg1079, %reg1037
 317         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 318         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 319         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 320         %reg1082 = SHL32ri %reg1038, 4
 321         %reg1039 = ADD32rr %reg1036, %reg1082
 322         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 323         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 324         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 325         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 326         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 327         %reg1040 = MOV32rr %reg1039
 328         %reg1084 = AND32ri8 %reg1039, 15
 329         CMP32ri8 %reg1084, 0
 330         JE mbb<cond_next204,0xa914d30>
 331
 332 Still ok. After register allocation:
 333
 334 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 335         %EAX = MOV32ri -3
 336         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 337         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 338         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 339         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 340         IMUL32rr %EAX<def&use>, %EDX
 341         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 342         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 343         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 344         %EAX = LEA32r %ESI, 1, %EAX, -3
 345         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 346         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 347         %EDI = MOV32rr %EAX
 348         SHL32ri %EDI<def&use>, 4
 349         ADD32rr %EDI<def&use>, %ESI
 350         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 351         %XMM1 = MOVAPSrr %XMM0
 352         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 353         %XMM2 = MOVAPSrr %XMM0
 354         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 355         %XMM3 = MOVAPSrr %XMM0
 356         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 357         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 358         %EBX = MOV32rr %EDI
 359         AND32ri8 %EBX<def&use>, 15
 360         CMP32ri8 %EBX, 0
 361         JE mbb<cond_next204,0xa914d30>
 362
 363 This looks really bad. The problem is shufps is a destructive opcode. Since it
 364 appears as operand two in more than one shufps ops. It resulted in a number of
 365 copies. Note icc also suffers from the same problem. Either the instruction
 366 selector should select pshufd or The register allocator can made the two-address
 367 to three-address transformation.
 368
 369 It also exposes some other problems. See MOV32ri -3 and the spills.
 370
 371 //===---------------------------------------------------------------------===//
 372
 373 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
 374
 375 LLVM is producing bad code.
 376
 377 LBB_main_4:     # cond_true44
 378         addps %xmm1, %xmm2
 379         subps %xmm3, %xmm2
 380         movaps (%ecx), %xmm4
 381         movaps %xmm2, %xmm1
 382         addps %xmm4, %xmm1
 383         addl $16, %ecx
 384         incl %edx
 385         cmpl $262144, %edx
 386         movaps %xmm3, %xmm2
 387         movaps %xmm4, %xmm3
 388         jne LBB_main_4  # cond_true44
 389
 390 There are two problems. 1) No need to two loop induction variables. We can
 391 compare against 262144 * 16. 2) Known register coalescer issue. We should
 392 be able eliminate one of the movaps:
 393
 394         addps %xmm2, %xmm1    <=== Commute!
 395         subps %xmm3, %xmm1
 396         movaps (%ecx), %xmm4
 397         movaps %xmm1, %xmm1   <=== Eliminate!
 398         addps %xmm4, %xmm1
 399         addl $16, %ecx
 400         incl %edx
 401         cmpl $262144, %edx
 402         movaps %xmm3, %xmm2
 403         movaps %xmm4, %xmm3
 404         jne LBB_main_4  # cond_true44
 405
 406 //===---------------------------------------------------------------------===//
 407
 408 Consider:
 409
 410 __m128 test(float a) {
 411   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 412 }
 413
 414 This compiles into:
 415
 416 movss 4(%esp), %xmm1
 417 mulss %xmm1, %xmm1
 418 xorps %xmm0, %xmm0
 419 movss %xmm1, %xmm0
 420 ret
 421
 422 Because mulss doesn't modify the top 3 elements, the top elements of
 423 xmm1 are already zero'd.  We could compile this to:
 424
 425 movss 4(%esp), %xmm0
 426 mulss %xmm0, %xmm0
 427 ret
 428
 429 //===---------------------------------------------------------------------===//
 430
 431 Here's a sick and twisted idea.  Consider code like this:
 432
 433 __m128 test(__m128 a) {
 434   float b = *(float*)&A;
 435   ...
 436   return _mm_set_ps(0.0, 0.0, 0.0, b);
 437 }
 438
 439 This might compile to this code:
 440
 441 movaps c(%esp), %xmm1
 442 xorps %xmm0, %xmm0
 443 movss %xmm1, %xmm0
 444 ret
 445
 446 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 447 this code:
 448
 449 movaps c(%esp), %xmm1
 450 movaps %xmm1, c2(%esp)
 451 ...
 452
 453 xorps %xmm0, %xmm0
 454 movaps c2(%esp), %xmm1
 455 movss %xmm1, %xmm0
 456 ret
 457
 458 However, since the reload is only used by these instructions, we could
 459 "fold" it into the uses, producing something like this:
 460
 461 movaps c(%esp), %xmm1
 462 movaps %xmm1, c2(%esp)
 463 ...
 464
 465 movss c2(%esp), %xmm0
 466 ret
 467
 468 ... saving two instructions.
 469
 470 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 471 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
 472 This can be used to simplify a variety of shuffle operations, where the
 473 elements are fixed zeros.
 474
 475 //===---------------------------------------------------------------------===//
 476
 477 For this:
 478
 479 #include <emmintrin.h>
 480 void test(__m128d *r, __m128d *A, double B) {
 481   *r = _mm_loadl_pd(*A, &B);
 482 }
 483
 484 We generates:
 485
 486         subl $12, %esp
 487         movsd 24(%esp), %xmm0
 488         movsd %xmm0, (%esp)
 489         movl 20(%esp), %eax
 490         movapd (%eax), %xmm0
 491         movlpd (%esp), %xmm0
 492         movl 16(%esp), %eax
 493         movapd %xmm0, (%eax)
 494         addl $12, %esp
 495         ret
 496
 497 icc generates:
 498
 499         movl      4(%esp), %edx                                 #3.6
 500         movl      8(%esp), %eax                                 #3.6
 501         movapd    (%eax), %xmm0                                 #4.22
 502         movlpd    12(%esp), %xmm0                               #4.8
 503         movapd    %xmm0, (%edx)                                 #4.3
 504         ret                                                     #5.1
 505
 506 So icc is smart enough to know that B is in memory so it doesn't load it and
 507 store it back to stack.
 508
 509 //===---------------------------------------------------------------------===//
 510
 511 __m128d test1( __m128d A, __m128d B) {
 512   return _mm_shuffle_pd(A, B, 0x3);
 513 }
 514
 515 compiles to
 516
 517 shufpd $3, %xmm1, %xmm0
 518
 519 Perhaps it's better to use unpckhpd instead?
 520
 521 unpckhpd %xmm1, %xmm0
 522
 523 Don't know if unpckhpd is faster. But it is shorter.
 524
 525 //===---------------------------------------------------------------------===//
 526
 527 This code generates ugly code, probably due to costs being off or something:
 528
 529 void %test(float* %P, <4 x float>* %P2 ) {
 530         %xFloat0.688 = load float* %P
 531         %loadVector37.712 = load <4 x float>* %P2
 532         %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
 533         store <4 x float> %inFloat3.713, <4 x float>* %P2
 534         ret void
 535 }
 536
 537 Generates:
 538
 539 _test:
 540         pxor %xmm0, %xmm0
 541         movd %xmm0, %eax        ;; EAX = 0!
 542         movl 8(%esp), %ecx
 543         movaps (%ecx), %xmm0
 544         pinsrw $6, %eax, %xmm0
 545         shrl $16, %eax          ;; EAX = 0 again!
 546         pinsrw $7, %eax, %xmm0
 547         movaps %xmm0, (%ecx)
 548         ret
 549
 550 It would be better to generate:
 551
 552 _test:
 553         movl 8(%esp), %ecx
 554         movaps (%ecx), %xmm0
 555         xor %eax, %eax
 556         pinsrw $6, %eax, %xmm0
 557         pinsrw $7, %eax, %xmm0
 558         movaps %xmm0, (%ecx)
 559         ret
 560
 561 or use pxor (to make a zero vector) and shuffle (to insert it).
 562
 563 //===---------------------------------------------------------------------===//
 564
 565 Some useful information in the Apple Altivec / SSE Migration Guide:
 566
 567 http://developer.apple.com/documentation/Performance/Conceptual/
 568 Accelerate_sse_migration/index.html
 569
 570 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 571
 572 //===---------------------------------------------------------------------===//
 573
 574 Add hooks to commute some CMPP operations.
 575
 576 //===---------------------------------------------------------------------===//
 577
 578 Implement some missing insert/extract element operations without going through
 579 the stack.  Testcase here:
 580 CodeGen/X86/vec_ins_extract.ll
 581 corresponds to this C code:
 582
 583 typedef float vectorfloat __attribute__((vector_size(16)));
 584 void test(vectorfloat *F, float f) {
 585   vectorfloat G = *F + *F;
 586   *((float*)&G) = f;
 587   *F = G + G;
 588 }
 589 void test2(vectorfloat *F, float f) {
 590   vectorfloat G = *F + *F;
 591   ((float*)&G)[2] = f;
 592   *F = G + G;
 593 }
 594 void test3(vectorfloat *F, float *f) {
 595   vectorfloat G = *F + *F;
 596   *f = ((float*)&G)[2];
 597 }
 598 void test4(vectorfloat *F, float *f) {
 599   vectorfloat G = *F + *F;
 600   *f = *((float*)&G);
 601 }
 602
 603 //===---------------------------------------------------------------------===//
 604
 605 Apply the same transformation that merged four float into a single 128-bit load
 606 to loads from constant pool.