lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 - Consider eliminating the unaligned SSE load intrinsics, replacing them with
   6   unaligned LLVM load instructions.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 Expand libm rounding functions inline:  Significant speedups possible.
  11 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  12
  13 //===---------------------------------------------------------------------===//
  14
  15 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  16 other fast SSE modes.
  17
  18 //===---------------------------------------------------------------------===//
  19
  20 Think about doing i64 math in SSE regs.
  21
  22 //===---------------------------------------------------------------------===//
  23
  24 This testcase should have no SSE instructions in it, and only one load from
  25 a constant pool:
  26
  27 double %test3(bool %B) {
  28         %C = select bool %B, double 123.412, double 523.01123123
  29         ret double %C
  30 }
  31
  32 Currently, the select is being lowered, which prevents the dag combiner from
  33 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  34
  35 The pattern isel got this one right.
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
  40 like this:
  41
  42   X += y
  43
  44 and the register allocator decides to spill X, it is cheaper to emit this as:
  45
  46 Y += [xslot]
  47 store Y -> [xslot]
  48
  49 than as:
  50
  51 tmp = [xslot]
  52 tmp += y
  53 store tmp -> [xslot]
  54
  55 ..and this uses one fewer register (so this should be done at load folding
  56 time, not at spiller time).  *Note* however that this can only be done
  57 if Y is dead.  Here's a testcase:
  58
  59 @.str_3 = external global [15 x i8]             ; <[15 x i8]*> [#uses=0]
  60 declare void @printf(i32, ...)
  61 define void @main() {
  62 build_tree.exit:
  63         br label %no_exit.i7
  64
  65 no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
  66         %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]             ; <double> [#uses=1]
  67         %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]            ; <double> [#uses=1]
  68         %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00           ; <double> [#uses=1]
  69         %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00            ; <double> [#uses=2]
  70         br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
  71
  72 Compute_Tree.exit23:            ; preds = %no_exit.i7
  73         tail call void (i32, ...)* @printf( i32 0 )
  74         store double %tmp.34.i18, double* null
  75         ret void
  76 }
  77
  78 We currently emit:
  79
  80 .BBmain_1:
  81         xorpd %XMM1, %XMM1
  82         addsd %XMM0, %XMM1
  83 ***     movsd %XMM2, QWORD PTR [%ESP + 8]
  84 ***     addsd %XMM2, %XMM1
  85 ***     movsd QWORD PTR [%ESP + 8], %XMM2
  86         jmp .BBmain_1   # no_exit.i7
  87
  88 This is a bugpoint reduced testcase, which is why the testcase doesn't make
  89 much sense (e.g. its an infinite loop). :)
  90
  91 //===---------------------------------------------------------------------===//
  92
  93 SSE should implement 'select_cc' using 'emulated conditional moves' that use
  94 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  95
  96 double %X(double %Y, double %Z, double %A, double %B) {
  97         %C = setlt double %A, %B
  98         %z = add double %Z, 0.0    ;; select operand is not a load
  99         %D = select bool %C, double %Y, double %z
 100         ret double %D
 101 }
 102
 103 We currently emit:
 104
 105 _X:
 106         subl $12, %esp
 107         xorpd %xmm0, %xmm0
 108         addsd 24(%esp), %xmm0
 109         movsd 32(%esp), %xmm1
 110         movsd 16(%esp), %xmm2
 111         ucomisd 40(%esp), %xmm1
 112         jb LBB_X_2
 113 LBB_X_1:
 114         movsd %xmm0, %xmm2
 115 LBB_X_2:
 116         movsd %xmm2, (%esp)
 117         fldl (%esp)
 118         addl $12, %esp
 119         ret
 120
 121 //===---------------------------------------------------------------------===//
 122
 123 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
 124 registers. The choice may depend on subtarget information. We should do some
 125 more experiments on different x86 machines.
 126
 127 //===---------------------------------------------------------------------===//
 128
 129 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 130 feasible.
 131
 132 //===---------------------------------------------------------------------===//
 133
 134 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
 135 the reg-reg copy in this example:
 136
 137 float foo(int *x, float *y, unsigned c) {
 138   float res = 0.0;
 139   unsigned i;
 140   for (i = 0; i < c; i++) {
 141     float xx = (float)x[i];
 142     xx = xx * y[i];
 143     xx += res;
 144     res = xx;
 145   }
 146   return res;
 147 }
 148
 149 LBB_foo_3:      # no_exit
 150         cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
 151         mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
 152         addss %XMM0, %XMM1
 153         inc %ESI
 154         cmp %ESI, %ECX
 155 ****    movaps %XMM1, %XMM0
 156         jb LBB_foo_3    # no_exit
 157
 158 //===---------------------------------------------------------------------===//
 159
 160 Codegen:
 161   if (copysign(1.0, x) == copysign(1.0, y))
 162 into:
 163   if (x^y & mask)
 164 when using SSE.
 165
 166 //===---------------------------------------------------------------------===//
 167
 168 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
 169 of a v4sf value.
 170
 171 //===---------------------------------------------------------------------===//
 172
 173 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 174 Perhaps use pxor / xorp* to clear a XMM register first?
 175
 176 //===---------------------------------------------------------------------===//
 177
 178 How to decide when to use the "floating point version" of logical ops? Here are
 179 some code fragments:
 180
 181         movaps LCPI5_5, %xmm2
 182         divps %xmm1, %xmm2
 183         mulps %xmm2, %xmm3
 184         mulps 8656(%ecx), %xmm3
 185         addps 8672(%ecx), %xmm3
 186         andps LCPI5_6, %xmm2
 187         andps LCPI5_1, %xmm3
 188         por %xmm2, %xmm3
 189         movdqa %xmm3, (%edi)
 190
 191         movaps LCPI5_5, %xmm1
 192         divps %xmm0, %xmm1
 193         mulps %xmm1, %xmm3
 194         mulps 8656(%ecx), %xmm3
 195         addps 8672(%ecx), %xmm3
 196         andps LCPI5_6, %xmm1
 197         andps LCPI5_1, %xmm3
 198         orps %xmm1, %xmm3
 199         movaps %xmm3, 112(%esp)
 200         movaps %xmm3, (%ebx)
 201
 202 Due to some minor source change, the later case ended up using orps and movaps
 203 instead of por and movdqa. Does it matter?
 204
 205 //===---------------------------------------------------------------------===//
 206
 207 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 208 to choose between movaps, movapd, and movdqa based on types of source and
 209 destination?
 210
 211 How about andps, andpd, and pand? Do we really care about the type of the packed
 212 elements? If not, why not always use the "ps" variants which are likely to be
 213 shorter.
 214
 215 //===---------------------------------------------------------------------===//
 216
 217 External test Nurbs exposed some problems. Look for
 218 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 219 emits:
 220
 221         movaps    (%edx), %xmm2                                 #59.21
 222         movaps    (%edx), %xmm5                                 #60.21
 223         movaps    (%edx), %xmm4                                 #61.21
 224         movaps    (%edx), %xmm3                                 #62.21
 225         movl      40(%ecx), %ebp                                #69.49
 226         shufps    $0, %xmm2, %xmm5                              #60.21
 227         movl      100(%esp), %ebx                               #69.20
 228         movl      (%ebx), %edi                                  #69.20
 229         imull     %ebp, %edi                                    #69.49
 230         addl      (%eax), %edi                                  #70.33
 231         shufps    $85, %xmm2, %xmm4                             #61.21
 232         shufps    $170, %xmm2, %xmm3                            #62.21
 233         shufps    $255, %xmm2, %xmm2                            #63.21
 234         lea       (%ebp,%ebp,2), %ebx                           #69.49
 235         negl      %ebx                                          #69.49
 236         lea       -3(%edi,%ebx), %ebx                           #70.33
 237         shll      $4, %ebx                                      #68.37
 238         addl      32(%ecx), %ebx                                #68.37
 239         testb     $15, %bl                                      #91.13
 240         jne       L_B1.24       # Prob 5%                       #91.13
 241
 242 This is the llvm code after instruction scheduling:
 243
 244 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 245         %reg1078 = MOV32ri -3
 246         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 247         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 248         %reg1080 = IMUL32rr %reg1079, %reg1037
 249         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 250         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 251         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 252         %reg1082 = SHL32ri %reg1038, 4
 253         %reg1039 = ADD32rr %reg1036, %reg1082
 254         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 255         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 256         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 257         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 258         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 259         %reg1040 = MOV32rr %reg1039
 260         %reg1084 = AND32ri8 %reg1039, 15
 261         CMP32ri8 %reg1084, 0
 262         JE mbb<cond_next204,0xa914d30>
 263
 264 Still ok. After register allocation:
 265
 266 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 267         %EAX = MOV32ri -3
 268         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 269         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 270         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 271         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 272         IMUL32rr %EAX<def&use>, %EDX
 273         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 274         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 275         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 276         %EAX = LEA32r %ESI, 1, %EAX, -3
 277         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 278         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 279         %EDI = MOV32rr %EAX
 280         SHL32ri %EDI<def&use>, 4
 281         ADD32rr %EDI<def&use>, %ESI
 282         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 283         %XMM1 = MOVAPSrr %XMM0
 284         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 285         %XMM2 = MOVAPSrr %XMM0
 286         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 287         %XMM3 = MOVAPSrr %XMM0
 288         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 289         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 290         %EBX = MOV32rr %EDI
 291         AND32ri8 %EBX<def&use>, 15
 292         CMP32ri8 %EBX, 0
 293         JE mbb<cond_next204,0xa914d30>
 294
 295 This looks really bad. The problem is shufps is a destructive opcode. Since it
 296 appears as operand two in more than one shufps ops. It resulted in a number of
 297 copies. Note icc also suffers from the same problem. Either the instruction
 298 selector should select pshufd or The register allocator can made the two-address
 299 to three-address transformation.
 300
 301 It also exposes some other problems. See MOV32ri -3 and the spills.
 302
 303 //===---------------------------------------------------------------------===//
 304
 305 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
 306
 307 LLVM is producing bad code.
 308
 309 LBB_main_4:     # cond_true44
 310         addps %xmm1, %xmm2
 311         subps %xmm3, %xmm2
 312         movaps (%ecx), %xmm4
 313         movaps %xmm2, %xmm1
 314         addps %xmm4, %xmm1
 315         addl $16, %ecx
 316         incl %edx
 317         cmpl $262144, %edx
 318         movaps %xmm3, %xmm2
 319         movaps %xmm4, %xmm3
 320         jne LBB_main_4  # cond_true44
 321
 322 There are two problems. 1) No need to two loop induction variables. We can
 323 compare against 262144 * 16. 2) Known register coalescer issue. We should
 324 be able eliminate one of the movaps:
 325
 326         addps %xmm2, %xmm1    <=== Commute!
 327         subps %xmm3, %xmm1
 328         movaps (%ecx), %xmm4
 329         movaps %xmm1, %xmm1   <=== Eliminate!
 330         addps %xmm4, %xmm1
 331         addl $16, %ecx
 332         incl %edx
 333         cmpl $262144, %edx
 334         movaps %xmm3, %xmm2
 335         movaps %xmm4, %xmm3
 336         jne LBB_main_4  # cond_true44
 337
 338 //===---------------------------------------------------------------------===//
 339
 340 Consider:
 341
 342 __m128 test(float a) {
 343   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 344 }
 345
 346 This compiles into:
 347
 348 movss 4(%esp), %xmm1
 349 mulss %xmm1, %xmm1
 350 xorps %xmm0, %xmm0
 351 movss %xmm1, %xmm0
 352 ret
 353
 354 Because mulss doesn't modify the top 3 elements, the top elements of
 355 xmm1 are already zero'd.  We could compile this to:
 356
 357 movss 4(%esp), %xmm0
 358 mulss %xmm0, %xmm0
 359 ret
 360
 361 //===---------------------------------------------------------------------===//
 362
 363 Here's a sick and twisted idea.  Consider code like this:
 364
 365 __m128 test(__m128 a) {
 366   float b = *(float*)&A;
 367   ...
 368   return _mm_set_ps(0.0, 0.0, 0.0, b);
 369 }
 370
 371 This might compile to this code:
 372
 373 movaps c(%esp), %xmm1
 374 xorps %xmm0, %xmm0
 375 movss %xmm1, %xmm0
 376 ret
 377
 378 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 379 this code:
 380
 381 movaps c(%esp), %xmm1
 382 movaps %xmm1, c2(%esp)
 383 ...
 384
 385 xorps %xmm0, %xmm0
 386 movaps c2(%esp), %xmm1
 387 movss %xmm1, %xmm0
 388 ret
 389
 390 However, since the reload is only used by these instructions, we could
 391 "fold" it into the uses, producing something like this:
 392
 393 movaps c(%esp), %xmm1
 394 movaps %xmm1, c2(%esp)
 395 ...
 396
 397 movss c2(%esp), %xmm0
 398 ret
 399
 400 ... saving two instructions.
 401
 402 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 403 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
 404 This can be used to simplify a variety of shuffle operations, where the
 405 elements are fixed zeros.
 406
 407 //===---------------------------------------------------------------------===//
 408
 409 For this:
 410
 411 #include <emmintrin.h>
 412 void test(__m128d *r, __m128d *A, double B) {
 413   *r = _mm_loadl_pd(*A, &B);
 414 }
 415
 416 We generates:
 417
 418         subl $12, %esp
 419         movsd 24(%esp), %xmm0
 420         movsd %xmm0, (%esp)
 421         movl 20(%esp), %eax
 422         movapd (%eax), %xmm0
 423         movlpd (%esp), %xmm0
 424         movl 16(%esp), %eax
 425         movapd %xmm0, (%eax)
 426         addl $12, %esp
 427         ret
 428
 429 icc generates:
 430
 431         movl      4(%esp), %edx                                 #3.6
 432         movl      8(%esp), %eax                                 #3.6
 433         movapd    (%eax), %xmm0                                 #4.22
 434         movlpd    12(%esp), %xmm0                               #4.8
 435         movapd    %xmm0, (%edx)                                 #4.3
 436         ret                                                     #5.1
 437
 438 So icc is smart enough to know that B is in memory so it doesn't load it and
 439 store it back to stack.
 440
 441 This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic,
 442 lowering it to a load+insertelement instead.  Already match the load+shuffle
 443 as movlpd, so this should be easy.  We already get optimal code for:
 444
 445 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) {
 446 entry:
 447         %tmp2 = load <2 x double>* %A, align 16
 448         %tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0
 449         store <2 x double> %tmp8, <2 x double>* %r, align 16
 450         ret void
 451 }
 452
 453 //===---------------------------------------------------------------------===//
 454
 455 __m128d test1( __m128d A, __m128d B) {
 456   return _mm_shuffle_pd(A, B, 0x3);
 457 }
 458
 459 compiles to
 460
 461 shufpd $3, %xmm1, %xmm0
 462
 463 Perhaps it's better to use unpckhpd instead?
 464
 465 unpckhpd %xmm1, %xmm0
 466
 467 Don't know if unpckhpd is faster. But it is shorter.
 468
 469 //===---------------------------------------------------------------------===//
 470
 471 This code generates ugly code, probably due to costs being off or something:
 472
 473 define void @test(float* %P, <4 x float>* %P2 ) {
 474         %xFloat0.688 = load float* %P
 475         %tmp = load <4 x float>* %P2
 476         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
 477         store <4 x float> %inFloat3.713, <4 x float>* %P2
 478         ret void
 479 }
 480
 481 Generates:
 482
 483 _test:
 484         movl    8(%esp), %eax
 485         movaps  (%eax), %xmm0
 486         pxor    %xmm1, %xmm1
 487         movaps  %xmm0, %xmm2
 488         shufps  $50, %xmm1, %xmm2
 489         shufps  $132, %xmm2, %xmm0
 490         movaps  %xmm0, (%eax)
 491         ret
 492
 493 Would it be better to generate:
 494
 495 _test:
 496         movl 8(%esp), %ecx
 497         movaps (%ecx), %xmm0
 498         xor %eax, %eax
 499         pinsrw $6, %eax, %xmm0
 500         pinsrw $7, %eax, %xmm0
 501         movaps %xmm0, (%ecx)
 502         ret
 503
 504 ?
 505
 506 //===---------------------------------------------------------------------===//
 507
 508 Some useful information in the Apple Altivec / SSE Migration Guide:
 509
 510 http://developer.apple.com/documentation/Performance/Conceptual/
 511 Accelerate_sse_migration/index.html
 512
 513 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 514
 515 //===---------------------------------------------------------------------===//
 516
 517 Add hooks to commute some CMPP operations.
 518
 519 //===---------------------------------------------------------------------===//
 520
 521 Apply the same transformation that merged four float into a single 128-bit load
 522 to loads from constant pool.
 523
 524 //===---------------------------------------------------------------------===//
 525
 526 Floating point max / min are commutable when -enable-unsafe-fp-path is
 527 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
 528 nodes which are selected to max / min instructions that are marked commutable.
 529
 530 //===---------------------------------------------------------------------===//
 531
 532 We should compile this:
 533 #include <xmmintrin.h>
 534 typedef union {
 535   int i[4];
 536   float f[4];
 537   __m128 v;
 538 } vector4_t;
 539 void swizzle (const void *a, vector4_t * b, vector4_t * c) {
 540   b->v = _mm_loadl_pi (b->v, (__m64 *) a);
 541   c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
 542 }
 543
 544 to:
 545
 546 _swizzle:
 547         movl    4(%esp), %eax
 548         movl    8(%esp), %edx
 549         movl    12(%esp), %ecx
 550         movlps  (%eax), %xmm0
 551         movlps  %xmm0, (%edx)
 552         movlps  8(%eax), %xmm0
 553         movlps  %xmm0, (%ecx)
 554         ret
 555
 556 not:
 557
 558 swizzle:
 559         movl 8(%esp), %eax
 560         movaps (%eax), %xmm0
 561         movl 4(%esp), %ecx
 562         movlps (%ecx), %xmm0
 563         movaps %xmm0, (%eax)
 564         movl 12(%esp), %eax
 565         movaps (%eax), %xmm0
 566         movlps 8(%ecx), %xmm0
 567         movaps %xmm0, (%eax)
 568         ret
 569
 570 //===---------------------------------------------------------------------===//
 571
 572 These functions should produce the same code:
 573
 574 #include <emmintrin.h>
 575
 576 typedef long long __m128i __attribute__ ((__vector_size__ (16)));
 577
 578 int foo(__m128i* val) {
 579   return __builtin_ia32_vec_ext_v4si(*val, 1);
 580 }
 581 int bar(__m128i* val) {
 582   union vs {
 583     __m128i *_v;
 584     int* _s;
 585   } v = {val};
 586   return v._s[1];
 587 }
 588
 589 We currently produce (with -m64):
 590
 591 _foo:
 592         pshufd $1, (%rdi), %xmm0
 593         movd %xmm0, %eax
 594         ret
 595 _bar:
 596         movl 4(%rdi), %eax
 597         ret
 598
 599 //===---------------------------------------------------------------------===//
 600
 601 We should materialize vector constants like "all ones" and "signbit" with
 602 code like:
 603
 604      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 605
 606 and:
 607      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 608      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
 609
 610 instead of using a load from the constant pool.  The later is important for
 611 ABS/NEG/copysign etc.
 612
 613 //===---------------------------------------------------------------------===//
 614
 615 "converting 64-bit constant pool entry to 32-bit not necessarily beneficial"
 616 http://llvm.org/PR1264
 617
 618 For this test case:
 619
 620 define double @foo(double %x) {
 621         %y = mul double %x, 5.000000e-01
 622         ret double %y
 623 }
 624
 625 llc -march=x86-64 currently produces a 32-bit constant pool entry and this code:
 626
 627         cvtss2sd .LCPI1_0(%rip), %xmm1
 628         mulsd %xmm1, %xmm0
 629
 630 instead of just using a 64-bit constant pool entry with this:
 631
 632         mulsd .LCPI1_0(%rip), %xmm0
 633
 634 This is due to the code in ExpandConstantFP in LegalizeDAG.cpp. It notices that
 635 x86-64 indeed has an instruction to load a 32-bit float from memory and convert
 636 it into a 64-bit float in a register, however it doesn't notice that this isn't
 637 beneficial because it prevents the load from being folded into the multiply.
 638
 639 //===---------------------------------------------------------------------===//
 640
 641 These functions:
 642
 643 #include <xmmintrin.h>
 644 __m128i a;
 645 void x(unsigned short n) {
 646   a = _mm_slli_epi32 (a, n);
 647 }
 648 void y(unsigned n) {
 649   a = _mm_slli_epi32 (a, n);
 650 }
 651
 652 compile to ( -O3 -static -fomit-frame-pointer):
 653 _x:
 654         movzwl  4(%esp), %eax
 655         movd    %eax, %xmm0
 656         movaps  _a, %xmm1
 657         pslld   %xmm0, %xmm1
 658         movaps  %xmm1, _a
 659         ret
 660 _y:
 661         movd    4(%esp), %xmm0
 662         movaps  _a, %xmm1
 663         pslld   %xmm0, %xmm1
 664         movaps  %xmm1, _a
 665         ret
 666
 667 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
 668 like movd would be sufficient in both cases as the value is already zero
 669 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
 670 save, as a really-signed value would be undefined for pslld.
 671
 672
 673 //===---------------------------------------------------------------------===//
 674
 675 #include <math.h>
 676 int t1(double d) { return signbit(d); }
 677
 678 This currently compiles to:
 679         subl    $12, %esp
 680         movsd   16(%esp), %xmm0
 681         movsd   %xmm0, (%esp)
 682         movl    4(%esp), %eax
 683         shrl    $31, %eax
 684         addl    $12, %esp
 685         ret
 686
 687 We should use movmskp{s|d} instead.
 688
 689 //===---------------------------------------------------------------------===//
 690
 691 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
 692 (aligned) vector load.  This functionality has a couple of problems.
 693
 694 1. The code to infer alignment from loads of globals is in the X86 backend,
 695    not the dag combiner.  This is because dagcombine2 needs to be able to see
 696    through the X86ISD::Wrapper node, which DAGCombine can't really do.
 697 2. The code for turning 4 x load into a single vector load is target
 698    independent and should be moved to the dag combiner.
 699 3. The code for turning 4 x load into a vector load can only handle a direct
 700    load from a global or a direct load from the stack.  It should be generalized
 701    to handle any load from P, P+4, P+8, P+12, where P can be anything.
 702 4. The alignment inference code cannot handle loads from globals in non-static
 703    mode because it doesn't look through the extra dyld stub load.  If you try
 704    vec_align.ll without -relocation-model=static, you'll see what I mean.
 705
 706 //===---------------------------------------------------------------------===//
 707
 708 We should lower store(fneg(load p), q) into an integer load+xor+store, which
 709 eliminates a constant pool load.  For example, consider:
 710
 711 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
 712 entry:
 713         %tmp6 = sub float -0.000000e+00, %z.1           ; <float> [#uses=1]
 714         %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly             ; <i64> [#uses=1]
 715         ret i64 %tmp20
 716 }
 717
 718 This currently compiles to:
 719
 720 LCPI1_0:                                        #  <4 x float>
 721         .long   2147483648      # float -0
 722         .long   2147483648      # float -0
 723         .long   2147483648      # float -0
 724         .long   2147483648      # float -0
 725 _ccosf:
 726         subl    $12, %esp
 727         movss   16(%esp), %xmm0
 728         movss   %xmm0, 4(%esp)
 729         movss   20(%esp), %xmm0
 730         xorps   LCPI1_0, %xmm0
 731         movss   %xmm0, (%esp)
 732         call    L_ccoshf$stub
 733         addl    $12, %esp
 734         ret
 735
 736 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
 737 this code computes the pic base and does two loads to do the constant pool
 738 load, so the improvement is much bigger.
 739
 740 The tricky part about this xform is that the argument load/store isn't exposed
 741 until post-legalize, and at that point, the fneg has been custom expanded into
 742 an X86 fxor.  This means that we need to handle this case in the x86 backend
 743 instead of in target independent code.
 744
 745 //===---------------------------------------------------------------------===//
 746
 747 Non-SSE4 insert into 16 x i8 is atrociously bad.
 748
 749 //===---------------------------------------------------------------------===//
 750
 751 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
 752 is memory.
 753
 754 //===---------------------------------------------------------------------===//
 755
 756 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
 757 sitting between the truncate and the extract.
 758
 759 //===---------------------------------------------------------------------===//
 760
 761 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
 762 any number of 0.0 simultaneously.  Currently we only use it for simple
 763 insertions.
 764
 765 See comments in LowerINSERT_VECTOR_ELT_SSE4.
 766
 767 //===---------------------------------------------------------------------===//
 768
 769 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
 770 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
 771 legal, it'll just take a few extra patterns written in the .td file.
 772
 773 Note: this is not a code quality issue; the custom lowered code happens to be
 774 right, but we shouldn't have to custom lower anything.  This is probably related
 775 to <2 x i64> ops being so bad.
 776