lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 - Consider eliminating the unaligned SSE load intrinsics, replacing them with
   6   unaligned LLVM load instructions.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 Expand libm rounding functions inline:  Significant speedups possible.
  11 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  12
  13 //===---------------------------------------------------------------------===//
  14
  15 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  16 other fast SSE modes.
  17
  18 //===---------------------------------------------------------------------===//
  19
  20 Think about doing i64 math in SSE regs on x86-32.
  21
  22 //===---------------------------------------------------------------------===//
  23
  24 This testcase should have no SSE instructions in it, and only one load from
  25 a constant pool:
  26
  27 double %test3(bool %B) {
  28         %C = select bool %B, double 123.412, double 523.01123123
  29         ret double %C
  30 }
  31
  32 Currently, the select is being lowered, which prevents the dag combiner from
  33 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  34
  35 The pattern isel got this one right.
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 SSE should implement 'select_cc' using 'emulated conditional moves' that use
  40 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  41
  42 double %X(double %Y, double %Z, double %A, double %B) {
  43         %C = setlt double %A, %B
  44         %z = fadd double %Z, 0.0    ;; select operand is not a load
  45         %D = select bool %C, double %Y, double %z
  46         ret double %D
  47 }
  48
  49 We currently emit:
  50
  51 _X:
  52         subl $12, %esp
  53         xorpd %xmm0, %xmm0
  54         addsd 24(%esp), %xmm0
  55         movsd 32(%esp), %xmm1
  56         movsd 16(%esp), %xmm2
  57         ucomisd 40(%esp), %xmm1
  58         jb LBB_X_2
  59 LBB_X_1:
  60         movsd %xmm0, %xmm2
  61 LBB_X_2:
  62         movsd %xmm2, (%esp)
  63         fldl (%esp)
  64         addl $12, %esp
  65         ret
  66
  67 //===---------------------------------------------------------------------===//
  68
  69 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
  70 feasible.
  71
  72 //===---------------------------------------------------------------------===//
  73
  74 Codegen:
  75   if (copysign(1.0, x) == copysign(1.0, y))
  76 into:
  77   if (x^y & mask)
  78 when using SSE.
  79
  80 //===---------------------------------------------------------------------===//
  81
  82 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
  83 of a v4sf value.
  84
  85 //===---------------------------------------------------------------------===//
  86
  87 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
  88 Perhaps use pxor / xorp* to clear a XMM register first?
  89
  90 //===---------------------------------------------------------------------===//
  91
  92 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
  93 to choose between movaps, movapd, and movdqa based on types of source and
  94 destination?
  95
  96 How about andps, andpd, and pand? Do we really care about the type of the packed
  97 elements? If not, why not always use the "ps" variants which are likely to be
  98 shorter.
  99
 100 //===---------------------------------------------------------------------===//
 101
 102 External test Nurbs exposed some problems. Look for
 103 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 104 emits:
 105
 106         movaps    (%edx), %xmm2                                 #59.21
 107         movaps    (%edx), %xmm5                                 #60.21
 108         movaps    (%edx), %xmm4                                 #61.21
 109         movaps    (%edx), %xmm3                                 #62.21
 110         movl      40(%ecx), %ebp                                #69.49
 111         shufps    $0, %xmm2, %xmm5                              #60.21
 112         movl      100(%esp), %ebx                               #69.20
 113         movl      (%ebx), %edi                                  #69.20
 114         imull     %ebp, %edi                                    #69.49
 115         addl      (%eax), %edi                                  #70.33
 116         shufps    $85, %xmm2, %xmm4                             #61.21
 117         shufps    $170, %xmm2, %xmm3                            #62.21
 118         shufps    $255, %xmm2, %xmm2                            #63.21
 119         lea       (%ebp,%ebp,2), %ebx                           #69.49
 120         negl      %ebx                                          #69.49
 121         lea       -3(%edi,%ebx), %ebx                           #70.33
 122         shll      $4, %ebx                                      #68.37
 123         addl      32(%ecx), %ebx                                #68.37
 124         testb     $15, %bl                                      #91.13
 125         jne       L_B1.24       # Prob 5%                       #91.13
 126
 127 This is the llvm code after instruction scheduling:
 128
 129 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 130         %reg1078 = MOV32ri -3
 131         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 132         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 133         %reg1080 = IMUL32rr %reg1079, %reg1037
 134         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 135         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 136         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 137         %reg1082 = SHL32ri %reg1038, 4
 138         %reg1039 = ADD32rr %reg1036, %reg1082
 139         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 140         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 141         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 142         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 143         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 144         %reg1040 = MOV32rr %reg1039
 145         %reg1084 = AND32ri8 %reg1039, 15
 146         CMP32ri8 %reg1084, 0
 147         JE mbb<cond_next204,0xa914d30>
 148
 149 Still ok. After register allocation:
 150
 151 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 152         %EAX = MOV32ri -3
 153         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 154         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 155         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 156         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 157         IMUL32rr %EAX<def&use>, %EDX
 158         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 159         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 160         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 161         %EAX = LEA32r %ESI, 1, %EAX, -3
 162         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 163         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 164         %EDI = MOV32rr %EAX
 165         SHL32ri %EDI<def&use>, 4
 166         ADD32rr %EDI<def&use>, %ESI
 167         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 168         %XMM1 = MOVAPSrr %XMM0
 169         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 170         %XMM2 = MOVAPSrr %XMM0
 171         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 172         %XMM3 = MOVAPSrr %XMM0
 173         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 174         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 175         %EBX = MOV32rr %EDI
 176         AND32ri8 %EBX<def&use>, 15
 177         CMP32ri8 %EBX, 0
 178         JE mbb<cond_next204,0xa914d30>
 179
 180 This looks really bad. The problem is shufps is a destructive opcode. Since it
 181 appears as operand two in more than one shufps ops. It resulted in a number of
 182 copies. Note icc also suffers from the same problem. Either the instruction
 183 selector should select pshufd or The register allocator can made the two-address
 184 to three-address transformation.
 185
 186 It also exposes some other problems. See MOV32ri -3 and the spills.
 187
 188 //===---------------------------------------------------------------------===//
 189
 190 Consider:
 191
 192 __m128 test(float a) {
 193   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 194 }
 195
 196 This compiles into:
 197
 198 movss 4(%esp), %xmm1
 199 mulss %xmm1, %xmm1
 200 xorps %xmm0, %xmm0
 201 movss %xmm1, %xmm0
 202 ret
 203
 204 Because mulss doesn't modify the top 3 elements, the top elements of
 205 xmm1 are already zero'd.  We could compile this to:
 206
 207 movss 4(%esp), %xmm0
 208 mulss %xmm0, %xmm0
 209 ret
 210
 211 //===---------------------------------------------------------------------===//
 212
 213 Here's a sick and twisted idea.  Consider code like this:
 214
 215 __m128 test(__m128 a) {
 216   float b = *(float*)&A;
 217   ...
 218   return _mm_set_ps(0.0, 0.0, 0.0, b);
 219 }
 220
 221 This might compile to this code:
 222
 223 movaps c(%esp), %xmm1
 224 xorps %xmm0, %xmm0
 225 movss %xmm1, %xmm0
 226 ret
 227
 228 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 229 this code:
 230
 231 movaps c(%esp), %xmm1
 232 movaps %xmm1, c2(%esp)
 233 ...
 234
 235 xorps %xmm0, %xmm0
 236 movaps c2(%esp), %xmm1
 237 movss %xmm1, %xmm0
 238 ret
 239
 240 However, since the reload is only used by these instructions, we could
 241 "fold" it into the uses, producing something like this:
 242
 243 movaps c(%esp), %xmm1
 244 movaps %xmm1, c2(%esp)
 245 ...
 246
 247 movss c2(%esp), %xmm0
 248 ret
 249
 250 ... saving two instructions.
 251
 252 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 253 chunk is used, bring in 3 zeros the one element instead of 4 elements.
 254 This can be used to simplify a variety of shuffle operations, where the
 255 elements are fixed zeros.
 256
 257 //===---------------------------------------------------------------------===//
 258
 259 This code generates ugly code, probably due to costs being off or something:
 260
 261 define void @test(float* %P, <4 x float>* %P2 ) {
 262         %xFloat0.688 = load float* %P
 263         %tmp = load <4 x float>* %P2
 264         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
 265         store <4 x float> %inFloat3.713, <4 x float>* %P2
 266         ret void
 267 }
 268
 269 Generates:
 270
 271 _test:
 272         movl    8(%esp), %eax
 273         movaps  (%eax), %xmm0
 274         pxor    %xmm1, %xmm1
 275         movaps  %xmm0, %xmm2
 276         shufps  $50, %xmm1, %xmm2
 277         shufps  $132, %xmm2, %xmm0
 278         movaps  %xmm0, (%eax)
 279         ret
 280
 281 Would it be better to generate:
 282
 283 _test:
 284         movl 8(%esp), %ecx
 285         movaps (%ecx), %xmm0
 286         xor %eax, %eax
 287         pinsrw $6, %eax, %xmm0
 288         pinsrw $7, %eax, %xmm0
 289         movaps %xmm0, (%ecx)
 290         ret
 291
 292 ?
 293
 294 //===---------------------------------------------------------------------===//
 295
 296 Some useful information in the Apple Altivec / SSE Migration Guide:
 297
 298 http://developer.apple.com/documentation/Performance/Conceptual/
 299 Accelerate_sse_migration/index.html
 300
 301 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 302
 303 //===---------------------------------------------------------------------===//
 304
 305 Add hooks to commute some CMPP operations.
 306
 307 //===---------------------------------------------------------------------===//
 308
 309 Apply the same transformation that merged four float into a single 128-bit load
 310 to loads from constant pool.
 311
 312 //===---------------------------------------------------------------------===//
 313
 314 Floating point max / min are commutable when -enable-unsafe-fp-path is
 315 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
 316 nodes which are selected to max / min instructions that are marked commutable.
 317
 318 //===---------------------------------------------------------------------===//
 319
 320 We should materialize vector constants like "all ones" and "signbit" with
 321 code like:
 322
 323      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 324
 325 and:
 326      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 327      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
 328
 329 instead of using a load from the constant pool.  The later is important for
 330 ABS/NEG/copysign etc.
 331
 332 //===---------------------------------------------------------------------===//
 333
 334 These functions:
 335
 336 #include <xmmintrin.h>
 337 __m128i a;
 338 void x(unsigned short n) {
 339   a = _mm_slli_epi32 (a, n);
 340 }
 341 void y(unsigned n) {
 342   a = _mm_slli_epi32 (a, n);
 343 }
 344
 345 compile to ( -O3 -static -fomit-frame-pointer):
 346 _x:
 347         movzwl  4(%esp), %eax
 348         movd    %eax, %xmm0
 349         movaps  _a, %xmm1
 350         pslld   %xmm0, %xmm1
 351         movaps  %xmm1, _a
 352         ret
 353 _y:
 354         movd    4(%esp), %xmm0
 355         movaps  _a, %xmm1
 356         pslld   %xmm0, %xmm1
 357         movaps  %xmm1, _a
 358         ret
 359
 360 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
 361 like movd would be sufficient in both cases as the value is already zero
 362 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
 363 save, as a really-signed value would be undefined for pslld.
 364
 365
 366 //===---------------------------------------------------------------------===//
 367
 368 #include <math.h>
 369 int t1(double d) { return signbit(d); }
 370
 371 This currently compiles to:
 372         subl    $12, %esp
 373         movsd   16(%esp), %xmm0
 374         movsd   %xmm0, (%esp)
 375         movl    4(%esp), %eax
 376         shrl    $31, %eax
 377         addl    $12, %esp
 378         ret
 379
 380 We should use movmskp{s|d} instead.
 381
 382 //===---------------------------------------------------------------------===//
 383
 384 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
 385 (aligned) vector load.  This functionality has a couple of problems.
 386
 387 1. The code to infer alignment from loads of globals is in the X86 backend,
 388    not the dag combiner.  This is because dagcombine2 needs to be able to see
 389    through the X86ISD::Wrapper node, which DAGCombine can't really do.
 390 2. The code for turning 4 x load into a single vector load is target
 391    independent and should be moved to the dag combiner.
 392 3. The code for turning 4 x load into a vector load can only handle a direct
 393    load from a global or a direct load from the stack.  It should be generalized
 394    to handle any load from P, P+4, P+8, P+12, where P can be anything.
 395 4. The alignment inference code cannot handle loads from globals in non-static
 396    mode because it doesn't look through the extra dyld stub load.  If you try
 397    vec_align.ll without -relocation-model=static, you'll see what I mean.
 398
 399 //===---------------------------------------------------------------------===//
 400
 401 We should lower store(fneg(load p), q) into an integer load+xor+store, which
 402 eliminates a constant pool load.  For example, consider:
 403
 404 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
 405 entry:
 406  %tmp6 = fsub float -0.000000e+00, %z.1         ; <float> [#uses=1]
 407  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
 408  ret i64 %tmp20
 409 }
 410 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
 411
 412 This currently compiles to:
 413
 414 LCPI1_0:                                        #  <4 x float>
 415         .long   2147483648      # float -0
 416         .long   2147483648      # float -0
 417         .long   2147483648      # float -0
 418         .long   2147483648      # float -0
 419 _ccosf:
 420         subl    $12, %esp
 421         movss   16(%esp), %xmm0
 422         movss   %xmm0, 4(%esp)
 423         movss   20(%esp), %xmm0
 424         xorps   LCPI1_0, %xmm0
 425         movss   %xmm0, (%esp)
 426         call    L_ccoshf$stub
 427         addl    $12, %esp
 428         ret
 429
 430 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
 431 this code computes the pic base and does two loads to do the constant pool
 432 load, so the improvement is much bigger.
 433
 434 The tricky part about this xform is that the argument load/store isn't exposed
 435 until post-legalize, and at that point, the fneg has been custom expanded into
 436 an X86 fxor.  This means that we need to handle this case in the x86 backend
 437 instead of in target independent code.
 438
 439 //===---------------------------------------------------------------------===//
 440
 441 Non-SSE4 insert into 16 x i8 is atrociously bad.
 442
 443 //===---------------------------------------------------------------------===//
 444
 445 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
 446 is memory.
 447
 448 //===---------------------------------------------------------------------===//
 449
 450 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
 451 sitting between the truncate and the extract.
 452
 453 //===---------------------------------------------------------------------===//
 454
 455 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
 456 any number of 0.0 simultaneously.  Currently we only use it for simple
 457 insertions.
 458
 459 See comments in LowerINSERT_VECTOR_ELT_SSE4.
 460
 461 //===---------------------------------------------------------------------===//
 462
 463 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
 464 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
 465 legal, it'll just take a few extra patterns written in the .td file.
 466
 467 Note: this is not a code quality issue; the custom lowered code happens to be
 468 right, but we shouldn't have to custom lower anything.  This is probably related
 469 to <2 x i64> ops being so bad.
 470
 471 //===---------------------------------------------------------------------===//
 472
 473 'select' on vectors and scalars could be a whole lot better.  We currently
 474 lower them to conditional branches.  On x86-64 for example, we compile this:
 475
 476 double test(double a, double b, double c, double d) { return a<b ? c : d; }
 477
 478 to:
 479
 480 _test:
 481         ucomisd %xmm0, %xmm1
 482         ja      LBB1_2  # entry
 483 LBB1_1: # entry
 484         movapd  %xmm3, %xmm2
 485 LBB1_2: # entry
 486         movapd  %xmm2, %xmm0
 487         ret
 488
 489 instead of:
 490
 491 _test:
 492         cmpltsd %xmm1, %xmm0
 493         andpd   %xmm0, %xmm2
 494         andnpd  %xmm3, %xmm0
 495         orpd    %xmm2, %xmm0
 496         ret
 497
 498 For unpredictable branches, the later is much more efficient.  This should
 499 just be a matter of having scalar sse map to SELECT_CC and custom expanding
 500 or iseling it.
 501
 502 //===---------------------------------------------------------------------===//
 503
 504 LLVM currently generates stack realignment code, when it is not necessary
 505 needed. The problem is that we need to know about stack alignment too early,
 506 before RA runs.
 507
 508 At that point we don't know, whether there will be vector spill, or not.
 509 Stack realignment logic is overly conservative here, but otherwise we can
 510 produce unaligned loads/stores.
 511
 512 Fixing this will require some huge RA changes.
 513
 514 Testcase:
 515 #include <emmintrin.h>
 516
 517 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
 518
 519 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
 520 - 22725, - 12873};;
 521
 522 vSInt16 madd(vSInt16 b)
 523 {
 524     return _mm_madd_epi16(a, b);
 525 }
 526
 527 Generated code (x86-32, linux):
 528 madd:
 529         pushl   %ebp
 530         movl    %esp, %ebp
 531         andl    $-16, %esp
 532         movaps  .LCPI1_0, %xmm1
 533         pmaddwd %xmm1, %xmm0
 534         movl    %ebp, %esp
 535         popl    %ebp
 536         ret
 537
 538 //===---------------------------------------------------------------------===//
 539
 540 Consider:
 541 #include <emmintrin.h>
 542 __m128 foo2 (float x) {
 543  return _mm_set_ps (0, 0, x, 0);
 544 }
 545
 546 In x86-32 mode, we generate this spiffy code:
 547
 548 _foo2:
 549         movss   4(%esp), %xmm0
 550         pshufd  $81, %xmm0, %xmm0
 551         ret
 552
 553 in x86-64 mode, we generate this code, which could be better:
 554
 555 _foo2:
 556         xorps   %xmm1, %xmm1
 557         movss   %xmm0, %xmm1
 558         pshufd  $81, %xmm1, %xmm0
 559         ret
 560
 561 In sse4 mode, we could use insertps to make both better.
 562
 563 Here's another testcase that could use insertps [mem]:
 564
 565 #include <xmmintrin.h>
 566 extern float x2, x3;
 567 __m128 foo1 (float x1, float x4) {
 568  return _mm_set_ps (x2, x1, x3, x4);
 569 }
 570
 571 gcc mainline compiles it to:
 572
 573 foo1:
 574        insertps        $0x10, x2(%rip), %xmm0
 575        insertps        $0x10, x3(%rip), %xmm1
 576        movaps  %xmm1, %xmm2
 577        movlhps %xmm0, %xmm2
 578        movaps  %xmm2, %xmm0
 579        ret
 580
 581 //===---------------------------------------------------------------------===//
 582
 583 We compile vector multiply-by-constant into poor code:
 584
 585 define <4 x i32> @f(<4 x i32> %i) nounwind  {
 586         %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
 587         ret <4 x i32> %A
 588 }
 589
 590 On targets without SSE4.1, this compiles into:
 591
 592 LCPI1_0:                                        ##  <4 x i32>
 593         .long   10
 594         .long   10
 595         .long   10
 596         .long   10
 597         .text
 598         .align  4,0x90
 599         .globl  _f
 600 _f:
 601         pshufd  $3, %xmm0, %xmm1
 602         movd    %xmm1, %eax
 603         imull   LCPI1_0+12, %eax
 604         movd    %eax, %xmm1
 605         pshufd  $1, %xmm0, %xmm2
 606         movd    %xmm2, %eax
 607         imull   LCPI1_0+4, %eax
 608         movd    %eax, %xmm2
 609         punpckldq       %xmm1, %xmm2
 610         movd    %xmm0, %eax
 611         imull   LCPI1_0, %eax
 612         movd    %eax, %xmm1
 613         movhlps %xmm0, %xmm0
 614         movd    %xmm0, %eax
 615         imull   LCPI1_0+8, %eax
 616         movd    %eax, %xmm0
 617         punpckldq       %xmm0, %xmm1
 618         movaps  %xmm1, %xmm0
 619         punpckldq       %xmm2, %xmm0
 620         ret
 621
 622 It would be better to synthesize integer vector multiplication by constants
 623 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
 624 simple cases such as multiplication by powers of two would be better as
 625 vector shifts than as multiplications.
 626
 627 //===---------------------------------------------------------------------===//
 628
 629 We compile this:
 630
 631 __m128i
 632 foo2 (char x)
 633 {
 634   return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
 635 }
 636
 637 into:
 638         movl    $1, %eax
 639         xorps   %xmm0, %xmm0
 640         pinsrw  $2, %eax, %xmm0
 641         movzbl  4(%esp), %eax
 642         pinsrw  $3, %eax, %xmm0
 643         movl    $256, %eax
 644         pinsrw  $7, %eax, %xmm0
 645         ret
 646
 647
 648 gcc-4.2:
 649         subl    $12, %esp
 650         movzbl  16(%esp), %eax
 651         movdqa  LC0, %xmm0
 652         pinsrw  $3, %eax, %xmm0
 653         addl    $12, %esp
 654         ret
 655         .const
 656         .align 4
 657 LC0:
 658         .word   0
 659         .word   0
 660         .word   1
 661         .word   0
 662         .word   0
 663         .word   0
 664         .word   0
 665         .word   256
 666
 667 With SSE4, it should be
 668       movdqa  .LC0(%rip), %xmm0
 669       pinsrb  $6, %edi, %xmm0
 670
 671 //===---------------------------------------------------------------------===//
 672
 673 We should transform a shuffle of two vectors of constants into a single vector
 674 of constants. Also, insertelement of a constant into a vector of constants
 675 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
 676
 677 We compiled it to something horrible:
 678
 679         .align  4
 680 LCPI1_1:                                        ##  float
 681         .long   1065353216      ## float 1
 682         .const
 683
 684         .align  4
 685 LCPI1_0:                                        ##  <4 x float>
 686         .space  4
 687         .long   1065353216      ## float 1
 688         .space  4
 689         .long   1065353216      ## float 1
 690         .text
 691         .align  4,0x90
 692         .globl  _t
 693 _t:
 694         xorps   %xmm0, %xmm0
 695         movhps  LCPI1_0, %xmm0
 696         movss   LCPI1_1, %xmm1
 697         movaps  %xmm0, %xmm2
 698         shufps  $2, %xmm1, %xmm2
 699         shufps  $132, %xmm2, %xmm0
 700         movaps  %xmm0, 0
 701
 702 //===---------------------------------------------------------------------===//
 703 rdar://5907648
 704
 705 This function:
 706
 707 float foo(unsigned char x) {
 708   return x;
 709 }
 710
 711 compiles to (x86-32):
 712
 713 define float @foo(i8 zeroext  %x) nounwind  {
 714         %tmp12 = uitofp i8 %x to float          ; <float> [#uses=1]
 715         ret float %tmp12
 716 }
 717
 718 compiles to:
 719
 720 _foo:
 721         subl    $4, %esp
 722         movzbl  8(%esp), %eax
 723         cvtsi2ss        %eax, %xmm0
 724         movss   %xmm0, (%esp)
 725         flds    (%esp)
 726         addl    $4, %esp
 727         ret
 728
 729 We should be able to use:
 730   cvtsi2ss 8($esp), %xmm0
 731 since we know the stack slot is already zext'd.
 732
 733 //===---------------------------------------------------------------------===//
 734
 735 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
 736 when code size is critical. movlps is slower than movsd on core2 but it's one
 737 byte shorter.
 738
 739 //===---------------------------------------------------------------------===//
 740
 741 We should use a dynamic programming based approach to tell when using FPStack
 742 operations is cheaper than SSE.  SciMark montecarlo contains code like this
 743 for example:
 744
 745 double MonteCarlo_num_flops(int Num_samples) {
 746     return ((double) Num_samples)* 4.0;
 747 }
 748
 749 In fpstack mode, this compiles into:
 750
 751 LCPI1_0:
 752         .long   1082130432      ## float 4.000000e+00
 753 _MonteCarlo_num_flops:
 754         subl    $4, %esp
 755         movl    8(%esp), %eax
 756         movl    %eax, (%esp)
 757         fildl   (%esp)
 758         fmuls   LCPI1_0
 759         addl    $4, %esp
 760         ret
 761
 762 in SSE mode, it compiles into significantly slower code:
 763
 764 _MonteCarlo_num_flops:
 765         subl    $12, %esp
 766         cvtsi2sd        16(%esp), %xmm0
 767         mulsd   LCPI1_0, %xmm0
 768         movsd   %xmm0, (%esp)
 769         fldl    (%esp)
 770         addl    $12, %esp
 771         ret
 772
 773 There are also other cases in scimark where using fpstack is better, it is
 774 cheaper to do fld1 than load from a constant pool for example, so
 775 "load, add 1.0, store" is better done in the fp stack, etc.
 776
 777 //===---------------------------------------------------------------------===//
 778
 779 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
 780 "cmpsd".  For example, this code:
 781
 782 double d1(double x) { return x == x ? x : x + x; }
 783
 784 Compiles into:
 785
 786 _d1:
 787         ucomisd %xmm0, %xmm0
 788         jnp     LBB1_2
 789         addsd   %xmm0, %xmm0
 790         ret
 791 LBB1_2:
 792         ret
 793
 794 Also, the 'ret's should be shared.  This is PR6032.
 795
 796 //===---------------------------------------------------------------------===//
 797
 798 These should compile into the same code (PR6214): Perhaps instcombine should
 799 canonicalize the former into the later?
 800
 801 define float @foo(float %x) nounwind {
 802   %t = bitcast float %x to i32
 803   %s = and i32 %t, 2147483647
 804   %d = bitcast i32 %s to float
 805   ret float %d
 806 }
 807
 808 declare float @fabsf(float %n)
 809 define float @bar(float %x) nounwind {
 810   %d = call float @fabsf(float %x)
 811   ret float %d
 812 }
 813
 814 //===---------------------------------------------------------------------===//
 815
 816 This IR (from PR6194):
 817
 818 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 819 target triple = "x86_64-apple-darwin10.0.0"
 820
 821 %0 = type { double, double }
 822 %struct.float3 = type { float, float, float }
 823
 824 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
 825 entry:
 826   %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
 827   %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
 828   %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
 829   %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
 830   %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
 831   %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
 832   %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
 833   store float %tmp12, float* %tmp5
 834   ret void
 835 }
 836
 837 Compiles to:
 838
 839 _test:                                  ## @test
 840         movd    %xmm0, %rax
 841         shrq    $32, %rax
 842         movl    %eax, 4(%rdi)
 843         ret
 844
 845 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
 846 doing a shuffle from v[1] to v[0] then a float store.
 847
 848 //===---------------------------------------------------------------------===//
 849
 850 On SSE4 machines, we compile this code:
 851
 852 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
 853        <2 x float> *%P) nounwind {
 854   %Z = fadd <2 x float> %Q, %R
 855
 856   store <2 x float> %Z, <2 x float> *%P
 857   ret <2 x float> %Z
 858 }
 859
 860 into:
 861
 862 _test2:                                 ## @test2
 863 ## BB#0:
 864         insertps        $0, %xmm2, %xmm2
 865         insertps        $16, %xmm3, %xmm2
 866         insertps        $0, %xmm0, %xmm3
 867         insertps        $16, %xmm1, %xmm3
 868         addps   %xmm2, %xmm3
 869         movq    %xmm3, (%rdi)
 870         movaps  %xmm3, %xmm0
 871         pshufd  $1, %xmm3, %xmm1
 872                                         ## kill: XMM1<def> XMM1<kill>
 873         ret
 874
 875 The insertps's of $0 are pointless complex copies.
 876
 877 //===---------------------------------------------------------------------===//
 878
 879