lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 - Consider eliminating the unaligned SSE load intrinsics, replacing them with
   6   unaligned LLVM load instructions.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 Expand libm rounding functions inline:  Significant speedups possible.
  11 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  12
  13 //===---------------------------------------------------------------------===//
  14
  15 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  16 other fast SSE modes.
  17
  18 //===---------------------------------------------------------------------===//
  19
  20 Think about doing i64 math in SSE regs on x86-32.
  21
  22 //===---------------------------------------------------------------------===//
  23
  24 This testcase should have no SSE instructions in it, and only one load from
  25 a constant pool:
  26
  27 double %test3(bool %B) {
  28         %C = select bool %B, double 123.412, double 523.01123123
  29         ret double %C
  30 }
  31
  32 Currently, the select is being lowered, which prevents the dag combiner from
  33 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  34
  35 The pattern isel got this one right.
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 SSE should implement 'select_cc' using 'emulated conditional moves' that use
  40 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  41
  42 double %X(double %Y, double %Z, double %A, double %B) {
  43         %C = setlt double %A, %B
  44         %z = fadd double %Z, 0.0    ;; select operand is not a load
  45         %D = select bool %C, double %Y, double %z
  46         ret double %D
  47 }
  48
  49 We currently emit:
  50
  51 _X:
  52         subl $12, %esp
  53         xorpd %xmm0, %xmm0
  54         addsd 24(%esp), %xmm0
  55         movsd 32(%esp), %xmm1
  56         movsd 16(%esp), %xmm2
  57         ucomisd 40(%esp), %xmm1
  58         jb LBB_X_2
  59 LBB_X_1:
  60         movsd %xmm0, %xmm2
  61 LBB_X_2:
  62         movsd %xmm2, (%esp)
  63         fldl (%esp)
  64         addl $12, %esp
  65         ret
  66
  67 //===---------------------------------------------------------------------===//
  68
  69 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
  70 feasible.
  71
  72 //===---------------------------------------------------------------------===//
  73
  74 Codegen:
  75   if (copysign(1.0, x) == copysign(1.0, y))
  76 into:
  77   if (x^y & mask)
  78 when using SSE.
  79
  80 //===---------------------------------------------------------------------===//
  81
  82 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
  83 of a v4sf value.
  84
  85 //===---------------------------------------------------------------------===//
  86
  87 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
  88 Perhaps use pxor / xorp* to clear a XMM register first?
  89
  90 //===---------------------------------------------------------------------===//
  91
  92 External test Nurbs exposed some problems. Look for
  93 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
  94 emits:
  95
  96         movaps    (%edx), %xmm2                                 #59.21
  97         movaps    (%edx), %xmm5                                 #60.21
  98         movaps    (%edx), %xmm4                                 #61.21
  99         movaps    (%edx), %xmm3                                 #62.21
 100         movl      40(%ecx), %ebp                                #69.49
 101         shufps    $0, %xmm2, %xmm5                              #60.21
 102         movl      100(%esp), %ebx                               #69.20
 103         movl      (%ebx), %edi                                  #69.20
 104         imull     %ebp, %edi                                    #69.49
 105         addl      (%eax), %edi                                  #70.33
 106         shufps    $85, %xmm2, %xmm4                             #61.21
 107         shufps    $170, %xmm2, %xmm3                            #62.21
 108         shufps    $255, %xmm2, %xmm2                            #63.21
 109         lea       (%ebp,%ebp,2), %ebx                           #69.49
 110         negl      %ebx                                          #69.49
 111         lea       -3(%edi,%ebx), %ebx                           #70.33
 112         shll      $4, %ebx                                      #68.37
 113         addl      32(%ecx), %ebx                                #68.37
 114         testb     $15, %bl                                      #91.13
 115         jne       L_B1.24       # Prob 5%                       #91.13
 116
 117 This is the llvm code after instruction scheduling:
 118
 119 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 120         %reg1078 = MOV32ri -3
 121         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 122         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 123         %reg1080 = IMUL32rr %reg1079, %reg1037
 124         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 125         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 126         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 127         %reg1082 = SHL32ri %reg1038, 4
 128         %reg1039 = ADD32rr %reg1036, %reg1082
 129         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 130         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 131         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 132         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 133         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 134         %reg1040 = MOV32rr %reg1039
 135         %reg1084 = AND32ri8 %reg1039, 15
 136         CMP32ri8 %reg1084, 0
 137         JE mbb<cond_next204,0xa914d30>
 138
 139 Still ok. After register allocation:
 140
 141 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 142         %EAX = MOV32ri -3
 143         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 144         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 145         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 146         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 147         IMUL32rr %EAX<def&use>, %EDX
 148         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 149         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 150         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 151         %EAX = LEA32r %ESI, 1, %EAX, -3
 152         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 153         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 154         %EDI = MOV32rr %EAX
 155         SHL32ri %EDI<def&use>, 4
 156         ADD32rr %EDI<def&use>, %ESI
 157         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 158         %XMM1 = MOVAPSrr %XMM0
 159         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 160         %XMM2 = MOVAPSrr %XMM0
 161         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 162         %XMM3 = MOVAPSrr %XMM0
 163         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 164         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 165         %EBX = MOV32rr %EDI
 166         AND32ri8 %EBX<def&use>, 15
 167         CMP32ri8 %EBX, 0
 168         JE mbb<cond_next204,0xa914d30>
 169
 170 This looks really bad. The problem is shufps is a destructive opcode. Since it
 171 appears as operand two in more than one shufps ops. It resulted in a number of
 172 copies. Note icc also suffers from the same problem. Either the instruction
 173 selector should select pshufd or The register allocator can made the two-address
 174 to three-address transformation.
 175
 176 It also exposes some other problems. See MOV32ri -3 and the spills.
 177
 178 //===---------------------------------------------------------------------===//
 179
 180 Consider:
 181
 182 __m128 test(float a) {
 183   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 184 }
 185
 186 This compiles into:
 187
 188 movss 4(%esp), %xmm1
 189 mulss %xmm1, %xmm1
 190 xorps %xmm0, %xmm0
 191 movss %xmm1, %xmm0
 192 ret
 193
 194 Because mulss doesn't modify the top 3 elements, the top elements of
 195 xmm1 are already zero'd.  We could compile this to:
 196
 197 movss 4(%esp), %xmm0
 198 mulss %xmm0, %xmm0
 199 ret
 200
 201 //===---------------------------------------------------------------------===//
 202
 203 Here's a sick and twisted idea.  Consider code like this:
 204
 205 __m128 test(__m128 a) {
 206   float b = *(float*)&A;
 207   ...
 208   return _mm_set_ps(0.0, 0.0, 0.0, b);
 209 }
 210
 211 This might compile to this code:
 212
 213 movaps c(%esp), %xmm1
 214 xorps %xmm0, %xmm0
 215 movss %xmm1, %xmm0
 216 ret
 217
 218 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 219 this code:
 220
 221 movaps c(%esp), %xmm1
 222 movaps %xmm1, c2(%esp)
 223 ...
 224
 225 xorps %xmm0, %xmm0
 226 movaps c2(%esp), %xmm1
 227 movss %xmm1, %xmm0
 228 ret
 229
 230 However, since the reload is only used by these instructions, we could
 231 "fold" it into the uses, producing something like this:
 232
 233 movaps c(%esp), %xmm1
 234 movaps %xmm1, c2(%esp)
 235 ...
 236
 237 movss c2(%esp), %xmm0
 238 ret
 239
 240 ... saving two instructions.
 241
 242 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 243 chunk is used, bring in 3 zeros the one element instead of 4 elements.
 244 This can be used to simplify a variety of shuffle operations, where the
 245 elements are fixed zeros.
 246
 247 //===---------------------------------------------------------------------===//
 248
 249 This code generates ugly code, probably due to costs being off or something:
 250
 251 define void @test(float* %P, <4 x float>* %P2 ) {
 252         %xFloat0.688 = load float* %P
 253         %tmp = load <4 x float>* %P2
 254         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
 255         store <4 x float> %inFloat3.713, <4 x float>* %P2
 256         ret void
 257 }
 258
 259 Generates:
 260
 261 _test:
 262         movl    8(%esp), %eax
 263         movaps  (%eax), %xmm0
 264         pxor    %xmm1, %xmm1
 265         movaps  %xmm0, %xmm2
 266         shufps  $50, %xmm1, %xmm2
 267         shufps  $132, %xmm2, %xmm0
 268         movaps  %xmm0, (%eax)
 269         ret
 270
 271 Would it be better to generate:
 272
 273 _test:
 274         movl 8(%esp), %ecx
 275         movaps (%ecx), %xmm0
 276         xor %eax, %eax
 277         pinsrw $6, %eax, %xmm0
 278         pinsrw $7, %eax, %xmm0
 279         movaps %xmm0, (%ecx)
 280         ret
 281
 282 ?
 283
 284 //===---------------------------------------------------------------------===//
 285
 286 Some useful information in the Apple Altivec / SSE Migration Guide:
 287
 288 http://developer.apple.com/documentation/Performance/Conceptual/
 289 Accelerate_sse_migration/index.html
 290
 291 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 292
 293 //===---------------------------------------------------------------------===//
 294
 295 Add hooks to commute some CMPP operations.
 296
 297 //===---------------------------------------------------------------------===//
 298
 299 Apply the same transformation that merged four float into a single 128-bit load
 300 to loads from constant pool.
 301
 302 //===---------------------------------------------------------------------===//
 303
 304 Floating point max / min are commutable when -enable-unsafe-fp-path is
 305 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
 306 nodes which are selected to max / min instructions that are marked commutable.
 307
 308 //===---------------------------------------------------------------------===//
 309
 310 We should materialize vector constants like "all ones" and "signbit" with
 311 code like:
 312
 313      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 314
 315 and:
 316      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 317      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
 318
 319 instead of using a load from the constant pool.  The later is important for
 320 ABS/NEG/copysign etc.
 321
 322 //===---------------------------------------------------------------------===//
 323
 324 These functions:
 325
 326 #include <xmmintrin.h>
 327 __m128i a;
 328 void x(unsigned short n) {
 329   a = _mm_slli_epi32 (a, n);
 330 }
 331 void y(unsigned n) {
 332   a = _mm_slli_epi32 (a, n);
 333 }
 334
 335 compile to ( -O3 -static -fomit-frame-pointer):
 336 _x:
 337         movzwl  4(%esp), %eax
 338         movd    %eax, %xmm0
 339         movaps  _a, %xmm1
 340         pslld   %xmm0, %xmm1
 341         movaps  %xmm1, _a
 342         ret
 343 _y:
 344         movd    4(%esp), %xmm0
 345         movaps  _a, %xmm1
 346         pslld   %xmm0, %xmm1
 347         movaps  %xmm1, _a
 348         ret
 349
 350 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
 351 like movd would be sufficient in both cases as the value is already zero
 352 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
 353 save, as a really-signed value would be undefined for pslld.
 354
 355
 356 //===---------------------------------------------------------------------===//
 357
 358 #include <math.h>
 359 int t1(double d) { return signbit(d); }
 360
 361 This currently compiles to:
 362         subl    $12, %esp
 363         movsd   16(%esp), %xmm0
 364         movsd   %xmm0, (%esp)
 365         movl    4(%esp), %eax
 366         shrl    $31, %eax
 367         addl    $12, %esp
 368         ret
 369
 370 We should use movmskp{s|d} instead.
 371
 372 //===---------------------------------------------------------------------===//
 373
 374 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
 375 (aligned) vector load.  This functionality has a couple of problems.
 376
 377 1. The code to infer alignment from loads of globals is in the X86 backend,
 378    not the dag combiner.  This is because dagcombine2 needs to be able to see
 379    through the X86ISD::Wrapper node, which DAGCombine can't really do.
 380 2. The code for turning 4 x load into a single vector load is target
 381    independent and should be moved to the dag combiner.
 382 3. The code for turning 4 x load into a vector load can only handle a direct
 383    load from a global or a direct load from the stack.  It should be generalized
 384    to handle any load from P, P+4, P+8, P+12, where P can be anything.
 385 4. The alignment inference code cannot handle loads from globals in non-static
 386    mode because it doesn't look through the extra dyld stub load.  If you try
 387    vec_align.ll without -relocation-model=static, you'll see what I mean.
 388
 389 //===---------------------------------------------------------------------===//
 390
 391 We should lower store(fneg(load p), q) into an integer load+xor+store, which
 392 eliminates a constant pool load.  For example, consider:
 393
 394 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
 395 entry:
 396  %tmp6 = fsub float -0.000000e+00, %z.1         ; <float> [#uses=1]
 397  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
 398  ret i64 %tmp20
 399 }
 400 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
 401
 402 This currently compiles to:
 403
 404 LCPI1_0:                                        #  <4 x float>
 405         .long   2147483648      # float -0
 406         .long   2147483648      # float -0
 407         .long   2147483648      # float -0
 408         .long   2147483648      # float -0
 409 _ccosf:
 410         subl    $12, %esp
 411         movss   16(%esp), %xmm0
 412         movss   %xmm0, 4(%esp)
 413         movss   20(%esp), %xmm0
 414         xorps   LCPI1_0, %xmm0
 415         movss   %xmm0, (%esp)
 416         call    L_ccoshf$stub
 417         addl    $12, %esp
 418         ret
 419
 420 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
 421 this code computes the pic base and does two loads to do the constant pool
 422 load, so the improvement is much bigger.
 423
 424 The tricky part about this xform is that the argument load/store isn't exposed
 425 until post-legalize, and at that point, the fneg has been custom expanded into
 426 an X86 fxor.  This means that we need to handle this case in the x86 backend
 427 instead of in target independent code.
 428
 429 //===---------------------------------------------------------------------===//
 430
 431 Non-SSE4 insert into 16 x i8 is atrociously bad.
 432
 433 //===---------------------------------------------------------------------===//
 434
 435 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
 436 is memory.
 437
 438 //===---------------------------------------------------------------------===//
 439
 440 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
 441 sitting between the truncate and the extract.
 442
 443 //===---------------------------------------------------------------------===//
 444
 445 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
 446 any number of 0.0 simultaneously.  Currently we only use it for simple
 447 insertions.
 448
 449 See comments in LowerINSERT_VECTOR_ELT_SSE4.
 450
 451 //===---------------------------------------------------------------------===//
 452
 453 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
 454 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
 455 legal, it'll just take a few extra patterns written in the .td file.
 456
 457 Note: this is not a code quality issue; the custom lowered code happens to be
 458 right, but we shouldn't have to custom lower anything.  This is probably related
 459 to <2 x i64> ops being so bad.
 460
 461 //===---------------------------------------------------------------------===//
 462
 463 'select' on vectors and scalars could be a whole lot better.  We currently
 464 lower them to conditional branches.  On x86-64 for example, we compile this:
 465
 466 double test(double a, double b, double c, double d) { return a<b ? c : d; }
 467
 468 to:
 469
 470 _test:
 471         ucomisd %xmm0, %xmm1
 472         ja      LBB1_2  # entry
 473 LBB1_1: # entry
 474         movapd  %xmm3, %xmm2
 475 LBB1_2: # entry
 476         movapd  %xmm2, %xmm0
 477         ret
 478
 479 instead of:
 480
 481 _test:
 482         cmpltsd %xmm1, %xmm0
 483         andpd   %xmm0, %xmm2
 484         andnpd  %xmm3, %xmm0
 485         orpd    %xmm2, %xmm0
 486         ret
 487
 488 For unpredictable branches, the later is much more efficient.  This should
 489 just be a matter of having scalar sse map to SELECT_CC and custom expanding
 490 or iseling it.
 491
 492 //===---------------------------------------------------------------------===//
 493
 494 LLVM currently generates stack realignment code, when it is not necessary
 495 needed. The problem is that we need to know about stack alignment too early,
 496 before RA runs.
 497
 498 At that point we don't know, whether there will be vector spill, or not.
 499 Stack realignment logic is overly conservative here, but otherwise we can
 500 produce unaligned loads/stores.
 501
 502 Fixing this will require some huge RA changes.
 503
 504 Testcase:
 505 #include <emmintrin.h>
 506
 507 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
 508
 509 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
 510 - 22725, - 12873};;
 511
 512 vSInt16 madd(vSInt16 b)
 513 {
 514     return _mm_madd_epi16(a, b);
 515 }
 516
 517 Generated code (x86-32, linux):
 518 madd:
 519         pushl   %ebp
 520         movl    %esp, %ebp
 521         andl    $-16, %esp
 522         movaps  .LCPI1_0, %xmm1
 523         pmaddwd %xmm1, %xmm0
 524         movl    %ebp, %esp
 525         popl    %ebp
 526         ret
 527
 528 //===---------------------------------------------------------------------===//
 529
 530 Consider:
 531 #include <emmintrin.h>
 532 __m128 foo2 (float x) {
 533  return _mm_set_ps (0, 0, x, 0);
 534 }
 535
 536 In x86-32 mode, we generate this spiffy code:
 537
 538 _foo2:
 539         movss   4(%esp), %xmm0
 540         pshufd  $81, %xmm0, %xmm0
 541         ret
 542
 543 in x86-64 mode, we generate this code, which could be better:
 544
 545 _foo2:
 546         xorps   %xmm1, %xmm1
 547         movss   %xmm0, %xmm1
 548         pshufd  $81, %xmm1, %xmm0
 549         ret
 550
 551 In sse4 mode, we could use insertps to make both better.
 552
 553 Here's another testcase that could use insertps [mem]:
 554
 555 #include <xmmintrin.h>
 556 extern float x2, x3;
 557 __m128 foo1 (float x1, float x4) {
 558  return _mm_set_ps (x2, x1, x3, x4);
 559 }
 560
 561 gcc mainline compiles it to:
 562
 563 foo1:
 564        insertps        $0x10, x2(%rip), %xmm0
 565        insertps        $0x10, x3(%rip), %xmm1
 566        movaps  %xmm1, %xmm2
 567        movlhps %xmm0, %xmm2
 568        movaps  %xmm2, %xmm0
 569        ret
 570
 571 //===---------------------------------------------------------------------===//
 572
 573 We compile vector multiply-by-constant into poor code:
 574
 575 define <4 x i32> @f(<4 x i32> %i) nounwind  {
 576         %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
 577         ret <4 x i32> %A
 578 }
 579
 580 On targets without SSE4.1, this compiles into:
 581
 582 LCPI1_0:                                        ##  <4 x i32>
 583         .long   10
 584         .long   10
 585         .long   10
 586         .long   10
 587         .text
 588         .align  4,0x90
 589         .globl  _f
 590 _f:
 591         pshufd  $3, %xmm0, %xmm1
 592         movd    %xmm1, %eax
 593         imull   LCPI1_0+12, %eax
 594         movd    %eax, %xmm1
 595         pshufd  $1, %xmm0, %xmm2
 596         movd    %xmm2, %eax
 597         imull   LCPI1_0+4, %eax
 598         movd    %eax, %xmm2
 599         punpckldq       %xmm1, %xmm2
 600         movd    %xmm0, %eax
 601         imull   LCPI1_0, %eax
 602         movd    %eax, %xmm1
 603         movhlps %xmm0, %xmm0
 604         movd    %xmm0, %eax
 605         imull   LCPI1_0+8, %eax
 606         movd    %eax, %xmm0
 607         punpckldq       %xmm0, %xmm1
 608         movaps  %xmm1, %xmm0
 609         punpckldq       %xmm2, %xmm0
 610         ret
 611
 612 It would be better to synthesize integer vector multiplication by constants
 613 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
 614 simple cases such as multiplication by powers of two would be better as
 615 vector shifts than as multiplications.
 616
 617 //===---------------------------------------------------------------------===//
 618
 619 We compile this:
 620
 621 __m128i
 622 foo2 (char x)
 623 {
 624   return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
 625 }
 626
 627 into:
 628         movl    $1, %eax
 629         xorps   %xmm0, %xmm0
 630         pinsrw  $2, %eax, %xmm0
 631         movzbl  4(%esp), %eax
 632         pinsrw  $3, %eax, %xmm0
 633         movl    $256, %eax
 634         pinsrw  $7, %eax, %xmm0
 635         ret
 636
 637
 638 gcc-4.2:
 639         subl    $12, %esp
 640         movzbl  16(%esp), %eax
 641         movdqa  LC0, %xmm0
 642         pinsrw  $3, %eax, %xmm0
 643         addl    $12, %esp
 644         ret
 645         .const
 646         .align 4
 647 LC0:
 648         .word   0
 649         .word   0
 650         .word   1
 651         .word   0
 652         .word   0
 653         .word   0
 654         .word   0
 655         .word   256
 656
 657 With SSE4, it should be
 658       movdqa  .LC0(%rip), %xmm0
 659       pinsrb  $6, %edi, %xmm0
 660
 661 //===---------------------------------------------------------------------===//
 662
 663 We should transform a shuffle of two vectors of constants into a single vector
 664 of constants. Also, insertelement of a constant into a vector of constants
 665 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
 666
 667 We compiled it to something horrible:
 668
 669         .align  4
 670 LCPI1_1:                                        ##  float
 671         .long   1065353216      ## float 1
 672         .const
 673
 674         .align  4
 675 LCPI1_0:                                        ##  <4 x float>
 676         .space  4
 677         .long   1065353216      ## float 1
 678         .space  4
 679         .long   1065353216      ## float 1
 680         .text
 681         .align  4,0x90
 682         .globl  _t
 683 _t:
 684         xorps   %xmm0, %xmm0
 685         movhps  LCPI1_0, %xmm0
 686         movss   LCPI1_1, %xmm1
 687         movaps  %xmm0, %xmm2
 688         shufps  $2, %xmm1, %xmm2
 689         shufps  $132, %xmm2, %xmm0
 690         movaps  %xmm0, 0
 691
 692 //===---------------------------------------------------------------------===//
 693 rdar://5907648
 694
 695 This function:
 696
 697 float foo(unsigned char x) {
 698   return x;
 699 }
 700
 701 compiles to (x86-32):
 702
 703 define float @foo(i8 zeroext  %x) nounwind  {
 704         %tmp12 = uitofp i8 %x to float          ; <float> [#uses=1]
 705         ret float %tmp12
 706 }
 707
 708 compiles to:
 709
 710 _foo:
 711         subl    $4, %esp
 712         movzbl  8(%esp), %eax
 713         cvtsi2ss        %eax, %xmm0
 714         movss   %xmm0, (%esp)
 715         flds    (%esp)
 716         addl    $4, %esp
 717         ret
 718
 719 We should be able to use:
 720   cvtsi2ss 8($esp), %xmm0
 721 since we know the stack slot is already zext'd.
 722
 723 //===---------------------------------------------------------------------===//
 724
 725 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
 726 when code size is critical. movlps is slower than movsd on core2 but it's one
 727 byte shorter.
 728
 729 //===---------------------------------------------------------------------===//
 730
 731 We should use a dynamic programming based approach to tell when using FPStack
 732 operations is cheaper than SSE.  SciMark montecarlo contains code like this
 733 for example:
 734
 735 double MonteCarlo_num_flops(int Num_samples) {
 736     return ((double) Num_samples)* 4.0;
 737 }
 738
 739 In fpstack mode, this compiles into:
 740
 741 LCPI1_0:
 742         .long   1082130432      ## float 4.000000e+00
 743 _MonteCarlo_num_flops:
 744         subl    $4, %esp
 745         movl    8(%esp), %eax
 746         movl    %eax, (%esp)
 747         fildl   (%esp)
 748         fmuls   LCPI1_0
 749         addl    $4, %esp
 750         ret
 751
 752 in SSE mode, it compiles into significantly slower code:
 753
 754 _MonteCarlo_num_flops:
 755         subl    $12, %esp
 756         cvtsi2sd        16(%esp), %xmm0
 757         mulsd   LCPI1_0, %xmm0
 758         movsd   %xmm0, (%esp)
 759         fldl    (%esp)
 760         addl    $12, %esp
 761         ret
 762
 763 There are also other cases in scimark where using fpstack is better, it is
 764 cheaper to do fld1 than load from a constant pool for example, so
 765 "load, add 1.0, store" is better done in the fp stack, etc.
 766
 767 //===---------------------------------------------------------------------===//
 768
 769 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
 770 "cmpsd".  For example, this code:
 771
 772 double d1(double x) { return x == x ? x : x + x; }
 773
 774 Compiles into:
 775
 776 _d1:
 777         ucomisd %xmm0, %xmm0
 778         jnp     LBB1_2
 779         addsd   %xmm0, %xmm0
 780         ret
 781 LBB1_2:
 782         ret
 783
 784 Also, the 'ret's should be shared.  This is PR6032.
 785
 786 //===---------------------------------------------------------------------===//
 787
 788 These should compile into the same code (PR6214): Perhaps instcombine should
 789 canonicalize the former into the later?
 790
 791 define float @foo(float %x) nounwind {
 792   %t = bitcast float %x to i32
 793   %s = and i32 %t, 2147483647
 794   %d = bitcast i32 %s to float
 795   ret float %d
 796 }
 797
 798 declare float @fabsf(float %n)
 799 define float @bar(float %x) nounwind {
 800   %d = call float @fabsf(float %x)
 801   ret float %d
 802 }
 803
 804 //===---------------------------------------------------------------------===//
 805
 806 This IR (from PR6194):
 807
 808 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 809 target triple = "x86_64-apple-darwin10.0.0"
 810
 811 %0 = type { double, double }
 812 %struct.float3 = type { float, float, float }
 813
 814 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
 815 entry:
 816   %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
 817   %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
 818   %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
 819   %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
 820   %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
 821   %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
 822   %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
 823   store float %tmp12, float* %tmp5
 824   ret void
 825 }
 826
 827 Compiles to:
 828
 829 _test:                                  ## @test
 830         movd    %xmm0, %rax
 831         shrq    $32, %rax
 832         movl    %eax, 4(%rdi)
 833         ret
 834
 835 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
 836 doing a shuffle from v[1] to v[0] then a float store.
 837
 838 //===---------------------------------------------------------------------===//
 839
 840 On SSE4 machines, we compile this code:
 841
 842 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
 843        <2 x float> *%P) nounwind {
 844   %Z = fadd <2 x float> %Q, %R
 845
 846   store <2 x float> %Z, <2 x float> *%P
 847   ret <2 x float> %Z
 848 }
 849
 850 into:
 851
 852 _test2:                                 ## @test2
 853 ## BB#0:
 854         insertps        $0, %xmm2, %xmm2
 855         insertps        $16, %xmm3, %xmm2
 856         insertps        $0, %xmm0, %xmm3
 857         insertps        $16, %xmm1, %xmm3
 858         addps   %xmm2, %xmm3
 859         movq    %xmm3, (%rdi)
 860         movaps  %xmm3, %xmm0
 861         pshufd  $1, %xmm3, %xmm1
 862                                         ## kill: XMM1<def> XMM1<kill>
 863         ret
 864
 865 The insertps's of $0 are pointless complex copies.
 866
 867 //===---------------------------------------------------------------------===//
 868
 869