lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 - Consider eliminating the unaligned SSE load intrinsics, replacing them with
   6   unaligned LLVM load instructions.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 Expand libm rounding functions inline:  Significant speedups possible.
  11 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  12
  13 //===---------------------------------------------------------------------===//
  14
  15 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  16 other fast SSE modes.
  17
  18 //===---------------------------------------------------------------------===//
  19
  20 Think about doing i64 math in SSE regs on x86-32.
  21
  22 //===---------------------------------------------------------------------===//
  23
  24 This testcase should have no SSE instructions in it, and only one load from
  25 a constant pool:
  26
  27 double %test3(bool %B) {
  28         %C = select bool %B, double 123.412, double 523.01123123
  29         ret double %C
  30 }
  31
  32 Currently, the select is being lowered, which prevents the dag combiner from
  33 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  34
  35 The pattern isel got this one right.
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 SSE should implement 'select_cc' using 'emulated conditional moves' that use
  40 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  41
  42 double %X(double %Y, double %Z, double %A, double %B) {
  43         %C = setlt double %A, %B
  44         %z = fadd double %Z, 0.0    ;; select operand is not a load
  45         %D = select bool %C, double %Y, double %z
  46         ret double %D
  47 }
  48
  49 We currently emit:
  50
  51 _X:
  52         subl $12, %esp
  53         xorpd %xmm0, %xmm0
  54         addsd 24(%esp), %xmm0
  55         movsd 32(%esp), %xmm1
  56         movsd 16(%esp), %xmm2
  57         ucomisd 40(%esp), %xmm1
  58         jb LBB_X_2
  59 LBB_X_1:
  60         movsd %xmm0, %xmm2
  61 LBB_X_2:
  62         movsd %xmm2, (%esp)
  63         fldl (%esp)
  64         addl $12, %esp
  65         ret
  66
  67 //===---------------------------------------------------------------------===//
  68
  69 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
  70 registers. The choice may depend on subtarget information. We should do some
  71 more experiments on different x86 machines.
  72
  73 //===---------------------------------------------------------------------===//
  74
  75 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
  76 feasible.
  77
  78 //===---------------------------------------------------------------------===//
  79
  80 Codegen:
  81   if (copysign(1.0, x) == copysign(1.0, y))
  82 into:
  83   if (x^y & mask)
  84 when using SSE.
  85
  86 //===---------------------------------------------------------------------===//
  87
  88 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
  89 of a v4sf value.
  90
  91 //===---------------------------------------------------------------------===//
  92
  93 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
  94 Perhaps use pxor / xorp* to clear a XMM register first?
  95
  96 //===---------------------------------------------------------------------===//
  97
  98 How to decide when to use the "floating point version" of logical ops? Here are
  99 some code fragments:
 100
 101         movaps LCPI5_5, %xmm2
 102         divps %xmm1, %xmm2
 103         mulps %xmm2, %xmm3
 104         mulps 8656(%ecx), %xmm3
 105         addps 8672(%ecx), %xmm3
 106         andps LCPI5_6, %xmm2
 107         andps LCPI5_1, %xmm3
 108         por %xmm2, %xmm3
 109         movdqa %xmm3, (%edi)
 110
 111         movaps LCPI5_5, %xmm1
 112         divps %xmm0, %xmm1
 113         mulps %xmm1, %xmm3
 114         mulps 8656(%ecx), %xmm3
 115         addps 8672(%ecx), %xmm3
 116         andps LCPI5_6, %xmm1
 117         andps LCPI5_1, %xmm3
 118         orps %xmm1, %xmm3
 119         movaps %xmm3, 112(%esp)
 120         movaps %xmm3, (%ebx)
 121
 122 Due to some minor source change, the later case ended up using orps and movaps
 123 instead of por and movdqa. Does it matter?
 124
 125 //===---------------------------------------------------------------------===//
 126
 127 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 128 to choose between movaps, movapd, and movdqa based on types of source and
 129 destination?
 130
 131 How about andps, andpd, and pand? Do we really care about the type of the packed
 132 elements? If not, why not always use the "ps" variants which are likely to be
 133 shorter.
 134
 135 //===---------------------------------------------------------------------===//
 136
 137 External test Nurbs exposed some problems. Look for
 138 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 139 emits:
 140
 141         movaps    (%edx), %xmm2                                 #59.21
 142         movaps    (%edx), %xmm5                                 #60.21
 143         movaps    (%edx), %xmm4                                 #61.21
 144         movaps    (%edx), %xmm3                                 #62.21
 145         movl      40(%ecx), %ebp                                #69.49
 146         shufps    $0, %xmm2, %xmm5                              #60.21
 147         movl      100(%esp), %ebx                               #69.20
 148         movl      (%ebx), %edi                                  #69.20
 149         imull     %ebp, %edi                                    #69.49
 150         addl      (%eax), %edi                                  #70.33
 151         shufps    $85, %xmm2, %xmm4                             #61.21
 152         shufps    $170, %xmm2, %xmm3                            #62.21
 153         shufps    $255, %xmm2, %xmm2                            #63.21
 154         lea       (%ebp,%ebp,2), %ebx                           #69.49
 155         negl      %ebx                                          #69.49
 156         lea       -3(%edi,%ebx), %ebx                           #70.33
 157         shll      $4, %ebx                                      #68.37
 158         addl      32(%ecx), %ebx                                #68.37
 159         testb     $15, %bl                                      #91.13
 160         jne       L_B1.24       # Prob 5%                       #91.13
 161
 162 This is the llvm code after instruction scheduling:
 163
 164 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 165         %reg1078 = MOV32ri -3
 166         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 167         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 168         %reg1080 = IMUL32rr %reg1079, %reg1037
 169         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 170         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 171         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 172         %reg1082 = SHL32ri %reg1038, 4
 173         %reg1039 = ADD32rr %reg1036, %reg1082
 174         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 175         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 176         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 177         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 178         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 179         %reg1040 = MOV32rr %reg1039
 180         %reg1084 = AND32ri8 %reg1039, 15
 181         CMP32ri8 %reg1084, 0
 182         JE mbb<cond_next204,0xa914d30>
 183
 184 Still ok. After register allocation:
 185
 186 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 187         %EAX = MOV32ri -3
 188         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 189         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 190         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 191         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 192         IMUL32rr %EAX<def&use>, %EDX
 193         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 194         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 195         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 196         %EAX = LEA32r %ESI, 1, %EAX, -3
 197         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 198         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 199         %EDI = MOV32rr %EAX
 200         SHL32ri %EDI<def&use>, 4
 201         ADD32rr %EDI<def&use>, %ESI
 202         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 203         %XMM1 = MOVAPSrr %XMM0
 204         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 205         %XMM2 = MOVAPSrr %XMM0
 206         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 207         %XMM3 = MOVAPSrr %XMM0
 208         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 209         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 210         %EBX = MOV32rr %EDI
 211         AND32ri8 %EBX<def&use>, 15
 212         CMP32ri8 %EBX, 0
 213         JE mbb<cond_next204,0xa914d30>
 214
 215 This looks really bad. The problem is shufps is a destructive opcode. Since it
 216 appears as operand two in more than one shufps ops. It resulted in a number of
 217 copies. Note icc also suffers from the same problem. Either the instruction
 218 selector should select pshufd or The register allocator can made the two-address
 219 to three-address transformation.
 220
 221 It also exposes some other problems. See MOV32ri -3 and the spills.
 222
 223 //===---------------------------------------------------------------------===//
 224
 225 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
 226
 227 LLVM is producing bad code.
 228
 229 LBB_main_4:     # cond_true44
 230         addps %xmm1, %xmm2
 231         subps %xmm3, %xmm2
 232         movaps (%ecx), %xmm4
 233         movaps %xmm2, %xmm1
 234         addps %xmm4, %xmm1
 235         addl $16, %ecx
 236         incl %edx
 237         cmpl $262144, %edx
 238         movaps %xmm3, %xmm2
 239         movaps %xmm4, %xmm3
 240         jne LBB_main_4  # cond_true44
 241
 242 There are two problems. 1) No need to two loop induction variables. We can
 243 compare against 262144 * 16. 2) Known register coalescer issue. We should
 244 be able eliminate one of the movaps:
 245
 246         addps %xmm2, %xmm1    <=== Commute!
 247         subps %xmm3, %xmm1
 248         movaps (%ecx), %xmm4
 249         movaps %xmm1, %xmm1   <=== Eliminate!
 250         addps %xmm4, %xmm1
 251         addl $16, %ecx
 252         incl %edx
 253         cmpl $262144, %edx
 254         movaps %xmm3, %xmm2
 255         movaps %xmm4, %xmm3
 256         jne LBB_main_4  # cond_true44
 257
 258 //===---------------------------------------------------------------------===//
 259
 260 Consider:
 261
 262 __m128 test(float a) {
 263   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 264 }
 265
 266 This compiles into:
 267
 268 movss 4(%esp), %xmm1
 269 mulss %xmm1, %xmm1
 270 xorps %xmm0, %xmm0
 271 movss %xmm1, %xmm0
 272 ret
 273
 274 Because mulss doesn't modify the top 3 elements, the top elements of
 275 xmm1 are already zero'd.  We could compile this to:
 276
 277 movss 4(%esp), %xmm0
 278 mulss %xmm0, %xmm0
 279 ret
 280
 281 //===---------------------------------------------------------------------===//
 282
 283 Here's a sick and twisted idea.  Consider code like this:
 284
 285 __m128 test(__m128 a) {
 286   float b = *(float*)&A;
 287   ...
 288   return _mm_set_ps(0.0, 0.0, 0.0, b);
 289 }
 290
 291 This might compile to this code:
 292
 293 movaps c(%esp), %xmm1
 294 xorps %xmm0, %xmm0
 295 movss %xmm1, %xmm0
 296 ret
 297
 298 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 299 this code:
 300
 301 movaps c(%esp), %xmm1
 302 movaps %xmm1, c2(%esp)
 303 ...
 304
 305 xorps %xmm0, %xmm0
 306 movaps c2(%esp), %xmm1
 307 movss %xmm1, %xmm0
 308 ret
 309
 310 However, since the reload is only used by these instructions, we could
 311 "fold" it into the uses, producing something like this:
 312
 313 movaps c(%esp), %xmm1
 314 movaps %xmm1, c2(%esp)
 315 ...
 316
 317 movss c2(%esp), %xmm0
 318 ret
 319
 320 ... saving two instructions.
 321
 322 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 323 chunk is used, bring in 3 zeros the one element instead of 4 elements.
 324 This can be used to simplify a variety of shuffle operations, where the
 325 elements are fixed zeros.
 326
 327 //===---------------------------------------------------------------------===//
 328
 329 __m128d test1( __m128d A, __m128d B) {
 330   return _mm_shuffle_pd(A, B, 0x3);
 331 }
 332
 333 compiles to
 334
 335 shufpd $3, %xmm1, %xmm0
 336
 337 Perhaps it's better to use unpckhpd instead?
 338
 339 unpckhpd %xmm1, %xmm0
 340
 341 Don't know if unpckhpd is faster. But it is shorter.
 342
 343 //===---------------------------------------------------------------------===//
 344
 345 This code generates ugly code, probably due to costs being off or something:
 346
 347 define void @test(float* %P, <4 x float>* %P2 ) {
 348         %xFloat0.688 = load float* %P
 349         %tmp = load <4 x float>* %P2
 350         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
 351         store <4 x float> %inFloat3.713, <4 x float>* %P2
 352         ret void
 353 }
 354
 355 Generates:
 356
 357 _test:
 358         movl    8(%esp), %eax
 359         movaps  (%eax), %xmm0
 360         pxor    %xmm1, %xmm1
 361         movaps  %xmm0, %xmm2
 362         shufps  $50, %xmm1, %xmm2
 363         shufps  $132, %xmm2, %xmm0
 364         movaps  %xmm0, (%eax)
 365         ret
 366
 367 Would it be better to generate:
 368
 369 _test:
 370         movl 8(%esp), %ecx
 371         movaps (%ecx), %xmm0
 372         xor %eax, %eax
 373         pinsrw $6, %eax, %xmm0
 374         pinsrw $7, %eax, %xmm0
 375         movaps %xmm0, (%ecx)
 376         ret
 377
 378 ?
 379
 380 //===---------------------------------------------------------------------===//
 381
 382 Some useful information in the Apple Altivec / SSE Migration Guide:
 383
 384 http://developer.apple.com/documentation/Performance/Conceptual/
 385 Accelerate_sse_migration/index.html
 386
 387 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 388
 389 //===---------------------------------------------------------------------===//
 390
 391 Add hooks to commute some CMPP operations.
 392
 393 //===---------------------------------------------------------------------===//
 394
 395 Apply the same transformation that merged four float into a single 128-bit load
 396 to loads from constant pool.
 397
 398 //===---------------------------------------------------------------------===//
 399
 400 Floating point max / min are commutable when -enable-unsafe-fp-path is
 401 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
 402 nodes which are selected to max / min instructions that are marked commutable.
 403
 404 //===---------------------------------------------------------------------===//
 405
 406 We should materialize vector constants like "all ones" and "signbit" with
 407 code like:
 408
 409      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 410
 411 and:
 412      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 413      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
 414
 415 instead of using a load from the constant pool.  The later is important for
 416 ABS/NEG/copysign etc.
 417
 418 //===---------------------------------------------------------------------===//
 419
 420 These functions:
 421
 422 #include <xmmintrin.h>
 423 __m128i a;
 424 void x(unsigned short n) {
 425   a = _mm_slli_epi32 (a, n);
 426 }
 427 void y(unsigned n) {
 428   a = _mm_slli_epi32 (a, n);
 429 }
 430
 431 compile to ( -O3 -static -fomit-frame-pointer):
 432 _x:
 433         movzwl  4(%esp), %eax
 434         movd    %eax, %xmm0
 435         movaps  _a, %xmm1
 436         pslld   %xmm0, %xmm1
 437         movaps  %xmm1, _a
 438         ret
 439 _y:
 440         movd    4(%esp), %xmm0
 441         movaps  _a, %xmm1
 442         pslld   %xmm0, %xmm1
 443         movaps  %xmm1, _a
 444         ret
 445
 446 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
 447 like movd would be sufficient in both cases as the value is already zero
 448 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
 449 save, as a really-signed value would be undefined for pslld.
 450
 451
 452 //===---------------------------------------------------------------------===//
 453
 454 #include <math.h>
 455 int t1(double d) { return signbit(d); }
 456
 457 This currently compiles to:
 458         subl    $12, %esp
 459         movsd   16(%esp), %xmm0
 460         movsd   %xmm0, (%esp)
 461         movl    4(%esp), %eax
 462         shrl    $31, %eax
 463         addl    $12, %esp
 464         ret
 465
 466 We should use movmskp{s|d} instead.
 467
 468 //===---------------------------------------------------------------------===//
 469
 470 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
 471 (aligned) vector load.  This functionality has a couple of problems.
 472
 473 1. The code to infer alignment from loads of globals is in the X86 backend,
 474    not the dag combiner.  This is because dagcombine2 needs to be able to see
 475    through the X86ISD::Wrapper node, which DAGCombine can't really do.
 476 2. The code for turning 4 x load into a single vector load is target
 477    independent and should be moved to the dag combiner.
 478 3. The code for turning 4 x load into a vector load can only handle a direct
 479    load from a global or a direct load from the stack.  It should be generalized
 480    to handle any load from P, P+4, P+8, P+12, where P can be anything.
 481 4. The alignment inference code cannot handle loads from globals in non-static
 482    mode because it doesn't look through the extra dyld stub load.  If you try
 483    vec_align.ll without -relocation-model=static, you'll see what I mean.
 484
 485 //===---------------------------------------------------------------------===//
 486
 487 We should lower store(fneg(load p), q) into an integer load+xor+store, which
 488 eliminates a constant pool load.  For example, consider:
 489
 490 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
 491 entry:
 492  %tmp6 = fsub float -0.000000e+00, %z.1         ; <float> [#uses=1]
 493  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
 494  ret i64 %tmp20
 495 }
 496
 497 This currently compiles to:
 498
 499 LCPI1_0:                                        #  <4 x float>
 500         .long   2147483648      # float -0
 501         .long   2147483648      # float -0
 502         .long   2147483648      # float -0
 503         .long   2147483648      # float -0
 504 _ccosf:
 505         subl    $12, %esp
 506         movss   16(%esp), %xmm0
 507         movss   %xmm0, 4(%esp)
 508         movss   20(%esp), %xmm0
 509         xorps   LCPI1_0, %xmm0
 510         movss   %xmm0, (%esp)
 511         call    L_ccoshf$stub
 512         addl    $12, %esp
 513         ret
 514
 515 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
 516 this code computes the pic base and does two loads to do the constant pool
 517 load, so the improvement is much bigger.
 518
 519 The tricky part about this xform is that the argument load/store isn't exposed
 520 until post-legalize, and at that point, the fneg has been custom expanded into
 521 an X86 fxor.  This means that we need to handle this case in the x86 backend
 522 instead of in target independent code.
 523
 524 //===---------------------------------------------------------------------===//
 525
 526 Non-SSE4 insert into 16 x i8 is atrociously bad.
 527
 528 //===---------------------------------------------------------------------===//
 529
 530 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
 531 is memory.
 532
 533 //===---------------------------------------------------------------------===//
 534
 535 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
 536 sitting between the truncate and the extract.
 537
 538 //===---------------------------------------------------------------------===//
 539
 540 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
 541 any number of 0.0 simultaneously.  Currently we only use it for simple
 542 insertions.
 543
 544 See comments in LowerINSERT_VECTOR_ELT_SSE4.
 545
 546 //===---------------------------------------------------------------------===//
 547
 548 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
 549 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
 550 legal, it'll just take a few extra patterns written in the .td file.
 551
 552 Note: this is not a code quality issue; the custom lowered code happens to be
 553 right, but we shouldn't have to custom lower anything.  This is probably related
 554 to <2 x i64> ops being so bad.
 555
 556 //===---------------------------------------------------------------------===//
 557
 558 'select' on vectors and scalars could be a whole lot better.  We currently
 559 lower them to conditional branches.  On x86-64 for example, we compile this:
 560
 561 double test(double a, double b, double c, double d) { return a<b ? c : d; }
 562
 563 to:
 564
 565 _test:
 566         ucomisd %xmm0, %xmm1
 567         ja      LBB1_2  # entry
 568 LBB1_1: # entry
 569         movapd  %xmm3, %xmm2
 570 LBB1_2: # entry
 571         movapd  %xmm2, %xmm0
 572         ret
 573
 574 instead of:
 575
 576 _test:
 577         cmpltsd %xmm1, %xmm0
 578         andpd   %xmm0, %xmm2
 579         andnpd  %xmm3, %xmm0
 580         orpd    %xmm2, %xmm0
 581         ret
 582
 583 For unpredictable branches, the later is much more efficient.  This should
 584 just be a matter of having scalar sse map to SELECT_CC and custom expanding
 585 or iseling it.
 586
 587 //===---------------------------------------------------------------------===//
 588
 589 LLVM currently generates stack realignment code, when it is not necessary
 590 needed. The problem is that we need to know about stack alignment too early,
 591 before RA runs.
 592
 593 At that point we don't know, whether there will be vector spill, or not.
 594 Stack realignment logic is overly conservative here, but otherwise we can
 595 produce unaligned loads/stores.
 596
 597 Fixing this will require some huge RA changes.
 598
 599 Testcase:
 600 #include <emmintrin.h>
 601
 602 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
 603
 604 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
 605 - 22725, - 12873};;
 606
 607 vSInt16 madd(vSInt16 b)
 608 {
 609     return _mm_madd_epi16(a, b);
 610 }
 611
 612 Generated code (x86-32, linux):
 613 madd:
 614         pushl   %ebp
 615         movl    %esp, %ebp
 616         andl    $-16, %esp
 617         movaps  .LCPI1_0, %xmm1
 618         pmaddwd %xmm1, %xmm0
 619         movl    %ebp, %esp
 620         popl    %ebp
 621         ret
 622
 623 //===---------------------------------------------------------------------===//
 624
 625 Consider:
 626 #include <emmintrin.h>
 627 __m128 foo2 (float x) {
 628  return _mm_set_ps (0, 0, x, 0);
 629 }
 630
 631 In x86-32 mode, we generate this spiffy code:
 632
 633 _foo2:
 634         movss   4(%esp), %xmm0
 635         pshufd  $81, %xmm0, %xmm0
 636         ret
 637
 638 in x86-64 mode, we generate this code, which could be better:
 639
 640 _foo2:
 641         xorps   %xmm1, %xmm1
 642         movss   %xmm0, %xmm1
 643         pshufd  $81, %xmm1, %xmm0
 644         ret
 645
 646 In sse4 mode, we could use insertps to make both better.
 647
 648 Here's another testcase that could use insertps [mem]:
 649
 650 #include <xmmintrin.h>
 651 extern float x2, x3;
 652 __m128 foo1 (float x1, float x4) {
 653  return _mm_set_ps (x2, x1, x3, x4);
 654 }
 655
 656 gcc mainline compiles it to:
 657
 658 foo1:
 659        insertps        $0x10, x2(%rip), %xmm0
 660        insertps        $0x10, x3(%rip), %xmm1
 661        movaps  %xmm1, %xmm2
 662        movlhps %xmm0, %xmm2
 663        movaps  %xmm2, %xmm0
 664        ret
 665
 666 //===---------------------------------------------------------------------===//
 667
 668 We compile vector multiply-by-constant into poor code:
 669
 670 define <4 x i32> @f(<4 x i32> %i) nounwind  {
 671         %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
 672         ret <4 x i32> %A
 673 }
 674
 675 On targets without SSE4.1, this compiles into:
 676
 677 LCPI1_0:                                        ##  <4 x i32>
 678         .long   10
 679         .long   10
 680         .long   10
 681         .long   10
 682         .text
 683         .align  4,0x90
 684         .globl  _f
 685 _f:
 686         pshufd  $3, %xmm0, %xmm1
 687         movd    %xmm1, %eax
 688         imull   LCPI1_0+12, %eax
 689         movd    %eax, %xmm1
 690         pshufd  $1, %xmm0, %xmm2
 691         movd    %xmm2, %eax
 692         imull   LCPI1_0+4, %eax
 693         movd    %eax, %xmm2
 694         punpckldq       %xmm1, %xmm2
 695         movd    %xmm0, %eax
 696         imull   LCPI1_0, %eax
 697         movd    %eax, %xmm1
 698         movhlps %xmm0, %xmm0
 699         movd    %xmm0, %eax
 700         imull   LCPI1_0+8, %eax
 701         movd    %eax, %xmm0
 702         punpckldq       %xmm0, %xmm1
 703         movaps  %xmm1, %xmm0
 704         punpckldq       %xmm2, %xmm0
 705         ret
 706
 707 It would be better to synthesize integer vector multiplication by constants
 708 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
 709 simple cases such as multiplication by powers of two would be better as
 710 vector shifts than as multiplications.
 711
 712 //===---------------------------------------------------------------------===//
 713
 714 We compile this:
 715
 716 __m128i
 717 foo2 (char x)
 718 {
 719   return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
 720 }
 721
 722 into:
 723         movl    $1, %eax
 724         xorps   %xmm0, %xmm0
 725         pinsrw  $2, %eax, %xmm0
 726         movzbl  4(%esp), %eax
 727         pinsrw  $3, %eax, %xmm0
 728         movl    $256, %eax
 729         pinsrw  $7, %eax, %xmm0
 730         ret
 731
 732
 733 gcc-4.2:
 734         subl    $12, %esp
 735         movzbl  16(%esp), %eax
 736         movdqa  LC0, %xmm0
 737         pinsrw  $3, %eax, %xmm0
 738         addl    $12, %esp
 739         ret
 740         .const
 741         .align 4
 742 LC0:
 743         .word   0
 744         .word   0
 745         .word   1
 746         .word   0
 747         .word   0
 748         .word   0
 749         .word   0
 750         .word   256
 751
 752 With SSE4, it should be
 753       movdqa  .LC0(%rip), %xmm0
 754       pinsrb  $6, %edi, %xmm0
 755
 756 //===---------------------------------------------------------------------===//
 757
 758 We should transform a shuffle of two vectors of constants into a single vector
 759 of constants. Also, insertelement of a constant into a vector of constants
 760 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
 761
 762 We compiled it to something horrible:
 763
 764         .align  4
 765 LCPI1_1:                                        ##  float
 766         .long   1065353216      ## float 1
 767         .const
 768
 769         .align  4
 770 LCPI1_0:                                        ##  <4 x float>
 771         .space  4
 772         .long   1065353216      ## float 1
 773         .space  4
 774         .long   1065353216      ## float 1
 775         .text
 776         .align  4,0x90
 777         .globl  _t
 778 _t:
 779         xorps   %xmm0, %xmm0
 780         movhps  LCPI1_0, %xmm0
 781         movss   LCPI1_1, %xmm1
 782         movaps  %xmm0, %xmm2
 783         shufps  $2, %xmm1, %xmm2
 784         shufps  $132, %xmm2, %xmm0
 785         movaps  %xmm0, 0
 786
 787 //===---------------------------------------------------------------------===//
 788 rdar://5907648
 789
 790 This function:
 791
 792 float foo(unsigned char x) {
 793   return x;
 794 }
 795
 796 compiles to (x86-32):
 797
 798 define float @foo(i8 zeroext  %x) nounwind  {
 799         %tmp12 = uitofp i8 %x to float          ; <float> [#uses=1]
 800         ret float %tmp12
 801 }
 802
 803 compiles to:
 804
 805 _foo:
 806         subl    $4, %esp
 807         movzbl  8(%esp), %eax
 808         cvtsi2ss        %eax, %xmm0
 809         movss   %xmm0, (%esp)
 810         flds    (%esp)
 811         addl    $4, %esp
 812         ret
 813
 814 We should be able to use:
 815   cvtsi2ss 8($esp), %xmm0
 816 since we know the stack slot is already zext'd.
 817
 818 //===---------------------------------------------------------------------===//
 819
 820 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
 821 when code size is critical. movlps is slower than movsd on core2 but it's one
 822 byte shorter.
 823
 824 //===---------------------------------------------------------------------===//
 825
 826 We should use a dynamic programming based approach to tell when using FPStack
 827 operations is cheaper than SSE.  SciMark montecarlo contains code like this
 828 for example:
 829
 830 double MonteCarlo_num_flops(int Num_samples) {
 831     return ((double) Num_samples)* 4.0;
 832 }
 833
 834 In fpstack mode, this compiles into:
 835
 836 LCPI1_0:
 837         .long   1082130432      ## float 4.000000e+00
 838 _MonteCarlo_num_flops:
 839         subl    $4, %esp
 840         movl    8(%esp), %eax
 841         movl    %eax, (%esp)
 842         fildl   (%esp)
 843         fmuls   LCPI1_0
 844         addl    $4, %esp
 845         ret
 846
 847 in SSE mode, it compiles into significantly slower code:
 848
 849 _MonteCarlo_num_flops:
 850         subl    $12, %esp
 851         cvtsi2sd        16(%esp), %xmm0
 852         mulsd   LCPI1_0, %xmm0
 853         movsd   %xmm0, (%esp)
 854         fldl    (%esp)
 855         addl    $12, %esp
 856         ret
 857
 858 There are also other cases in scimark where using fpstack is better, it is
 859 cheaper to do fld1 than load from a constant pool for example, so
 860 "load, add 1.0, store" is better done in the fp stack, etc.
 861
 862 //===---------------------------------------------------------------------===//
 863
 864 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
 865 "cmpsd".  For example, this code:
 866
 867 double d1(double x) { return x == x ? x : x + x; }
 868
 869 Compiles into:
 870
 871 _d1:
 872         ucomisd %xmm0, %xmm0
 873         jnp     LBB1_2
 874         addsd   %xmm0, %xmm0
 875         ret
 876 LBB1_2:
 877         ret
 878
 879 Also, the 'ret's should be shared.  This is PR6032.
 880
 881 //===---------------------------------------------------------------------===//
 882
 883 These should compile into the same code (PR6214): Perhaps instcombine should
 884 canonicalize the former into the later?
 885
 886 define float @foo(float %x) nounwind {
 887   %t = bitcast float %x to i32
 888   %s = and i32 %t, 2147483647
 889   %d = bitcast i32 %s to float
 890   ret float %d
 891 }
 892
 893 declare float @fabsf(float %n)
 894 define float @bar(float %x) nounwind {
 895   %d = call float @fabsf(float %x)
 896   ret float %d
 897 }
 898
 899 //===---------------------------------------------------------------------===//
 900
 901 This IR (from PR6194):
 902
 903 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 904 target triple = "x86_64-apple-darwin10.0.0"
 905
 906 %0 = type { double, double }
 907 %struct.float3 = type { float, float, float }
 908
 909 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
 910 entry:
 911   %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
 912   %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
 913   %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
 914   %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
 915   %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
 916   %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
 917   %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
 918   store float %tmp12, float* %tmp5
 919   ret void
 920 }
 921
 922 Compiles to:
 923
 924 _test:                                  ## @test
 925         movd    %xmm0, %rax
 926         shrq    $32, %rax
 927         movl    %eax, 4(%rdi)
 928         ret
 929
 930 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
 931 doing a shuffle from v[1] to v[0] then a float store.
 932
 933 //===---------------------------------------------------------------------===//