lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 //===---------------------------------------------------------------------===//
   6
   7 SSE Variable shift can be custom lowered to something like this, which uses a
   8 small table + unaligned load + shuffle instead of going through memory.
   9
  10 __m128i_shift_right:
  11         .byte     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
  12         .byte    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  13
  14 ...
  15 __m128i shift_right(__m128i value, unsigned long offset) {
  16   return _mm_shuffle_epi8(value,
  17                _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
  18 }
  19
  20 //===---------------------------------------------------------------------===//
  21
  22 SSE has instructions for doing operations on complex numbers, we should pattern
  23 match them.  Compiling this:
  24
  25 _Complex float f32(_Complex float A, _Complex float B) {
  26   return A+B;
  27 }
  28
  29 into:
  30
  31 _f32:
  32         movdqa  %xmm0, %xmm2
  33         addss   %xmm1, %xmm2
  34         pshufd  $16, %xmm2, %xmm2
  35         pshufd  $1, %xmm1, %xmm1
  36         pshufd  $1, %xmm0, %xmm0
  37         addss   %xmm1, %xmm0
  38         pshufd  $16, %xmm0, %xmm1
  39         movdqa  %xmm2, %xmm0
  40         unpcklps        %xmm1, %xmm0
  41         ret
  42
  43 seems silly.
  44
  45
  46 //===---------------------------------------------------------------------===//
  47
  48 Expand libm rounding functions inline:  Significant speedups possible.
  49 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  50
  51 //===---------------------------------------------------------------------===//
  52
  53 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  54 other fast SSE modes.
  55
  56 //===---------------------------------------------------------------------===//
  57
  58 Think about doing i64 math in SSE regs on x86-32.
  59
  60 //===---------------------------------------------------------------------===//
  61
  62 This testcase should have no SSE instructions in it, and only one load from
  63 a constant pool:
  64
  65 double %test3(bool %B) {
  66         %C = select bool %B, double 123.412, double 523.01123123
  67         ret double %C
  68 }
  69
  70 Currently, the select is being lowered, which prevents the dag combiner from
  71 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  72
  73 The pattern isel got this one right.
  74
  75 //===---------------------------------------------------------------------===//
  76
  77 SSE should implement 'select_cc' using 'emulated conditional moves' that use
  78 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  79
  80 double %X(double %Y, double %Z, double %A, double %B) {
  81         %C = setlt double %A, %B
  82         %z = fadd double %Z, 0.0    ;; select operand is not a load
  83         %D = select bool %C, double %Y, double %z
  84         ret double %D
  85 }
  86
  87 We currently emit:
  88
  89 _X:
  90         subl $12, %esp
  91         xorpd %xmm0, %xmm0
  92         addsd 24(%esp), %xmm0
  93         movsd 32(%esp), %xmm1
  94         movsd 16(%esp), %xmm2
  95         ucomisd 40(%esp), %xmm1
  96         jb LBB_X_2
  97 LBB_X_1:
  98         movsd %xmm0, %xmm2
  99 LBB_X_2:
 100         movsd %xmm2, (%esp)
 101         fldl (%esp)
 102         addl $12, %esp
 103         ret
 104
 105 //===---------------------------------------------------------------------===//
 106
 107 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 108 feasible.
 109
 110 //===---------------------------------------------------------------------===//
 111
 112 Codegen:
 113   if (copysign(1.0, x) == copysign(1.0, y))
 114 into:
 115   if (x^y & mask)
 116 when using SSE.
 117
 118 //===---------------------------------------------------------------------===//
 119
 120 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
 121 of a v4sf value.
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 126 Perhaps use pxor / xorp* to clear a XMM register first?
 127
 128 //===---------------------------------------------------------------------===//
 129
 130 External test Nurbs exposed some problems. Look for
 131 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 132 emits:
 133
 134         movaps    (%edx), %xmm2                                 #59.21
 135         movaps    (%edx), %xmm5                                 #60.21
 136         movaps    (%edx), %xmm4                                 #61.21
 137         movaps    (%edx), %xmm3                                 #62.21
 138         movl      40(%ecx), %ebp                                #69.49
 139         shufps    $0, %xmm2, %xmm5                              #60.21
 140         movl      100(%esp), %ebx                               #69.20
 141         movl      (%ebx), %edi                                  #69.20
 142         imull     %ebp, %edi                                    #69.49
 143         addl      (%eax), %edi                                  #70.33
 144         shufps    $85, %xmm2, %xmm4                             #61.21
 145         shufps    $170, %xmm2, %xmm3                            #62.21
 146         shufps    $255, %xmm2, %xmm2                            #63.21
 147         lea       (%ebp,%ebp,2), %ebx                           #69.49
 148         negl      %ebx                                          #69.49
 149         lea       -3(%edi,%ebx), %ebx                           #70.33
 150         shll      $4, %ebx                                      #68.37
 151         addl      32(%ecx), %ebx                                #68.37
 152         testb     $15, %bl                                      #91.13
 153         jne       L_B1.24       # Prob 5%                       #91.13
 154
 155 This is the llvm code after instruction scheduling:
 156
 157 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 158         %reg1078 = MOV32ri -3
 159         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 160         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 161         %reg1080 = IMUL32rr %reg1079, %reg1037
 162         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 163         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 164         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 165         %reg1082 = SHL32ri %reg1038, 4
 166         %reg1039 = ADD32rr %reg1036, %reg1082
 167         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 168         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 169         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 170         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 171         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 172         %reg1040 = MOV32rr %reg1039
 173         %reg1084 = AND32ri8 %reg1039, 15
 174         CMP32ri8 %reg1084, 0
 175         JE mbb<cond_next204,0xa914d30>
 176
 177 Still ok. After register allocation:
 178
 179 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 180         %EAX = MOV32ri -3
 181         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 182         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 183         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 184         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 185         IMUL32rr %EAX<def&use>, %EDX
 186         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 187         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 188         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 189         %EAX = LEA32r %ESI, 1, %EAX, -3
 190         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 191         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 192         %EDI = MOV32rr %EAX
 193         SHL32ri %EDI<def&use>, 4
 194         ADD32rr %EDI<def&use>, %ESI
 195         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 196         %XMM1 = MOVAPSrr %XMM0
 197         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 198         %XMM2 = MOVAPSrr %XMM0
 199         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 200         %XMM3 = MOVAPSrr %XMM0
 201         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 202         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 203         %EBX = MOV32rr %EDI
 204         AND32ri8 %EBX<def&use>, 15
 205         CMP32ri8 %EBX, 0
 206         JE mbb<cond_next204,0xa914d30>
 207
 208 This looks really bad. The problem is shufps is a destructive opcode. Since it
 209 appears as operand two in more than one shufps ops. It resulted in a number of
 210 copies. Note icc also suffers from the same problem. Either the instruction
 211 selector should select pshufd or The register allocator can made the two-address
 212 to three-address transformation.
 213
 214 It also exposes some other problems. See MOV32ri -3 and the spills.
 215
 216 //===---------------------------------------------------------------------===//
 217
 218 Consider:
 219
 220 __m128 test(float a) {
 221   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 222 }
 223
 224 This compiles into:
 225
 226 movss 4(%esp), %xmm1
 227 mulss %xmm1, %xmm1
 228 xorps %xmm0, %xmm0
 229 movss %xmm1, %xmm0
 230 ret
 231
 232 Because mulss doesn't modify the top 3 elements, the top elements of
 233 xmm1 are already zero'd.  We could compile this to:
 234
 235 movss 4(%esp), %xmm0
 236 mulss %xmm0, %xmm0
 237 ret
 238
 239 //===---------------------------------------------------------------------===//
 240
 241 Here's a sick and twisted idea.  Consider code like this:
 242
 243 __m128 test(__m128 a) {
 244   float b = *(float*)&A;
 245   ...
 246   return _mm_set_ps(0.0, 0.0, 0.0, b);
 247 }
 248
 249 This might compile to this code:
 250
 251 movaps c(%esp), %xmm1
 252 xorps %xmm0, %xmm0
 253 movss %xmm1, %xmm0
 254 ret
 255
 256 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 257 this code:
 258
 259 movaps c(%esp), %xmm1
 260 movaps %xmm1, c2(%esp)
 261 ...
 262
 263 xorps %xmm0, %xmm0
 264 movaps c2(%esp), %xmm1
 265 movss %xmm1, %xmm0
 266 ret
 267
 268 However, since the reload is only used by these instructions, we could
 269 "fold" it into the uses, producing something like this:
 270
 271 movaps c(%esp), %xmm1
 272 movaps %xmm1, c2(%esp)
 273 ...
 274
 275 movss c2(%esp), %xmm0
 276 ret
 277
 278 ... saving two instructions.
 279
 280 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 281 chunk is used, bring in 3 zeros the one element instead of 4 elements.
 282 This can be used to simplify a variety of shuffle operations, where the
 283 elements are fixed zeros.
 284
 285 //===---------------------------------------------------------------------===//
 286
 287 This code generates ugly code, probably due to costs being off or something:
 288
 289 define void @test(float* %P, <4 x float>* %P2 ) {
 290         %xFloat0.688 = load float* %P
 291         %tmp = load <4 x float>* %P2
 292         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
 293         store <4 x float> %inFloat3.713, <4 x float>* %P2
 294         ret void
 295 }
 296
 297 Generates:
 298
 299 _test:
 300         movl    8(%esp), %eax
 301         movaps  (%eax), %xmm0
 302         pxor    %xmm1, %xmm1
 303         movaps  %xmm0, %xmm2
 304         shufps  $50, %xmm1, %xmm2
 305         shufps  $132, %xmm2, %xmm0
 306         movaps  %xmm0, (%eax)
 307         ret
 308
 309 Would it be better to generate:
 310
 311 _test:
 312         movl 8(%esp), %ecx
 313         movaps (%ecx), %xmm0
 314         xor %eax, %eax
 315         pinsrw $6, %eax, %xmm0
 316         pinsrw $7, %eax, %xmm0
 317         movaps %xmm0, (%ecx)
 318         ret
 319
 320 ?
 321
 322 //===---------------------------------------------------------------------===//
 323
 324 Some useful information in the Apple Altivec / SSE Migration Guide:
 325
 326 http://developer.apple.com/documentation/Performance/Conceptual/
 327 Accelerate_sse_migration/index.html
 328
 329 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 330
 331 //===---------------------------------------------------------------------===//
 332
 333 Add hooks to commute some CMPP operations.
 334
 335 //===---------------------------------------------------------------------===//
 336
 337 Apply the same transformation that merged four float into a single 128-bit load
 338 to loads from constant pool.
 339
 340 //===---------------------------------------------------------------------===//
 341
 342 Floating point max / min are commutable when -enable-unsafe-fp-path is
 343 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
 344 nodes which are selected to max / min instructions that are marked commutable.
 345
 346 //===---------------------------------------------------------------------===//
 347
 348 We should materialize vector constants like "all ones" and "signbit" with
 349 code like:
 350
 351      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 352
 353 and:
 354      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 355      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
 356
 357 instead of using a load from the constant pool.  The later is important for
 358 ABS/NEG/copysign etc.
 359
 360 //===---------------------------------------------------------------------===//
 361
 362 These functions:
 363
 364 #include <xmmintrin.h>
 365 __m128i a;
 366 void x(unsigned short n) {
 367   a = _mm_slli_epi32 (a, n);
 368 }
 369 void y(unsigned n) {
 370   a = _mm_slli_epi32 (a, n);
 371 }
 372
 373 compile to ( -O3 -static -fomit-frame-pointer):
 374 _x:
 375         movzwl  4(%esp), %eax
 376         movd    %eax, %xmm0
 377         movaps  _a, %xmm1
 378         pslld   %xmm0, %xmm1
 379         movaps  %xmm1, _a
 380         ret
 381 _y:
 382         movd    4(%esp), %xmm0
 383         movaps  _a, %xmm1
 384         pslld   %xmm0, %xmm1
 385         movaps  %xmm1, _a
 386         ret
 387
 388 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
 389 like movd would be sufficient in both cases as the value is already zero
 390 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
 391 save, as a really-signed value would be undefined for pslld.
 392
 393
 394 //===---------------------------------------------------------------------===//
 395
 396 #include <math.h>
 397 int t1(double d) { return signbit(d); }
 398
 399 This currently compiles to:
 400         subl    $12, %esp
 401         movsd   16(%esp), %xmm0
 402         movsd   %xmm0, (%esp)
 403         movl    4(%esp), %eax
 404         shrl    $31, %eax
 405         addl    $12, %esp
 406         ret
 407
 408 We should use movmskp{s|d} instead.
 409
 410 //===---------------------------------------------------------------------===//
 411
 412 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
 413 (aligned) vector load.  This functionality has a couple of problems.
 414
 415 1. The code to infer alignment from loads of globals is in the X86 backend,
 416    not the dag combiner.  This is because dagcombine2 needs to be able to see
 417    through the X86ISD::Wrapper node, which DAGCombine can't really do.
 418 2. The code for turning 4 x load into a single vector load is target
 419    independent and should be moved to the dag combiner.
 420 3. The code for turning 4 x load into a vector load can only handle a direct
 421    load from a global or a direct load from the stack.  It should be generalized
 422    to handle any load from P, P+4, P+8, P+12, where P can be anything.
 423 4. The alignment inference code cannot handle loads from globals in non-static
 424    mode because it doesn't look through the extra dyld stub load.  If you try
 425    vec_align.ll without -relocation-model=static, you'll see what I mean.
 426
 427 //===---------------------------------------------------------------------===//
 428
 429 We should lower store(fneg(load p), q) into an integer load+xor+store, which
 430 eliminates a constant pool load.  For example, consider:
 431
 432 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
 433 entry:
 434  %tmp6 = fsub float -0.000000e+00, %z.1         ; <float> [#uses=1]
 435  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
 436  ret i64 %tmp20
 437 }
 438 declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
 439
 440 This currently compiles to:
 441
 442 LCPI1_0:                                        #  <4 x float>
 443         .long   2147483648      # float -0
 444         .long   2147483648      # float -0
 445         .long   2147483648      # float -0
 446         .long   2147483648      # float -0
 447 _ccosf:
 448         subl    $12, %esp
 449         movss   16(%esp), %xmm0
 450         movss   %xmm0, 4(%esp)
 451         movss   20(%esp), %xmm0
 452         xorps   LCPI1_0, %xmm0
 453         movss   %xmm0, (%esp)
 454         call    L_ccoshf$stub
 455         addl    $12, %esp
 456         ret
 457
 458 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
 459 this code computes the pic base and does two loads to do the constant pool
 460 load, so the improvement is much bigger.
 461
 462 The tricky part about this xform is that the argument load/store isn't exposed
 463 until post-legalize, and at that point, the fneg has been custom expanded into
 464 an X86 fxor.  This means that we need to handle this case in the x86 backend
 465 instead of in target independent code.
 466
 467 //===---------------------------------------------------------------------===//
 468
 469 Non-SSE4 insert into 16 x i8 is atrociously bad.
 470
 471 //===---------------------------------------------------------------------===//
 472
 473 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
 474 is memory.
 475
 476 //===---------------------------------------------------------------------===//
 477
 478 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
 479 sitting between the truncate and the extract.
 480
 481 //===---------------------------------------------------------------------===//
 482
 483 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
 484 any number of 0.0 simultaneously.  Currently we only use it for simple
 485 insertions.
 486
 487 See comments in LowerINSERT_VECTOR_ELT_SSE4.
 488
 489 //===---------------------------------------------------------------------===//
 490
 491 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
 492 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
 493 legal, it'll just take a few extra patterns written in the .td file.
 494
 495 Note: this is not a code quality issue; the custom lowered code happens to be
 496 right, but we shouldn't have to custom lower anything.  This is probably related
 497 to <2 x i64> ops being so bad.
 498
 499 //===---------------------------------------------------------------------===//
 500
 501 'select' on vectors and scalars could be a whole lot better.  We currently
 502 lower them to conditional branches.  On x86-64 for example, we compile this:
 503
 504 double test(double a, double b, double c, double d) { return a<b ? c : d; }
 505
 506 to:
 507
 508 _test:
 509         ucomisd %xmm0, %xmm1
 510         ja      LBB1_2  # entry
 511 LBB1_1: # entry
 512         movapd  %xmm3, %xmm2
 513 LBB1_2: # entry
 514         movapd  %xmm2, %xmm0
 515         ret
 516
 517 instead of:
 518
 519 _test:
 520         cmpltsd %xmm1, %xmm0
 521         andpd   %xmm0, %xmm2
 522         andnpd  %xmm3, %xmm0
 523         orpd    %xmm2, %xmm0
 524         ret
 525
 526 For unpredictable branches, the later is much more efficient.  This should
 527 just be a matter of having scalar sse map to SELECT_CC and custom expanding
 528 or iseling it.
 529
 530 //===---------------------------------------------------------------------===//
 531
 532 LLVM currently generates stack realignment code, when it is not necessary
 533 needed. The problem is that we need to know about stack alignment too early,
 534 before RA runs.
 535
 536 At that point we don't know, whether there will be vector spill, or not.
 537 Stack realignment logic is overly conservative here, but otherwise we can
 538 produce unaligned loads/stores.
 539
 540 Fixing this will require some huge RA changes.
 541
 542 Testcase:
 543 #include <emmintrin.h>
 544
 545 typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
 546
 547 static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
 548 - 22725, - 12873};;
 549
 550 vSInt16 madd(vSInt16 b)
 551 {
 552     return _mm_madd_epi16(a, b);
 553 }
 554
 555 Generated code (x86-32, linux):
 556 madd:
 557         pushl   %ebp
 558         movl    %esp, %ebp
 559         andl    $-16, %esp
 560         movaps  .LCPI1_0, %xmm1
 561         pmaddwd %xmm1, %xmm0
 562         movl    %ebp, %esp
 563         popl    %ebp
 564         ret
 565
 566 //===---------------------------------------------------------------------===//
 567
 568 Consider:
 569 #include <emmintrin.h>
 570 __m128 foo2 (float x) {
 571  return _mm_set_ps (0, 0, x, 0);
 572 }
 573
 574 In x86-32 mode, we generate this spiffy code:
 575
 576 _foo2:
 577         movss   4(%esp), %xmm0
 578         pshufd  $81, %xmm0, %xmm0
 579         ret
 580
 581 in x86-64 mode, we generate this code, which could be better:
 582
 583 _foo2:
 584         xorps   %xmm1, %xmm1
 585         movss   %xmm0, %xmm1
 586         pshufd  $81, %xmm1, %xmm0
 587         ret
 588
 589 In sse4 mode, we could use insertps to make both better.
 590
 591 Here's another testcase that could use insertps [mem]:
 592
 593 #include <xmmintrin.h>
 594 extern float x2, x3;
 595 __m128 foo1 (float x1, float x4) {
 596  return _mm_set_ps (x2, x1, x3, x4);
 597 }
 598
 599 gcc mainline compiles it to:
 600
 601 foo1:
 602        insertps        $0x10, x2(%rip), %xmm0
 603        insertps        $0x10, x3(%rip), %xmm1
 604        movaps  %xmm1, %xmm2
 605        movlhps %xmm0, %xmm2
 606        movaps  %xmm2, %xmm0
 607        ret
 608
 609 //===---------------------------------------------------------------------===//
 610
 611 We compile vector multiply-by-constant into poor code:
 612
 613 define <4 x i32> @f(<4 x i32> %i) nounwind  {
 614         %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
 615         ret <4 x i32> %A
 616 }
 617
 618 On targets without SSE4.1, this compiles into:
 619
 620 LCPI1_0:                                        ##  <4 x i32>
 621         .long   10
 622         .long   10
 623         .long   10
 624         .long   10
 625         .text
 626         .align  4,0x90
 627         .globl  _f
 628 _f:
 629         pshufd  $3, %xmm0, %xmm1
 630         movd    %xmm1, %eax
 631         imull   LCPI1_0+12, %eax
 632         movd    %eax, %xmm1
 633         pshufd  $1, %xmm0, %xmm2
 634         movd    %xmm2, %eax
 635         imull   LCPI1_0+4, %eax
 636         movd    %eax, %xmm2
 637         punpckldq       %xmm1, %xmm2
 638         movd    %xmm0, %eax
 639         imull   LCPI1_0, %eax
 640         movd    %eax, %xmm1
 641         movhlps %xmm0, %xmm0
 642         movd    %xmm0, %eax
 643         imull   LCPI1_0+8, %eax
 644         movd    %eax, %xmm0
 645         punpckldq       %xmm0, %xmm1
 646         movaps  %xmm1, %xmm0
 647         punpckldq       %xmm2, %xmm0
 648         ret
 649
 650 It would be better to synthesize integer vector multiplication by constants
 651 using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
 652 simple cases such as multiplication by powers of two would be better as
 653 vector shifts than as multiplications.
 654
 655 //===---------------------------------------------------------------------===//
 656
 657 We compile this:
 658
 659 __m128i
 660 foo2 (char x)
 661 {
 662   return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
 663 }
 664
 665 into:
 666         movl    $1, %eax
 667         xorps   %xmm0, %xmm0
 668         pinsrw  $2, %eax, %xmm0
 669         movzbl  4(%esp), %eax
 670         pinsrw  $3, %eax, %xmm0
 671         movl    $256, %eax
 672         pinsrw  $7, %eax, %xmm0
 673         ret
 674
 675
 676 gcc-4.2:
 677         subl    $12, %esp
 678         movzbl  16(%esp), %eax
 679         movdqa  LC0, %xmm0
 680         pinsrw  $3, %eax, %xmm0
 681         addl    $12, %esp
 682         ret
 683         .const
 684         .align 4
 685 LC0:
 686         .word   0
 687         .word   0
 688         .word   1
 689         .word   0
 690         .word   0
 691         .word   0
 692         .word   0
 693         .word   256
 694
 695 With SSE4, it should be
 696       movdqa  .LC0(%rip), %xmm0
 697       pinsrb  $6, %edi, %xmm0
 698
 699 //===---------------------------------------------------------------------===//
 700
 701 We should transform a shuffle of two vectors of constants into a single vector
 702 of constants. Also, insertelement of a constant into a vector of constants
 703 should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
 704
 705 We compiled it to something horrible:
 706
 707         .align  4
 708 LCPI1_1:                                        ##  float
 709         .long   1065353216      ## float 1
 710         .const
 711
 712         .align  4
 713 LCPI1_0:                                        ##  <4 x float>
 714         .space  4
 715         .long   1065353216      ## float 1
 716         .space  4
 717         .long   1065353216      ## float 1
 718         .text
 719         .align  4,0x90
 720         .globl  _t
 721 _t:
 722         xorps   %xmm0, %xmm0
 723         movhps  LCPI1_0, %xmm0
 724         movss   LCPI1_1, %xmm1
 725         movaps  %xmm0, %xmm2
 726         shufps  $2, %xmm1, %xmm2
 727         shufps  $132, %xmm2, %xmm0
 728         movaps  %xmm0, 0
 729
 730 //===---------------------------------------------------------------------===//
 731 rdar://5907648
 732
 733 This function:
 734
 735 float foo(unsigned char x) {
 736   return x;
 737 }
 738
 739 compiles to (x86-32):
 740
 741 define float @foo(i8 zeroext  %x) nounwind  {
 742         %tmp12 = uitofp i8 %x to float          ; <float> [#uses=1]
 743         ret float %tmp12
 744 }
 745
 746 compiles to:
 747
 748 _foo:
 749         subl    $4, %esp
 750         movzbl  8(%esp), %eax
 751         cvtsi2ss        %eax, %xmm0
 752         movss   %xmm0, (%esp)
 753         flds    (%esp)
 754         addl    $4, %esp
 755         ret
 756
 757 We should be able to use:
 758   cvtsi2ss 8($esp), %xmm0
 759 since we know the stack slot is already zext'd.
 760
 761 //===---------------------------------------------------------------------===//
 762
 763 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
 764 when code size is critical. movlps is slower than movsd on core2 but it's one
 765 byte shorter.
 766
 767 //===---------------------------------------------------------------------===//
 768
 769 We should use a dynamic programming based approach to tell when using FPStack
 770 operations is cheaper than SSE.  SciMark montecarlo contains code like this
 771 for example:
 772
 773 double MonteCarlo_num_flops(int Num_samples) {
 774     return ((double) Num_samples)* 4.0;
 775 }
 776
 777 In fpstack mode, this compiles into:
 778
 779 LCPI1_0:
 780         .long   1082130432      ## float 4.000000e+00
 781 _MonteCarlo_num_flops:
 782         subl    $4, %esp
 783         movl    8(%esp), %eax
 784         movl    %eax, (%esp)
 785         fildl   (%esp)
 786         fmuls   LCPI1_0
 787         addl    $4, %esp
 788         ret
 789
 790 in SSE mode, it compiles into significantly slower code:
 791
 792 _MonteCarlo_num_flops:
 793         subl    $12, %esp
 794         cvtsi2sd        16(%esp), %xmm0
 795         mulsd   LCPI1_0, %xmm0
 796         movsd   %xmm0, (%esp)
 797         fldl    (%esp)
 798         addl    $12, %esp
 799         ret
 800
 801 There are also other cases in scimark where using fpstack is better, it is
 802 cheaper to do fld1 than load from a constant pool for example, so
 803 "load, add 1.0, store" is better done in the fp stack, etc.
 804
 805 //===---------------------------------------------------------------------===//
 806
 807 The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
 808 "cmpsd".  For example, this code:
 809
 810 double d1(double x) { return x == x ? x : x + x; }
 811
 812 Compiles into:
 813
 814 _d1:
 815         ucomisd %xmm0, %xmm0
 816         jnp     LBB1_2
 817         addsd   %xmm0, %xmm0
 818         ret
 819 LBB1_2:
 820         ret
 821
 822 Also, the 'ret's should be shared.  This is PR6032.
 823
 824 //===---------------------------------------------------------------------===//
 825
 826 These should compile into the same code (PR6214): Perhaps instcombine should
 827 canonicalize the former into the later?
 828
 829 define float @foo(float %x) nounwind {
 830   %t = bitcast float %x to i32
 831   %s = and i32 %t, 2147483647
 832   %d = bitcast i32 %s to float
 833   ret float %d
 834 }
 835
 836 declare float @fabsf(float %n)
 837 define float @bar(float %x) nounwind {
 838   %d = call float @fabsf(float %x)
 839   ret float %d
 840 }
 841
 842 //===---------------------------------------------------------------------===//
 843
 844 This IR (from PR6194):
 845
 846 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 847 target triple = "x86_64-apple-darwin10.0.0"
 848
 849 %0 = type { double, double }
 850 %struct.float3 = type { float, float, float }
 851
 852 define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
 853 entry:
 854   %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
 855   %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
 856   %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
 857   %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
 858   %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
 859   %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
 860   %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
 861   store float %tmp12, float* %tmp5
 862   ret void
 863 }
 864
 865 Compiles to:
 866
 867 _test:                                  ## @test
 868         movd    %xmm0, %rax
 869         shrq    $32, %rax
 870         movl    %eax, 4(%rdi)
 871         ret
 872
 873 This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
 874 doing a shuffle from v[1] to v[0] then a float store.
 875
 876 //===---------------------------------------------------------------------===//
 877
 878 On SSE4 machines, we compile this code:
 879
 880 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
 881        <2 x float> *%P) nounwind {
 882   %Z = fadd <2 x float> %Q, %R
 883
 884   store <2 x float> %Z, <2 x float> *%P
 885   ret <2 x float> %Z
 886 }
 887
 888 into:
 889
 890 _test2:                                 ## @test2
 891 ## BB#0:
 892         insertps        $0, %xmm2, %xmm2
 893         insertps        $16, %xmm3, %xmm2
 894         insertps        $0, %xmm0, %xmm3
 895         insertps        $16, %xmm1, %xmm3
 896         addps   %xmm2, %xmm3
 897         movq    %xmm3, (%rdi)
 898         movaps  %xmm3, %xmm0
 899         pshufd  $1, %xmm3, %xmm1
 900                                         ## kill: XMM1<def> XMM1<kill>
 901         ret
 902
 903 The insertps's of $0 are pointless complex copies.
 904
 905 //===---------------------------------------------------------------------===//
 906
 907