lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 - Consider eliminating the unaligned SSE load intrinsics, replacing them with
   6   unaligned LLVM load instructions.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 Expand libm rounding functions inline:  Significant speedups possible.
  11 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  12
  13 //===---------------------------------------------------------------------===//
  14
  15 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  16 other fast SSE modes.
  17
  18 //===---------------------------------------------------------------------===//
  19
  20 Think about doing i64 math in SSE regs.
  21
  22 //===---------------------------------------------------------------------===//
  23
  24 This testcase should have no SSE instructions in it, and only one load from
  25 a constant pool:
  26
  27 double %test3(bool %B) {
  28         %C = select bool %B, double 123.412, double 523.01123123
  29         ret double %C
  30 }
  31
  32 Currently, the select is being lowered, which prevents the dag combiner from
  33 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  34
  35 The pattern isel got this one right.
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
  40 like this:
  41
  42   X += y
  43
  44 and the register allocator decides to spill X, it is cheaper to emit this as:
  45
  46 Y += [xslot]
  47 store Y -> [xslot]
  48
  49 than as:
  50
  51 tmp = [xslot]
  52 tmp += y
  53 store tmp -> [xslot]
  54
  55 ..and this uses one fewer register (so this should be done at load folding
  56 time, not at spiller time).  *Note* however that this can only be done
  57 if Y is dead.  Here's a testcase:
  58
  59 @.str_3 = external global [15 x i8]             ; <[15 x i8]*> [#uses=0]
  60 declare void @printf(i32, ...)
  61 define void @main() {
  62 build_tree.exit:
  63         br label %no_exit.i7
  64
  65 no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
  66         %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]             ; <double> [#uses=1]
  67         %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]            ; <double> [#uses=1]
  68         %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00           ; <double> [#uses=1]
  69         %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00            ; <double> [#uses=2]
  70         br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
  71
  72 Compute_Tree.exit23:            ; preds = %no_exit.i7
  73         tail call void (i32, ...)* @printf( i32 0 )
  74         store double %tmp.34.i18, double* null
  75         ret void
  76 }
  77
  78 We currently emit:
  79
  80 .BBmain_1:
  81         xorpd %XMM1, %XMM1
  82         addsd %XMM0, %XMM1
  83 ***     movsd %XMM2, QWORD PTR [%ESP + 8]
  84 ***     addsd %XMM2, %XMM1
  85 ***     movsd QWORD PTR [%ESP + 8], %XMM2
  86         jmp .BBmain_1   # no_exit.i7
  87
  88 This is a bugpoint reduced testcase, which is why the testcase doesn't make
  89 much sense (e.g. its an infinite loop). :)
  90
  91 //===---------------------------------------------------------------------===//
  92
  93 SSE should implement 'select_cc' using 'emulated conditional moves' that use
  94 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  95
  96 double %X(double %Y, double %Z, double %A, double %B) {
  97         %C = setlt double %A, %B
  98         %z = add double %Z, 0.0    ;; select operand is not a load
  99         %D = select bool %C, double %Y, double %z
 100         ret double %D
 101 }
 102
 103 We currently emit:
 104
 105 _X:
 106         subl $12, %esp
 107         xorpd %xmm0, %xmm0
 108         addsd 24(%esp), %xmm0
 109         movsd 32(%esp), %xmm1
 110         movsd 16(%esp), %xmm2
 111         ucomisd 40(%esp), %xmm1
 112         jb LBB_X_2
 113 LBB_X_1:
 114         movsd %xmm0, %xmm2
 115 LBB_X_2:
 116         movsd %xmm2, (%esp)
 117         fldl (%esp)
 118         addl $12, %esp
 119         ret
 120
 121 //===---------------------------------------------------------------------===//
 122
 123 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
 124 registers. The choice may depend on subtarget information. We should do some
 125 more experiments on different x86 machines.
 126
 127 //===---------------------------------------------------------------------===//
 128
 129 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 130 feasible.
 131
 132 //===---------------------------------------------------------------------===//
 133
 134 Codegen:
 135   if (copysign(1.0, x) == copysign(1.0, y))
 136 into:
 137   if (x^y & mask)
 138 when using SSE.
 139
 140 //===---------------------------------------------------------------------===//
 141
 142 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
 143 of a v4sf value.
 144
 145 //===---------------------------------------------------------------------===//
 146
 147 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 148 Perhaps use pxor / xorp* to clear a XMM register first?
 149
 150 //===---------------------------------------------------------------------===//
 151
 152 How to decide when to use the "floating point version" of logical ops? Here are
 153 some code fragments:
 154
 155         movaps LCPI5_5, %xmm2
 156         divps %xmm1, %xmm2
 157         mulps %xmm2, %xmm3
 158         mulps 8656(%ecx), %xmm3
 159         addps 8672(%ecx), %xmm3
 160         andps LCPI5_6, %xmm2
 161         andps LCPI5_1, %xmm3
 162         por %xmm2, %xmm3
 163         movdqa %xmm3, (%edi)
 164
 165         movaps LCPI5_5, %xmm1
 166         divps %xmm0, %xmm1
 167         mulps %xmm1, %xmm3
 168         mulps 8656(%ecx), %xmm3
 169         addps 8672(%ecx), %xmm3
 170         andps LCPI5_6, %xmm1
 171         andps LCPI5_1, %xmm3
 172         orps %xmm1, %xmm3
 173         movaps %xmm3, 112(%esp)
 174         movaps %xmm3, (%ebx)
 175
 176 Due to some minor source change, the later case ended up using orps and movaps
 177 instead of por and movdqa. Does it matter?
 178
 179 //===---------------------------------------------------------------------===//
 180
 181 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 182 to choose between movaps, movapd, and movdqa based on types of source and
 183 destination?
 184
 185 How about andps, andpd, and pand? Do we really care about the type of the packed
 186 elements? If not, why not always use the "ps" variants which are likely to be
 187 shorter.
 188
 189 //===---------------------------------------------------------------------===//
 190
 191 External test Nurbs exposed some problems. Look for
 192 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 193 emits:
 194
 195         movaps    (%edx), %xmm2                                 #59.21
 196         movaps    (%edx), %xmm5                                 #60.21
 197         movaps    (%edx), %xmm4                                 #61.21
 198         movaps    (%edx), %xmm3                                 #62.21
 199         movl      40(%ecx), %ebp                                #69.49
 200         shufps    $0, %xmm2, %xmm5                              #60.21
 201         movl      100(%esp), %ebx                               #69.20
 202         movl      (%ebx), %edi                                  #69.20
 203         imull     %ebp, %edi                                    #69.49
 204         addl      (%eax), %edi                                  #70.33
 205         shufps    $85, %xmm2, %xmm4                             #61.21
 206         shufps    $170, %xmm2, %xmm3                            #62.21
 207         shufps    $255, %xmm2, %xmm2                            #63.21
 208         lea       (%ebp,%ebp,2), %ebx                           #69.49
 209         negl      %ebx                                          #69.49
 210         lea       -3(%edi,%ebx), %ebx                           #70.33
 211         shll      $4, %ebx                                      #68.37
 212         addl      32(%ecx), %ebx                                #68.37
 213         testb     $15, %bl                                      #91.13
 214         jne       L_B1.24       # Prob 5%                       #91.13
 215
 216 This is the llvm code after instruction scheduling:
 217
 218 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 219         %reg1078 = MOV32ri -3
 220         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 221         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 222         %reg1080 = IMUL32rr %reg1079, %reg1037
 223         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 224         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 225         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 226         %reg1082 = SHL32ri %reg1038, 4
 227         %reg1039 = ADD32rr %reg1036, %reg1082
 228         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 229         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 230         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 231         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 232         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 233         %reg1040 = MOV32rr %reg1039
 234         %reg1084 = AND32ri8 %reg1039, 15
 235         CMP32ri8 %reg1084, 0
 236         JE mbb<cond_next204,0xa914d30>
 237
 238 Still ok. After register allocation:
 239
 240 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 241         %EAX = MOV32ri -3
 242         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 243         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 244         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 245         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 246         IMUL32rr %EAX<def&use>, %EDX
 247         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 248         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 249         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 250         %EAX = LEA32r %ESI, 1, %EAX, -3
 251         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 252         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 253         %EDI = MOV32rr %EAX
 254         SHL32ri %EDI<def&use>, 4
 255         ADD32rr %EDI<def&use>, %ESI
 256         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 257         %XMM1 = MOVAPSrr %XMM0
 258         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 259         %XMM2 = MOVAPSrr %XMM0
 260         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 261         %XMM3 = MOVAPSrr %XMM0
 262         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 263         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 264         %EBX = MOV32rr %EDI
 265         AND32ri8 %EBX<def&use>, 15
 266         CMP32ri8 %EBX, 0
 267         JE mbb<cond_next204,0xa914d30>
 268
 269 This looks really bad. The problem is shufps is a destructive opcode. Since it
 270 appears as operand two in more than one shufps ops. It resulted in a number of
 271 copies. Note icc also suffers from the same problem. Either the instruction
 272 selector should select pshufd or The register allocator can made the two-address
 273 to three-address transformation.
 274
 275 It also exposes some other problems. See MOV32ri -3 and the spills.
 276
 277 //===---------------------------------------------------------------------===//
 278
 279 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
 280
 281 LLVM is producing bad code.
 282
 283 LBB_main_4:     # cond_true44
 284         addps %xmm1, %xmm2
 285         subps %xmm3, %xmm2
 286         movaps (%ecx), %xmm4
 287         movaps %xmm2, %xmm1
 288         addps %xmm4, %xmm1
 289         addl $16, %ecx
 290         incl %edx
 291         cmpl $262144, %edx
 292         movaps %xmm3, %xmm2
 293         movaps %xmm4, %xmm3
 294         jne LBB_main_4  # cond_true44
 295
 296 There are two problems. 1) No need to two loop induction variables. We can
 297 compare against 262144 * 16. 2) Known register coalescer issue. We should
 298 be able eliminate one of the movaps:
 299
 300         addps %xmm2, %xmm1    <=== Commute!
 301         subps %xmm3, %xmm1
 302         movaps (%ecx), %xmm4
 303         movaps %xmm1, %xmm1   <=== Eliminate!
 304         addps %xmm4, %xmm1
 305         addl $16, %ecx
 306         incl %edx
 307         cmpl $262144, %edx
 308         movaps %xmm3, %xmm2
 309         movaps %xmm4, %xmm3
 310         jne LBB_main_4  # cond_true44
 311
 312 //===---------------------------------------------------------------------===//
 313
 314 Consider:
 315
 316 __m128 test(float a) {
 317   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 318 }
 319
 320 This compiles into:
 321
 322 movss 4(%esp), %xmm1
 323 mulss %xmm1, %xmm1
 324 xorps %xmm0, %xmm0
 325 movss %xmm1, %xmm0
 326 ret
 327
 328 Because mulss doesn't modify the top 3 elements, the top elements of
 329 xmm1 are already zero'd.  We could compile this to:
 330
 331 movss 4(%esp), %xmm0
 332 mulss %xmm0, %xmm0
 333 ret
 334
 335 //===---------------------------------------------------------------------===//
 336
 337 Here's a sick and twisted idea.  Consider code like this:
 338
 339 __m128 test(__m128 a) {
 340   float b = *(float*)&A;
 341   ...
 342   return _mm_set_ps(0.0, 0.0, 0.0, b);
 343 }
 344
 345 This might compile to this code:
 346
 347 movaps c(%esp), %xmm1
 348 xorps %xmm0, %xmm0
 349 movss %xmm1, %xmm0
 350 ret
 351
 352 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 353 this code:
 354
 355 movaps c(%esp), %xmm1
 356 movaps %xmm1, c2(%esp)
 357 ...
 358
 359 xorps %xmm0, %xmm0
 360 movaps c2(%esp), %xmm1
 361 movss %xmm1, %xmm0
 362 ret
 363
 364 However, since the reload is only used by these instructions, we could
 365 "fold" it into the uses, producing something like this:
 366
 367 movaps c(%esp), %xmm1
 368 movaps %xmm1, c2(%esp)
 369 ...
 370
 371 movss c2(%esp), %xmm0
 372 ret
 373
 374 ... saving two instructions.
 375
 376 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 377 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
 378 This can be used to simplify a variety of shuffle operations, where the
 379 elements are fixed zeros.
 380
 381 //===---------------------------------------------------------------------===//
 382
 383 For this:
 384
 385 #include <emmintrin.h>
 386 void test(__m128d *r, __m128d *A, double B) {
 387   *r = _mm_loadl_pd(*A, &B);
 388 }
 389
 390 We generates:
 391
 392         subl $12, %esp
 393         movsd 24(%esp), %xmm0
 394         movsd %xmm0, (%esp)
 395         movl 20(%esp), %eax
 396         movapd (%eax), %xmm0
 397         movlpd (%esp), %xmm0
 398         movl 16(%esp), %eax
 399         movapd %xmm0, (%eax)
 400         addl $12, %esp
 401         ret
 402
 403 icc generates:
 404
 405         movl      4(%esp), %edx                                 #3.6
 406         movl      8(%esp), %eax                                 #3.6
 407         movapd    (%eax), %xmm0                                 #4.22
 408         movlpd    12(%esp), %xmm0                               #4.8
 409         movapd    %xmm0, (%edx)                                 #4.3
 410         ret                                                     #5.1
 411
 412 So icc is smart enough to know that B is in memory so it doesn't load it and
 413 store it back to stack.
 414
 415 This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic,
 416 lowering it to a load+insertelement instead.  Already match the load+shuffle
 417 as movlpd, so this should be easy.  We already get optimal code for:
 418
 419 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) {
 420 entry:
 421         %tmp2 = load <2 x double>* %A, align 16
 422         %tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0
 423         store <2 x double> %tmp8, <2 x double>* %r, align 16
 424         ret void
 425 }
 426
 427 //===---------------------------------------------------------------------===//
 428
 429 __m128d test1( __m128d A, __m128d B) {
 430   return _mm_shuffle_pd(A, B, 0x3);
 431 }
 432
 433 compiles to
 434
 435 shufpd $3, %xmm1, %xmm0
 436
 437 Perhaps it's better to use unpckhpd instead?
 438
 439 unpckhpd %xmm1, %xmm0
 440
 441 Don't know if unpckhpd is faster. But it is shorter.
 442
 443 //===---------------------------------------------------------------------===//
 444
 445 This code generates ugly code, probably due to costs being off or something:
 446
 447 define void @test(float* %P, <4 x float>* %P2 ) {
 448         %xFloat0.688 = load float* %P
 449         %tmp = load <4 x float>* %P2
 450         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
 451         store <4 x float> %inFloat3.713, <4 x float>* %P2
 452         ret void
 453 }
 454
 455 Generates:
 456
 457 _test:
 458         movl    8(%esp), %eax
 459         movaps  (%eax), %xmm0
 460         pxor    %xmm1, %xmm1
 461         movaps  %xmm0, %xmm2
 462         shufps  $50, %xmm1, %xmm2
 463         shufps  $132, %xmm2, %xmm0
 464         movaps  %xmm0, (%eax)
 465         ret
 466
 467 Would it be better to generate:
 468
 469 _test:
 470         movl 8(%esp), %ecx
 471         movaps (%ecx), %xmm0
 472         xor %eax, %eax
 473         pinsrw $6, %eax, %xmm0
 474         pinsrw $7, %eax, %xmm0
 475         movaps %xmm0, (%ecx)
 476         ret
 477
 478 ?
 479
 480 //===---------------------------------------------------------------------===//
 481
 482 Some useful information in the Apple Altivec / SSE Migration Guide:
 483
 484 http://developer.apple.com/documentation/Performance/Conceptual/
 485 Accelerate_sse_migration/index.html
 486
 487 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 488
 489 //===---------------------------------------------------------------------===//
 490
 491 Add hooks to commute some CMPP operations.
 492
 493 //===---------------------------------------------------------------------===//
 494
 495 Apply the same transformation that merged four float into a single 128-bit load
 496 to loads from constant pool.
 497
 498 //===---------------------------------------------------------------------===//
 499
 500 Floating point max / min are commutable when -enable-unsafe-fp-path is
 501 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
 502 nodes which are selected to max / min instructions that are marked commutable.
 503
 504 //===---------------------------------------------------------------------===//
 505
 506 We should compile this:
 507 #include <xmmintrin.h>
 508 typedef union {
 509   int i[4];
 510   float f[4];
 511   __m128 v;
 512 } vector4_t;
 513 void swizzle (const void *a, vector4_t * b, vector4_t * c) {
 514   b->v = _mm_loadl_pi (b->v, (__m64 *) a);
 515   c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
 516 }
 517
 518 to:
 519
 520 _swizzle:
 521         movl    4(%esp), %eax
 522         movl    8(%esp), %edx
 523         movl    12(%esp), %ecx
 524         movlps  (%eax), %xmm0
 525         movlps  %xmm0, (%edx)
 526         movlps  8(%eax), %xmm0
 527         movlps  %xmm0, (%ecx)
 528         ret
 529
 530 not:
 531
 532 swizzle:
 533         movl 8(%esp), %eax
 534         movaps (%eax), %xmm0
 535         movl 4(%esp), %ecx
 536         movlps (%ecx), %xmm0
 537         movaps %xmm0, (%eax)
 538         movl 12(%esp), %eax
 539         movaps (%eax), %xmm0
 540         movlps 8(%ecx), %xmm0
 541         movaps %xmm0, (%eax)
 542         ret
 543
 544 //===---------------------------------------------------------------------===//
 545
 546 These functions should produce the same code:
 547
 548 #include <emmintrin.h>
 549
 550 typedef long long __m128i __attribute__ ((__vector_size__ (16)));
 551
 552 int foo(__m128i* val) {
 553   return __builtin_ia32_vec_ext_v4si(*val, 1);
 554 }
 555 int bar(__m128i* val) {
 556   union vs {
 557     __m128i *_v;
 558     int* _s;
 559   } v = {val};
 560   return v._s[1];
 561 }
 562
 563 We currently produce (with -m64):
 564
 565 _foo:
 566         pshufd $1, (%rdi), %xmm0
 567         movd %xmm0, %eax
 568         ret
 569 _bar:
 570         movl 4(%rdi), %eax
 571         ret
 572
 573 //===---------------------------------------------------------------------===//
 574
 575 We should materialize vector constants like "all ones" and "signbit" with
 576 code like:
 577
 578      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 579
 580 and:
 581      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 582      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
 583
 584 instead of using a load from the constant pool.  The later is important for
 585 ABS/NEG/copysign etc.
 586
 587 //===---------------------------------------------------------------------===//
 588
 589 These functions:
 590
 591 #include <xmmintrin.h>
 592 __m128i a;
 593 void x(unsigned short n) {
 594   a = _mm_slli_epi32 (a, n);
 595 }
 596 void y(unsigned n) {
 597   a = _mm_slli_epi32 (a, n);
 598 }
 599
 600 compile to ( -O3 -static -fomit-frame-pointer):
 601 _x:
 602         movzwl  4(%esp), %eax
 603         movd    %eax, %xmm0
 604         movaps  _a, %xmm1
 605         pslld   %xmm0, %xmm1
 606         movaps  %xmm1, _a
 607         ret
 608 _y:
 609         movd    4(%esp), %xmm0
 610         movaps  _a, %xmm1
 611         pslld   %xmm0, %xmm1
 612         movaps  %xmm1, _a
 613         ret
 614
 615 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
 616 like movd would be sufficient in both cases as the value is already zero
 617 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
 618 save, as a really-signed value would be undefined for pslld.
 619
 620
 621 //===---------------------------------------------------------------------===//
 622
 623 #include <math.h>
 624 int t1(double d) { return signbit(d); }
 625
 626 This currently compiles to:
 627         subl    $12, %esp
 628         movsd   16(%esp), %xmm0
 629         movsd   %xmm0, (%esp)
 630         movl    4(%esp), %eax
 631         shrl    $31, %eax
 632         addl    $12, %esp
 633         ret
 634
 635 We should use movmskp{s|d} instead.
 636
 637 //===---------------------------------------------------------------------===//
 638
 639 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
 640 (aligned) vector load.  This functionality has a couple of problems.
 641
 642 1. The code to infer alignment from loads of globals is in the X86 backend,
 643    not the dag combiner.  This is because dagcombine2 needs to be able to see
 644    through the X86ISD::Wrapper node, which DAGCombine can't really do.
 645 2. The code for turning 4 x load into a single vector load is target
 646    independent and should be moved to the dag combiner.
 647 3. The code for turning 4 x load into a vector load can only handle a direct
 648    load from a global or a direct load from the stack.  It should be generalized
 649    to handle any load from P, P+4, P+8, P+12, where P can be anything.
 650 4. The alignment inference code cannot handle loads from globals in non-static
 651    mode because it doesn't look through the extra dyld stub load.  If you try
 652    vec_align.ll without -relocation-model=static, you'll see what I mean.
 653
 654 //===---------------------------------------------------------------------===//
 655
 656 We should lower store(fneg(load p), q) into an integer load+xor+store, which
 657 eliminates a constant pool load.  For example, consider:
 658
 659 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
 660 entry:
 661         %tmp6 = sub float -0.000000e+00, %z.1           ; <float> [#uses=1]
 662         %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly             ; <i64> [#uses=1]
 663         ret i64 %tmp20
 664 }
 665
 666 This currently compiles to:
 667
 668 LCPI1_0:                                        #  <4 x float>
 669         .long   2147483648      # float -0
 670         .long   2147483648      # float -0
 671         .long   2147483648      # float -0
 672         .long   2147483648      # float -0
 673 _ccosf:
 674         subl    $12, %esp
 675         movss   16(%esp), %xmm0
 676         movss   %xmm0, 4(%esp)
 677         movss   20(%esp), %xmm0
 678         xorps   LCPI1_0, %xmm0
 679         movss   %xmm0, (%esp)
 680         call    L_ccoshf$stub
 681         addl    $12, %esp
 682         ret
 683
 684 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
 685 this code computes the pic base and does two loads to do the constant pool
 686 load, so the improvement is much bigger.
 687
 688 The tricky part about this xform is that the argument load/store isn't exposed
 689 until post-legalize, and at that point, the fneg has been custom expanded into
 690 an X86 fxor.  This means that we need to handle this case in the x86 backend
 691 instead of in target independent code.
 692
 693 //===---------------------------------------------------------------------===//
 694
 695 Non-SSE4 insert into 16 x i8 is atrociously bad.
 696
 697 //===---------------------------------------------------------------------===//
 698
 699 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
 700 is memory.
 701
 702 //===---------------------------------------------------------------------===//
 703
 704 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
 705 sitting between the truncate and the extract.
 706
 707 //===---------------------------------------------------------------------===//
 708
 709 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
 710 any number of 0.0 simultaneously.  Currently we only use it for simple
 711 insertions.
 712
 713 See comments in LowerINSERT_VECTOR_ELT_SSE4.
 714
 715 //===---------------------------------------------------------------------===//
 716
 717 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
 718 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
 719 legal, it'll just take a few extra patterns written in the .td file.
 720
 721 Note: this is not a code quality issue; the custom lowered code happens to be
 722 right, but we shouldn't have to custom lower anything.  This is probably related
 723 to <2 x i64> ops being so bad.
 724
 725 //===---------------------------------------------------------------------===//
 726
 727 'select' on vectors and scalars could be a whole lot better.  We currently
 728 lower them to conditional branches.  On x86-64 for example, we compile this:
 729
 730 double test(double a, double b, double c, double d) { return a<b ? c : d; }
 731
 732 to:
 733
 734 _test:
 735         ucomisd %xmm0, %xmm1
 736         ja      LBB1_2  # entry
 737 LBB1_1: # entry
 738         movapd  %xmm3, %xmm2
 739 LBB1_2: # entry
 740         movapd  %xmm2, %xmm0
 741         ret
 742
 743 instead of:
 744
 745 _test:
 746         cmpltsd %xmm1, %xmm0
 747         andpd   %xmm0, %xmm2
 748         andnpd  %xmm3, %xmm0
 749         orpd    %xmm2, %xmm0
 750         ret
 751
 752 For unpredictable branches, the later is much more efficient.  This should
 753 just be a matter of having scalar sse map to SELECT_CC and custom expanding
 754 or iseling it.