lib/Target/X86/README-SSE.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend: SSE-specific stuff.
   3 //===---------------------------------------------------------------------===//
   4
   5 - Consider eliminating the unaligned SSE load intrinsics, replacing them with
   6   unaligned LLVM load instructions.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 Expand libm rounding functions inline:  Significant speedups possible.
  11 http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  12
  13 //===---------------------------------------------------------------------===//
  14
  15 When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  16 other fast SSE modes.
  17
  18 //===---------------------------------------------------------------------===//
  19
  20 Think about doing i64 math in SSE regs.
  21
  22 //===---------------------------------------------------------------------===//
  23
  24 This testcase should have no SSE instructions in it, and only one load from
  25 a constant pool:
  26
  27 double %test3(bool %B) {
  28         %C = select bool %B, double 123.412, double 523.01123123
  29         ret double %C
  30 }
  31
  32 Currently, the select is being lowered, which prevents the dag combiner from
  33 turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  34
  35 The pattern isel got this one right.
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
  40 like this:
  41
  42   X += y
  43
  44 and the register allocator decides to spill X, it is cheaper to emit this as:
  45
  46 Y += [xslot]
  47 store Y -> [xslot]
  48
  49 than as:
  50
  51 tmp = [xslot]
  52 tmp += y
  53 store tmp -> [xslot]
  54
  55 ..and this uses one fewer register (so this should be done at load folding
  56 time, not at spiller time).  *Note* however that this can only be done
  57 if Y is dead.  Here's a testcase:
  58
  59 %.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
  60 implementation   ; Functions:
  61 declare void %printf(int, ...)
  62 void %main() {
  63 build_tree.exit:
  64         br label %no_exit.i7
  65 no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
  66         %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
  67         %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
  68         %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
  69         %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
  70         br bool false, label %Compute_Tree.exit23, label %no_exit.i7
  71 Compute_Tree.exit23:            ; preds = %no_exit.i7
  72         tail call void (int, ...)* %printf( int 0 )
  73         store double %tmp.34.i18, double* null
  74         ret void
  75 }
  76
  77 We currently emit:
  78
  79 .BBmain_1:
  80         xorpd %XMM1, %XMM1
  81         addsd %XMM0, %XMM1
  82 ***     movsd %XMM2, QWORD PTR [%ESP + 8]
  83 ***     addsd %XMM2, %XMM1
  84 ***     movsd QWORD PTR [%ESP + 8], %XMM2
  85         jmp .BBmain_1   # no_exit.i7
  86
  87 This is a bugpoint reduced testcase, which is why the testcase doesn't make
  88 much sense (e.g. its an infinite loop). :)
  89
  90 //===---------------------------------------------------------------------===//
  91
  92 SSE should implement 'select_cc' using 'emulated conditional moves' that use
  93 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  94
  95 double %X(double %Y, double %Z, double %A, double %B) {
  96         %C = setlt double %A, %B
  97         %z = add double %Z, 0.0    ;; select operand is not a load
  98         %D = select bool %C, double %Y, double %z
  99         ret double %D
 100 }
 101
 102 We currently emit:
 103
 104 _X:
 105         subl $12, %esp
 106         xorpd %xmm0, %xmm0
 107         addsd 24(%esp), %xmm0
 108         movsd 32(%esp), %xmm1
 109         movsd 16(%esp), %xmm2
 110         ucomisd 40(%esp), %xmm1
 111         jb LBB_X_2
 112 LBB_X_1:
 113         movsd %xmm0, %xmm2
 114 LBB_X_2:
 115         movsd %xmm2, (%esp)
 116         fldl (%esp)
 117         addl $12, %esp
 118         ret
 119
 120 //===---------------------------------------------------------------------===//
 121
 122 It's not clear whether we should use pxor or xorps / xorpd to clear XMM
 123 registers. The choice may depend on subtarget information. We should do some
 124 more experiments on different x86 machines.
 125
 126 //===---------------------------------------------------------------------===//
 127
 128 Currently the x86 codegen isn't very good at mixing SSE and FPStack
 129 code:
 130
 131 unsigned int foo(double x) { return x; }
 132
 133 foo:
 134         subl $20, %esp
 135         movsd 24(%esp), %xmm0
 136         movsd %xmm0, 8(%esp)
 137         fldl 8(%esp)
 138         fisttpll (%esp)
 139         movl (%esp), %eax
 140         addl $20, %esp
 141         ret
 142
 143 This will be solved when we go to a dynamic programming based isel.
 144
 145 //===---------------------------------------------------------------------===//
 146
 147 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 148 feasible.
 149
 150 //===---------------------------------------------------------------------===//
 151
 152 Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
 153 the reg-reg copy in this example:
 154
 155 float foo(int *x, float *y, unsigned c) {
 156   float res = 0.0;
 157   unsigned i;
 158   for (i = 0; i < c; i++) {
 159     float xx = (float)x[i];
 160     xx = xx * y[i];
 161     xx += res;
 162     res = xx;
 163   }
 164   return res;
 165 }
 166
 167 LBB_foo_3:      # no_exit
 168         cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
 169         mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
 170         addss %XMM0, %XMM1
 171         inc %ESI
 172         cmp %ESI, %ECX
 173 ****    movaps %XMM1, %XMM0
 174         jb LBB_foo_3    # no_exit
 175
 176 //===---------------------------------------------------------------------===//
 177
 178 Codegen:
 179   if (copysign(1.0, x) == copysign(1.0, y))
 180 into:
 181   if (x^y & mask)
 182 when using SSE.
 183
 184 //===---------------------------------------------------------------------===//
 185
 186 Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
 187 of a v4sf value.
 188
 189 //===---------------------------------------------------------------------===//
 190
 191 Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
 192 Perhaps use pxor / xorp* to clear a XMM register first?
 193
 194 //===---------------------------------------------------------------------===//
 195
 196 How to decide when to use the "floating point version" of logical ops? Here are
 197 some code fragments:
 198
 199         movaps LCPI5_5, %xmm2
 200         divps %xmm1, %xmm2
 201         mulps %xmm2, %xmm3
 202         mulps 8656(%ecx), %xmm3
 203         addps 8672(%ecx), %xmm3
 204         andps LCPI5_6, %xmm2
 205         andps LCPI5_1, %xmm3
 206         por %xmm2, %xmm3
 207         movdqa %xmm3, (%edi)
 208
 209         movaps LCPI5_5, %xmm1
 210         divps %xmm0, %xmm1
 211         mulps %xmm1, %xmm3
 212         mulps 8656(%ecx), %xmm3
 213         addps 8672(%ecx), %xmm3
 214         andps LCPI5_6, %xmm1
 215         andps LCPI5_1, %xmm3
 216         orps %xmm1, %xmm3
 217         movaps %xmm3, 112(%esp)
 218         movaps %xmm3, (%ebx)
 219
 220 Due to some minor source change, the later case ended up using orps and movaps
 221 instead of por and movdqa. Does it matter?
 222
 223 //===---------------------------------------------------------------------===//
 224
 225 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 226 to choose between movaps, movapd, and movdqa based on types of source and
 227 destination?
 228
 229 How about andps, andpd, and pand? Do we really care about the type of the packed
 230 elements? If not, why not always use the "ps" variants which are likely to be
 231 shorter.
 232
 233 //===---------------------------------------------------------------------===//
 234
 235 External test Nurbs exposed some problems. Look for
 236 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 237 emits:
 238
 239         movaps    (%edx), %xmm2                                 #59.21
 240         movaps    (%edx), %xmm5                                 #60.21
 241         movaps    (%edx), %xmm4                                 #61.21
 242         movaps    (%edx), %xmm3                                 #62.21
 243         movl      40(%ecx), %ebp                                #69.49
 244         shufps    $0, %xmm2, %xmm5                              #60.21
 245         movl      100(%esp), %ebx                               #69.20
 246         movl      (%ebx), %edi                                  #69.20
 247         imull     %ebp, %edi                                    #69.49
 248         addl      (%eax), %edi                                  #70.33
 249         shufps    $85, %xmm2, %xmm4                             #61.21
 250         shufps    $170, %xmm2, %xmm3                            #62.21
 251         shufps    $255, %xmm2, %xmm2                            #63.21
 252         lea       (%ebp,%ebp,2), %ebx                           #69.49
 253         negl      %ebx                                          #69.49
 254         lea       -3(%edi,%ebx), %ebx                           #70.33
 255         shll      $4, %ebx                                      #68.37
 256         addl      32(%ecx), %ebx                                #68.37
 257         testb     $15, %bl                                      #91.13
 258         jne       L_B1.24       # Prob 5%                       #91.13
 259
 260 This is the llvm code after instruction scheduling:
 261
 262 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 263         %reg1078 = MOV32ri -3
 264         %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
 265         %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
 266         %reg1080 = IMUL32rr %reg1079, %reg1037
 267         %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
 268         %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
 269         %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
 270         %reg1082 = SHL32ri %reg1038, 4
 271         %reg1039 = ADD32rr %reg1036, %reg1082
 272         %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
 273         %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
 274         %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
 275         %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
 276         %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
 277         %reg1040 = MOV32rr %reg1039
 278         %reg1084 = AND32ri8 %reg1039, 15
 279         CMP32ri8 %reg1084, 0
 280         JE mbb<cond_next204,0xa914d30>
 281
 282 Still ok. After register allocation:
 283
 284 cond_next140 (0xa910740, LLVM BB @0xa90beb0):
 285         %EAX = MOV32ri -3
 286         %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
 287         ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
 288         %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
 289         %EDX = MOV32rm %EDX, 1, %NOREG, 40
 290         IMUL32rr %EAX<def&use>, %EDX
 291         %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
 292         %ESI = MOV32rm %ESI, 1, %NOREG, 0
 293         MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
 294         %EAX = LEA32r %ESI, 1, %EAX, -3
 295         %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
 296         %ESI = MOV32rm %ESI, 1, %NOREG, 32
 297         %EDI = MOV32rr %EAX
 298         SHL32ri %EDI<def&use>, 4
 299         ADD32rr %EDI<def&use>, %ESI
 300         %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
 301         %XMM1 = MOVAPSrr %XMM0
 302         SHUFPSrr %XMM1<def&use>, %XMM1, 170
 303         %XMM2 = MOVAPSrr %XMM0
 304         SHUFPSrr %XMM2<def&use>, %XMM2, 0
 305         %XMM3 = MOVAPSrr %XMM0
 306         SHUFPSrr %XMM3<def&use>, %XMM3, 255
 307         SHUFPSrr %XMM0<def&use>, %XMM0, 85
 308         %EBX = MOV32rr %EDI
 309         AND32ri8 %EBX<def&use>, 15
 310         CMP32ri8 %EBX, 0
 311         JE mbb<cond_next204,0xa914d30>
 312
 313 This looks really bad. The problem is shufps is a destructive opcode. Since it
 314 appears as operand two in more than one shufps ops. It resulted in a number of
 315 copies. Note icc also suffers from the same problem. Either the instruction
 316 selector should select pshufd or The register allocator can made the two-address
 317 to three-address transformation.
 318
 319 It also exposes some other problems. See MOV32ri -3 and the spills.
 320
 321 //===---------------------------------------------------------------------===//
 322
 323 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
 324
 325 LLVM is producing bad code.
 326
 327 LBB_main_4:     # cond_true44
 328         addps %xmm1, %xmm2
 329         subps %xmm3, %xmm2
 330         movaps (%ecx), %xmm4
 331         movaps %xmm2, %xmm1
 332         addps %xmm4, %xmm1
 333         addl $16, %ecx
 334         incl %edx
 335         cmpl $262144, %edx
 336         movaps %xmm3, %xmm2
 337         movaps %xmm4, %xmm3
 338         jne LBB_main_4  # cond_true44
 339
 340 There are two problems. 1) No need to two loop induction variables. We can
 341 compare against 262144 * 16. 2) Known register coalescer issue. We should
 342 be able eliminate one of the movaps:
 343
 344         addps %xmm2, %xmm1    <=== Commute!
 345         subps %xmm3, %xmm1
 346         movaps (%ecx), %xmm4
 347         movaps %xmm1, %xmm1   <=== Eliminate!
 348         addps %xmm4, %xmm1
 349         addl $16, %ecx
 350         incl %edx
 351         cmpl $262144, %edx
 352         movaps %xmm3, %xmm2
 353         movaps %xmm4, %xmm3
 354         jne LBB_main_4  # cond_true44
 355
 356 //===---------------------------------------------------------------------===//
 357
 358 Consider:
 359
 360 __m128 test(float a) {
 361   return _mm_set_ps(0.0, 0.0, 0.0, a*a);
 362 }
 363
 364 This compiles into:
 365
 366 movss 4(%esp), %xmm1
 367 mulss %xmm1, %xmm1
 368 xorps %xmm0, %xmm0
 369 movss %xmm1, %xmm0
 370 ret
 371
 372 Because mulss doesn't modify the top 3 elements, the top elements of
 373 xmm1 are already zero'd.  We could compile this to:
 374
 375 movss 4(%esp), %xmm0
 376 mulss %xmm0, %xmm0
 377 ret
 378
 379 //===---------------------------------------------------------------------===//
 380
 381 Here's a sick and twisted idea.  Consider code like this:
 382
 383 __m128 test(__m128 a) {
 384   float b = *(float*)&A;
 385   ...
 386   return _mm_set_ps(0.0, 0.0, 0.0, b);
 387 }
 388
 389 This might compile to this code:
 390
 391 movaps c(%esp), %xmm1
 392 xorps %xmm0, %xmm0
 393 movss %xmm1, %xmm0
 394 ret
 395
 396 Now consider if the ... code caused xmm1 to get spilled.  This might produce
 397 this code:
 398
 399 movaps c(%esp), %xmm1
 400 movaps %xmm1, c2(%esp)
 401 ...
 402
 403 xorps %xmm0, %xmm0
 404 movaps c2(%esp), %xmm1
 405 movss %xmm1, %xmm0
 406 ret
 407
 408 However, since the reload is only used by these instructions, we could
 409 "fold" it into the uses, producing something like this:
 410
 411 movaps c(%esp), %xmm1
 412 movaps %xmm1, c2(%esp)
 413 ...
 414
 415 movss c2(%esp), %xmm0
 416 ret
 417
 418 ... saving two instructions.
 419
 420 The basic idea is that a reload from a spill slot, can, if only one 4-byte
 421 chunk is used, bring in 3 zeros the the one element instead of 4 elements.
 422 This can be used to simplify a variety of shuffle operations, where the
 423 elements are fixed zeros.
 424
 425 //===---------------------------------------------------------------------===//
 426
 427 For this:
 428
 429 #include <emmintrin.h>
 430 void test(__m128d *r, __m128d *A, double B) {
 431   *r = _mm_loadl_pd(*A, &B);
 432 }
 433
 434 We generates:
 435
 436         subl $12, %esp
 437         movsd 24(%esp), %xmm0
 438         movsd %xmm0, (%esp)
 439         movl 20(%esp), %eax
 440         movapd (%eax), %xmm0
 441         movlpd (%esp), %xmm0
 442         movl 16(%esp), %eax
 443         movapd %xmm0, (%eax)
 444         addl $12, %esp
 445         ret
 446
 447 icc generates:
 448
 449         movl      4(%esp), %edx                                 #3.6
 450         movl      8(%esp), %eax                                 #3.6
 451         movapd    (%eax), %xmm0                                 #4.22
 452         movlpd    12(%esp), %xmm0                               #4.8
 453         movapd    %xmm0, (%edx)                                 #4.3
 454         ret                                                     #5.1
 455
 456 So icc is smart enough to know that B is in memory so it doesn't load it and
 457 store it back to stack.
 458
 459 This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic,
 460 lowering it to a load+insertelement instead.  Already match the load+shuffle
 461 as movlpd, so this should be easy.  We already get optimal code for:
 462
 463 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) {
 464 entry:
 465         %tmp2 = load <2 x double>* %A, align 16
 466         %tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0
 467         store <2 x double> %tmp8, <2 x double>* %r, align 16
 468         ret void
 469 }
 470
 471 //===---------------------------------------------------------------------===//
 472
 473 __m128d test1( __m128d A, __m128d B) {
 474   return _mm_shuffle_pd(A, B, 0x3);
 475 }
 476
 477 compiles to
 478
 479 shufpd $3, %xmm1, %xmm0
 480
 481 Perhaps it's better to use unpckhpd instead?
 482
 483 unpckhpd %xmm1, %xmm0
 484
 485 Don't know if unpckhpd is faster. But it is shorter.
 486
 487 //===---------------------------------------------------------------------===//
 488
 489 This code generates ugly code, probably due to costs being off or something:
 490
 491 define void @test(float* %P, <4 x float>* %P2 ) {
 492         %xFloat0.688 = load float* %P
 493         %tmp = load <4 x float>* %P2
 494         %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
 495         store <4 x float> %inFloat3.713, <4 x float>* %P2
 496         ret void
 497 }
 498
 499 Generates:
 500
 501 _test:
 502         movl    8(%esp), %eax
 503         movaps  (%eax), %xmm0
 504         pxor    %xmm1, %xmm1
 505         movaps  %xmm0, %xmm2
 506         shufps  $50, %xmm1, %xmm2
 507         shufps  $132, %xmm2, %xmm0
 508         movaps  %xmm0, (%eax)
 509         ret
 510
 511 Would it be better to generate:
 512
 513 _test:
 514         movl 8(%esp), %ecx
 515         movaps (%ecx), %xmm0
 516         xor %eax, %eax
 517         pinsrw $6, %eax, %xmm0
 518         pinsrw $7, %eax, %xmm0
 519         movaps %xmm0, (%ecx)
 520         ret
 521
 522 ?
 523
 524 //===---------------------------------------------------------------------===//
 525
 526 Some useful information in the Apple Altivec / SSE Migration Guide:
 527
 528 http://developer.apple.com/documentation/Performance/Conceptual/
 529 Accelerate_sse_migration/index.html
 530
 531 e.g. SSE select using and, andnot, or. Various SSE compare translations.
 532
 533 //===---------------------------------------------------------------------===//
 534
 535 Add hooks to commute some CMPP operations.
 536
 537 //===---------------------------------------------------------------------===//
 538
 539 Apply the same transformation that merged four float into a single 128-bit load
 540 to loads from constant pool.
 541
 542 //===---------------------------------------------------------------------===//
 543
 544 Floating point max / min are commutable when -enable-unsafe-fp-path is
 545 specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
 546 nodes which are selected to max / min instructions that are marked commutable.
 547
 548 //===---------------------------------------------------------------------===//
 549
 550 We should compile this:
 551 #include <xmmintrin.h>
 552 typedef union {
 553   int i[4];
 554   float f[4];
 555   __m128 v;
 556 } vector4_t;
 557 void swizzle (const void *a, vector4_t * b, vector4_t * c) {
 558   b->v = _mm_loadl_pi (b->v, (__m64 *) a);
 559   c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
 560 }
 561
 562 to:
 563
 564 _swizzle:
 565         movl    4(%esp), %eax
 566         movl    8(%esp), %edx
 567         movl    12(%esp), %ecx
 568         movlps  (%eax), %xmm0
 569         movlps  %xmm0, (%edx)
 570         movlps  8(%eax), %xmm0
 571         movlps  %xmm0, (%ecx)
 572         ret
 573
 574 not:
 575
 576 swizzle:
 577         movl 8(%esp), %eax
 578         movaps (%eax), %xmm0
 579         movl 4(%esp), %ecx
 580         movlps (%ecx), %xmm0
 581         movaps %xmm0, (%eax)
 582         movl 12(%esp), %eax
 583         movaps (%eax), %xmm0
 584         movlps 8(%ecx), %xmm0
 585         movaps %xmm0, (%eax)
 586         ret
 587
 588 //===---------------------------------------------------------------------===//
 589
 590 These functions should produce the same code:
 591
 592 #include <emmintrin.h>
 593
 594 typedef long long __m128i __attribute__ ((__vector_size__ (16)));
 595
 596 int foo(__m128i* val) {
 597   return __builtin_ia32_vec_ext_v4si(*val, 1);
 598 }
 599 int bar(__m128i* val) {
 600   union vs {
 601     __m128i *_v;
 602     int* _s;
 603   } v = {val};
 604   return v._s[1];
 605 }
 606
 607 We currently produce (with -m64):
 608
 609 _foo:
 610         pshufd $1, (%rdi), %xmm0
 611         movd %xmm0, %eax
 612         ret
 613 _bar:
 614         movl 4(%rdi), %eax
 615         ret
 616
 617 //===---------------------------------------------------------------------===//
 618
 619 We should materialize vector constants like "all ones" and "signbit" with
 620 code like:
 621
 622      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 623
 624 and:
 625      cmpeqps xmm1, xmm1   ; xmm1 = all-ones
 626      psrlq   xmm1, 31     ; xmm1 = all 100000000000...
 627
 628 instead of using a load from the constant pool.  The later is important for
 629 ABS/NEG/copysign etc.
 630
 631 //===---------------------------------------------------------------------===//
 632
 633 "converting 64-bit constant pool entry to 32-bit not necessarily beneficial"
 634 http://llvm.org/PR1264
 635
 636 For this test case:
 637
 638 define double @foo(double %x) {
 639         %y = mul double %x, 5.000000e-01
 640         ret double %y
 641 }
 642
 643 llc -march=x86-64 currently produces a 32-bit constant pool entry and this code:
 644
 645         cvtss2sd .LCPI1_0(%rip), %xmm1
 646         mulsd %xmm1, %xmm0
 647
 648 instead of just using a 64-bit constant pool entry with this:
 649
 650         mulsd .LCPI1_0(%rip), %xmm0
 651
 652 This is due to the code in ExpandConstantFP in LegalizeDAG.cpp. It notices that
 653 x86-64 indeed has an instruction to load a 32-bit float from memory and convert
 654 it into a 64-bit float in a register, however it doesn't notice that this isn't
 655 beneficial because it prevents the load from being folded into the multiply.
 656
 657 //===---------------------------------------------------------------------===//
 658
 659 These functions:
 660
 661 #include <xmmintrin.h>
 662 __m128i a;
 663 void x(unsigned short n) {
 664   a = _mm_slli_epi32 (a, n);
 665 }
 666 void y(unsigned n) {
 667   a = _mm_slli_epi32 (a, n);
 668 }
 669
 670 compile to ( -O3 -static -fomit-frame-pointer):
 671 _x:
 672         movzwl  4(%esp), %eax
 673         movd    %eax, %xmm0
 674         movaps  _a, %xmm1
 675         pslld   %xmm0, %xmm1
 676         movaps  %xmm1, _a
 677         ret
 678 _y:
 679         movd    4(%esp), %xmm0
 680         movaps  _a, %xmm1
 681         pslld   %xmm0, %xmm1
 682         movaps  %xmm1, _a
 683         ret
 684
 685 "y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
 686 like movd would be sufficient in both cases as the value is already zero
 687 extended in the 32-bit stack slot IIRC.  For signed short, it should also be
 688 save, as a really-signed value would be undefined for pslld.
 689
 690
 691 //===---------------------------------------------------------------------===//
 692
 693 #include <math.h>
 694 int t1(double d) { return signbit(d); }
 695
 696 This currently compiles to:
 697         subl    $12, %esp
 698         movsd   16(%esp), %xmm0
 699         movsd   %xmm0, (%esp)
 700         movl    4(%esp), %eax
 701         shrl    $31, %eax
 702         addl    $12, %esp
 703         ret
 704
 705 We should use movmskp{s|d} instead.
 706
 707 //===---------------------------------------------------------------------===//
 708
 709 CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
 710 (aligned) vector load.  This functionality has a couple of problems.
 711
 712 1. The code to infer alignment from loads of globals is in the X86 backend,
 713    not the dag combiner.  This is because dagcombine2 needs to be able to see
 714    through the X86ISD::Wrapper node, which DAGCombine can't really do.
 715 2. The code for turning 4 x load into a single vector load is target
 716    independent and should be moved to the dag combiner.
 717 3. The code for turning 4 x load into a vector load can only handle a direct
 718    load from a global or a direct load from the stack.  It should be generalized
 719    to handle any load from P, P+4, P+8, P+12, where P can be anything.
 720 4. The alignment inference code cannot handle loads from globals in non-static
 721    mode because it doesn't look through the extra dyld stub load.  If you try
 722    vec_align.ll without -relocation-model=static, you'll see what I mean.
 723
 724 //===---------------------------------------------------------------------===//
 725
 726 We should lower store(fneg(load p), q) into an integer load+xor+store, which
 727 eliminates a constant pool load.  For example, consider:
 728
 729 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
 730 entry:
 731         %tmp6 = sub float -0.000000e+00, %z.1           ; <float> [#uses=1]
 732         %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly             ; <i64> [#uses=1]
 733         ret i64 %tmp20
 734 }
 735
 736 This currently compiles to:
 737
 738 LCPI1_0:                                        #  <4 x float>
 739         .long   2147483648      # float -0
 740         .long   2147483648      # float -0
 741         .long   2147483648      # float -0
 742         .long   2147483648      # float -0
 743 _ccosf:
 744         subl    $12, %esp
 745         movss   16(%esp), %xmm0
 746         movss   %xmm0, 4(%esp)
 747         movss   20(%esp), %xmm0
 748         xorps   LCPI1_0, %xmm0
 749         movss   %xmm0, (%esp)
 750         call    L_ccoshf$stub
 751         addl    $12, %esp
 752         ret
 753
 754 Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
 755 this code computes the pic base and does two loads to do the constant pool
 756 load, so the improvement is much bigger.
 757
 758 The tricky part about this xform is that the argument load/store isn't exposed
 759 until post-legalize, and at that point, the fneg has been custom expanded into
 760 an X86 fxor.  This means that we need to handle this case in the x86 backend
 761 instead of in target independent code.
 762
 763 //===---------------------------------------------------------------------===//
 764
 765 Non-SSE4 insert into 16 x i8 is atrociously bad.
 766
 767 //===---------------------------------------------------------------------===//
 768
 769 <2 x i64> extract is substantially worse than <2 x f64>, even if the destination
 770 is memory.
 771
 772 //===---------------------------------------------------------------------===//
 773
 774 SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
 775 sitting between the truncate and the extract.
 776
 777 //===---------------------------------------------------------------------===//
 778
 779 INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
 780 any number of 0.0 simultaneously.  Currently we only use it for simple
 781 insertions.
 782
 783 See comments in LowerINSERT_VECTOR_ELT_SSE4.
 784
 785 //===---------------------------------------------------------------------===//
 786
 787 On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
 788 Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
 789 legal, it'll just take a few extra patterns written in the .td file.
 790
 791 Note: this is not a code quality issue; the custom lowered code happens to be
 792 right, but we shouldn't have to custom lower anything.  This is probably related
 793 to <2 x i64> ops being so bad.
 794