Add patterns for the x86 popcnt instruction.

[oota-llvm.git] / lib / Target / X86 / README-SSE.txt
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt

index 1a5d9045b05ad2fd04d294889897cf9b3c8f9227..b2116e03b148ea222532c956e9dd78f49fea3a06 100644 (file)
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -2,8 +2,65 @@
  // Random ideas for the X86 backend: SSE-specific stuff.
  //===---------------------------------------------------------------------===//
  
  // Random ideas for the X86 backend: SSE-specific stuff.
  //===---------------------------------------------------------------------===//
  
-- Consider eliminating the unaligned SSE load intrinsics, replacing them with
-  unaligned LLVM load instructions.
+//===---------------------------------------------------------------------===//
+
+SSE Variable shift can be custom lowered to something like this, which uses a
+small table + unaligned load + shuffle instead of going through memory.
+
+__m128i_shift_right:
+       .byte     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+       .byte    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+
+...
+__m128i shift_right(__m128i value, unsigned long offset) {
+  return _mm_shuffle_epi8(value,
+               _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
+}
+
+//===---------------------------------------------------------------------===//
+
+SSE has instructions for doing operations on complex numbers, we should pattern
+match them.   For example, this should turn into a horizontal add:
+
+typedef float __attribute__((vector_size(16))) v4f32;
+float f32(v4f32 A) {
+  return A[0]+A[1]+A[2]+A[3];
+}
+
+Instead we get this:
+
+_f32:                                   ## @f32
+       pshufd  $1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
+       addss   %xmm0, %xmm1
+       pshufd  $3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
+       movhlps %xmm0, %xmm0            ## xmm0 = xmm0[1,1]
+       movaps  %xmm0, %xmm3
+       addss   %xmm1, %xmm3
+       movdqa  %xmm2, %xmm0
+       addss   %xmm3, %xmm0
+       ret
+
+Also, there are cases where some simple local SLP would improve codegen a bit.
+compiling this:
+
+_Complex float f32(_Complex float A, _Complex float B) {
+  return A+B;
+}
+
+into:
+
+_f32:                                   ## @f32
+       movdqa  %xmm0, %xmm2
+       addss   %xmm1, %xmm2
+       pshufd  $1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
+       pshufd  $1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
+       addss   %xmm1, %xmm3
+       movaps  %xmm2, %xmm0
+       unpcklps        %xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+       ret
+
+seems silly when it could just be one addps.
+
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
@@ -17,7 +74,7 @@ other fast SSE modes.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-Think about doing i64 math in SSE regs.
+Think about doing i64 math in SSE regs on x86-32.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
@@ -36,68 +93,12 @@ The pattern isel got this one right.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
-like this:
-
-  X += y
-
-and the register allocator decides to spill X, it is cheaper to emit this as:
-
-Y += [xslot]
-store Y -> [xslot]
-
-than as:
-
-tmp = [xslot]
-tmp += y
-store tmp -> [xslot]
-
-..and this uses one fewer register (so this should be done at load folding
-time, not at spiller time).  *Note* however that this can only be done
-if Y is dead.  Here's a testcase:
-
-@.str_3 = external global [15 x i8]
-declare void @printf(i32, ...)
-define void @main() {
-build_tree.exit:
-       br label %no_exit.i7
-
-no_exit.i7:            ; preds = %no_exit.i7, %build_tree.exit
-       %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ],
-                                   [ %tmp.34.i18, %no_exit.i7 ]
-       %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ],
-                                    [ %tmp.28.i16, %no_exit.i7 ]
-       %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
-       %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
-       br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
-
-Compute_Tree.exit23:           ; preds = %no_exit.i7
-       tail call void (i32, ...)* @printf( i32 0 )
-       store double %tmp.34.i18, double* null
-       ret void
-}
-
-We currently emit:
-
-.BBmain_1:
-        xorpd %XMM1, %XMM1
-        addsd %XMM0, %XMM1
-***     movsd %XMM2, QWORD PTR [%ESP + 8]
-***     addsd %XMM2, %XMM1
-***     movsd QWORD PTR [%ESP + 8], %XMM2
-        jmp .BBmain_1   # no_exit.i7
-
-This is a bugpoint reduced testcase, which is why the testcase doesn't make
-much sense (e.g. its an infinite loop). :)
-
-//===---------------------------------------------------------------------===//
-
  SSE should implement 'select_cc' using 'emulated conditional moves' that use
  pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  
  double %X(double %Y, double %Z, double %A, double %B) {
          %C = setlt double %A, %B
  SSE should implement 'select_cc' using 'emulated conditional moves' that use
  pcmp/pand/pandn/por to do a selection instead of a conditional branch:
  
  double %X(double %Y, double %Z, double %A, double %B) {
          %C = setlt double %A, %B
-        %z = add double %Z, 0.0    ;; select operand is not a load
+        %z = fadd double %Z, 0.0    ;; select operand is not a load
          %D = select bool %C, double %Y, double %z
          ret double %D
  }
          %D = select bool %C, double %Y, double %z
          ret double %D
  }
@@ -122,12 +123,6 @@ LBB_X_2:
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-It's not clear whether we should use pxor or xorps / xorpd to clear XMM
-registers. The choice may depend on subtarget information. We should do some
-more experiments on different x86 machines.
-
-//===---------------------------------------------------------------------===//
-
  Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
  feasible.
  
  Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
  feasible.
  
@@ -151,45 +146,6 @@ Perhaps use pxor / xorp* to clear a XMM register first?
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-How to decide when to use the "floating point version" of logical ops? Here are
-some code fragments:
-
-       movaps LCPI5_5, %xmm2
-       divps %xmm1, %xmm2
-       mulps %xmm2, %xmm3
-       mulps 8656(%ecx), %xmm3
-       addps 8672(%ecx), %xmm3
-       andps LCPI5_6, %xmm2
-       andps LCPI5_1, %xmm3
-       por %xmm2, %xmm3
-       movdqa %xmm3, (%edi)
-
-       movaps LCPI5_5, %xmm1
-       divps %xmm0, %xmm1
-       mulps %xmm1, %xmm3
-       mulps 8656(%ecx), %xmm3
-       addps 8672(%ecx), %xmm3
-       andps LCPI5_6, %xmm1
-       andps LCPI5_1, %xmm3
-       orps %xmm1, %xmm3
-       movaps %xmm3, 112(%esp)
-       movaps %xmm3, (%ebx)
-
-Due to some minor source change, the later case ended up using orps and movaps
-instead of por and movdqa. Does it matter?
-
-//===---------------------------------------------------------------------===//
-
-X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
-to choose between movaps, movapd, and movdqa based on types of source and
-destination?
-
-How about andps, andpd, and pand? Do we really care about the type of the packed
-elements? If not, why not always use the "ps" variants which are likely to be
-shorter.
-
-//===---------------------------------------------------------------------===//
-
  External test Nurbs exposed some problems. Look for
  __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
  emits:
  External test Nurbs exposed some problems. Look for
  __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
  emits:
@@ -278,41 +234,6 @@ It also exposes some other problems. See MOV32ri -3 and the spills.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
-
-LLVM is producing bad code.
-
-LBB_main_4:    # cond_true44
-       addps %xmm1, %xmm2
-       subps %xmm3, %xmm2
-       movaps (%ecx), %xmm4
-       movaps %xmm2, %xmm1
-       addps %xmm4, %xmm1
-       addl $16, %ecx
-       incl %edx
-       cmpl $262144, %edx
-       movaps %xmm3, %xmm2
-       movaps %xmm4, %xmm3
-       jne LBB_main_4  # cond_true44
-
-There are two problems. 1) No need to two loop induction variables. We can
-compare against 262144 * 16. 2) Known register coalescer issue. We should
-be able eliminate one of the movaps:
-
-       addps %xmm2, %xmm1    <=== Commute!
-       subps %xmm3, %xmm1
-       movaps (%ecx), %xmm4
-       movaps %xmm1, %xmm1   <=== Eliminate!
-       addps %xmm4, %xmm1
-       addl $16, %ecx
-       incl %edx
-       cmpl $262144, %edx
-       movaps %xmm3, %xmm2
-       movaps %xmm4, %xmm3
-       jne LBB_main_4  # cond_true44
-
-//===---------------------------------------------------------------------===//
-
  Consider:
  
  __m128 test(float a) {
  Consider:
  
  __m128 test(float a) {
@@ -376,128 +297,12 @@ ret
  ... saving two instructions.
  
  The basic idea is that a reload from a spill slot, can, if only one 4-byte 
  ... saving two instructions.
  
  The basic idea is that a reload from a spill slot, can, if only one 4-byte 
-chunk is used, bring in 3 zeros the the one element instead of 4 elements.
+chunk is used, bring in 3 zeros the one element instead of 4 elements.
  This can be used to simplify a variety of shuffle operations, where the
  elements are fixed zeros.
  
  //===---------------------------------------------------------------------===//
  
  This can be used to simplify a variety of shuffle operations, where the
  elements are fixed zeros.
  
  //===---------------------------------------------------------------------===//
  
-For this:
-
-#include <emmintrin.h>
-void test(__m128d *r, __m128d *A, double B) {
-  *r = _mm_loadl_pd(*A, &B);
-}
-
-We generates:
-
-       subl $12, %esp
-       movsd 24(%esp), %xmm0
-       movsd %xmm0, (%esp)
-       movl 20(%esp), %eax
-       movapd (%eax), %xmm0
-       movlpd (%esp), %xmm0
-       movl 16(%esp), %eax
-       movapd %xmm0, (%eax)
-       addl $12, %esp
-       ret
-
-icc generates:
-
-        movl      4(%esp), %edx                                 #3.6
-        movl      8(%esp), %eax                                 #3.6
-        movapd    (%eax), %xmm0                                 #4.22
-        movlpd    12(%esp), %xmm0                               #4.8
-        movapd    %xmm0, (%edx)                                 #4.3
-        ret                                                     #5.1
-
-So icc is smart enough to know that B is in memory so it doesn't load it and
-store it back to stack.
-
-This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic, 
-lowering it to a load+insertelement instead.  Already match the load+shuffle 
-as movlpd, so this should be easy.  We already get optimal code for:
-
-define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) {
-entry:
-       %tmp2 = load <2 x double>* %A, align 16
-       %tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0
-       store <2 x double> %tmp8, <2 x double>* %r, align 16
-       ret void
-}
-
-//===---------------------------------------------------------------------===//
-
-Consider (PR2108):
-
-#include <xmmintrin.h>
-__m128i doload64(unsigned long long x) { return _mm_loadl_epi64(&x);}
-__m128i doload64_2(unsigned long long *x) { return _mm_loadl_epi64(x);}
-
-These are very similar routines, but we generate significantly worse code for
-the first one on x86-32:
-
-_doload64:
-       subl    $12, %esp
-       movl    20(%esp), %eax
-       movl    %eax, 4(%esp)
-       movl    16(%esp), %eax
-       movl    %eax, (%esp)
-       movsd   (%esp), %xmm0
-       addl    $12, %esp
-       ret
-_doload64_2:
-       movl    4(%esp), %eax
-       movsd   (%eax), %xmm0
-       ret
-
-The problem is that the argument lowering logic splits the i64 argument into
-2x i32 loads early, the f64 insert doesn't match.  Here's a reduced testcase:
-
-define fastcc double @doload64(i64 %x) nounwind  {
-entry:
-       %tmp717 = bitcast i64 %x to double              ; <double> [#uses=1]
-       ret double %tmp717
-}
-
-compiles to:
-
-_doload64:
-       subl    $12, %esp
-       movl    20(%esp), %eax
-       movl    %eax, 4(%esp)
-       movl    16(%esp), %eax
-       movl    %eax, (%esp)
-       movsd   (%esp), %xmm0
-       addl    $12, %esp
-       ret
-
-instead of movsd from the stack.  This is actually not too bad to implement. The
-best way to do this is to implement a dag combine that turns 
-bitconvert(build_pair(load a, load b)) into one load of the right type.  The
-only trick to this is writing the predicate that determines that a/b are at the
-right offset from each other.  For the enterprising hacker, InferAlignment is a
-helpful place to start poking if interested.
-
-
-//===---------------------------------------------------------------------===//
-
-__m128d test1( __m128d A, __m128d B) {
-  return _mm_shuffle_pd(A, B, 0x3);
-}
-
-compiles to
-
-shufpd $3, %xmm1, %xmm0
-
-Perhaps it's better to use unpckhpd instead?
-
-unpckhpd %xmm1, %xmm0
-
-Don't know if unpckhpd is faster. But it is shorter.
-
-//===---------------------------------------------------------------------===//
-
  This code generates ugly code, probably due to costs being off or something:
  
  define void @test(float* %P, <4 x float>* %P2 ) {
  This code generates ugly code, probably due to costs being off or something:
  
  define void @test(float* %P, <4 x float>* %P2 ) {
@@ -559,75 +364,6 @@ nodes which are selected to max / min instructions that are marked commutable.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-We should compile this:
-#include <xmmintrin.h>
-typedef union {
-  int i[4];
-  float f[4];
-  __m128 v;
-} vector4_t;
-void swizzle (const void *a, vector4_t * b, vector4_t * c) {
-  b->v = _mm_loadl_pi (b->v, (__m64 *) a);
-  c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
-}
-
-to:
-
-_swizzle:
-        movl    4(%esp), %eax
-        movl    8(%esp), %edx
-        movl    12(%esp), %ecx
-        movlps  (%eax), %xmm0
-        movlps  %xmm0, (%edx)
-        movlps  8(%eax), %xmm0
-        movlps  %xmm0, (%ecx)
-        ret
-
-not:
-
-swizzle:
-        movl 8(%esp), %eax
-        movaps (%eax), %xmm0
-        movl 4(%esp), %ecx
-        movlps (%ecx), %xmm0
-        movaps %xmm0, (%eax)
-        movl 12(%esp), %eax
-        movaps (%eax), %xmm0
-        movlps 8(%ecx), %xmm0
-        movaps %xmm0, (%eax)
-        ret
-
-//===---------------------------------------------------------------------===//
-
-These functions should produce the same code:
-
-#include <emmintrin.h>
-
-typedef long long __m128i __attribute__ ((__vector_size__ (16)));
-
-int foo(__m128i* val) {
-  return __builtin_ia32_vec_ext_v4si(*val, 1);
-}
-int bar(__m128i* val) {
-  union vs {
-    __m128i *_v;
-    int* _s;
-  } v = {val};
-  return v._s[1];
-}
-
-We currently produce (with -m64):
-
-_foo:
-        pshufd $1, (%rdi), %xmm0
-        movd %xmm0, %eax
-        ret
-_bar:
-        movl 4(%rdi), %eax
-        ret
-
-//===---------------------------------------------------------------------===//
-
  We should materialize vector constants like "all ones" and "signbit" with 
  code like:
  
  We should materialize vector constants like "all ones" and "signbit" with 
  code like:
  
@@ -714,10 +450,11 @@ eliminates a constant pool load.  For example, consider:
  
  define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
  entry:
  
  define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
  entry:
- %tmp6 = sub float -0.000000e+00, %z.1         ; <float> [#uses=1]
+ %tmp6 = fsub float -0.000000e+00, %z.1                ; <float> [#uses=1]
   %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
   ret i64 %tmp20
  }
   %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
   ret i64 %tmp20
  }
+declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
  
  This currently compiles to:
  
  
  This currently compiles to:
  
@@ -811,31 +548,6 @@ or iseling it.
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
-Take the following code:
-
-#include <xmmintrin.h>
-__m128i doload64(short x) {return _mm_set_epi16(x,x,x,x,x,x,x,x);}
-
-LLVM currently generates the following on x86:
-doload64:
-        movzwl  4(%esp), %eax
-        movd    %eax, %xmm0
-        punpcklwd       %xmm0, %xmm0
-        pshufd  $0, %xmm0, %xmm0
-        ret
-
-gcc's generated code:
-doload64:
-        movd    4(%esp), %xmm0
-        punpcklwd       %xmm0, %xmm0
-        pshufd  $0, %xmm0, %xmm0
-        ret
-
-LLVM should be able to generate the same thing as gcc.  This looks like it is
-just a matter of matching (scalar_to_vector (load x)) to movd.
-
-//===---------------------------------------------------------------------===//
-
  LLVM currently generates stack realignment code, when it is not necessary
  needed. The problem is that we need to know about stack alignment too early,
  before RA runs.
  LLVM currently generates stack realignment code, when it is not necessary
  needed. The problem is that we need to know about stack alignment too early,
  before RA runs.
@@ -872,4 +584,343 @@ madd:
  
  //===---------------------------------------------------------------------===//
  
  
  //===---------------------------------------------------------------------===//
  
+Consider:
+#include <emmintrin.h> 
+__m128 foo2 (float x) {
+ return _mm_set_ps (0, 0, x, 0);
+}
+
+In x86-32 mode, we generate this spiffy code:
+
+_foo2:
+       movss   4(%esp), %xmm0
+       pshufd  $81, %xmm0, %xmm0
+       ret
+
+in x86-64 mode, we generate this code, which could be better:
+
+_foo2:
+       xorps   %xmm1, %xmm1
+       movss   %xmm0, %xmm1
+       pshufd  $81, %xmm1, %xmm0
+       ret
+
+In sse4 mode, we could use insertps to make both better.
+
+Here's another testcase that could use insertps [mem]:
+
+#include <xmmintrin.h>
+extern float x2, x3;
+__m128 foo1 (float x1, float x4) {
+ return _mm_set_ps (x2, x1, x3, x4);
+}
+
+gcc mainline compiles it to:
+
+foo1:
+       insertps        $0x10, x2(%rip), %xmm0
+       insertps        $0x10, x3(%rip), %xmm1
+       movaps  %xmm1, %xmm2
+       movlhps %xmm0, %xmm2
+       movaps  %xmm2, %xmm0
+       ret
+
+//===---------------------------------------------------------------------===//
+
+We compile vector multiply-by-constant into poor code:
+
+define <4 x i32> @f(<4 x i32> %i) nounwind  {
+       %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
+       ret <4 x i32> %A
+}
+
+On targets without SSE4.1, this compiles into:
+
+LCPI1_0:                                       ##  <4 x i32>
+       .long   10
+       .long   10
+       .long   10
+       .long   10
+       .text
+       .align  4,0x90
+       .globl  _f
+_f:
+       pshufd  $3, %xmm0, %xmm1
+       movd    %xmm1, %eax
+       imull   LCPI1_0+12, %eax
+       movd    %eax, %xmm1
+       pshufd  $1, %xmm0, %xmm2
+       movd    %xmm2, %eax
+       imull   LCPI1_0+4, %eax
+       movd    %eax, %xmm2
+       punpckldq       %xmm1, %xmm2
+       movd    %xmm0, %eax
+       imull   LCPI1_0, %eax
+       movd    %eax, %xmm1
+       movhlps %xmm0, %xmm0
+       movd    %xmm0, %eax
+       imull   LCPI1_0+8, %eax
+       movd    %eax, %xmm0
+       punpckldq       %xmm0, %xmm1
+       movaps  %xmm1, %xmm0
+       punpckldq       %xmm2, %xmm0
+       ret
+
+It would be better to synthesize integer vector multiplication by constants
+using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
+simple cases such as multiplication by powers of two would be better as
+vector shifts than as multiplications.
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+__m128i
+foo2 (char x)
+{
+  return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
+}
+
+into:
+       movl    $1, %eax
+       xorps   %xmm0, %xmm0
+       pinsrw  $2, %eax, %xmm0
+       movzbl  4(%esp), %eax
+       pinsrw  $3, %eax, %xmm0
+       movl    $256, %eax
+       pinsrw  $7, %eax, %xmm0
+       ret
+
+
+gcc-4.2:
+       subl    $12, %esp
+       movzbl  16(%esp), %eax
+       movdqa  LC0, %xmm0
+       pinsrw  $3, %eax, %xmm0
+       addl    $12, %esp
+       ret
+       .const
+       .align 4
+LC0:
+       .word   0
+       .word   0
+       .word   1
+       .word   0
+       .word   0
+       .word   0
+       .word   0
+       .word   256
+
+With SSE4, it should be
+      movdqa  .LC0(%rip), %xmm0
+      pinsrb  $6, %edi, %xmm0
+
+//===---------------------------------------------------------------------===//
+
+We should transform a shuffle of two vectors of constants into a single vector
+of constants. Also, insertelement of a constant into a vector of constants
+should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
+
+We compiled it to something horrible:
+
+       .align  4
+LCPI1_1:                                       ##  float
+       .long   1065353216      ## float 1
+       .const
+
+       .align  4
+LCPI1_0:                                       ##  <4 x float>
+       .space  4
+       .long   1065353216      ## float 1
+       .space  4
+       .long   1065353216      ## float 1
+       .text
+       .align  4,0x90
+       .globl  _t
+_t:
+       xorps   %xmm0, %xmm0
+       movhps  LCPI1_0, %xmm0
+       movss   LCPI1_1, %xmm1
+       movaps  %xmm0, %xmm2
+       shufps  $2, %xmm1, %xmm2
+       shufps  $132, %xmm2, %xmm0
+       movaps  %xmm0, 0
+
+//===---------------------------------------------------------------------===//
+rdar://5907648
+
+This function:
+
+float foo(unsigned char x) {
+  return x;
+}
+
+compiles to (x86-32):
+
+define float @foo(i8 zeroext  %x) nounwind  {
+       %tmp12 = uitofp i8 %x to float          ; <float> [#uses=1]
+       ret float %tmp12
+}
+
+compiles to:
+
+_foo:
+       subl    $4, %esp
+       movzbl  8(%esp), %eax
+       cvtsi2ss        %eax, %xmm0
+       movss   %xmm0, (%esp)
+       flds    (%esp)
+       addl    $4, %esp
+       ret
+
+We should be able to use:
+  cvtsi2ss 8($esp), %xmm0
+since we know the stack slot is already zext'd.
+
+//===---------------------------------------------------------------------===//
+
+Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
+when code size is critical. movlps is slower than movsd on core2 but it's one
+byte shorter.
+
+//===---------------------------------------------------------------------===//
+
+We should use a dynamic programming based approach to tell when using FPStack
+operations is cheaper than SSE.  SciMark montecarlo contains code like this
+for example:
+
+double MonteCarlo_num_flops(int Num_samples) {
+    return ((double) Num_samples)* 4.0;
+}
+
+In fpstack mode, this compiles into:
+
+LCPI1_0:                                       
+       .long   1082130432      ## float 4.000000e+00
+_MonteCarlo_num_flops:
+       subl    $4, %esp
+       movl    8(%esp), %eax
+       movl    %eax, (%esp)
+       fildl   (%esp)
+       fmuls   LCPI1_0
+       addl    $4, %esp
+       ret
+        
+in SSE mode, it compiles into significantly slower code:
+
+_MonteCarlo_num_flops:
+       subl    $12, %esp
+       cvtsi2sd        16(%esp), %xmm0
+       mulsd   LCPI1_0, %xmm0
+       movsd   %xmm0, (%esp)
+       fldl    (%esp)
+       addl    $12, %esp
+       ret
+
+There are also other cases in scimark where using fpstack is better, it is
+cheaper to do fld1 than load from a constant pool for example, so
+"load, add 1.0, store" is better done in the fp stack, etc.
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
+"cmpsd".  For example, this code:
+
+double d1(double x) { return x == x ? x : x + x; }
+
+Compiles into:
+
+_d1:
+       ucomisd %xmm0, %xmm0
+       jnp     LBB1_2
+       addsd   %xmm0, %xmm0
+       ret
+LBB1_2:
+       ret
+
+Also, the 'ret's should be shared.  This is PR6032.
+
+//===---------------------------------------------------------------------===//
+
+These should compile into the same code (PR6214): Perhaps instcombine should
+canonicalize the former into the later?
+
+define float @foo(float %x) nounwind {
+  %t = bitcast float %x to i32
+  %s = and i32 %t, 2147483647
+  %d = bitcast i32 %s to float
+  ret float %d
+}
+
+declare float @fabsf(float %n)
+define float @bar(float %x) nounwind {
+  %d = call float @fabsf(float %x)
+  ret float %d
+}
+
+//===---------------------------------------------------------------------===//
+
+This IR (from PR6194):
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%0 = type { double, double }
+%struct.float3 = type { float, float, float }
+
+define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
+entry:
+  %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
+  %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
+  %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
+  %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
+  %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
+  %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
+  %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
+  store float %tmp12, float* %tmp5
+  ret void
+}
+
+Compiles to:
+
+_test:                                  ## @test
+       movd    %xmm0, %rax
+       shrq    $32, %rax
+       movl    %eax, 4(%rdi)
+       ret
+
+This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
+doing a shuffle from v[1] to v[0] then a float store.
+
+//===---------------------------------------------------------------------===//
+
+On SSE4 machines, we compile this code:
+
+define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
+       <2 x float> *%P) nounwind {
+  %Z = fadd <2 x float> %Q, %R
+
+  store <2 x float> %Z, <2 x float> *%P
+  ret <2 x float> %Z
+}
+
+into:
+
+_test2:                                 ## @test2
+## BB#0:
+       insertps        $0, %xmm2, %xmm2
+       insertps        $16, %xmm3, %xmm2
+       insertps        $0, %xmm0, %xmm3
+       insertps        $16, %xmm1, %xmm3
+       addps   %xmm2, %xmm3
+       movq    %xmm3, (%rdi)
+       movaps  %xmm3, %xmm0
+       pshufd  $1, %xmm3, %xmm1
+                                        ## kill: XMM1<def> XMM1<kill>
+       ret
+
+The insertps's of $0 are pointless complex copies.
+
+//===---------------------------------------------------------------------===//
+