done

[oota-llvm.git] / lib / Target / X86 / README-SSE.txt
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt

index 4bfebc156127e15067b9d85787d60c86c4a9d4b2..08dcc278a1b894ea4849b1c010ef68d2b97b8d99 100644 (file)
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -18,11 +18,6 @@ Think about doing i64 math in SSE regs.
  
  //===---------------------------------------------------------------------===//
  
-Bitcast to<->from SSE registers should use movd/movq instead of going through
-the stack.  Testcase here: CodeGen/X86/bitcast.ll
-
-//===---------------------------------------------------------------------===//
-
  This testcase should have no SSE instructions in it, and only one load from
  a constant pool:
  
@@ -538,5 +533,42 @@ nodes which are selected to max / min instructions that are marked commutable.
  
  //===---------------------------------------------------------------------===//
  
-Add MOVDI2SSrr and MOVDSS2DIrr to X86RegisterInfo::foldMemoryOperand() once the
-recent X86 JIT regressions have been identified and fixed.
+We should compile this:
+#include <xmmintrin.h>
+typedef union {
+  int i[4];
+  float f[4];
+  __m128 v;
+} vector4_t;
+void swizzle (const void *a, vector4_t * b, vector4_t * c) {
+  b->v = _mm_loadl_pi (b->v, (__m64 *) a);
+  c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
+}
+
+to:
+
+_swizzle:
+        movl    4(%esp), %eax
+        movl    8(%esp), %edx
+        movl    12(%esp), %ecx
+        movlps  (%eax), %xmm0
+        movlps  %xmm0, (%edx)
+        movlps  8(%eax), %xmm0
+        movlps  %xmm0, (%ecx)
+        ret
+
+not:
+
+swizzle:
+        movl 8(%esp), %eax
+        movaps (%eax), %xmm0
+        movl 4(%esp), %ecx
+        movlps (%ecx), %xmm0
+        movaps %xmm0, (%eax)
+        movl 12(%esp), %eax
+        movaps (%eax), %xmm0
+        movlps 8(%ecx), %xmm0
+        movaps %xmm0, (%eax)
+        ret
+
+//===---------------------------------------------------------------------===//