done

[oota-llvm.git] / lib / Target / X86 / README-SSE.txt
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt

index e019afa966fc7603301269febe4db3810b5eb20d..08dcc278a1b894ea4849b1c010ef68d2b97b8d99 100644 (file)
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -4,6 +4,11 @@
  
  //===---------------------------------------------------------------------===//
  
+Expand libm rounding functions inline:  Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
+
+//===---------------------------------------------------------------------===//
+
  When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
  other fast SSE modes.
  
@@ -136,17 +141,6 @@ This will be solved when we go to a dynamic programming based isel.
  
  //===---------------------------------------------------------------------===//
  
-Should generate min/max for stuff like:
-
-void minf(float a, float b, float *X) {
-  *X = a <= b ? a : b;
-}
-
-Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
-and ISD::FMAX node types?
-
-//===---------------------------------------------------------------------===//
-
  Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
  feasible.
  
@@ -196,29 +190,6 @@ Perhaps use pxor / xorp* to clear a XMM register first?
  
  //===---------------------------------------------------------------------===//
  
-Better codegen for:
-
-void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
-void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
-
-For the later we generate:
-
-_f:
-        pxor %xmm0, %xmm0
-        movss 8(%esp), %xmm1
-        movaps %xmm0, %xmm2
-        unpcklps %xmm1, %xmm2
-        movss 4(%esp), %xmm1
-        unpcklps %xmm0, %xmm1
-        unpcklps %xmm2, %xmm1
-        movl 12(%esp), %eax
-        movaps %xmm1, (%eax)
-        ret
-
-This seems like it should use shufps, one for each of a & b.
-
-//===---------------------------------------------------------------------===//
-
  How to decide when to use the "floating point version" of logical ops? Here are
  some code fragments:
  
@@ -551,32 +522,53 @@ Add hooks to commute some CMPP operations.
  
  //===---------------------------------------------------------------------===//
  
-Implement some missing insert/extract element operations without going through
-the stack.  Testcase here:
-CodeGen/X86/vec_ins_extract.ll
-corresponds to this C code:
+Apply the same transformation that merged four float into a single 128-bit load
+to loads from constant pool.
  
-typedef float vectorfloat __attribute__((vector_size(16)));
-void test(vectorfloat *F, float f) {
-  vectorfloat G = *F + *F;
-  *((float*)&G) = f;
-  *F = G + G;
-}
-void test2(vectorfloat *F, float f) {
-  vectorfloat G = *F + *F;
-  ((float*)&G)[2] = f;
-  *F = G + G;
-}
-void test3(vectorfloat *F, float *f) {
-  vectorfloat G = *F + *F;
-  *f = ((float*)&G)[2];
-}
-void test4(vectorfloat *F, float *f) {
-  vectorfloat G = *F + *F;
-  *f = *((float*)&G);
-}
+//===---------------------------------------------------------------------===//
+
+Floating point max / min are commutable when -enable-unsafe-fp-path is
+specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
+nodes which are selected to max / min instructions that are marked commutable.
  
  //===---------------------------------------------------------------------===//
  
-Apply the same transformation that merged four float into a single 128-bit load
-to loads from constant pool.
+We should compile this:
+#include <xmmintrin.h>
+typedef union {
+  int i[4];
+  float f[4];
+  __m128 v;
+} vector4_t;
+void swizzle (const void *a, vector4_t * b, vector4_t * c) {
+  b->v = _mm_loadl_pi (b->v, (__m64 *) a);
+  c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
+}
+
+to:
+
+_swizzle:
+        movl    4(%esp), %eax
+        movl    8(%esp), %edx
+        movl    12(%esp), %ecx
+        movlps  (%eax), %xmm0
+        movlps  %xmm0, (%edx)
+        movlps  8(%eax), %xmm0
+        movlps  %xmm0, (%ecx)
+        ret
+
+not:
+
+swizzle:
+        movl 8(%esp), %eax
+        movaps (%eax), %xmm0
+        movl 4(%esp), %ecx
+        movlps (%ecx), %xmm0
+        movaps %xmm0, (%eax)
+        movl 12(%esp), %eax
+        movaps (%eax), %xmm0
+        movlps 8(%ecx), %xmm0
+        movaps %xmm0, (%eax)
+        ret
+
+//===---------------------------------------------------------------------===//