Turn on vzeroupper insertion on call boundaries for AVX; it works as far as I know...

[oota-llvm.git] / lib / Target / X86 / README-SSE.txt
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt

index 2a8506fd4f8c5147d2783278a387540d00a0493f..7d901afae47499b401dc409f664922e6fc1da4ca 100644 (file)
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -2,8 +2,65 @@
  // Random ideas for the X86 backend: SSE-specific stuff.
  //===---------------------------------------------------------------------===//
  
-- Consider eliminating the unaligned SSE load intrinsics, replacing them with
-  unaligned LLVM load instructions.
+//===---------------------------------------------------------------------===//
+
+SSE Variable shift can be custom lowered to something like this, which uses a
+small table + unaligned load + shuffle instead of going through memory.
+
+__m128i_shift_right:
+       .byte     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+       .byte    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+
+...
+__m128i shift_right(__m128i value, unsigned long offset) {
+  return _mm_shuffle_epi8(value,
+               _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
+}
+
+//===---------------------------------------------------------------------===//
+
+SSE has instructions for doing operations on complex numbers, we should pattern
+match them.   For example, this should turn into a horizontal add:
+
+typedef float __attribute__((vector_size(16))) v4f32;
+float f32(v4f32 A) {
+  return A[0]+A[1]+A[2]+A[3];
+}
+
+Instead we get this:
+
+_f32:                                   ## @f32
+       pshufd  $1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
+       addss   %xmm0, %xmm1
+       pshufd  $3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
+       movhlps %xmm0, %xmm0            ## xmm0 = xmm0[1,1]
+       movaps  %xmm0, %xmm3
+       addss   %xmm1, %xmm3
+       movdqa  %xmm2, %xmm0
+       addss   %xmm3, %xmm0
+       ret
+
+Also, there are cases where some simple local SLP would improve codegen a bit.
+compiling this:
+
+_Complex float f32(_Complex float A, _Complex float B) {
+  return A+B;
+}
+
+into:
+
+_f32:                                   ## @f32
+       movdqa  %xmm0, %xmm2
+       addss   %xmm1, %xmm2
+       pshufd  $1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
+       pshufd  $1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
+       addss   %xmm1, %xmm3
+       movaps  %xmm2, %xmm0
+       unpcklps        %xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+       ret
+
+seems silly when it could just be one addps.
+
  
  //===---------------------------------------------------------------------===//
  
@@ -89,16 +146,6 @@ Perhaps use pxor / xorp* to clear a XMM register first?
  
  //===---------------------------------------------------------------------===//
  
-X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
-to choose between movaps, movapd, and movdqa based on types of source and
-destination?
-
-How about andps, andpd, and pand? Do we really care about the type of the packed
-elements? If not, why not always use the "ps" variants which are likely to be
-shorter.
-
-//===---------------------------------------------------------------------===//
-
  External test Nurbs exposed some problems. Look for
  __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
  emits:
@@ -815,7 +862,7 @@ define float @bar(float %x) nounwind {
  
  This IR (from PR6194):
  
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
  target triple = "x86_64-apple-darwin10.0.0"
  
  %0 = type { double, double }
@@ -876,4 +923,15 @@ The insertps's of $0 are pointless complex copies.
  
  //===---------------------------------------------------------------------===//
  
+If SSE4.1 is available we should inline rounding functions instead of emitting
+a libcall.
+
+floor: roundsd $0x01, %xmm, %xmm
+ceil:  roundsd $0x02, %xmm, %xmm
  
+and likewise for the single precision versions.
+
+Currently, SelectionDAGBuilder doesn't turn calls to these functions into the
+corresponding nodes and some targets (including X86) aren't ready for them.
+
+//===---------------------------------------------------------------------===//