X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME-SSE.txt;h=496b704ee85fb16878eb8585a77121b78497a674;hb=b8f0d89d0584e37e205c04ed5753f57a23365403;hp=7269fa2964583579690203add2888b45ad55967f;hpb=d4083d01d275de9d30b0e23a03d8bd1984c8ddab;p=oota-llvm.git

diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 7269fa29645..496b704ee85 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -2,8 +2,65 @@
 // Random ideas for the X86 backend: SSE-specific stuff.
 //===---------------------------------------------------------------------===//
 
-- Consider eliminating the unaligned SSE load intrinsics, replacing them with
-  unaligned LLVM load instructions.
+//===---------------------------------------------------------------------===//
+
+SSE Variable shift can be custom lowered to something like this, which uses a
+small table + unaligned load + shuffle instead of going through memory.
+
+__m128i_shift_right:
+	.byte	  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+	.byte	 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+
+...
+__m128i shift_right(__m128i value, unsigned long offset) {
+  return _mm_shuffle_epi8(value,
+               _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
+}
+
+//===---------------------------------------------------------------------===//
+
+SSE has instructions for doing operations on complex numbers, we should pattern
+match them.   For example, this should turn into a horizontal add:
+
+typedef float __attribute__((vector_size(16))) v4f32;
+float f32(v4f32 A) {
+  return A[0]+A[1]+A[2]+A[3];
+}
+
+Instead we get this:
+
+_f32:                                   ## @f32
+	pshufd	$1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
+	addss	%xmm0, %xmm1
+	pshufd	$3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
+	movhlps	%xmm0, %xmm0            ## xmm0 = xmm0[1,1]
+	movaps	%xmm0, %xmm3
+	addss	%xmm1, %xmm3
+	movdqa	%xmm2, %xmm0
+	addss	%xmm3, %xmm0
+	ret
+
+Also, there are cases where some simple local SLP would improve codegen a bit.
+compiling this:
+
+_Complex float f32(_Complex float A, _Complex float B) {
+  return A+B;
+}
+
+into:
+
+_f32:                                   ## @f32
+	movdqa	%xmm0, %xmm2
+	addss	%xmm1, %xmm2
+	pshufd	$1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
+	pshufd	$1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
+	addss	%xmm1, %xmm3
+	movaps	%xmm2, %xmm0
+	unpcklps	%xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+	ret
+
+seems silly when it could just be one addps.
+
 
 //===---------------------------------------------------------------------===//
 
@@ -17,7 +74,7 @@ other fast SSE modes.
 
 //===---------------------------------------------------------------------===//
 
-Think about doing i64 math in SSE regs.
+Think about doing i64 math in SSE regs on x86-32.
 
 //===---------------------------------------------------------------------===//
 
@@ -36,68 +93,12 @@ The pattern isel got this one right.
 
 //===---------------------------------------------------------------------===//
 
-SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
-like this:
-
-  X += y
-
-and the register allocator decides to spill X, it is cheaper to emit this as:
-
-Y += [xslot]
-store Y -> [xslot]
-
-than as:
-
-tmp = [xslot]
-tmp += y
-store tmp -> [xslot]
-
-..and this uses one fewer register (so this should be done at load folding
-time, not at spiller time).  *Note* however that this can only be done
-if Y is dead.  Here's a testcase:
-
-@.str_3 = external global [15 x i8]
-declare void @printf(i32, ...)
-define void @main() {
-build_tree.exit:
-	br label %no_exit.i7
-
-no_exit.i7:		; preds = %no_exit.i7, %build_tree.exit
-	%tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ],
-                                   [ %tmp.34.i18, %no_exit.i7 ]
-	%tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ],
-                                    [ %tmp.28.i16, %no_exit.i7 ]
-	%tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
-	%tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
-	br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
-
-Compute_Tree.exit23:		; preds = %no_exit.i7
-	tail call void (i32, ...)* @printf( i32 0 )
-	store double %tmp.34.i18, double* null
-	ret void
-}
-
-We currently emit:
-
-.BBmain_1:
-        xorpd %XMM1, %XMM1
-        addsd %XMM0, %XMM1
-***     movsd %XMM2, QWORD PTR [%ESP + 8]
-***     addsd %XMM2, %XMM1
-***     movsd QWORD PTR [%ESP + 8], %XMM2
-        jmp .BBmain_1   # no_exit.i7
-
-This is a bugpoint reduced testcase, which is why the testcase doesn't make
-much sense (e.g. its an infinite loop). :)
-
-//===---------------------------------------------------------------------===//
-
 SSE should implement 'select_cc' using 'emulated conditional moves' that use
 pcmp/pand/pandn/por to do a selection instead of a conditional branch:
 
 double %X(double %Y, double %Z, double %A, double %B) {
         %C = setlt double %A, %B
-        %z = add double %Z, 0.0    ;; select operand is not a load
+        %z = fadd double %Z, 0.0    ;; select operand is not a load
         %D = select bool %C, double %Y, double %z
         ret double %D
 }
@@ -122,12 +123,6 @@ LBB_X_2:
 
 //===---------------------------------------------------------------------===//
 
-It's not clear whether we should use pxor or xorps / xorpd to clear XMM
-registers. The choice may depend on subtarget information. We should do some
-more experiments on different x86 machines.
-
-//===---------------------------------------------------------------------===//
-
 Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
 feasible.
 
@@ -151,45 +146,6 @@ Perhaps use pxor / xorp* to clear a XMM register first?
 
 //===---------------------------------------------------------------------===//
 
-How to decide when to use the "floating point version" of logical ops? Here are
-some code fragments:
-
-	movaps LCPI5_5, %xmm2
-	divps %xmm1, %xmm2
-	mulps %xmm2, %xmm3
-	mulps 8656(%ecx), %xmm3
-	addps 8672(%ecx), %xmm3
-	andps LCPI5_6, %xmm2
-	andps LCPI5_1, %xmm3
-	por %xmm2, %xmm3
-	movdqa %xmm3, (%edi)
-
-	movaps LCPI5_5, %xmm1
-	divps %xmm0, %xmm1
-	mulps %xmm1, %xmm3
-	mulps 8656(%ecx), %xmm3
-	addps 8672(%ecx), %xmm3
-	andps LCPI5_6, %xmm1
-	andps LCPI5_1, %xmm3
-	orps %xmm1, %xmm3
-	movaps %xmm3, 112(%esp)
-	movaps %xmm3, (%ebx)
-
-Due to some minor source change, the later case ended up using orps and movaps
-instead of por and movdqa. Does it matter?
-
-//===---------------------------------------------------------------------===//
-
-X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
-to choose between movaps, movapd, and movdqa based on types of source and
-destination?
-
-How about andps, andpd, and pand? Do we really care about the type of the packed
-elements? If not, why not always use the "ps" variants which are likely to be
-shorter.
-
-//===---------------------------------------------------------------------===//
-
 External test Nurbs exposed some problems. Look for
 __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
 emits:
@@ -278,41 +234,6 @@ It also exposes some other problems. See MOV32ri -3 and the spills.
 
 //===---------------------------------------------------------------------===//
 
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
-
-LLVM is producing bad code.
-
-LBB_main_4:	# cond_true44
-	addps %xmm1, %xmm2
-	subps %xmm3, %xmm2
-	movaps (%ecx), %xmm4
-	movaps %xmm2, %xmm1
-	addps %xmm4, %xmm1
-	addl $16, %ecx
-	incl %edx
-	cmpl $262144, %edx
-	movaps %xmm3, %xmm2
-	movaps %xmm4, %xmm3
-	jne LBB_main_4	# cond_true44
-
-There are two problems. 1) No need to two loop induction variables. We can
-compare against 262144 * 16. 2) Known register coalescer issue. We should
-be able eliminate one of the movaps:
-
-	addps %xmm2, %xmm1    <=== Commute!
-	subps %xmm3, %xmm1
-	movaps (%ecx), %xmm4
-	movaps %xmm1, %xmm1   <=== Eliminate!
-	addps %xmm4, %xmm1
-	addl $16, %ecx
-	incl %edx
-	cmpl $262144, %edx
-	movaps %xmm3, %xmm2
-	movaps %xmm4, %xmm3
-	jne LBB_main_4	# cond_true44
-
-//===---------------------------------------------------------------------===//
-
 Consider:
 
 __m128 test(float a) {
@@ -376,74 +297,12 @@ ret
 ... saving two instructions.
 
 The basic idea is that a reload from a spill slot, can, if only one 4-byte 
-chunk is used, bring in 3 zeros the the one element instead of 4 elements.
+chunk is used, bring in 3 zeros the one element instead of 4 elements.
 This can be used to simplify a variety of shuffle operations, where the
 elements are fixed zeros.
 
 //===---------------------------------------------------------------------===//
 
-For this:
-
-#include <emmintrin.h>
-void test(__m128d *r, __m128d *A, double B) {
-  *r = _mm_loadl_pd(*A, &B);
-}
-
-We generates:
-
-	subl $12, %esp
-	movsd 24(%esp), %xmm0
-	movsd %xmm0, (%esp)
-	movl 20(%esp), %eax
-	movapd (%eax), %xmm0
-	movlpd (%esp), %xmm0
-	movl 16(%esp), %eax
-	movapd %xmm0, (%eax)
-	addl $12, %esp
-	ret
-
-icc generates:
-
-        movl      4(%esp), %edx                                 #3.6
-        movl      8(%esp), %eax                                 #3.6
-        movapd    (%eax), %xmm0                                 #4.22
-        movlpd    12(%esp), %xmm0                               #4.8
-        movapd    %xmm0, (%edx)                                 #4.3
-        ret                                                     #5.1
-
-So icc is smart enough to know that B is in memory so it doesn't load it and
-store it back to stack.
-
-This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic, 
-lowering it to a load+insertelement instead.  Already match the load+shuffle 
-as movlpd, so this should be easy.  We already get optimal code for:
-
-define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) {
-entry:
-	%tmp2 = load <2 x double>* %A, align 16
-	%tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0
-	store <2 x double> %tmp8, <2 x double>* %r, align 16
-	ret void
-}
-
-//===---------------------------------------------------------------------===//
-
-__m128d test1( __m128d A, __m128d B) {
-  return _mm_shuffle_pd(A, B, 0x3);
-}
-
-compiles to
-
-shufpd $3, %xmm1, %xmm0
-
-Perhaps it's better to use unpckhpd instead?
-
-unpckhpd %xmm1, %xmm0
-
-Don't know if unpckhpd is faster. But it is shorter.
-
-//===---------------------------------------------------------------------===//
-
 This code generates ugly code, probably due to costs being off or something:
 
 define void @test(float* %P, <4 x float>* %P2 ) {
@@ -505,46 +364,6 @@ nodes which are selected to max / min instructions that are marked commutable.
 
 //===---------------------------------------------------------------------===//
 
-We should compile this:
-#include <xmmintrin.h>
-typedef union {
-  int i[4];
-  float f[4];
-  __m128 v;
-} vector4_t;
-void swizzle (const void *a, vector4_t * b, vector4_t * c) {
-  b->v = _mm_loadl_pi (b->v, (__m64 *) a);
-  c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
-}
-
-to:
-
-_swizzle:
-        movl    4(%esp), %eax
-        movl    8(%esp), %edx
-        movl    12(%esp), %ecx
-        movlps  (%eax), %xmm0
-        movlps  %xmm0, (%edx)
-        movlps  8(%eax), %xmm0
-        movlps  %xmm0, (%ecx)
-        ret
-
-not:
-
-swizzle:
-        movl 8(%esp), %eax
-        movaps (%eax), %xmm0
-        movl 4(%esp), %ecx
-        movlps (%ecx), %xmm0
-        movaps %xmm0, (%eax)
-        movl 12(%esp), %eax
-        movaps (%eax), %xmm0
-        movlps 8(%ecx), %xmm0
-        movaps %xmm0, (%eax)
-        ret
-
-//===---------------------------------------------------------------------===//
-
 We should materialize vector constants like "all ones" and "signbit" with 
 code like:
 
@@ -631,10 +450,11 @@ eliminates a constant pool load.  For example, consider:
 
 define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
 entry:
- %tmp6 = sub float -0.000000e+00, %z.1		; <float> [#uses=1]
+ %tmp6 = fsub float -0.000000e+00, %z.1		; <float> [#uses=1]
  %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
  ret i64 %tmp20
 }
+declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
 
 This currently compiles to:
 
@@ -814,7 +634,7 @@ define <4 x i32> @f(<4 x i32> %i) nounwind  {
 	ret <4 x i32> %A
 }
 
-Compiles into:
+On targets without SSE4.1, this compiles into:
 
 LCPI1_0:					##  <4 x i32>
 	.long	10
@@ -846,6 +666,11 @@ _f:
 	punpckldq	%xmm2, %xmm0
 	ret
 
+It would be better to synthesize integer vector multiplication by constants
+using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
+simple cases such as multiplication by powers of two would be better as
+vector shifts than as multiplications.
+
 //===---------------------------------------------------------------------===//
 
 We compile this:
@@ -889,3 +714,251 @@ LC0:
 With SSE4, it should be
       movdqa  .LC0(%rip), %xmm0
       pinsrb  $6, %edi, %xmm0
+
+//===---------------------------------------------------------------------===//
+
+We should transform a shuffle of two vectors of constants into a single vector
+of constants. Also, insertelement of a constant into a vector of constants
+should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
+
+We compiled it to something horrible:
+
+	.align	4
+LCPI1_1:					##  float
+	.long	1065353216	## float 1
+	.const
+
+	.align	4
+LCPI1_0:					##  <4 x float>
+	.space	4
+	.long	1065353216	## float 1
+	.space	4
+	.long	1065353216	## float 1
+	.text
+	.align	4,0x90
+	.globl	_t
+_t:
+	xorps	%xmm0, %xmm0
+	movhps	LCPI1_0, %xmm0
+	movss	LCPI1_1, %xmm1
+	movaps	%xmm0, %xmm2
+	shufps	$2, %xmm1, %xmm2
+	shufps	$132, %xmm2, %xmm0
+	movaps	%xmm0, 0
+
+//===---------------------------------------------------------------------===//
+rdar://5907648
+
+This function:
+
+float foo(unsigned char x) {
+  return x;
+}
+
+compiles to (x86-32):
+
+define float @foo(i8 zeroext  %x) nounwind  {
+	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
+	ret float %tmp12
+}
+
+compiles to:
+
+_foo:
+	subl	$4, %esp
+	movzbl	8(%esp), %eax
+	cvtsi2ss	%eax, %xmm0
+	movss	%xmm0, (%esp)
+	flds	(%esp)
+	addl	$4, %esp
+	ret
+
+We should be able to use:
+  cvtsi2ss 8($esp), %xmm0
+since we know the stack slot is already zext'd.
+
+//===---------------------------------------------------------------------===//
+
+Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
+when code size is critical. movlps is slower than movsd on core2 but it's one
+byte shorter.
+
+//===---------------------------------------------------------------------===//
+
+We should use a dynamic programming based approach to tell when using FPStack
+operations is cheaper than SSE.  SciMark montecarlo contains code like this
+for example:
+
+double MonteCarlo_num_flops(int Num_samples) {
+    return ((double) Num_samples)* 4.0;
+}
+
+In fpstack mode, this compiles into:
+
+LCPI1_0:					
+	.long	1082130432	## float 4.000000e+00
+_MonteCarlo_num_flops:
+	subl	$4, %esp
+	movl	8(%esp), %eax
+	movl	%eax, (%esp)
+	fildl	(%esp)
+	fmuls	LCPI1_0
+	addl	$4, %esp
+	ret
+        
+in SSE mode, it compiles into significantly slower code:
+
+_MonteCarlo_num_flops:
+	subl	$12, %esp
+	cvtsi2sd	16(%esp), %xmm0
+	mulsd	LCPI1_0, %xmm0
+	movsd	%xmm0, (%esp)
+	fldl	(%esp)
+	addl	$12, %esp
+	ret
+
+There are also other cases in scimark where using fpstack is better, it is
+cheaper to do fld1 than load from a constant pool for example, so
+"load, add 1.0, store" is better done in the fp stack, etc.
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
+"cmpsd".  For example, this code:
+
+double d1(double x) { return x == x ? x : x + x; }
+
+Compiles into:
+
+_d1:
+	ucomisd	%xmm0, %xmm0
+	jnp	LBB1_2
+	addsd	%xmm0, %xmm0
+	ret
+LBB1_2:
+	ret
+
+Also, the 'ret's should be shared.  This is PR6032.
+
+//===---------------------------------------------------------------------===//
+
+These should compile into the same code (PR6214): Perhaps instcombine should
+canonicalize the former into the later?
+
+define float @foo(float %x) nounwind {
+  %t = bitcast float %x to i32
+  %s = and i32 %t, 2147483647
+  %d = bitcast i32 %s to float
+  ret float %d
+}
+
+declare float @fabsf(float %n)
+define float @bar(float %x) nounwind {
+  %d = call float @fabsf(float %x)
+  ret float %d
+}
+
+//===---------------------------------------------------------------------===//
+
+This IR (from PR6194):
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%0 = type { double, double }
+%struct.float3 = type { float, float, float }
+
+define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
+entry:
+  %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
+  %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
+  %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
+  %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
+  %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
+  %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
+  %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
+  store float %tmp12, float* %tmp5
+  ret void
+}
+
+Compiles to:
+
+_test:                                  ## @test
+	movd	%xmm0, %rax
+	shrq	$32, %rax
+	movl	%eax, 4(%rdi)
+	ret
+
+This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
+doing a shuffle from v[1] to v[0] then a float store.
+
+//===---------------------------------------------------------------------===//
+
+On SSE4 machines, we compile this code:
+
+define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
+       <2 x float> *%P) nounwind {
+  %Z = fadd <2 x float> %Q, %R
+
+  store <2 x float> %Z, <2 x float> *%P
+  ret <2 x float> %Z
+}
+
+into:
+
+_test2:                                 ## @test2
+## BB#0:
+	insertps	$0, %xmm2, %xmm2
+	insertps	$16, %xmm3, %xmm2
+	insertps	$0, %xmm0, %xmm3
+	insertps	$16, %xmm1, %xmm3
+	addps	%xmm2, %xmm3
+	movq	%xmm3, (%rdi)
+	movaps	%xmm3, %xmm0
+	pshufd	$1, %xmm3, %xmm1
+                                        ## kill: XMM1<def> XMM1<kill>
+	ret
+
+The insertps's of $0 are pointless complex copies.
+
+//===---------------------------------------------------------------------===//
+
+[UNSAFE FP]
+
+void foo(double, double, double);
+void norm(double x, double y, double z) {
+  double scale = __builtin_sqrt(x*x + y*y + z*z);
+  foo(x/scale, y/scale, z/scale);
+}
+
+We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
+slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
+and emit 3 mulsd in place of the divs. This can be done as a target-independent
+transform.
+
+If we're dealing with floats instead of doubles we could even replace the sqrtss
+and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
+cost of reduced accuracy.
+
+//===---------------------------------------------------------------------===//
+
+This function should be matched to haddpd when the appropriate CPU is enabled:
+
+#include <x86intrin.h>
+double f (__m128d p) {
+  return p[0] + p[1];
+}
+
+similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
+turn into hsubpd also.
+
+//===---------------------------------------------------------------------===//
+
+define <2 x i32> @foo(<2 x double> %in) {
+  %x = fptosi <2 x double> %in to <2 x i32>
+  ret <2 x i32> %x
+}
+
+Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si.
+
+//===---------------------------------------------------------------------===//