X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=e67fab109047ed9cdce789af6c0fb50bae122c35;hb=d93ceeb125c11a96eb85618bb9a8a7d664a1d8f4;hp=d4545a6fcfd37150820e69ae6bb268a45c2d7e4d;hpb=85e96018621c8a763f9e60a62cca06b0d8475488;p=oota-llvm.git

diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index d4545a6fcfd..e67fab10904 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -895,6 +895,24 @@ compare:
 
 //===---------------------------------------------------------------------===//
 
+Linux is missing some basic tail call support:
+
+#include <math.h>
+double foo(double a) {    return sin(a); }
+
+This compiles into this on x86-64 Linux (but not darwin):
+foo:
+	subq	$8, %rsp
+	call	sin
+	addq	$8, %rsp
+	ret
+vs:
+
+foo:
+        jmp sin
+
+//===---------------------------------------------------------------------===//
+
 Tail call optimization improvements: Tail call optimization currently
 pushes all arguments on the top of the stack (their normal place for
 non-tail call optimized calls) that source from the callers arguments
@@ -1103,57 +1121,6 @@ be folded into: shl [mem], 1
 
 //===---------------------------------------------------------------------===//
 
-This testcase misses a read/modify/write opportunity (from PR1425):
-
-void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
-    int i;
-    for(i=0; i<width; i++)
-        b1[i] += (1*(b0[i] + b2[i])+0)>>0;
-}
-
-We compile it down to:
-
-LBB1_2:	# bb
-	movl	(%esi,%edi,4), %ebx
-	addl	(%ecx,%edi,4), %ebx
-	addl	(%edx,%edi,4), %ebx
-	movl	%ebx, (%ecx,%edi,4)
-	incl	%edi
-	cmpl	%eax, %edi
-	jne	LBB1_2	# bb
-
-the inner loop should add to the memory location (%ecx,%edi,4), saving
-a mov.  Something like:
-
-        movl    (%esi,%edi,4), %ebx
-        addl    (%edx,%edi,4), %ebx
-        addl    %ebx, (%ecx,%edi,4)
-
-Here is another interesting example:
-
-void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
-    int i;
-    for(i=0; i<width; i++)
-        b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
-}
-
-We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
-
-LBB9_2:	# bb
-	movl	(%ecx,%edi,4), %ebx
-	subl	(%esi,%edi,4), %ebx
-	subl	(%edx,%edi,4), %ebx
-	movl	%ebx, (%ecx,%edi,4)
-	incl	%edi
-	cmpl	%eax, %edi
-	jne	LBB9_2	# bb
-
-Additionally, LSR should rewrite the exit condition of these loops to use
-a stride-4 IV, would would allow all the scales in the loop to go away.
-This would result in smaller code and more efficient microops.
-
-//===---------------------------------------------------------------------===//
-
 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
 or and instruction, for example:
 
@@ -1186,13 +1153,6 @@ void test(double *P) {
 
 //===---------------------------------------------------------------------===//
 
-handling llvm.memory.barrier on pre SSE2 cpus
-
-should generate:
-lock ; mov %esp, %esp
-
-//===---------------------------------------------------------------------===//
-
 The generated code on x86 for checking for signed overflow on a multiply the
 obvious way is much longer than it needs to be.
 
@@ -1301,15 +1261,8 @@ FirstOnet:
         xorl    %eax, %eax
         ret
 
-There are a few possible improvements here:
-1. We should be able to eliminate the dead load into %ecx
-2. We could change the "movl 8(%esp), %eax" into
-   "movzwl 10(%esp), %eax"; this lets us change the cmpl
-   into a testl, which is shorter, and eliminate the shift.
-
-We could also in theory eliminate the branch by using a conditional
-for the address of the load, but that seems unlikely to be worthwhile
-in general.
+We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
+lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
 
 //===---------------------------------------------------------------------===//
 
@@ -1331,22 +1284,23 @@ bb7:		; preds = %entry
 
 to:
 
-_foo:
+foo:                                    # @foo
+# BB#0:                                 # %entry
+	movl	4(%esp), %ecx
 	cmpb	$0, 16(%esp)
-	movl	12(%esp), %ecx
+	je	.LBB0_2
+# BB#1:                                 # %bb
 	movl	8(%esp), %eax
-	movl	4(%esp), %edx
-	je	LBB1_2	# bb7
-LBB1_1:	# bb
-	addl	%edx, %eax
+	addl	%ecx, %eax
 	ret
-LBB1_2:	# bb7
-	movl	%edx, %eax
-	subl	%ecx, %eax
+.LBB0_2:                                # %bb7
+	movl	12(%esp), %edx
+	movl	%ecx, %eax
+	subl	%edx, %eax
 	ret
 
-The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
-if it commuted the addl in LBB1_1.
+There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
+couple more movls by putting 4(%esp) into %eax instead of %ecx.
 
 //===---------------------------------------------------------------------===//
 
@@ -1396,8 +1350,7 @@ Also check why xmm7 is not used at all in the function.
 
 //===---------------------------------------------------------------------===//
 
-Legalize loses track of the fact that bools are always zero extended when in
-memory.  This causes us to compile abort_gzip (from 164.gzip) from:
+Take the following:
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
@@ -1416,16 +1369,15 @@ bb4.i:		; preds = %entry
 }
 declare void @exit(i32) noreturn nounwind 
 
-into:
-
-_abort_gzip:
+This compiles into:
+_abort_gzip:                            ## @abort_gzip
+## BB#0:                                ## %entry
 	subl	$12, %esp
 	movb	_in_exit.4870.b, %al
-	notb	%al
-	testb	$1, %al
-	jne	LBB1_2	## bb4.i
-LBB1_1:	## bb.i
-  ...
+	cmpb	$1, %al
+	jne	LBB0_2
+
+We somehow miss folding the movb into the cmpb.
 
 //===---------------------------------------------------------------------===//
 
@@ -1929,3 +1881,100 @@ The code produced by gcc is 3 bytes shorter.  This sort of construct often
 shows up with bitfields.
 
 //===---------------------------------------------------------------------===//
+
+Take the following C code:
+int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+  %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
+  %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
+  %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
+  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
+  ret i32 %conv5
+}
+
+And the following x86 code:
+	xorl	%esi, %edi
+	testb	$-1, %dil
+	sete	%al
+	movzbl	%al, %eax
+	ret
+
+A cmpb instead of the xorl+testb would be one instruction shorter.
+
+//===---------------------------------------------------------------------===//
+
+Given the following C code:
+int f(int a, int b) { return (signed char)a == (signed char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+  %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
+  %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
+  %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
+  %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
+  %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
+  %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
+  ret i32 %conv5
+}
+
+And the following x86 code:
+	movsbl	%sil, %eax
+	movsbl	%dil, %ecx
+	cmpl	%eax, %ecx
+	sete	%al
+	movzbl	%al, %eax
+	ret
+
+
+It should be possible to eliminate the sign extensions.
+
+//===---------------------------------------------------------------------===//
+
+LLVM misses a load+store narrowing opportunity in this code:
+
+%struct.bf = type { i64, i16, i16, i32 }
+
+@bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
+
+define void @t1() nounwind ssp {
+entry:
+  %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
+  %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
+  %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
+  %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
+  %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
+  store i32 %4, i32* %2, align 1
+  %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
+  %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
+  %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
+  %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
+  %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
+  store i32 %9, i32* %7, align 1
+  ret void
+}
+
+LLVM currently emits this:
+
+  movq  bfi(%rip), %rax
+  andl  $-65537, 8(%rax)
+  movq  bfi(%rip), %rax
+  andl  $-131073, 8(%rax)
+  ret
+
+It could narrow the loads and stores to emit this:
+
+  movq  bfi(%rip), %rax
+  andb  $-2, 10(%rax)
+  movq  bfi(%rip), %rax
+  andb  $-3, 10(%rax)
+  ret
+
+The trouble is that there is a TokenFactor between the store and the
+load, making it non-trivial to determine if there's anything between
+the load and the store which would prohibit narrowing.
+
+//===---------------------------------------------------------------------===//