X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=c64d7e0e6e61d9093f667aabd9aa5617f9603c48;hb=fd9d976d74ef0c0276c97e303066afc935e9b908;hp=623c86f3555f761e0165d0341a10e1274c6281cc;hpb=65844fbd8496cc7981758f61a6699cd07bfe32c7;p=oota-llvm.git

diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 623c86f3555..c64d7e0e6e6 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -397,32 +397,32 @@ followed by an uncond branch to an exit block.
 ; This testcase is due to tail-duplication not wanting to copy the return
 ; instruction into the terminating blocks because there was other code
 ; optimized out of the function after the taildup happened.
-;RUN: llvm-upgrade < %s | llvm-as | opt -tailcallelim | llvm-dis | not grep call
+; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call
 
-int %t4(int %a) {
+define i32 @t4(i32 %a) {
 entry:
-        %tmp.1 = and int %a, 1
-        %tmp.2 = cast int %tmp.1 to bool
-        br bool %tmp.2, label %then.0, label %else.0
-
-then.0:
-        %tmp.5 = add int %a, -1
-        %tmp.3 = call int %t4( int %tmp.5 )
-        br label %return
-
-else.0:
-        %tmp.7 = setne int %a, 0
-        br bool %tmp.7, label %then.1, label %return
-
-then.1:
-        %tmp.11 = add int %a, -2
-        %tmp.9 = call int %t4( int %tmp.11 )
-        br label %return
-
-return:
-        %result.0 = phi int [ 0, %else.0 ], [ %tmp.3, %then.0 ],
+	%tmp.1 = and i32 %a, 1		; <i32> [#uses=1]
+	%tmp.2 = icmp ne i32 %tmp.1, 0		; <i1> [#uses=1]
+	br i1 %tmp.2, label %then.0, label %else.0
+
+then.0:		; preds = %entry
+	%tmp.5 = add i32 %a, -1		; <i32> [#uses=1]
+	%tmp.3 = call i32 @t4( i32 %tmp.5 )		; <i32> [#uses=1]
+	br label %return
+
+else.0:		; preds = %entry
+	%tmp.7 = icmp ne i32 %a, 0		; <i1> [#uses=1]
+	br i1 %tmp.7, label %then.1, label %return
+
+then.1:		; preds = %else.0
+	%tmp.11 = add i32 %a, -2		; <i32> [#uses=1]
+	%tmp.9 = call i32 @t4( i32 %tmp.11 )		; <i32> [#uses=1]
+	br label %return
+
+return:		; preds = %then.1, %else.0, %then.0
+	%result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ],
                             [ %tmp.9, %then.1 ]
-        ret int %result.0
+	ret i32 %result.0
 }
 
 //===---------------------------------------------------------------------===//
@@ -446,21 +446,19 @@ long long fib(const long long n) {
 Argument promotion should promote arguments for recursive functions, like 
 this:
 
-; RUN: llvm-upgrade < %s | llvm-as | opt -argpromotion | llvm-dis | grep x.val
+; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val
 
-implementation   ; Functions:
-
-internal int %foo(int* %x) {
+define internal i32 @foo(i32* %x) {
 entry:
-        %tmp = load int* %x
-        %tmp.foo = call int %foo(int *%x)
-        ret int %tmp.foo
+	%tmp = load i32* %x		; <i32> [#uses=0]
+	%tmp.foo = call i32 @foo( i32* %x )		; <i32> [#uses=1]
+	ret i32 %tmp.foo
 }
 
-int %bar(int* %x) {
+define i32 @bar(i32* %x) {
 entry:
-        %tmp3 = call int %foo( int* %x)                ; <int>[#uses=1]
-        ret int %tmp3
+	%tmp3 = call i32 @foo( i32* %x )		; <i32> [#uses=1]
+	ret i32 %tmp3
 }
 
 //===---------------------------------------------------------------------===//
@@ -529,16 +527,22 @@ We should turn things like "load+fabs+store" and "load+fneg+store" into the
 corresponding integer operations.  On a yonah, this loop:
 
 double a[256];
- for (b = 0; b < 10000000; b++)
- for (i = 0; i < 256; i++)
-   a[i] = -a[i];
+void foo() {
+  int i, b;
+  for (b = 0; b < 10000000; b++)
+  for (i = 0; i < 256; i++)
+    a[i] = -a[i];
+}
 
 is twice as slow as this loop:
 
 long long a[256];
- for (b = 0; b < 10000000; b++)
- for (i = 0; i < 256; i++)
-   a[i] ^= (1ULL << 63);
+void foo() {
+  int i, b;
+  for (b = 0; b < 10000000; b++)
+  for (i = 0; i < 256; i++)
+    a[i] ^= (1ULL << 63);
+}
 
 and I suspect other processors are similar.  On X86 in particular this is a
 big win because doing this with integers allows the use of read/modify/write
@@ -621,3 +625,158 @@ The fact that the caller does not modify byval arg is not enough, we need
 to know that it doesn't modify G either.  This is very tricky.
 
 //===---------------------------------------------------------------------===//
+
+We should add an FRINT node to the DAG to model targets that have legal
+implementations of ceil/floor/rint.
+
+//===---------------------------------------------------------------------===//
+
+This GCC bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34043
+contains a testcase that compiles down to:
+
+	%struct.XMM128 = type { <4 x float> }
+..
+	%src = alloca %struct.XMM128
+..
+	%tmp6263 = bitcast %struct.XMM128* %src to <2 x i64>*
+	%tmp65 = getelementptr %struct.XMM128* %src, i32 0, i32 0
+	store <2 x i64> %tmp5899, <2 x i64>* %tmp6263, align 16
+	%tmp66 = load <4 x float>* %tmp65, align 16		
+	%tmp71 = add <4 x float> %tmp66, %tmp66		
+
+If the mid-level optimizer turned the bitcast of pointer + store of tmp5899
+into a bitcast of the vector value and a store to the pointer, then the 
+store->load could be easily removed.
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+int test() {
+  long long input[8] = {1,1,1,1,1,1,1,1};
+  foo(input);
+}
+
+We currently compile this into a memcpy from a global array since the 
+initializer is fairly large and not memset'able.  This is good, but the memcpy
+gets lowered to load/stores in the code generator.  This is also ok, except
+that the codegen lowering for memcpy doesn't handle the case when the source
+is a constant global.  This gives us atrocious code like this:
+
+	call	"L1$pb"
+"L1$pb":
+	popl	%eax
+	movl	_C.0.1444-"L1$pb"+32(%eax), %ecx
+	movl	%ecx, 40(%esp)
+	movl	_C.0.1444-"L1$pb"+20(%eax), %ecx
+	movl	%ecx, 28(%esp)
+	movl	_C.0.1444-"L1$pb"+36(%eax), %ecx
+	movl	%ecx, 44(%esp)
+	movl	_C.0.1444-"L1$pb"+44(%eax), %ecx
+	movl	%ecx, 52(%esp)
+	movl	_C.0.1444-"L1$pb"+40(%eax), %ecx
+	movl	%ecx, 48(%esp)
+	movl	_C.0.1444-"L1$pb"+12(%eax), %ecx
+	movl	%ecx, 20(%esp)
+	movl	_C.0.1444-"L1$pb"+4(%eax), %ecx
+...
+
+instead of:
+	movl	$1, 16(%esp)
+	movl	$0, 20(%esp)
+	movl	$1, 24(%esp)
+	movl	$0, 28(%esp)
+	movl	$1, 32(%esp)
+	movl	$0, 36(%esp)
+	...
+
+//===---------------------------------------------------------------------===//
+
+http://llvm.org/PR717:
+
+The following code should compile into "ret int undef". Instead, LLVM
+produces "ret int 0":
+
+int f() {
+  int x = 4;
+  int y;
+  if (x == 3) y = 0;
+  return y;
+}
+
+//===---------------------------------------------------------------------===//
+
+The loop unroller should partially unroll loops (instead of peeling them)
+when code growth isn't too bad and when an unroll count allows simplification
+of some code within the loop.  One trivial example is:
+
+#include <stdio.h>
+int main() {
+    int nRet = 17;
+    int nLoop;
+    for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
+        if ( nLoop & 1 )
+            nRet += 2;
+        else
+            nRet -= 1;
+    }
+    return nRet;
+}
+
+Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
+reduction in code size.  The resultant code would then also be suitable for
+exit value computation.
+
+//===---------------------------------------------------------------------===//
+
+We miss a bunch of rotate opportunities on various targets, including ppc, x86,
+etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
+matching code in dag combine doesn't look through truncates aggressively 
+enough.  Here are some testcases reduces from GCC PR17886:
+
+unsigned long long f(unsigned long long x, int y) {
+  return (x << y) | (x >> 64-y); 
+} 
+unsigned f2(unsigned x, int y){
+  return (x << y) | (x >> 32-y); 
+} 
+unsigned long long f3(unsigned long long x){
+  int y = 9;
+  return (x << y) | (x >> 64-y); 
+} 
+unsigned f4(unsigned x){
+  int y = 10;
+  return (x << y) | (x >> 32-y); 
+}
+unsigned long long f5(unsigned long long x, unsigned long long y) {
+  return (x << 8) | ((y >> 48) & 0xffull);
+}
+unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
+  switch(z) {
+  case 1:
+    return (x << 8) | ((y >> 48) & 0xffull);
+  case 2:
+    return (x << 16) | ((y >> 40) & 0xffffull);
+  case 3:
+    return (x << 24) | ((y >> 32) & 0xffffffull);
+  case 4:
+    return (x << 32) | ((y >> 24) & 0xffffffffull);
+  default:
+    return (x << 40) | ((y >> 16) & 0xffffffffffull);
+  }
+}
+
+On X86-64, we only handle f3/f4 right.  On x86-32, several of these 
+generate truly horrible code, instead of using shld and friends.  On
+ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
+badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
+
+//===---------------------------------------------------------------------===//
+
+We do a number of simplifications in simplify libcalls to strength reduce
+standard library functions, but we don't currently merge them together.  For
+example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy.  This can only
+be done safely if "b" isn't modified between the strlen and memcpy of course.
+
+//===---------------------------------------------------------------------===//
+