X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=c64d7e0e6e61d9093f667aabd9aa5617f9603c48;hb=fd9d976d74ef0c0276c97e303066afc935e9b908;hp=623c86f3555f761e0165d0341a10e1274c6281cc;hpb=65844fbd8496cc7981758f61a6699cd07bfe32c7;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 623c86f3555..c64d7e0e6e6 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -397,32 +397,32 @@ followed by an uncond branch to an exit block. ; This testcase is due to tail-duplication not wanting to copy the return ; instruction into the terminating blocks because there was other code ; optimized out of the function after the taildup happened. -;RUN: llvm-upgrade < %s | llvm-as | opt -tailcallelim | llvm-dis | not grep call +; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call -int %t4(int %a) { +define i32 @t4(i32 %a) { entry: - %tmp.1 = and int %a, 1 - %tmp.2 = cast int %tmp.1 to bool - br bool %tmp.2, label %then.0, label %else.0 - -then.0: - %tmp.5 = add int %a, -1 - %tmp.3 = call int %t4( int %tmp.5 ) - br label %return - -else.0: - %tmp.7 = setne int %a, 0 - br bool %tmp.7, label %then.1, label %return - -then.1: - %tmp.11 = add int %a, -2 - %tmp.9 = call int %t4( int %tmp.11 ) - br label %return - -return: - %result.0 = phi int [ 0, %else.0 ], [ %tmp.3, %then.0 ], + %tmp.1 = and i32 %a, 1 ; [#uses=1] + %tmp.2 = icmp ne i32 %tmp.1, 0 ; [#uses=1] + br i1 %tmp.2, label %then.0, label %else.0 + +then.0: ; preds = %entry + %tmp.5 = add i32 %a, -1 ; [#uses=1] + %tmp.3 = call i32 @t4( i32 %tmp.5 ) ; [#uses=1] + br label %return + +else.0: ; preds = %entry + %tmp.7 = icmp ne i32 %a, 0 ; [#uses=1] + br i1 %tmp.7, label %then.1, label %return + +then.1: ; preds = %else.0 + %tmp.11 = add i32 %a, -2 ; [#uses=1] + %tmp.9 = call i32 @t4( i32 %tmp.11 ) ; [#uses=1] + br label %return + +return: ; preds = %then.1, %else.0, %then.0 + %result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ], [ %tmp.9, %then.1 ] - ret int %result.0 + ret i32 %result.0 } //===---------------------------------------------------------------------===// @@ -446,21 +446,19 @@ long long fib(const long long n) { Argument promotion should promote arguments for recursive functions, like this: -; RUN: llvm-upgrade < %s | llvm-as | opt -argpromotion | llvm-dis | grep x.val +; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val -implementation ; Functions: - -internal int %foo(int* %x) { +define internal i32 @foo(i32* %x) { entry: - %tmp = load int* %x - %tmp.foo = call int %foo(int *%x) - ret int %tmp.foo + %tmp = load i32* %x ; [#uses=0] + %tmp.foo = call i32 @foo( i32* %x ) ; [#uses=1] + ret i32 %tmp.foo } -int %bar(int* %x) { +define i32 @bar(i32* %x) { entry: - %tmp3 = call int %foo( int* %x) ; [#uses=1] - ret int %tmp3 + %tmp3 = call i32 @foo( i32* %x ) ; [#uses=1] + ret i32 %tmp3 } //===---------------------------------------------------------------------===// @@ -529,16 +527,22 @@ We should turn things like "load+fabs+store" and "load+fneg+store" into the corresponding integer operations. On a yonah, this loop: double a[256]; - for (b = 0; b < 10000000; b++) - for (i = 0; i < 256; i++) - a[i] = -a[i]; +void foo() { + int i, b; + for (b = 0; b < 10000000; b++) + for (i = 0; i < 256; i++) + a[i] = -a[i]; +} is twice as slow as this loop: long long a[256]; - for (b = 0; b < 10000000; b++) - for (i = 0; i < 256; i++) - a[i] ^= (1ULL << 63); +void foo() { + int i, b; + for (b = 0; b < 10000000; b++) + for (i = 0; i < 256; i++) + a[i] ^= (1ULL << 63); +} and I suspect other processors are similar. On X86 in particular this is a big win because doing this with integers allows the use of read/modify/write @@ -621,3 +625,158 @@ The fact that the caller does not modify byval arg is not enough, we need to know that it doesn't modify G either. This is very tricky. //===---------------------------------------------------------------------===// + +We should add an FRINT node to the DAG to model targets that have legal +implementations of ceil/floor/rint. + +//===---------------------------------------------------------------------===// + +This GCC bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34043 +contains a testcase that compiles down to: + + %struct.XMM128 = type { <4 x float> } +.. + %src = alloca %struct.XMM128 +.. + %tmp6263 = bitcast %struct.XMM128* %src to <2 x i64>* + %tmp65 = getelementptr %struct.XMM128* %src, i32 0, i32 0 + store <2 x i64> %tmp5899, <2 x i64>* %tmp6263, align 16 + %tmp66 = load <4 x float>* %tmp65, align 16 + %tmp71 = add <4 x float> %tmp66, %tmp66 + +If the mid-level optimizer turned the bitcast of pointer + store of tmp5899 +into a bitcast of the vector value and a store to the pointer, then the +store->load could be easily removed. + +//===---------------------------------------------------------------------===// + +Consider: + +int test() { + long long input[8] = {1,1,1,1,1,1,1,1}; + foo(input); +} + +We currently compile this into a memcpy from a global array since the +initializer is fairly large and not memset'able. This is good, but the memcpy +gets lowered to load/stores in the code generator. This is also ok, except +that the codegen lowering for memcpy doesn't handle the case when the source +is a constant global. This gives us atrocious code like this: + + call "L1$pb" +"L1$pb": + popl %eax + movl _C.0.1444-"L1$pb"+32(%eax), %ecx + movl %ecx, 40(%esp) + movl _C.0.1444-"L1$pb"+20(%eax), %ecx + movl %ecx, 28(%esp) + movl _C.0.1444-"L1$pb"+36(%eax), %ecx + movl %ecx, 44(%esp) + movl _C.0.1444-"L1$pb"+44(%eax), %ecx + movl %ecx, 52(%esp) + movl _C.0.1444-"L1$pb"+40(%eax), %ecx + movl %ecx, 48(%esp) + movl _C.0.1444-"L1$pb"+12(%eax), %ecx + movl %ecx, 20(%esp) + movl _C.0.1444-"L1$pb"+4(%eax), %ecx +... + +instead of: + movl $1, 16(%esp) + movl $0, 20(%esp) + movl $1, 24(%esp) + movl $0, 28(%esp) + movl $1, 32(%esp) + movl $0, 36(%esp) + ... + +//===---------------------------------------------------------------------===// + +http://llvm.org/PR717: + +The following code should compile into "ret int undef". Instead, LLVM +produces "ret int 0": + +int f() { + int x = 4; + int y; + if (x == 3) y = 0; + return y; +} + +//===---------------------------------------------------------------------===// + +The loop unroller should partially unroll loops (instead of peeling them) +when code growth isn't too bad and when an unroll count allows simplification +of some code within the loop. One trivial example is: + +#include +int main() { + int nRet = 17; + int nLoop; + for ( nLoop = 0; nLoop < 1000; nLoop++ ) { + if ( nLoop & 1 ) + nRet += 2; + else + nRet -= 1; + } + return nRet; +} + +Unrolling by 2 would eliminate the '&1' in both copies, leading to a net +reduction in code size. The resultant code would then also be suitable for +exit value computation. + +//===---------------------------------------------------------------------===// + +We miss a bunch of rotate opportunities on various targets, including ppc, x86, +etc. On X86, we miss a bunch of 'rotate by variable' cases because the rotate +matching code in dag combine doesn't look through truncates aggressively +enough. Here are some testcases reduces from GCC PR17886: + +unsigned long long f(unsigned long long x, int y) { + return (x << y) | (x >> 64-y); +} +unsigned f2(unsigned x, int y){ + return (x << y) | (x >> 32-y); +} +unsigned long long f3(unsigned long long x){ + int y = 9; + return (x << y) | (x >> 64-y); +} +unsigned f4(unsigned x){ + int y = 10; + return (x << y) | (x >> 32-y); +} +unsigned long long f5(unsigned long long x, unsigned long long y) { + return (x << 8) | ((y >> 48) & 0xffull); +} +unsigned long long f6(unsigned long long x, unsigned long long y, int z) { + switch(z) { + case 1: + return (x << 8) | ((y >> 48) & 0xffull); + case 2: + return (x << 16) | ((y >> 40) & 0xffffull); + case 3: + return (x << 24) | ((y >> 32) & 0xffffffull); + case 4: + return (x << 32) | ((y >> 24) & 0xffffffffull); + default: + return (x << 40) | ((y >> 16) & 0xffffffffffull); + } +} + +On X86-64, we only handle f3/f4 right. On x86-32, several of these +generate truly horrible code, instead of using shld and friends. On +ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is +badness. PPC64 misses f, f5 and f6. CellSPU aborts in isel. + +//===---------------------------------------------------------------------===// + +We do a number of simplifications in simplify libcalls to strength reduce +standard library functions, but we don't currently merge them together. For +example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy. This can only +be done safely if "b" isn't modified between the strlen and memcpy of course. + +//===---------------------------------------------------------------------===// +