X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=ef63b9f913f7ef78483f56d6744cd715b5acaed9;hb=e6be34a53ecbe8c2ff9f0793b13d847e94c0de91;hp=20f4898b2a4378dde9ccff7dafcd9de1bccc86ef;hpb=5e14b0d3e612c6aec69cd4948dae3658ec020c3d;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 20f4898b2a4..ef63b9f913f 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -2,6 +2,13 @@ Target Independent Opportunities: //===---------------------------------------------------------------------===// +We should make the various target's "IMPLICIT_DEF" instructions be a single +target-independent opcode like TargetInstrInfo::INLINEASM. This would allow +us to eliminate the TargetInstrDesc::isImplicitDef() method, and would allow +us to avoid having to define this for every target for every register class. + +//===---------------------------------------------------------------------===// + With the recent changes to make the implicit def/use set explicit in machineinstrs, we should change the target descriptions for 'call' instructions so that the .td files don't list all the call-clobbered registers as implicit @@ -131,13 +138,6 @@ v4sf example(float *P) { //===---------------------------------------------------------------------===// -We should constant fold vector type casts at the LLVM level, regardless of the -cast. Currently we cannot fold some casts because we don't have TargetData -information in the constant folder, so we don't know the endianness of the -target! - -//===---------------------------------------------------------------------===// - Add support for conditional increments, and other related patterns. Instead of: @@ -397,30 +397,234 @@ followed by an uncond branch to an exit block. ; This testcase is due to tail-duplication not wanting to copy the return ; instruction into the terminating blocks because there was other code ; optimized out of the function after the taildup happened. -;RUN: llvm-upgrade < %s | llvm-as | opt -tailcallelim | llvm-dis | not grep call +; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call -int %t4(int %a) { +define i32 @t4(i32 %a) { entry: - %tmp.1 = and int %a, 1 - %tmp.2 = cast int %tmp.1 to bool - br bool %tmp.2, label %then.0, label %else.0 - -then.0: - %tmp.5 = add int %a, -1 - %tmp.3 = call int %t4( int %tmp.5 ) - br label %return - -else.0: - %tmp.7 = setne int %a, 0 - br bool %tmp.7, label %then.1, label %return - -then.1: - %tmp.11 = add int %a, -2 - %tmp.9 = call int %t4( int %tmp.11 ) - br label %return - -return: - %result.0 = phi int [ 0, %else.0 ], [ %tmp.3, %then.0 ], + %tmp.1 = and i32 %a, 1 ; [#uses=1] + %tmp.2 = icmp ne i32 %tmp.1, 0 ; [#uses=1] + br i1 %tmp.2, label %then.0, label %else.0 + +then.0: ; preds = %entry + %tmp.5 = add i32 %a, -1 ; [#uses=1] + %tmp.3 = call i32 @t4( i32 %tmp.5 ) ; [#uses=1] + br label %return + +else.0: ; preds = %entry + %tmp.7 = icmp ne i32 %a, 0 ; [#uses=1] + br i1 %tmp.7, label %then.1, label %return + +then.1: ; preds = %else.0 + %tmp.11 = add i32 %a, -2 ; [#uses=1] + %tmp.9 = call i32 @t4( i32 %tmp.11 ) ; [#uses=1] + br label %return + +return: ; preds = %then.1, %else.0, %then.0 + %result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ], [ %tmp.9, %then.1 ] - ret int %result.0 + ret i32 %result.0 +} + +//===---------------------------------------------------------------------===// + +Tail recursion elimination is not transforming this function, because it is +returning n, which fails the isDynamicConstant check in the accumulator +recursion checks. + +long long fib(const long long n) { + switch(n) { + case 0: + case 1: + return n; + default: + return fib(n-1) + fib(n-2); + } +} + +//===---------------------------------------------------------------------===// + +Argument promotion should promote arguments for recursive functions, like +this: + +; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val + +define internal i32 @foo(i32* %x) { +entry: + %tmp = load i32* %x ; [#uses=0] + %tmp.foo = call i32 @foo( i32* %x ) ; [#uses=1] + ret i32 %tmp.foo +} + +define i32 @bar(i32* %x) { +entry: + %tmp3 = call i32 @foo( i32* %x ) ; [#uses=1] + ret i32 %tmp3 +} + +//===---------------------------------------------------------------------===// + +"basicaa" should know how to look through "or" instructions that act like add +instructions. For example in this code, the x*4+1 is turned into x*4 | 1, and +basicaa can't analyze the array subscript, leading to duplicated loads in the +generated code: + +void test(int X, int Y, int a[]) { +int i; + for (i=2; i<1000; i+=4) { + a[i+0] = a[i-1+0]*a[i-2+0]; + a[i+1] = a[i-1+1]*a[i-2+1]; + a[i+2] = a[i-1+2]*a[i-2+2]; + a[i+3] = a[i-1+3]*a[i-2+3]; + } +} + +//===---------------------------------------------------------------------===// + +We should investigate an instruction sinking pass. Consider this silly +example in pic mode: + +#include +void foo(int x) { + assert(x); + //... } + +we compile this to: +_foo: + subl $28, %esp + call "L1$pb" +"L1$pb": + popl %eax + cmpl $0, 32(%esp) + je LBB1_2 # cond_true +LBB1_1: # return + # ... + addl $28, %esp + ret +LBB1_2: # cond_true +... + +The PIC base computation (call+popl) is only used on one path through the +code, but is currently always computed in the entry block. It would be +better to sink the picbase computation down into the block for the +assertion, as it is the only one that uses it. This happens for a lot of +code with early outs. + +Another example is loads of arguments, which are usually emitted into the +entry block on targets like x86. If not used in all paths through a +function, they should be sunk into the ones that do. + +In this case, whole-function-isel would also handle this. + +//===---------------------------------------------------------------------===// + +Investigate lowering of sparse switch statements into perfect hash tables: +http://burtleburtle.net/bob/hash/perfect.html + +//===---------------------------------------------------------------------===// + +We should turn things like "load+fabs+store" and "load+fneg+store" into the +corresponding integer operations. On a yonah, this loop: + +double a[256]; +void foo() { + int i, b; + for (b = 0; b < 10000000; b++) + for (i = 0; i < 256; i++) + a[i] = -a[i]; +} + +is twice as slow as this loop: + +long long a[256]; +void foo() { + int i, b; + for (b = 0; b < 10000000; b++) + for (i = 0; i < 256; i++) + a[i] ^= (1ULL << 63); +} + +and I suspect other processors are similar. On X86 in particular this is a +big win because doing this with integers allows the use of read/modify/write +instructions. + +//===---------------------------------------------------------------------===// + +DAG Combiner should try to combine small loads into larger loads when +profitable. For example, we compile this C++ example: + +struct THotKey { short Key; bool Control; bool Shift; bool Alt; }; +extern THotKey m_HotKey; +THotKey GetHotKey () { return m_HotKey; } + +into (-O3 -fno-exceptions -static -fomit-frame-pointer): + +__Z9GetHotKeyv: + pushl %esi + movl 8(%esp), %eax + movb _m_HotKey+3, %cl + movb _m_HotKey+4, %dl + movb _m_HotKey+2, %ch + movw _m_HotKey, %si + movw %si, (%eax) + movb %ch, 2(%eax) + movb %cl, 3(%eax) + movb %dl, 4(%eax) + popl %esi + ret $4 + +GCC produces: + +__Z9GetHotKeyv: + movl _m_HotKey, %edx + movl 4(%esp), %eax + movl %edx, (%eax) + movzwl _m_HotKey+4, %edx + movw %dx, 4(%eax) + ret $4 + +The LLVM IR contains the needed alignment info, so we should be able to +merge the loads and stores into 4-byte loads: + + %struct.THotKey = type { i16, i8, i8, i8 } +define void @_Z9GetHotKeyv(%struct.THotKey* sret %agg.result) nounwind { +... + %tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8 + %tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2 + %tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1 + %tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2 + +Alternatively, we should use a small amount of base-offset alias analysis +to make it so the scheduler doesn't need to hold all the loads in regs at +once. + +//===---------------------------------------------------------------------===// + +We should extend parameter attributes to capture more information about +pointer parameters for alias analysis. Some ideas: + +1. Add a "nocapture" attribute, which indicates that the callee does not store + the address of the parameter into a global or any other memory location + visible to the callee. This can be used to make basicaa and other analyses + more powerful. It is true for things like memcpy, strcat, and many other + things, including structs passed by value, most C++ references, etc. +2. Generalize readonly to be set on parameters. This is important mod/ref + info for the function, which is important for basicaa and others. It can + also be used by the inliner to avoid inserting a memcpy for byval + arguments when the function is inlined. + +These functions can be inferred by various analysis passes such as the +globalsmodrefaa pass. Note that getting #2 right is actually really tricky. +Consider this code: + +struct S; S G; +void caller(S byvalarg) { G.field = 1; ... } +void callee() { caller(G); } + +The fact that the caller does not modify byval arg is not enough, we need +to know that it doesn't modify G either. This is very tricky. + +//===---------------------------------------------------------------------===// + +We should add an FRINT node to the DAG to model targets that have legal +implementations of ceil/floor/rint.