X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=97546c4db595db64db3b9a49bec8058a860a2a26;hb=a8c58d572b294c90bde4b78a0121d73db78590db;hp=4e4668698ad2b09fc44ed2990843ac56a6fe8b91;hpb=9fee08fce78a9fedc7c7c1c090bcad5f92f7113e;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 4e4668698ad..97546c4db59 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -125,8 +125,7 @@ specific vector types are target dependent. //===---------------------------------------------------------------------===// -We should add 'unaligned load/store' nodes, and produce them from code like -this: +We should produce an unaligned load from code like this: v4sf example(float *P) { return (v4sf){P[0], P[1], P[2], P[3] }; @@ -167,52 +166,14 @@ if anyone cared enough about sincos. //===---------------------------------------------------------------------===// -Scalar Repl cannot currently promote this testcase to 'ret long cst': - - %struct.X = type { i32, i32 } - %struct.Y = type { %struct.X } - -define i64 @bar() { - %retval = alloca %struct.Y, align 8 - %tmp12 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 0 - store i32 0, i32* %tmp12 - %tmp15 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 1 - store i32 1, i32* %tmp15 - %retval.upgrd.1 = bitcast %struct.Y* %retval to i64* - %retval.upgrd.2 = load i64* %retval.upgrd.1 - ret i64 %retval.upgrd.2 -} - -it should be extended to do so. - -//===---------------------------------------------------------------------===// - --scalarrepl should promote this to be a vector scalar. - - %struct..0anon = type { <4 x float> } - -define void @test1(<4 x float> %V, float* %P) { - %u = alloca %struct..0anon, align 16 - %tmp = getelementptr %struct..0anon* %u, i32 0, i32 0 - store <4 x float> %V, <4 x float>* %tmp - %tmp1 = bitcast %struct..0anon* %u to [4 x float]* - %tmp.upgrd.1 = getelementptr [4 x float]* %tmp1, i32 0, i32 1 - %tmp.upgrd.2 = load float* %tmp.upgrd.1 - %tmp3 = mul float %tmp.upgrd.2, 2.000000e+00 - store float %tmp3, float* %P - ret void -} - -//===---------------------------------------------------------------------===// - Turn this into a single byte store with no load (the other 3 bytes are unmodified): -void %test(uint* %P) { - %tmp = load uint* %P - %tmp14 = or uint %tmp, 3305111552 - %tmp15 = and uint %tmp14, 3321888767 - store uint %tmp15, uint* %P +define void @test(i32* %P) { + %tmp = load i32* %P + %tmp14 = or i32 %tmp, 3305111552 + %tmp15 = and i32 %tmp14, 3321888767 + store i32 %tmp15, i32* %P ret void } @@ -236,13 +197,6 @@ _bar: addic r3,r3,-1 //===---------------------------------------------------------------------===// -Legalize should lower ctlz like this: - ctlz(x) = popcnt((x-1) & ~x) - -on targets that have popcnt but not ctlz. itanium, what else? - -//===---------------------------------------------------------------------===// - quantum_sigma_x in 462.libquantum contains the following loop: for(i=0; isize; i++) @@ -374,11 +328,6 @@ when it is declared U32. //===---------------------------------------------------------------------===// -Promote for i32 bswap can use i64 bswap + shr. Useful on targets with 64-bit -regs and bswap, like itanium. - -//===---------------------------------------------------------------------===// - LSR should know what GPR types a target has. This code: volatile short X, Y; // globals @@ -634,32 +583,6 @@ once. //===---------------------------------------------------------------------===// -We should extend parameter attributes to capture more information about -pointer parameters for alias analysis. Some ideas: - -1. Add a "nocapture" attribute, which indicates that the callee does not store - the address of the parameter into a global or any other memory location - visible to the callee. This can be used to make basicaa and other analyses - more powerful. It is true for things like memcpy, strcat, and many other - things, including structs passed by value, most C++ references, etc. -2. Generalize readonly to be set on parameters. This is important mod/ref - info for the function, which is important for basicaa and others. It can - also be used by the inliner to avoid inserting a memcpy for byval - arguments when the function is inlined. - -These functions can be inferred by various analysis passes such as the -globalsmodrefaa pass. Note that getting #2 right is actually really tricky. -Consider this code: - -struct S; S G; -void caller(S byvalarg) { G.field = 1; ... } -void callee() { caller(G); } - -The fact that the caller does not modify byval arg is not enough, we need -to know that it doesn't modify G either. This is very tricky. - -//===---------------------------------------------------------------------===// - We should add an FRINT node to the DAG to model targets that have legal implementations of ceil/floor/rint. @@ -814,16 +737,6 @@ be done safely if "b" isn't modified between the strlen and memcpy of course. //===---------------------------------------------------------------------===// -We should be able to evaluate this loop: - -int test(int x_offs) { - while (x_offs > 4) - x_offs -= 4; - return x_offs; -} - -//===---------------------------------------------------------------------===// - Reassociate should turn things like: int factorial(int X) { @@ -909,23 +822,6 @@ multiply hi's into a comparison against the mullo. //===---------------------------------------------------------------------===// -SROA is not promoting the union on the stack in this example, we should end -up with no allocas. - -union vec2d { - double e[2]; - double v __attribute__((vector_size(16))); -}; -typedef union vec2d vec2d; - -static vec2d a={{1,2}}, b={{3,4}}; - -vec2d foo () { - return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v }; -} - -//===---------------------------------------------------------------------===// - Better mod/ref analysis for scanf would allow us to eliminate the vtable and a bunch of other stuff from this example (see PR1604): @@ -1215,16 +1111,6 @@ optimized with "clang -emit-llvm-bc | opt -std-compile-opts". //===---------------------------------------------------------------------===// -We would like to do the following transform in the instcombiner: - - -X/C -> X/-C - -However, this isn't valid if (-X) overflows. We can implement this when we -have the concept of a "C signed subtraction" operator that which is undefined -on overflow. - -//===---------------------------------------------------------------------===// - This was noticed in the entryblock for grokdeclarator in 403.gcc: %tmp = icmp eq i32 %decl_context, 4 @@ -1651,3 +1537,143 @@ globalopt to remove the "stored only" global. //===---------------------------------------------------------------------===// +This code: + +define inreg i32 @foo(i8* inreg %p) nounwind { + %tmp0 = load i8* %p + %tmp1 = ashr i8 %tmp0, 5 + %tmp2 = sext i8 %tmp1 to i32 + ret i32 %tmp2 +} + +could be dagcombine'd to a sign-extending load with a shift. +For example, on x86 this currently gets this: + + movb (%eax), %al + sarb $5, %al + movsbl %al, %eax + +while it could get this: + + movsbl (%eax), %eax + sarl $5, %eax + +//===---------------------------------------------------------------------===// + +GCC PR31029: + +int test(int x) { return 1-x == x; } // --> return false +int test2(int x) { return 2-x == x; } // --> return x == 1 ? + +Always foldable for odd constants, what is the rule for even? + +//===---------------------------------------------------------------------===// + +PR 3381: GEP to field of size 0 inside a struct could be turned into GEP +for next field in struct (which is at same address). + +For example: store of float into { {{}}, float } could be turned into a store to +the float directly. + +//===---------------------------------------------------------------------===// + +#include +double foo(double a) { return sin(a); } + +This compiles into this on x86-64 Linux: +foo: + subq $8, %rsp + call sin + addq $8, %rsp + ret +vs: + +foo: + jmp sin + +//===---------------------------------------------------------------------===// + +The arg promotion pass should make use of nocapture to make its alias analysis +stuff much more precise. + +//===---------------------------------------------------------------------===// + +The following functions should be optimized to use a select instead of a +branch (from gcc PR40072): + +char char_int(int m) {if(m>7) return 0; return m;} +int int_char(char m) {if(m>7) return 0; return m;} + +//===---------------------------------------------------------------------===// + +Instcombine should replace the load with a constant in: + + static const char x[4] = {'a', 'b', 'c', 'd'}; + + unsigned int y(void) { + return *(unsigned int *)x; + } + +It currently only does this transformation when the size of the constant +is the same as the size of the integer (so, try x[5]) and the last byte +is a null (making it a C string). There's no need for these restrictions. + +//===---------------------------------------------------------------------===// + +InstCombine's "turn load from constant into constant" optimization should be +more aggressive in the presence of bitcasts. For example, because of unions, +this code: + +union vec2d { + double e[2]; + double v __attribute__((vector_size(16))); +}; +typedef union vec2d vec2d; + +static vec2d a={{1,2}}, b={{3,4}}; + +vec2d foo () { + return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v }; +} + +Compiles into: + +@a = internal constant %0 { [2 x double] + [double 1.000000e+00, double 2.000000e+00] }, align 16 +@b = internal constant %0 { [2 x double] + [double 3.000000e+00, double 4.000000e+00] }, align 16 +... +define void @foo(%struct.vec2d* noalias nocapture sret %agg.result) nounwind { +entry: + %0 = load <2 x double>* getelementptr (%struct.vec2d* + bitcast (%0* @a to %struct.vec2d*), i32 0, i32 0), align 16 + %1 = load <2 x double>* getelementptr (%struct.vec2d* + bitcast (%0* @b to %struct.vec2d*), i32 0, i32 0), align 16 + + +Instcombine should be able to optimize away the loads (and thus the globals). + + +//===---------------------------------------------------------------------===// + +I saw this constant expression in real code after llvm-g++ -O2: + +declare extern_weak i32 @0(i64) + +define void @foo() { + br i1 icmp eq (i32 zext (i1 icmp ne (i32 (i64)* @0, i32 (i64)* null) to i32), +i32 0), label %cond_true, label %cond_false +cond_true: + ret void +cond_false: + ret void +} + +That branch expression should be reduced to: + + i1 icmp eq (i32 (i64)* @0, i32 (i64)* null) + +It's probably not a perf issue, I just happened to see it while examining +something else and didn't want to forget about it. + +//===---------------------------------------------------------------------===//