X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=9dd2b365c03ee8fd18e07add7865ab4735c250c6;hb=356aed540cda52727440d37dbbbe22cf140b7066;hp=78dcb12581c699ff3271d73fce612b6d8ca5b386;hpb=e46a686dc237735831cd4ce410a8cbe396ec08ab;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 78dcb12581c..9dd2b365c03 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -125,8 +125,7 @@ specific vector types are target dependent. //===---------------------------------------------------------------------===// -We should add 'unaligned load/store' nodes, and produce them from code like -this: +We should produce an unaligned load from code like this: v4sf example(float *P) { return (v4sf){P[0], P[1], P[2], P[3] }; @@ -167,52 +166,14 @@ if anyone cared enough about sincos. //===---------------------------------------------------------------------===// -Scalar Repl cannot currently promote this testcase to 'ret long cst': - - %struct.X = type { i32, i32 } - %struct.Y = type { %struct.X } - -define i64 @bar() { - %retval = alloca %struct.Y, align 8 - %tmp12 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 0 - store i32 0, i32* %tmp12 - %tmp15 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 1 - store i32 1, i32* %tmp15 - %retval.upgrd.1 = bitcast %struct.Y* %retval to i64* - %retval.upgrd.2 = load i64* %retval.upgrd.1 - ret i64 %retval.upgrd.2 -} - -it should be extended to do so. - -//===---------------------------------------------------------------------===// - --scalarrepl should promote this to be a vector scalar. - - %struct..0anon = type { <4 x float> } - -define void @test1(<4 x float> %V, float* %P) { - %u = alloca %struct..0anon, align 16 - %tmp = getelementptr %struct..0anon* %u, i32 0, i32 0 - store <4 x float> %V, <4 x float>* %tmp - %tmp1 = bitcast %struct..0anon* %u to [4 x float]* - %tmp.upgrd.1 = getelementptr [4 x float]* %tmp1, i32 0, i32 1 - %tmp.upgrd.2 = load float* %tmp.upgrd.1 - %tmp3 = mul float %tmp.upgrd.2, 2.000000e+00 - store float %tmp3, float* %P - ret void -} - -//===---------------------------------------------------------------------===// - Turn this into a single byte store with no load (the other 3 bytes are unmodified): -void %test(uint* %P) { - %tmp = load uint* %P - %tmp14 = or uint %tmp, 3305111552 - %tmp15 = and uint %tmp14, 3321888767 - store uint %tmp15, uint* %P +define void @test(i32* %P) { + %tmp = load i32* %P + %tmp14 = or i32 %tmp, 3305111552 + %tmp15 = and i32 %tmp14, 3321888767 + store i32 %tmp15, i32* %P ret void } @@ -236,13 +197,6 @@ _bar: addic r3,r3,-1 //===---------------------------------------------------------------------===// -Legalize should lower ctlz like this: - ctlz(x) = popcnt((x-1) & ~x) - -on targets that have popcnt but not ctlz. itanium, what else? - -//===---------------------------------------------------------------------===// - quantum_sigma_x in 462.libquantum contains the following loop: for(i=0; isize; i++) @@ -374,11 +328,6 @@ when it is declared U32. //===---------------------------------------------------------------------===// -Promote for i32 bswap can use i64 bswap + shr. Useful on targets with 64-bit -regs and bswap, like itanium. - -//===---------------------------------------------------------------------===// - LSR should know what GPR types a target has. This code: volatile short X, Y; // globals @@ -634,32 +583,6 @@ once. //===---------------------------------------------------------------------===// -We should extend parameter attributes to capture more information about -pointer parameters for alias analysis. Some ideas: - -1. Add a "nocapture" attribute, which indicates that the callee does not store - the address of the parameter into a global or any other memory location - visible to the callee. This can be used to make basicaa and other analyses - more powerful. It is true for things like memcpy, strcat, and many other - things, including structs passed by value, most C++ references, etc. -2. Generalize readonly to be set on parameters. This is important mod/ref - info for the function, which is important for basicaa and others. It can - also be used by the inliner to avoid inserting a memcpy for byval - arguments when the function is inlined. - -These functions can be inferred by various analysis passes such as the -globalsmodrefaa pass. Note that getting #2 right is actually really tricky. -Consider this code: - -struct S; S G; -void caller(S byvalarg) { G.field = 1; ... } -void callee() { caller(G); } - -The fact that the caller does not modify byval arg is not enough, we need -to know that it doesn't modify G either. This is very tricky. - -//===---------------------------------------------------------------------===// - We should add an FRINT node to the DAG to model targets that have legal implementations of ceil/floor/rint. @@ -814,16 +737,6 @@ be done safely if "b" isn't modified between the strlen and memcpy of course. //===---------------------------------------------------------------------===// -We should be able to evaluate this loop: - -int test(int x_offs) { - while (x_offs > 4) - x_offs -= 4; - return x_offs; -} - -//===---------------------------------------------------------------------===// - Reassociate should turn things like: int factorial(int X) { @@ -909,23 +822,6 @@ multiply hi's into a comparison against the mullo. //===---------------------------------------------------------------------===// -SROA is not promoting the union on the stack in this example, we should end -up with no allocas. - -union vec2d { - double e[2]; - double v __attribute__((vector_size(16))); -}; -typedef union vec2d vec2d; - -static vec2d a={{1,2}}, b={{3,4}}; - -vec2d foo () { - return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v }; -} - -//===---------------------------------------------------------------------===// - Better mod/ref analysis for scanf would allow us to eliminate the vtable and a bunch of other stuff from this example (see PR1604): @@ -1215,16 +1111,6 @@ optimized with "clang -emit-llvm-bc | opt -std-compile-opts". //===---------------------------------------------------------------------===// -We would like to do the following transform in the instcombiner: - - -X/C -> X/-C - -However, this isn't valid if (-X) overflows. We can implement this when we -have the concept of a "C signed subtraction" operator that which is undefined -on overflow. - -//===---------------------------------------------------------------------===// - This was noticed in the entryblock for grokdeclarator in 403.gcc: %tmp = icmp eq i32 %decl_context, 4 @@ -1689,3 +1575,106 @@ for next field in struct (which is at same address). For example: store of float into { {{}}, float } could be turned into a store to the float directly. +//===---------------------------------------------------------------------===// + +#include +double foo(double a) { return sin(a); } + +This compiles into this on x86-64 Linux: +foo: + subq $8, %rsp + call sin + addq $8, %rsp + ret +vs: + +foo: + jmp sin + +//===---------------------------------------------------------------------===// + +The arg promotion pass should make use of nocapture to make its alias analysis +stuff much more precise. + +//===---------------------------------------------------------------------===// + +The following functions should be optimized to use a select instead of a +branch (from gcc PR40072): + +char char_int(int m) {if(m>7) return 0; return m;} +int int_char(char m) {if(m>7) return 0; return m;} + +//===---------------------------------------------------------------------===// + +Instcombine should replace the load with a constant in: + + static const char x[4] = {'a', 'b', 'c', 'd'}; + + unsigned int y(void) { + return *(unsigned int *)x; + } + +It currently only does this transformation when the size of the constant +is the same as the size of the integer (so, try x[5]) and the last byte +is a null (making it a C string). There's no need for these restrictions. + +//===---------------------------------------------------------------------===// + +InstCombine's "turn load from constant into constant" optimization should be +more aggressive in the presence of bitcasts. For example, because of unions, +this code: + +union vec2d { + double e[2]; + double v __attribute__((vector_size(16))); +}; +typedef union vec2d vec2d; + +static vec2d a={{1,2}}, b={{3,4}}; + +vec2d foo () { + return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v }; +} + +Compiles into: + +@a = internal constant %0 { [2 x double] + [double 1.000000e+00, double 2.000000e+00] }, align 16 +@b = internal constant %0 { [2 x double] + [double 3.000000e+00, double 4.000000e+00] }, align 16 +... +define void @foo(%struct.vec2d* noalias nocapture sret %agg.result) nounwind { +entry: + %0 = load <2 x double>* getelementptr (%struct.vec2d* + bitcast (%0* @a to %struct.vec2d*), i32 0, i32 0), align 16 + %1 = load <2 x double>* getelementptr (%struct.vec2d* + bitcast (%0* @b to %struct.vec2d*), i32 0, i32 0), align 16 + + +Instcombine should be able to optimize away the loads (and thus the globals). + +See also PR4973 + +//===---------------------------------------------------------------------===// + +I saw this constant expression in real code after llvm-g++ -O2: + +declare extern_weak i32 @0(i64) + +define void @foo() { + br i1 icmp eq (i32 zext (i1 icmp ne (i32 (i64)* @0, i32 (i64)* null) to i32), +i32 0), label %cond_true, label %cond_false +cond_true: + ret void +cond_false: + ret void +} + +That branch expression should be reduced to: + + i1 icmp eq (i32 (i64)* @0, i32 (i64)* null) + +It's probably not a perf issue, I just happened to see it while examining +something else and didn't want to forget about it. + +//===---------------------------------------------------------------------===//