X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=aad621f440ac7c3f424b371eacbbf39539ac01b5;hb=5ca124691bc81ed013593151c500d8104f7068dd;hp=559835e8b54c09dfd206a38e54f72b3a333e8c0d;hpb=58bb61ae949a3b7197e6c014dcd469dc2fbbd020;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 559835e8b54..aad621f440a 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -220,7 +220,20 @@ so cool to turn it into something like: ... which would only do one 32-bit XOR per loop iteration instead of two. It would also be nice to recognize the reg->size doesn't alias reg->node[i], but -alas... +alas. + +//===---------------------------------------------------------------------===// + +This should be optimized to one 'and' and one 'or', from PR4216: + +define i32 @test_bitfield(i32 %bf.prev.low) nounwind ssp { +entry: + %bf.prev.lo.cleared10 = or i32 %bf.prev.low, 32962 ; [#uses=1] + %0 = and i32 %bf.prev.low, -65536 ; [#uses=1] + %1 = and i32 %bf.prev.lo.cleared10, 40186 ; [#uses=1] + %2 = or i32 %1, %0 ; [#uses=1] + ret i32 %2 +} //===---------------------------------------------------------------------===// @@ -326,6 +339,8 @@ we don't have whole-function selection dags. On x86, this means we use one extra register for the function when effective_addr2 is declared as U64 than when it is declared U32. +PHI Slicing could be extended to do this. + //===---------------------------------------------------------------------===// LSR should know what GPR types a target has. This code: @@ -337,24 +352,22 @@ void foo(int N) { for (i = 0; i < N; i++) { X = i; Y = i*4; } } -produces two identical IV's (after promotion) on PPC/ARM: +produces two near identical IV's (after promotion) on PPC/ARM: + +LBB1_2: + ldr r3, LCPI1_0 + ldr r3, [r3] + strh r2, [r3] + ldr r3, LCPI1_1 + ldr r3, [r3] + strh r1, [r3] + add r1, r1, #4 + add r2, r2, #1 <- [0,+,1] + sub r0, r0, #1 <- [0,-,1] + cmp r0, #0 + bne LBB1_2 -LBB1_1: @bb.preheader - mov r3, #0 - mov r2, r3 - mov r1, r3 -LBB1_2: @bb - ldr r12, LCPI1_0 - ldr r12, [r12] - strh r2, [r12] - ldr r12, LCPI1_1 - ldr r12, [r12] - strh r3, [r12] - add r1, r1, #1 <- [0,+,1] - add r3, r3, #4 - add r2, r2, #1 <- [0,+,1] - cmp r1, r0 - bne LBB1_2 @bb +LSR should reuse the "+" IV for the exit test. //===---------------------------------------------------------------------===// @@ -395,22 +408,6 @@ return: ; preds = %then.1, %else.0, %then.0 //===---------------------------------------------------------------------===// -Tail recursion elimination is not transforming this function, because it is -returning n, which fails the isDynamicConstant check in the accumulator -recursion checks. - -long long fib(const long long n) { - switch(n) { - case 0: - case 1: - return n; - default: - return fib(n-1) + fib(n-2); - } -} - -//===---------------------------------------------------------------------===// - Tail recursion elimination should handle: int pow2m1(int n) { @@ -588,25 +585,6 @@ implementations of ceil/floor/rint. //===---------------------------------------------------------------------===// -This GCC bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34043 -contains a testcase that compiles down to: - - %struct.XMM128 = type { <4 x float> } -.. - %src = alloca %struct.XMM128 -.. - %tmp6263 = bitcast %struct.XMM128* %src to <2 x i64>* - %tmp65 = getelementptr %struct.XMM128* %src, i32 0, i32 0 - store <2 x i64> %tmp5899, <2 x i64>* %tmp6263, align 16 - %tmp66 = load <4 x float>* %tmp65, align 16 - %tmp71 = add <4 x float> %tmp66, %tmp66 - -If the mid-level optimizer turned the bitcast of pointer + store of tmp5899 -into a bitcast of the vector value and a store to the pointer, then the -store->load could be easily removed. - -//===---------------------------------------------------------------------===// - Consider: int test() { @@ -1111,16 +1089,6 @@ optimized with "clang -emit-llvm-bc | opt -std-compile-opts". //===---------------------------------------------------------------------===// -We would like to do the following transform in the instcombiner: - - -X/C -> X/-C - -However, this isn't valid if (-X) overflows. We can implement this when we -have the concept of a "C signed subtraction" operator that which is undefined -on overflow. - -//===---------------------------------------------------------------------===// - This was noticed in the entryblock for grokdeclarator in 403.gcc: %tmp = icmp eq i32 %decl_context, 4 @@ -1247,6 +1215,40 @@ GCC PR33344 is a similar case. //===---------------------------------------------------------------------===// +[PHI TRANSLATE INDEXED GEPs] PR5313 + +Load redundancy elimination for simple loop. This loop: + +void append_text(const char* text,unsigned char * const io) { + while(*text) + *io=*text++; +} + +Compiles to have a fully redundant load in the loop (%2): + +define void @append_text(i8* nocapture %text, i8* nocapture %io) nounwind { +entry: + %0 = load i8* %text, align 1 ; [#uses=1] + %1 = icmp eq i8 %0, 0 ; [#uses=1] + br i1 %1, label %return, label %bb + +bb: ; preds = %bb, %entry + %indvar = phi i32 [ 0, %entry ], [ %tmp, %bb ] ; [#uses=2] + %text_addr.04 = getelementptr i8* %text, i32 %indvar ; [#uses=1] + %2 = load i8* %text_addr.04, align 1 ; [#uses=1] + store i8 %2, i8* %io, align 1 + %tmp = add i32 %indvar, 1 ; [#uses=2] + %scevgep = getelementptr i8* %text, i32 %tmp ; [#uses=1] + %3 = load i8* %scevgep, align 1 ; [#uses=1] + %4 = icmp eq i8 %3, 0 ; [#uses=1] + br i1 %4, label %return, label %bb + +return: ; preds = %bb, %entry + ret void +} + +//===---------------------------------------------------------------------===// + There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the GCC testsuite. There are many pre testcases as ssa-pre-*.c @@ -1299,6 +1301,8 @@ http://gcc.gnu.org/bugzilla/show_bug.cgi?id=35287 [LPRE crit edge splitting] http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34677 (licm does this, LPRE crit edge) llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | opt -mem2reg -simplifycfg -gvn | llvm-dis +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16799 [BITCAST PHI TRANS] + //===---------------------------------------------------------------------===// Type based alias analysis: @@ -1306,31 +1310,25 @@ http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705 //===---------------------------------------------------------------------===// -When GVN/PRE finds a store of float* to a must aliases pointer when expecting -an int*, it should turn it into a bitcast. This is a nice generalization of -the SROA hack that would apply to other cases, e.g.: - -int foo(int C, int *P, float X) { - if (C) { - bar(); - *P = 42; - } else - *(float*)P = X; - - return *P; -} - - -One example (that requires crazy phi translation) is: -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16799 [BITCAST PHI TRANS] - -//===---------------------------------------------------------------------===// - A/B get pinned to the stack because we turn an if/then into a select instead of PRE'ing the load/store. This may be fixable in instcombine: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892 +struct X { int i; }; +int foo (int x) { + struct X a; + struct X b; + struct X *p; + a.i = 1; + b.i = 2; + if (x) + p = &a; + else + p = &b; + return p->i; +} +//===---------------------------------------------------------------------===// Interesting missed case because of control flow flattening (should be 2 loads): http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629 @@ -1616,74 +1614,123 @@ int int_char(char m) {if(m>7) return 0; return m;} //===---------------------------------------------------------------------===// -Instcombine should replace the load with a constant in: +int func(int a, int b) { if (a & 0x80) b |= 0x80; else b &= ~0x80; return b; } - static const char x[4] = {'a', 'b', 'c', 'd'}; - - unsigned int y(void) { - return *(unsigned int *)x; - } +Generates this: -It currently only does this transformation when the size of the constant -is the same as the size of the integer (so, try x[5]) and the last byte -is a null (making it a C string). There's no need for these restrictions. +define i32 @func(i32 %a, i32 %b) nounwind readnone ssp { +entry: + %0 = and i32 %a, 128 ; [#uses=1] + %1 = icmp eq i32 %0, 0 ; [#uses=1] + %2 = or i32 %b, 128 ; [#uses=1] + %3 = and i32 %b, -129 ; [#uses=1] + %b_addr.0 = select i1 %1, i32 %3, i32 %2 ; [#uses=1] + ret i32 %b_addr.0 +} -//===---------------------------------------------------------------------===// +However, it's functionally equivalent to: -InstCombine's "turn load from constant into constant" optimization should be -more aggressive in the presence of bitcasts. For example, because of unions, -this code: + b = (b & ~0x80) | (a & 0x80); -union vec2d { - double e[2]; - double v __attribute__((vector_size(16))); -}; -typedef union vec2d vec2d; +Which generates this: -static vec2d a={{1,2}}, b={{3,4}}; - -vec2d foo () { - return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v }; +define i32 @func(i32 %a, i32 %b) nounwind readnone ssp { +entry: + %0 = and i32 %b, -129 ; [#uses=1] + %1 = and i32 %a, 128 ; [#uses=1] + %2 = or i32 %0, %1 ; [#uses=1] + ret i32 %2 } -Compiles into: +This can be generalized for other forms: -@a = internal constant %0 { [2 x double] - [double 1.000000e+00, double 2.000000e+00] }, align 16 -@b = internal constant %0 { [2 x double] - [double 3.000000e+00, double 4.000000e+00] }, align 16 -... -define void @foo(%struct.vec2d* noalias nocapture sret %agg.result) nounwind { -entry: - %0 = load <2 x double>* getelementptr (%struct.vec2d* - bitcast (%0* @a to %struct.vec2d*), i32 0, i32 0), align 16 - %1 = load <2 x double>* getelementptr (%struct.vec2d* - bitcast (%0* @b to %struct.vec2d*), i32 0, i32 0), align 16 + b = (b & ~0x80) | (a & 0x40) << 1; + +//===---------------------------------------------------------------------===// + +These two functions produce different code. They shouldn't: +#include + +uint8_t p1(uint8_t b, uint8_t a) { + b = (b & ~0xc0) | (a & 0xc0); + return (b); +} + +uint8_t p2(uint8_t b, uint8_t a) { + b = (b & ~0x40) | (a & 0x40); + b = (b & ~0x80) | (a & 0x80); + return (b); +} -Instcombine should be able to optimize away the loads (and thus the globals). +define zeroext i8 @p1(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp { +entry: + %0 = and i8 %b, 63 ; [#uses=1] + %1 = and i8 %a, -64 ; [#uses=1] + %2 = or i8 %1, %0 ; [#uses=1] + ret i8 %2 +} +define zeroext i8 @p2(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp { +entry: + %0 = and i8 %b, 63 ; [#uses=1] + %.masked = and i8 %a, 64 ; [#uses=1] + %1 = and i8 %a, -128 ; [#uses=1] + %2 = or i8 %1, %0 ; [#uses=1] + %3 = or i8 %2, %.masked ; [#uses=1] + ret i8 %3 +} //===---------------------------------------------------------------------===// -I saw this constant expression in real code after llvm-g++ -O2: +IPSCCP does not currently propagate argument dependent constants through +functions where it does not not all of the callers. This includes functions +with normal external linkage as well as templates, C99 inline functions etc. +Specifically, it does nothing to: -declare extern_weak i32 @0(i64) +define i32 @test(i32 %x, i32 %y, i32 %z) nounwind { +entry: + %0 = add nsw i32 %y, %z + %1 = mul i32 %0, %x + %2 = mul i32 %y, %z + %3 = add nsw i32 %1, %2 + ret i32 %3 +} -define void @foo() { - br i1 icmp eq (i32 zext (i1 icmp ne (i32 (i64)* @0, i32 (i64)* null) to i32), -i32 0), label %cond_true, label %cond_false -cond_true: - ret void -cond_false: - ret void +define i32 @test2() nounwind { +entry: + %0 = call i32 @test(i32 1, i32 2, i32 4) nounwind + ret i32 %0 } -That branch expression should be reduced to: +It would be interesting extend IPSCCP to be able to handle simple cases like +this, where all of the arguments to a call are constant. Because IPSCCP runs +before inlining, trivial templates and inline functions are not yet inlined. +The results for a function + set of constant arguments should be memoized in a +map. + +//===---------------------------------------------------------------------===// + +The libcall constant folding stuff should be moved out of SimplifyLibcalls into +libanalysis' constantfolding logic. This would allow IPSCCP to be able to +handle simple things like this: - i1 icmp eq (i32 (i64)* @0, i32 (i64)* null) +static int foo(const char *X) { return strlen(X); } +int bar() { return foo("abcd"); } + +//===---------------------------------------------------------------------===// + +InstCombine should use SimplifyDemandedBits to remove the or instruction: + +define i1 @test(i8 %x, i8 %y) { + %A = or i8 %x, 1 + %B = icmp ugt i8 %A, 3 + ret i1 %B +} -It's probably not a perf issue, I just happened to see it while examining -something else and didn't want to forget about it. +Currently instcombine calls SimplifyDemandedBits with either all bits or just +the sign bit, if the comparison is obviously a sign test. In this case, we only +need all but the bottom two bits from %A, and if we gave that mask to SDB it +would delete the or instruction for us. //===---------------------------------------------------------------------===//