X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=23572cd6bc58f1f0bda5cd7be10e25da07c8b50a;hb=4a307ecce68f90e0eebf1ded52b947816cdc2304;hp=0096b0f9f42f100dd3950b625ac5599732fb66cc;hpb=48840f8db97733b3639006d3122e3fafe5c2fd3a;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 0096b0f9f42..23572cd6bc5 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -317,11 +317,6 @@ this construct. //===---------------------------------------------------------------------===// -Instcombine misses several of these cases (see the testcase in the patch): -http://gcc.gnu.org/ml/gcc-patches/2006-10/msg01519.html - -//===---------------------------------------------------------------------===// - viterbi speeds up *significantly* if the various "history" related copy loops are turned into memcpy calls at the source level. We need a "loops to memcpy" pass. @@ -631,6 +626,25 @@ implementations of ceil/floor/rint. //===---------------------------------------------------------------------===// +This GCC bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34043 +contains a testcase that compiles down to: + + %struct.XMM128 = type { <4 x float> } +.. + %src = alloca %struct.XMM128 +.. + %tmp6263 = bitcast %struct.XMM128* %src to <2 x i64>* + %tmp65 = getelementptr %struct.XMM128* %src, i32 0, i32 0 + store <2 x i64> %tmp5899, <2 x i64>* %tmp6263, align 16 + %tmp66 = load <4 x float>* %tmp65, align 16 + %tmp71 = add <4 x float> %tmp66, %tmp66 + +If the mid-level optimizer turned the bitcast of pointer + store of tmp5899 +into a bitcast of the vector value and a store to the pointer, then the +store->load could be easily removed. + +//===---------------------------------------------------------------------===// + Consider: int test() { @@ -672,3 +686,101 @@ instead of: ... //===---------------------------------------------------------------------===// + +http://llvm.org/PR717: + +The following code should compile into "ret int undef". Instead, LLVM +produces "ret int 0": + +int f() { + int x = 4; + int y; + if (x == 3) y = 0; + return y; +} + +//===---------------------------------------------------------------------===// + +The loop unroller should partially unroll loops (instead of peeling them) +when code growth isn't too bad and when an unroll count allows simplification +of some code within the loop. One trivial example is: + +#include +int main() { + int nRet = 17; + int nLoop; + for ( nLoop = 0; nLoop < 1000; nLoop++ ) { + if ( nLoop & 1 ) + nRet += 2; + else + nRet -= 1; + } + return nRet; +} + +Unrolling by 2 would eliminate the '&1' in both copies, leading to a net +reduction in code size. The resultant code would then also be suitable for +exit value computation. + +//===---------------------------------------------------------------------===// + +We miss a bunch of rotate opportunities on various targets, including ppc, x86, +etc. On X86, we miss a bunch of 'rotate by variable' cases because the rotate +matching code in dag combine doesn't look through truncates aggressively +enough. Here are some testcases reduces from GCC PR17886: + +unsigned long long f(unsigned long long x, int y) { + return (x << y) | (x >> 64-y); +} +unsigned f2(unsigned x, int y){ + return (x << y) | (x >> 32-y); +} +unsigned long long f3(unsigned long long x){ + int y = 9; + return (x << y) | (x >> 64-y); +} +unsigned f4(unsigned x){ + int y = 10; + return (x << y) | (x >> 32-y); +} +unsigned long long f5(unsigned long long x, unsigned long long y) { + return (x << 8) | ((y >> 48) & 0xffull); +} +unsigned long long f6(unsigned long long x, unsigned long long y, int z) { + switch(z) { + case 1: + return (x << 8) | ((y >> 48) & 0xffull); + case 2: + return (x << 16) | ((y >> 40) & 0xffffull); + case 3: + return (x << 24) | ((y >> 32) & 0xffffffull); + case 4: + return (x << 32) | ((y >> 24) & 0xffffffffull); + default: + return (x << 40) | ((y >> 16) & 0xffffffffffull); + } +} + +On X86-64, we only handle f3/f4 right. On x86-32, several of these +generate truly horrible code, instead of using shld and friends. On +ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is +badness. PPC64 misses f, f5 and f6. CellSPU aborts in isel. + +//===---------------------------------------------------------------------===// + +We do a number of simplifications in simplify libcalls to strength reduce +standard library functions, but we don't currently merge them together. For +example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy. This can only +be done safely if "b" isn't modified between the strlen and memcpy of course. + +//===---------------------------------------------------------------------===// + +We should be able to evaluate this loop: + +int test(int x_offs) { + while (x_offs > 4) + x_offs -= 4; + return x_offs; +} + +//===---------------------------------------------------------------------===//