X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=7e9888cc13e80b48466cb5cca8997819eb07cb92;hb=6902c8db2603287335a7e46257b4530674983116;hp=99e1252dc8aed1584e585bbaccff02842ffa1253;hpb=ed0318eab83cac5250353d8b70bca684ce247d70;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 99e1252dc8a..7e9888cc13e 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -2,22 +2,6 @@ Target Independent Opportunities: //===---------------------------------------------------------------------===// -With the recent changes to make the implicit def/use set explicit in -machineinstrs, we should change the target descriptions for 'call' instructions -so that the .td files don't list all the call-clobbered registers as implicit -defs. Instead, these should be added by the code generator (e.g. on the dag). - -This has a number of uses: - -1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions - for their different impdef sets. -2. Targets with multiple calling convs (e.g. x86) which have different clobber - sets don't need copies of call instructions. -3. 'Interprocedural register allocation' can be done to reduce the clobber sets - of calls. - -//===---------------------------------------------------------------------===// - We should recognized various "overflow detection" idioms and translate them into llvm.uadd.with.overflow and similar intrinsics. Here is a multiply idiom: @@ -109,44 +93,6 @@ This requires reassociating to forms of expressions that are already available, something that reassoc doesn't think about yet. -//===---------------------------------------------------------------------===// - -This function: (derived from GCC PR19988) -double foo(double x, double y) { - return ((x + 0.1234 * y) * (x + -0.1234 * y)); -} - -compiles to: -_foo: - movapd %xmm1, %xmm2 - mulsd LCPI1_1(%rip), %xmm1 - mulsd LCPI1_0(%rip), %xmm2 - addsd %xmm0, %xmm1 - addsd %xmm0, %xmm2 - movapd %xmm1, %xmm0 - mulsd %xmm2, %xmm0 - ret - -Reassociate should be able to turn it into: - -double foo(double x, double y) { - return ((x + 0.1234 * y) * (x - 0.1234 * y)); -} - -Which allows the multiply by constant to be CSE'd, producing: - -_foo: - mulsd LCPI1_0(%rip), %xmm1 - movapd %xmm1, %xmm2 - addsd %xmm0, %xmm2 - subsd %xmm1, %xmm0 - mulsd %xmm2, %xmm0 - ret - -This doesn't need -ffast-math support at all. This is particularly bad because -the llvm-gcc frontend is canonicalizing the later into the former, but clang -doesn't have this problem. - //===---------------------------------------------------------------------===// These two functions should generate the same code on big-endian systems: @@ -160,7 +106,7 @@ for 1,2,4,8 bytes. //===---------------------------------------------------------------------===// It would be nice to revert this patch: -http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html +http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html And teach the dag combiner enough to simplify the code expanded before legalize. It seems plausible that this knowledge would let it simplify other @@ -168,7 +114,7 @@ stuff too. //===---------------------------------------------------------------------===// -For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal +For vector types, DataLayout.cpp::getTypeInfo() returns alignment that is equal to the type size. It works but can be overly conservative as the alignment of specific vector types are target dependent. @@ -278,22 +224,7 @@ unsigned countbits_slow(unsigned v) { c += v & 1; return c; } -unsigned countbits_fast(unsigned v){ - unsigned c; - for (c = 0; v; c++) - v &= v - 1; // clear the least significant bit set - return c; -} -BITBOARD = unsigned long long -int PopCnt(register BITBOARD a) { - register int c=0; - while(a) { - c++; - a &= a - 1; - } - return c; -} unsigned int popcount(unsigned int input) { unsigned int count = 0; for (unsigned int i = 0; i < 4 * 8; i++) @@ -802,7 +733,7 @@ f (unsigned long a, unsigned long b, unsigned long c) return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0); } Both should combine to ((a|b) & (c-1)) != 0. Currently not optimized with -"clang -emit-llvm-bc | opt -std-compile-opts". +"clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// @@ -815,7 +746,7 @@ void clear_pmd_range(unsigned long start, unsigned long end) } The expression should optimize to something like "!((start|end)&~PMD_MASK). Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// @@ -834,7 +765,7 @@ int f(int x, int y) return (abs(x)) >= 0; } This should optimize to x == INT_MIN. (With -fwrapv.) Currently not -optimized with "clang -emit-llvm-bc | opt -std-compile-opts". +optimized with "clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// @@ -870,99 +801,119 @@ rshift_gt (unsigned int a) bar (); } -void neg_eq_cst(unsigned int a) { -if (-a == 123) -bar(); -} - All should simplify to a single comparison. All of these are currently not optimized with "clang -emit-llvm-bc | opt --std-compile-opts". +-O3". //===---------------------------------------------------------------------===// From GCC Bug 32605: int c(int* x) {return (char*)x+2 == (char*)x;} Should combine to 0. Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it). +-emit-llvm-bc | opt -O3" (although llc can optimize it). //===---------------------------------------------------------------------===// int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;} Should be combined to "((b >> 1) | b) & 1". Currently not optimized -with "clang -emit-llvm-bc | opt -std-compile-opts". +with "clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);} Should combine to "x | (y & 3)". Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);} Should fold to "(~a & c) | (a & b)". Currently not optimized with -"clang -emit-llvm-bc | opt -std-compile-opts". +"clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int a,int b) {return (~(a|b))|a;} Should fold to "a|~b". Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int a, int b) {return (a&&b) || (a&&!b);} Should fold to "a". Currently not optimized with "clang -emit-llvm-bc -| opt -std-compile-opts". +| opt -O3". //===---------------------------------------------------------------------===// int a(int a, int b, int c) {return (a&&b) || (!a&&c);} Should fold to "a ? b : c", or at least something sane. Currently not -optimized with "clang -emit-llvm-bc | opt -std-compile-opts". +optimized with "clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);} Should fold to a && (b || c). Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int x) {return x | ((x & 8) ^ 8);} Should combine to x | 8. Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int x) {return x ^ ((x & 8) ^ 8);} Should also combine to x | 8. Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int x) {return ((x | -9) ^ 8) & x;} Should combine to x & -9. Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;} Should combine to "a * 0x88888888 >> 31". Currently not optimized -with "clang -emit-llvm-bc | opt -std-compile-opts". +with "clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// unsigned a(char* x) {if ((*x & 32) == 0) return b();} There's an unnecessary zext in the generated code with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// unsigned a(unsigned long long x) {return 40 * (x >> 1);} Should combine to "20 * (((unsigned)x) & -2)". Currently not -optimized with "clang -emit-llvm-bc | opt -std-compile-opts". +optimized with "clang -emit-llvm-bc | opt -O3". + +//===---------------------------------------------------------------------===// + +int g(int x) { return (x - 10) < 0; } +Should combine to "x <= 9" (the sub has nsw). Currently not +optimized with "clang -emit-llvm-bc | opt -O3". + +//===---------------------------------------------------------------------===// + +int g(int x) { return (x + 10) < 0; } +Should combine to "x < -10" (the add has nsw). Currently not +optimized with "clang -emit-llvm-bc | opt -O3". + +//===---------------------------------------------------------------------===// + +int f(int i, int j) { return i < j + 1; } +int g(int i, int j) { return j > i - 1; } +Should combine to "i <= j" (the add/sub has nsw). Currently not +optimized with "clang -emit-llvm-bc | opt -O3". + +//===---------------------------------------------------------------------===// + +unsigned f(unsigned x) { return ((x & 7) + 1) & 15; } +The & 15 part should be optimized away, it doesn't change the result. Currently +not optimized with "clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// @@ -1174,7 +1125,7 @@ There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the GCC testsuite, ones we don't get yet are (checked through loadpre25): [CRIT EDGE BREAKING] -loadpre3.c predcom-4.c +predcom-4.c [PRE OF READONLY CALL] loadpre5.c @@ -1317,7 +1268,8 @@ int foo (void) { .. else if (strchr ("<>", *intel_parser.op_string) -Those should be turned into a switch. +Those should be turned into a switch. SimplifyLibCalls only gets the second +case. //===---------------------------------------------------------------------===// @@ -1767,7 +1719,6 @@ case it choses instead to keep the max operation obvious. //===---------------------------------------------------------------------===// -Switch lowering generates less than ideal code for the following switch: define void @a(i32 %x) nounwind { entry: switch i32 %x, label %if.end [ @@ -1788,19 +1739,15 @@ declare void @foo() Generated code on x86-64 (other platforms give similar results): a: cmpl $5, %edi - ja .LBB0_2 - movl %edi, %eax - movl $47, %ecx - btq %rax, %rcx - jb .LBB0_3 + ja LBB2_2 + cmpl $4, %edi + jne LBB2_3 .LBB0_2: ret .LBB0_3: jmp foo # TAILCALL -The movl+movl+btq+jb could be simplified to a cmpl+jne. - -Or, if we wanted to be really clever, we could simplify the whole thing to +If we wanted to be really clever, we could simplify the whole thing to something like the following, which eliminates a branch: xorl $1, %edi cmpl $4, %edi @@ -1897,44 +1844,6 @@ we remove checking in code like //===---------------------------------------------------------------------===// -This code (from Benchmarks/Dhrystone/dry.c): - -define i32 @Func1(i32, i32) nounwind readnone optsize ssp { -entry: - %sext = shl i32 %0, 24 - %conv = ashr i32 %sext, 24 - %sext6 = shl i32 %1, 24 - %conv4 = ashr i32 %sext6, 24 - %cmp = icmp eq i32 %conv, %conv4 - %. = select i1 %cmp, i32 10000, i32 0 - ret i32 %. -} - -Should be simplified into something like: - -define i32 @Func1(i32, i32) nounwind readnone optsize ssp { -entry: - %sext = shl i32 %0, 24 - %conv = and i32 %sext, 0xFF000000 - %sext6 = shl i32 %1, 24 - %conv4 = and i32 %sext6, 0xFF000000 - %cmp = icmp eq i32 %conv, %conv4 - %. = select i1 %cmp, i32 10000, i32 0 - ret i32 %. -} - -and then to: - -define i32 @Func1(i32, i32) nounwind readnone optsize ssp { -entry: - %conv = and i32 %0, 0xFF - %conv4 = and i32 %1, 0xFF - %cmp = icmp eq i32 %conv, %conv4 - %. = select i1 %cmp, i32 10000, i32 0 - ret i32 %. -} -//===---------------------------------------------------------------------===// - clang -O3 currently compiles this code int g(unsigned int a) { @@ -2259,52 +2168,6 @@ icmp transform. //===---------------------------------------------------------------------===// -We should optimize this: - - %tmp = load i16* %arrayidx, align 4, !tbaa !0 - %A = trunc i16 %tmp to i8 - %cmp = icmp eq i8 %A, 127 - %B.mask = and i16 %tmp, -256 - %cmp7 = icmp eq i16 %B.mask, 17664 - %or.cond = and i1 %cmp, %cmp7 - br i1 %or.cond, label %land.lhs.true9, label %if.end - -into: - - %tmp = load i16* %arrayidx, align 4, !tbaa !0 - %0 = icmp eq i16 %tmp, 17791 - br i1 %0, label %land.lhs.true9, label %if.end - -with this patch: -Index: InstCombine/InstCombineCompares.cpp -=================================================================== ---- InstCombine/InstCombineCompares.cpp (revision 129500) -+++ InstCombine/InstCombineCompares.cpp (working copy) -@@ -2506,6 +2506,18 @@ - return &I; - } - } -+ -+ // Transform "icmp eq (trunc X), cst" to "icmp (and X, mask), cst" -+ if (Op0->hasOneUse() && match(Op0, m_Trunc(m_Value(A))) && -+ isa(Op1)) { -+ APInt MaskV = APInt::getLowBitsSet(A->getType()->getPrimitiveSizeInBits(), -+ Op0->getType()->getPrimitiveSizeInBits()); -+ Value *Mask = -+ Builder->CreateAnd(A, ConstantInt::get(A->getContext(), MaskV)); -+ return new ICmpInst(I.getPredicate(), Mask, -+ ConstantExpr::getZExt(cast(Op1), -+ Mask->getType())); -+ } - } - - { - - -Not having this is blocking resolving PR6627. - -//===---------------------------------------------------------------------===// - This code: typedef struct { @@ -2351,4 +2214,66 @@ The two or/and's should be merged into one each. //===---------------------------------------------------------------------===// +Machine level code hoisting can be useful in some cases. For example, PR9408 +is about: + +typedef union { + void (*f1)(int); + void (*f2)(long); +} funcs; + +void foo(funcs f, int which) { + int a = 5; + if (which) { + f.f1(a); + } else { + f.f2(a); + } +} + +which we compile to: + +foo: # @foo +# BB#0: # %entry + pushq %rbp + movq %rsp, %rbp + testl %esi, %esi + movq %rdi, %rax + je .LBB0_2 +# BB#1: # %if.then + movl $5, %edi + callq *%rax + popq %rbp + ret +.LBB0_2: # %if.else + movl $5, %edi + callq *%rax + popq %rbp + ret + +Note that bb1 and bb2 are the same. This doesn't happen at the IR level +because one call is passing an i32 and the other is passing an i64. + +//===---------------------------------------------------------------------===// +I see this sort of pattern in 176.gcc in a few places (e.g. the start of +store_bit_field). The rem should be replaced with a multiply and subtract: + + %3 = sdiv i32 %A, %B + %4 = srem i32 %A, %B + +Similarly for udiv/urem. Note that this shouldn't be done on X86 or ARM, +which can do this in a single operation (instruction or libcall). It is +probably best to do this in the code generator. + +//===---------------------------------------------------------------------===// + +unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; } +should fold to (x & y) == 0. + +//===---------------------------------------------------------------------===// + +unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; } +should fold to x > y. + +//===---------------------------------------------------------------------===//