X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=1f69ffb09c0a4a04298f9e84b91acd2b3c06b1ec;hb=354efd88db96c9662d41c1e154fdee37324802db;hp=5032b2bbbb5b172cd25dc47c986592a7d24c9406;hpb=df2ef9015f8a02eee25c45a9be648540641610c2;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 5032b2bbbb5..1f69ffb09c0 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -870,11 +870,6 @@ rshift_gt (unsigned int a) bar (); } -void neg_eq_cst(unsigned int a) { -if (-a == 123) -bar(); -} - All should simplify to a single comparison. All of these are currently not optimized with "clang -emit-llvm-bc | opt -std-compile-opts". @@ -1767,7 +1762,6 @@ case it choses instead to keep the max operation obvious. //===---------------------------------------------------------------------===// -Switch lowering generates less than ideal code for the following switch: define void @a(i32 %x) nounwind { entry: switch i32 %x, label %if.end [ @@ -1788,19 +1782,15 @@ declare void @foo() Generated code on x86-64 (other platforms give similar results): a: cmpl $5, %edi - ja .LBB0_2 - movl %edi, %eax - movl $47, %ecx - btq %rax, %rcx - jb .LBB0_3 + ja LBB2_2 + cmpl $4, %edi + jne LBB2_3 .LBB0_2: ret .LBB0_3: jmp foo # TAILCALL -The movl+movl+btq+jb could be simplified to a cmpl+jne. - -Or, if we wanted to be really clever, we could simplify the whole thing to +If we wanted to be really clever, we could simplify the whole thing to something like the following, which eliminates a branch: xorl $1, %edi cmpl $4, %edi @@ -2259,78 +2249,112 @@ icmp transform. //===---------------------------------------------------------------------===// -These functions: -int foo(int *X) { - if ((*X & 255) == 47) - bar(); +This code: + +typedef struct { +int f1:1; +int f2:1; +int f3:1; +int f4:29; +} t1; + +typedef struct { +int f1:1; +int f2:1; +int f3:30; +} t2; + +t1 s1; +t2 s2; + +void func1(void) +{ +s1.f1 = s2.f1; +s1.f2 = s2.f2; } -int foo2(int X) { - if ((X & 255) == 47) - bar(); + +Compiles into this IR (on x86-64 at least): + +%struct.t1 = type { i8, [3 x i8] } +@s2 = global %struct.t1 zeroinitializer, align 4 +@s1 = global %struct.t1 zeroinitializer, align 4 +define void @func1() nounwind ssp noredzone { +entry: + %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4 + %bf.val.sext5 = and i32 %0, 1 + %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4 + %2 = and i32 %1, -4 + %3 = or i32 %2, %bf.val.sext5 + %bf.val.sext26 = and i32 %0, 2 + %4 = or i32 %3, %bf.val.sext26 + store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4 + ret void } -codegen to: +The two or/and's should be merged into one each. - movzbl (%rdi), %eax - cmpl $47, %eax - jne LBB0_2 +//===---------------------------------------------------------------------===// -and: - movzbl %dil, %eax - cmpl $47, %eax - jne LBB1_2 +Machine level code hoisting can be useful in some cases. For example, PR9408 +is about: -If a dag combine shrunk the compare to a byte compare, then we'd fold the load -in the first example, and eliminate the movzbl in the second, saving a register. -This can be a target independent dag combine that works on ISD::SETCC, it would -catch this before the legalize ops pass. +typedef union { + void (*f1)(int); + void (*f2)(long); +} funcs; -//===---------------------------------------------------------------------===// +void foo(funcs f, int which) { + int a = 5; + if (which) { + f.f1(a); + } else { + f.f2(a); + } +} -We should optimize this: +which we compile to: - %tmp = load i16* %arrayidx, align 4, !tbaa !0 - %A = trunc i16 %tmp to i8 - %cmp = icmp eq i8 %A, 127 - %B.mask = and i16 %tmp, -256 - %cmp7 = icmp eq i16 %B.mask, 17664 - %or.cond = and i1 %cmp, %cmp7 - br i1 %or.cond, label %land.lhs.true9, label %if.end +foo: # @foo +# BB#0: # %entry + pushq %rbp + movq %rsp, %rbp + testl %esi, %esi + movq %rdi, %rax + je .LBB0_2 +# BB#1: # %if.then + movl $5, %edi + callq *%rax + popq %rbp + ret +.LBB0_2: # %if.else + movl $5, %edi + callq *%rax + popq %rbp + ret -into: +Note that bb1 and bb2 are the same. This doesn't happen at the IR level +because one call is passing an i32 and the other is passing an i64. - %tmp = load i16* %arrayidx, align 4, !tbaa !0 - %0 = icmp eq i16 %tmp, 17791 - br i1 %0, label %land.lhs.true9, label %if.end - -with this patch: -Index: InstCombine/InstCombineCompares.cpp -=================================================================== ---- InstCombine/InstCombineCompares.cpp (revision 129500) -+++ InstCombine/InstCombineCompares.cpp (working copy) -@@ -2506,6 +2506,18 @@ - return &I; - } - } -+ -+ // Transform "icmp eq (trunc X), cst" to "icmp (and X, mask), cst" -+ if (Op0->hasOneUse() && match(Op0, m_Trunc(m_Value(A))) && -+ isa(Op1)) { -+ APInt MaskV = APInt::getLowBitsSet(A->getType()->getPrimitiveSizeInBits(), -+ Op0->getType()->getPrimitiveSizeInBits()); -+ Value *Mask = -+ Builder->CreateAnd(A, ConstantInt::get(A->getContext(), MaskV)); -+ return new ICmpInst(I.getPredicate(), Mask, -+ ConstantExpr::getZExt(cast(Op1), -+ Mask->getType())); -+ } - } - - { +//===---------------------------------------------------------------------===// + +I see this sort of pattern in 176.gcc in a few places (e.g. the start of +store_bit_field). The rem should be replaced with a multiply and subtract: + %3 = sdiv i32 %A, %B + %4 = srem i32 %A, %B -but we can't do that until the dag combine above is added. Not having this -is blocking resolving PR6627. +Similarly for udiv/urem. Note that this shouldn't be done on X86 or ARM, +which can do this in a single operation (instruction or libcall). It is +probably best to do this in the code generator. //===---------------------------------------------------------------------===// +unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; } +should fold to (x & y) == 0. + +//===---------------------------------------------------------------------===// + +unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; } +should fold to x > y. + +//===---------------------------------------------------------------------===//