X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=1f69ffb09c0a4a04298f9e84b91acd2b3c06b1ec;hb=0fe9a92b33269d005253a2ed47d55dba48929c48;hp=7d90795aa3b062438af920787dcb478c945eba4b;hpb=27a2a13b6275e19c086a263e475a8ac6d017005f;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 7d90795aa3b..1f69ffb09c0 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -2,29 +2,6 @@ Target Independent Opportunities: //===---------------------------------------------------------------------===// -Dead argument elimination should be enhanced to handle cases when an argument is -dead to an externally visible function. Though the argument can't be removed -from the externally visible function, the caller doesn't need to pass it in. -For example in this testcase: - - void foo(int X) __attribute__((noinline)); - void foo(int X) { sideeffect(); } - void bar(int A) { foo(A+1); } - -We compile bar to: - -define void @bar(i32 %A) nounwind ssp { - %0 = add nsw i32 %A, 1 ; [#uses=1] - tail call void @foo(i32 %0) nounwind noinline ssp - ret void -} - -The add is dead, we could pass in 'i32 undef' instead. This occurs for C++ -templates etc, which usually have linkonce_odr/weak_odr linkage, not internal -linkage. - -//===---------------------------------------------------------------------===// - With the recent changes to make the implicit def/use set explicit in machineinstrs, we should change the target descriptions for 'call' instructions so that the .td files don't list all the call-clobbered registers as implicit @@ -277,6 +254,20 @@ unsigned long reverse(unsigned v) { //===---------------------------------------------------------------------===// +[LOOP DELETION] + +We don't delete this output free loop, because trip count analysis doesn't +realize that it is finite (if it were infinite, it would be undefined). Not +having this blocks Loop Idiom from matching strlen and friends. + +void foo(char *C) { + int x = 0; + while (*C) + ++x,++C; +} + +//===---------------------------------------------------------------------===// + [LOOP RECOGNITION] These idioms should be recognized as popcount (see PR1488): @@ -310,6 +301,16 @@ unsigned int popcount(unsigned int input) { return count; } +This should be recognized as CLZ: rdar://8459039 + +unsigned clz_a(unsigned a) { + int i; + for (i=0;i<32;i++) + if (a & (1<<(31-i))) + return i; + return 32; +} + This sort of thing should be added to the loop idiom pass. //===---------------------------------------------------------------------===// @@ -391,34 +392,6 @@ PHI Slicing could be extended to do this. //===---------------------------------------------------------------------===// -LSR should know what GPR types a target has from TargetData. This code: - -volatile short X, Y; // globals - -void foo(int N) { - int i; - for (i = 0; i < N; i++) { X = i; Y = i*4; } -} - -produces two near identical IV's (after promotion) on PPC/ARM: - -LBB1_2: - ldr r3, LCPI1_0 - ldr r3, [r3] - strh r2, [r3] - ldr r3, LCPI1_1 - ldr r3, [r3] - strh r1, [r3] - add r1, r1, #4 - add r2, r2, #1 <- [0,+,1] - sub r0, r0, #1 <- [0,-,1] - cmp r0, #0 - bne LBB1_2 - -LSR should reuse the "+" IV for the exit test. - -//===---------------------------------------------------------------------===// - Tail call elim should be more aggressive, checking to see if the call is followed by an uncond branch to an exit block. @@ -756,18 +729,6 @@ codegen badness or something else (haven't investigated). //===---------------------------------------------------------------------===// -We miss some instcombines for stuff like this: -void bar (void); -void foo (unsigned int a) { - /* This one is equivalent to a >= (3 << 2). */ - if ((a >> 2) >= 3) - bar (); -} - -A few other related ones are in GCC PR14753. - -//===---------------------------------------------------------------------===// - Divisibility by constant can be simplified (according to GCC PR12849) from being a mulhi to being a mul lo (cheaper). Testcase: @@ -908,6 +869,7 @@ rshift_gt (unsigned int a) if ((a >> 2) > 5) bar (); } + All should simplify to a single comparison. All of these are currently not optimized with "clang -emit-llvm-bc | opt -std-compile-opts". @@ -1285,26 +1247,6 @@ SingleSource/Benchmarks/Misc/dt.c //===---------------------------------------------------------------------===// -A/B get pinned to the stack because we turn an if/then into a select instead -of PRE'ing the load/store. This may be fixable in instcombine: -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892 - -struct X { int i; }; -int foo (int x) { - struct X a; - struct X b; - struct X *p; - a.i = 1; - b.i = 2; - if (x) - p = &a; - else - p = &b; - return p->i; -} - -//===---------------------------------------------------------------------===// - Interesting missed case because of control flow flattening (should be 2 loads): http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629 With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | @@ -1350,6 +1292,21 @@ codegen. //===---------------------------------------------------------------------===// +simplifylibcalls should turn these snprintf idioms into memcpy (GCC PR47917) + +char buf1[6], buf2[6], buf3[4], buf4[4]; +int i; + +int foo (void) { + int ret = snprintf (buf1, sizeof buf1, "abcde"); + ret += snprintf (buf2, sizeof buf2, "abcdef") * 16; + ret += snprintf (buf3, sizeof buf3, "%s", i++ < 6 ? "abc" : "def") * 256; + ret += snprintf (buf4, sizeof buf4, "%s", i++ > 10 ? "abcde" : "defgh")*4096; + return ret; +} + +//===---------------------------------------------------------------------===// + "gas" uses this idiom: else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string)) .. @@ -1627,21 +1584,6 @@ int bar() { return foo("abcd"); } //===---------------------------------------------------------------------===// -InstCombine should use SimplifyDemandedBits to remove the or instruction: - -define i1 @test(i8 %x, i8 %y) { - %A = or i8 %x, 1 - %B = icmp ugt i8 %A, 3 - ret i1 %B -} - -Currently instcombine calls SimplifyDemandedBits with either all bits or just -the sign bit, if the comparison is obviously a sign test. In this case, we only -need all but the bottom two bits from %A, and if we gave that mask to SDB it -would delete the or instruction for us. - -//===---------------------------------------------------------------------===// - functionattrs doesn't know much about memcpy/memset. This function should be marked readnone rather than readonly, since it only twiddles local memory, but functionattrs doesn't handle memset/memcpy/memmove aggressively: @@ -1820,51 +1762,6 @@ case it choses instead to keep the max operation obvious. //===---------------------------------------------------------------------===// -Take the following testcase on x86-64 (similar testcases exist for all targets -with addc/adde): - -define void @a(i64* nocapture %s, i64* nocapture %t, i64 %a, i64 %b, -i64 %c) nounwind { -entry: - %0 = zext i64 %a to i128 ; [#uses=1] - %1 = zext i64 %b to i128 ; [#uses=1] - %2 = add i128 %1, %0 ; [#uses=2] - %3 = zext i64 %c to i128 ; [#uses=1] - %4 = shl i128 %3, 64 ; [#uses=1] - %5 = add i128 %4, %2 ; [#uses=1] - %6 = lshr i128 %5, 64 ; [#uses=1] - %7 = trunc i128 %6 to i64 ; [#uses=1] - store i64 %7, i64* %s, align 8 - %8 = trunc i128 %2 to i64 ; [#uses=1] - store i64 %8, i64* %t, align 8 - ret void -} - -Generated code: - addq %rcx, %rdx - movl $0, %eax - adcq $0, %rax - addq %r8, %rax - movq %rax, (%rdi) - movq %rdx, (%rsi) - ret - -Expected code: - addq %rcx, %rdx - adcq $0, %r8 - movq %r8, (%rdi) - movq %rdx, (%rsi) - ret - -The generated SelectionDAG has an ADD of an ADDE, where both operands of the -ADDE are zero. Replacing one of the operands of the ADDE with the other operand -of the ADD, and replacing the ADD with the ADDE, should give the desired result. - -(That said, we are doing a lot better than gcc on this testcase. :) ) - -//===---------------------------------------------------------------------===// - -Switch lowering generates less than ideal code for the following switch: define void @a(i32 %x) nounwind { entry: switch i32 %x, label %if.end [ @@ -1885,19 +1782,15 @@ declare void @foo() Generated code on x86-64 (other platforms give similar results): a: cmpl $5, %edi - ja .LBB0_2 - movl %edi, %eax - movl $47, %ecx - btq %rax, %rcx - jb .LBB0_3 + ja LBB2_2 + cmpl $4, %edi + jne LBB2_3 .LBB0_2: ret .LBB0_3: jmp foo # TAILCALL -The movl+movl+btq+jb could be simplified to a cmpl+jne. - -Or, if we wanted to be really clever, we could simplify the whole thing to +If we wanted to be really clever, we could simplify the whole thing to something like the following, which eliminates a branch: xorl $1, %edi cmpl $4, %edi @@ -1905,43 +1798,6 @@ something like the following, which eliminates a branch: ret .LBB0_2: jmp foo # TAILCALL -//===---------------------------------------------------------------------===// -Given a branch where the two target blocks are identical ("ret i32 %b" in -both), simplifycfg will simplify them away. But not so for a switch statement: - -define i32 @f(i32 %a, i32 %b) nounwind readnone { -entry: - switch i32 %a, label %bb3 [ - i32 4, label %bb - i32 6, label %bb - ] - -bb: ; preds = %entry, %entry - ret i32 %b - -bb3: ; preds = %entry - ret i32 %b -} -//===---------------------------------------------------------------------===// - -clang -O3 fails to devirtualize this virtual inheritance case: (GCC PR45875) -Looks related to PR3100 - -struct c1 {}; -struct c10 : c1{ - virtual void foo (); -}; -struct c11 : c10, c1{ - virtual void f6 (); -}; -struct c28 : virtual c11{ - void f6 (); -}; -void check_c28 () { - c28 obj; - c11 *ptr = &obj; - ptr->f6 (); -} //===---------------------------------------------------------------------===// @@ -2109,33 +1965,13 @@ aggressively as malloc though. //===---------------------------------------------------------------------===// -clang -03 currently compiles this code +clang -O3 doesn't optimize this: void f1(int* begin, int* end) { std::fill(begin, end, 0); } -into - -define void @_Z2f1PiS_(i32* %begin, i32* %end) nounwind { -entry: - %cmp7.i.i = icmp eq i32* %begin, %end - br i1 %cmp7.i.i, label %_ZSt4fillIPiiEvT_S1_RKT0_.exit, label %for.body.i.i - -for.body.i.i: ; preds = %entry, %for.body.i.i - %indvar.i.i = phi i64 [ %tmp, %for.body.i.i ], [ 0, %entry ] - %tmp = add i64 %indvar.i.i, 1 - %ptrincdec.i.i = getelementptr i32* %begin, i64 %tmp - %__first.addr.08.i.i = getelementptr i32* %begin, i64 %indvar.i.i - store i32 0, i32* %__first.addr.08.i.i, align 4, !tbaa !0 - %cmp.i.i = icmp eq i32* %ptrincdec.i.i, %end - br i1 %cmp.i.i, label %_ZSt4fillIPiiEvT_S1_RKT0_.exit, label %for.body.i.i - -_ZSt4fillIPiiEvT_S1_RKT0_.exit: ; preds = %for.body.i.i, %entry - ret void -} - -It should compile it to a memset. +into a memset. This is PR8942. //===---------------------------------------------------------------------===// @@ -2190,43 +2026,335 @@ _ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i: ; preds = %cond.true.i.i.i.i br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit This is just the handling the construction of the vector. Most surprising here -is the fact that all three null stores in %entry are dead, but not eliminated. +is the fact that all three null stores in %entry are dead (because we do no +cross-block DSE). + Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i. +This is a because the client of LazyValueInfo doesn't simplify all instruction +operands, just selected ones. //===---------------------------------------------------------------------===// clang -O3 -fno-exceptions currently compiles this code: -void f(int N) { - std::vector v(N); - for (int k = 0; k < N; ++k) - v[k] = 0; +void f(char* a, int n) { + __builtin_memset(a, 0, n); + for (int i = 0; i < n; ++i) + a[i] = 0; +} + +into: + +define void @_Z1fPci(i8* nocapture %a, i32 %n) nounwind { +entry: + %conv = sext i32 %n to i64 + tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %conv, i32 1, i1 false) + %cmp8 = icmp sgt i32 %n, 0 + br i1 %cmp8, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %tmp10 = add i32 %n, -1 + %tmp11 = zext i32 %tmp10 to i64 + %tmp12 = add i64 %tmp11, 1 + call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %tmp12, i32 1, i1 false) + ret void + +for.end: ; preds = %entry + ret void +} + +This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold +the two memset's together. + +The issue with the addition only occurs in 64-bit mode, and appears to be at +least partially caused by Scalar Evolution not keeping its cache updated: it +returns the "wrong" result immediately after indvars runs, but figures out the +expected result if it is run from scratch on IR resulting from running indvars. + +//===---------------------------------------------------------------------===// +clang -O3 -fno-exceptions currently compiles this code: + +struct S { + unsigned short m1, m2; + unsigned char m3, m4; +}; + +void f(int N) { + std::vector v(N); extern void sink(void*); sink(&v); } -into almost the same as the previous note, but replace its final BB with: +into poor code for zero-initializing 'v' when N is >0. The problem is that +S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and +4 stores on each iteration. If the struct were 8 bytes, this gets turned into +a memset. -for.body.lr.ph: ; preds = %cond.true.i.i.i.i - %mul.i.i.i.i.i = shl i64 %conv, 2 - %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind - %0 = bitcast i8* %call3.i.i.i.i.i to i32* - store i32* %0, i32** %v8.sub, align 8, !tbaa !0 - %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv - store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0 - call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false) - store i32* %add.ptr.i.i.i, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0 - %tmp18 = add i32 %N, -1 - %tmp19 = zext i32 %tmp18 to i64 - %tmp20 = shl i64 %tmp19, 2 - %tmp21 = add i64 %tmp20, 4 - call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %tmp21, i32 4, i1 false) - br label %for.end - -First off, why (((zext %N - 1) << 2) + 4) instead of the ((sext %N) << 2) done -previously? (or better yet, re-use that one?) - -Then, the really painful one is the second memset, of the same memory, to the -same value. +In order to handle this we have to: + A) Teach clang to generate metadata for memsets of structs that have holes in + them. + B) Teach clang to use such a memset for zero init of this struct (since it has + a hole), instead of doing elementwise zeroing. + +//===---------------------------------------------------------------------===// + +clang -O3 currently compiles this code: + +extern const int magic; +double f() { return 0.0 * magic; } + +into + +@magic = external constant i32 + +define double @_Z1fv() nounwind readnone { +entry: + %tmp = load i32* @magic, align 4, !tbaa !0 + %conv = sitofp i32 %tmp to double + %mul = fmul double %conv, 0.000000e+00 + ret double %mul +} + +We should be able to fold away this fmul to 0.0. More generally, fmul(x,0.0) +can be folded to 0.0 if we can prove that the LHS is not -0.0, not a NaN, and +not an INF. The CannotBeNegativeZero predicate in value tracking should be +extended to support general "fpclassify" operations that can return +yes/no/unknown for each of these predicates. + +In this predicate, we know that uitofp is trivially never NaN or -0.0, and +we know that it isn't +/-Inf if the floating point type has enough exponent bits +to represent the largest integer value as < inf. + +//===---------------------------------------------------------------------===// + +When optimizing a transformation that can change the sign of 0.0 (such as the +0.0*val -> 0.0 transformation above), it might be provable that the sign of the +expression doesn't matter. For example, by the above rules, we can't transform +fmul(sitofp(x), 0.0) into 0.0, because x might be -1 and the result of the +expression is defined to be -0.0. + +If we look at the uses of the fmul for example, we might be able to prove that +all uses don't care about the sign of zero. For example, if we have: + + fadd(fmul(sitofp(x), 0.0), 2.0) + +Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can +transform the fmul to 0.0, and then the fadd to 2.0. + +//===---------------------------------------------------------------------===// + +We should enhance memcpy/memcpy/memset to allow a metadata node on them +indicating that some bytes of the transfer are undefined. This is useful for +frontends like clang when lowering struct copies, when some elements of the +struct are undefined. Consider something like this: + +struct x { + char a; + int b[4]; +}; +void foo(struct x*P); +struct x testfunc() { + struct x V1, V2; + foo(&V1); + V2 = V1; + + return V2; +} + +We currently compile this to: +$ clang t.c -S -o - -O0 -emit-llvm | opt -scalarrepl -S + + +%struct.x = type { i8, [4 x i32] } + +define void @testfunc(%struct.x* sret %agg.result) nounwind ssp { +entry: + %V1 = alloca %struct.x, align 4 + call void @foo(%struct.x* %V1) + %tmp1 = bitcast %struct.x* %V1 to i8* + %0 = bitcast %struct.x* %V1 to i160* + %srcval1 = load i160* %0, align 4 + %tmp2 = bitcast %struct.x* %agg.result to i8* + %1 = bitcast %struct.x* %agg.result to i160* + store i160 %srcval1, i160* %1, align 4 + ret void +} + +This happens because SRoA sees that the temp alloca has is being memcpy'd into +and out of and it has holes and it has to be conservative. If we knew about the +holes, then this could be much much better. + +Having information about these holes would also improve memcpy (etc) lowering at +llc time when it gets inlined, because we can use smaller transfers. This also +avoids partial register stalls in some important cases. + +//===---------------------------------------------------------------------===// + +We don't fold (icmp (add) (add)) unless the two adds only have a single use. +There are a lot of cases that we're refusing to fold in (e.g.) 256.bzip2, for +example: + + %indvar.next90 = add i64 %indvar89, 1 ;; Has 2 uses + %tmp96 = add i64 %tmp95, 1 ;; Has 1 use + %exitcond97 = icmp eq i64 %indvar.next90, %tmp96 + +We don't fold this because we don't want to introduce an overlapped live range +of the ivar. However if we can make this more aggressive without causing +performance issues in two ways: + +1. If *either* the LHS or RHS has a single use, we can definitely do the + transformation. In the overlapping liverange case we're trading one register + use for one fewer operation, which is a reasonable trade. Before doing this + we should verify that the llc output actually shrinks for some benchmarks. +2. If both ops have multiple uses, we can still fold it if the operations are + both sinkable to *after* the icmp (e.g. in a subsequent block) which doesn't + increase register pressure. + +There are a ton of icmp's we aren't simplifying because of the reg pressure +concern. Care is warranted here though because many of these are induction +variables and other cases that matter a lot to performance, like the above. +Here's a blob of code that you can drop into the bottom of visitICmp to see some +missed cases: + + { Value *A, *B, *C, *D; + if (match(Op0, m_Add(m_Value(A), m_Value(B))) && + match(Op1, m_Add(m_Value(C), m_Value(D))) && + (A == C || A == D || B == C || B == D)) { + errs() << "OP0 = " << *Op0 << " U=" << Op0->getNumUses() << "\n"; + errs() << "OP1 = " << *Op1 << " U=" << Op1->getNumUses() << "\n"; + errs() << "CMP = " << I << "\n\n"; + } + } + +//===---------------------------------------------------------------------===// + +define i1 @test1(i32 %x) nounwind { + %and = and i32 %x, 3 + %cmp = icmp ult i32 %and, 2 + ret i1 %cmp +} + +Can be folded to (x & 2) == 0. + +define i1 @test2(i32 %x) nounwind { + %and = and i32 %x, 3 + %cmp = icmp ugt i32 %and, 1 + ret i1 %cmp +} + +Can be folded to (x & 2) != 0. + +SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the +icmp transform. + +//===---------------------------------------------------------------------===// + +This code: + +typedef struct { +int f1:1; +int f2:1; +int f3:1; +int f4:29; +} t1; + +typedef struct { +int f1:1; +int f2:1; +int f3:30; +} t2; + +t1 s1; +t2 s2; + +void func1(void) +{ +s1.f1 = s2.f1; +s1.f2 = s2.f2; +} + +Compiles into this IR (on x86-64 at least): + +%struct.t1 = type { i8, [3 x i8] } +@s2 = global %struct.t1 zeroinitializer, align 4 +@s1 = global %struct.t1 zeroinitializer, align 4 +define void @func1() nounwind ssp noredzone { +entry: + %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4 + %bf.val.sext5 = and i32 %0, 1 + %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4 + %2 = and i32 %1, -4 + %3 = or i32 %2, %bf.val.sext5 + %bf.val.sext26 = and i32 %0, 2 + %4 = or i32 %3, %bf.val.sext26 + store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4 + ret void +} + +The two or/and's should be merged into one each. + +//===---------------------------------------------------------------------===// + +Machine level code hoisting can be useful in some cases. For example, PR9408 +is about: + +typedef union { + void (*f1)(int); + void (*f2)(long); +} funcs; + +void foo(funcs f, int which) { + int a = 5; + if (which) { + f.f1(a); + } else { + f.f2(a); + } +} + +which we compile to: + +foo: # @foo +# BB#0: # %entry + pushq %rbp + movq %rsp, %rbp + testl %esi, %esi + movq %rdi, %rax + je .LBB0_2 +# BB#1: # %if.then + movl $5, %edi + callq *%rax + popq %rbp + ret +.LBB0_2: # %if.else + movl $5, %edi + callq *%rax + popq %rbp + ret + +Note that bb1 and bb2 are the same. This doesn't happen at the IR level +because one call is passing an i32 and the other is passing an i64. + +//===---------------------------------------------------------------------===// + +I see this sort of pattern in 176.gcc in a few places (e.g. the start of +store_bit_field). The rem should be replaced with a multiply and subtract: + + %3 = sdiv i32 %A, %B + %4 = srem i32 %A, %B + +Similarly for udiv/urem. Note that this shouldn't be done on X86 or ARM, +which can do this in a single operation (instruction or libcall). It is +probably best to do this in the code generator. + +//===---------------------------------------------------------------------===// + +unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; } +should fold to (x & y) == 0. + +//===---------------------------------------------------------------------===// + +unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; } +should fold to x > y. //===---------------------------------------------------------------------===//