X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=1f69ffb09c0a4a04298f9e84b91acd2b3c06b1ec;hb=ac027144e8e22563c9bb057598c710aac57c072f;hp=7362926875133831ea3364be1ea8e85e3ec9dacb;hpb=fd51e95171bd945f722dacc2b85a7866ad617242;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 73629268751..1f69ffb09c0 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -254,6 +254,20 @@ unsigned long reverse(unsigned v) { //===---------------------------------------------------------------------===// +[LOOP DELETION] + +We don't delete this output free loop, because trip count analysis doesn't +realize that it is finite (if it were infinite, it would be undefined). Not +having this blocks Loop Idiom from matching strlen and friends. + +void foo(char *C) { + int x = 0; + while (*C) + ++x,++C; +} + +//===---------------------------------------------------------------------===// + [LOOP RECOGNITION] These idioms should be recognized as popcount (see PR1488): @@ -287,6 +301,16 @@ unsigned int popcount(unsigned int input) { return count; } +This should be recognized as CLZ: rdar://8459039 + +unsigned clz_a(unsigned a) { + int i; + for (i=0;i<32;i++) + if (a & (1<<(31-i))) + return i; + return 32; +} + This sort of thing should be added to the loop idiom pass. //===---------------------------------------------------------------------===// @@ -368,34 +392,6 @@ PHI Slicing could be extended to do this. //===---------------------------------------------------------------------===// -LSR should know what GPR types a target has from TargetData. This code: - -volatile short X, Y; // globals - -void foo(int N) { - int i; - for (i = 0; i < N; i++) { X = i; Y = i*4; } -} - -produces two near identical IV's (after promotion) on PPC/ARM: - -LBB1_2: - ldr r3, LCPI1_0 - ldr r3, [r3] - strh r2, [r3] - ldr r3, LCPI1_1 - ldr r3, [r3] - strh r1, [r3] - add r1, r1, #4 - add r2, r2, #1 <- [0,+,1] - sub r0, r0, #1 <- [0,-,1] - cmp r0, #0 - bne LBB1_2 - -LSR should reuse the "+" IV for the exit test. - -//===---------------------------------------------------------------------===// - Tail call elim should be more aggressive, checking to see if the call is followed by an uncond branch to an exit block. @@ -733,18 +729,6 @@ codegen badness or something else (haven't investigated). //===---------------------------------------------------------------------===// -We miss some instcombines for stuff like this: -void bar (void); -void foo (unsigned int a) { - /* This one is equivalent to a >= (3 << 2). */ - if ((a >> 2) >= 3) - bar (); -} - -A few other related ones are in GCC PR14753. - -//===---------------------------------------------------------------------===// - Divisibility by constant can be simplified (according to GCC PR12849) from being a mulhi to being a mul lo (cheaper). Testcase: @@ -885,6 +869,7 @@ rshift_gt (unsigned int a) if ((a >> 2) > 5) bar (); } + All should simplify to a single comparison. All of these are currently not optimized with "clang -emit-llvm-bc | opt -std-compile-opts". @@ -1307,6 +1292,21 @@ codegen. //===---------------------------------------------------------------------===// +simplifylibcalls should turn these snprintf idioms into memcpy (GCC PR47917) + +char buf1[6], buf2[6], buf3[4], buf4[4]; +int i; + +int foo (void) { + int ret = snprintf (buf1, sizeof buf1, "abcde"); + ret += snprintf (buf2, sizeof buf2, "abcdef") * 16; + ret += snprintf (buf3, sizeof buf3, "%s", i++ < 6 ? "abc" : "def") * 256; + ret += snprintf (buf4, sizeof buf4, "%s", i++ > 10 ? "abcde" : "defgh")*4096; + return ret; +} + +//===---------------------------------------------------------------------===// + "gas" uses this idiom: else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string)) .. @@ -1762,44 +1762,6 @@ case it choses instead to keep the max operation obvious. //===---------------------------------------------------------------------===// -Take the following testcase on x86-64 (similar testcases exist for all targets -with addc/adde): - -define void @a(i64* nocapture %s, i64* nocapture %t, i64 %a, i64 %b, -i64 %c) nounwind { -entry: - %0 = zext i64 %a to i128 ; [#uses=1] - %1 = zext i64 %b to i128 ; [#uses=1] - %2 = add i128 %1, %0 ; [#uses=2] - %3 = zext i64 %c to i128 ; [#uses=1] - %4 = shl i128 %3, 64 ; [#uses=1] - %5 = add i128 %4, %2 ; [#uses=1] - %6 = lshr i128 %5, 64 ; [#uses=1] - %7 = trunc i128 %6 to i64 ; [#uses=1] - store i64 %7, i64* %s, align 8 - %8 = trunc i128 %2 to i64 ; [#uses=1] - store i64 %8, i64* %t, align 8 - ret void -} - -Generated code: - addq %rcx, %rdx - sbbq %rax, %rax - subq %rax, %r8 - movq %r8, (%rdi) - movq %rdx, (%rsi) - ret - -Expected code: - addq %rcx, %rdx - adcq $0, %r8 - movq %r8, (%rdi) - movq %rdx, (%rsi) - ret - -//===---------------------------------------------------------------------===// - -Switch lowering generates less than ideal code for the following switch: define void @a(i32 %x) nounwind { entry: switch i32 %x, label %if.end [ @@ -1820,19 +1782,15 @@ declare void @foo() Generated code on x86-64 (other platforms give similar results): a: cmpl $5, %edi - ja .LBB0_2 - movl %edi, %eax - movl $47, %ecx - btq %rax, %rcx - jb .LBB0_3 + ja LBB2_2 + cmpl $4, %edi + jne LBB2_3 .LBB0_2: ret .LBB0_3: jmp foo # TAILCALL -The movl+movl+btq+jb could be simplified to a cmpl+jne. - -Or, if we wanted to be really clever, we could simplify the whole thing to +If we wanted to be really clever, we could simplify the whole thing to something like the following, which eliminates a branch: xorl $1, %edi cmpl $4, %edi @@ -2106,11 +2064,12 @@ for.end: ; preds = %entry } This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold -the two memset's together. The issue with %n seems to stem from poor handling -of the original loop. +the two memset's together. -To simplify this, we need SCEV to know that "n != 0" because of the dominating -conditional. That would turn the second memset into a simple memset of 'n'. +The issue with the addition only occurs in 64-bit mode, and appears to be at +least partially caused by Scalar Evolution not keeping its cache updated: it +returns the "wrong" result immediately after indvars runs, but figures out the +expected result if it is run from scratch on IR resulting from running indvars. //===---------------------------------------------------------------------===// @@ -2230,3 +2189,172 @@ llc time when it gets inlined, because we can use smaller transfers. This also avoids partial register stalls in some important cases. //===---------------------------------------------------------------------===// + +We don't fold (icmp (add) (add)) unless the two adds only have a single use. +There are a lot of cases that we're refusing to fold in (e.g.) 256.bzip2, for +example: + + %indvar.next90 = add i64 %indvar89, 1 ;; Has 2 uses + %tmp96 = add i64 %tmp95, 1 ;; Has 1 use + %exitcond97 = icmp eq i64 %indvar.next90, %tmp96 + +We don't fold this because we don't want to introduce an overlapped live range +of the ivar. However if we can make this more aggressive without causing +performance issues in two ways: + +1. If *either* the LHS or RHS has a single use, we can definitely do the + transformation. In the overlapping liverange case we're trading one register + use for one fewer operation, which is a reasonable trade. Before doing this + we should verify that the llc output actually shrinks for some benchmarks. +2. If both ops have multiple uses, we can still fold it if the operations are + both sinkable to *after* the icmp (e.g. in a subsequent block) which doesn't + increase register pressure. + +There are a ton of icmp's we aren't simplifying because of the reg pressure +concern. Care is warranted here though because many of these are induction +variables and other cases that matter a lot to performance, like the above. +Here's a blob of code that you can drop into the bottom of visitICmp to see some +missed cases: + + { Value *A, *B, *C, *D; + if (match(Op0, m_Add(m_Value(A), m_Value(B))) && + match(Op1, m_Add(m_Value(C), m_Value(D))) && + (A == C || A == D || B == C || B == D)) { + errs() << "OP0 = " << *Op0 << " U=" << Op0->getNumUses() << "\n"; + errs() << "OP1 = " << *Op1 << " U=" << Op1->getNumUses() << "\n"; + errs() << "CMP = " << I << "\n\n"; + } + } + +//===---------------------------------------------------------------------===// + +define i1 @test1(i32 %x) nounwind { + %and = and i32 %x, 3 + %cmp = icmp ult i32 %and, 2 + ret i1 %cmp +} + +Can be folded to (x & 2) == 0. + +define i1 @test2(i32 %x) nounwind { + %and = and i32 %x, 3 + %cmp = icmp ugt i32 %and, 1 + ret i1 %cmp +} + +Can be folded to (x & 2) != 0. + +SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the +icmp transform. + +//===---------------------------------------------------------------------===// + +This code: + +typedef struct { +int f1:1; +int f2:1; +int f3:1; +int f4:29; +} t1; + +typedef struct { +int f1:1; +int f2:1; +int f3:30; +} t2; + +t1 s1; +t2 s2; + +void func1(void) +{ +s1.f1 = s2.f1; +s1.f2 = s2.f2; +} + +Compiles into this IR (on x86-64 at least): + +%struct.t1 = type { i8, [3 x i8] } +@s2 = global %struct.t1 zeroinitializer, align 4 +@s1 = global %struct.t1 zeroinitializer, align 4 +define void @func1() nounwind ssp noredzone { +entry: + %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4 + %bf.val.sext5 = and i32 %0, 1 + %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4 + %2 = and i32 %1, -4 + %3 = or i32 %2, %bf.val.sext5 + %bf.val.sext26 = and i32 %0, 2 + %4 = or i32 %3, %bf.val.sext26 + store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4 + ret void +} + +The two or/and's should be merged into one each. + +//===---------------------------------------------------------------------===// + +Machine level code hoisting can be useful in some cases. For example, PR9408 +is about: + +typedef union { + void (*f1)(int); + void (*f2)(long); +} funcs; + +void foo(funcs f, int which) { + int a = 5; + if (which) { + f.f1(a); + } else { + f.f2(a); + } +} + +which we compile to: + +foo: # @foo +# BB#0: # %entry + pushq %rbp + movq %rsp, %rbp + testl %esi, %esi + movq %rdi, %rax + je .LBB0_2 +# BB#1: # %if.then + movl $5, %edi + callq *%rax + popq %rbp + ret +.LBB0_2: # %if.else + movl $5, %edi + callq *%rax + popq %rbp + ret + +Note that bb1 and bb2 are the same. This doesn't happen at the IR level +because one call is passing an i32 and the other is passing an i64. + +//===---------------------------------------------------------------------===// + +I see this sort of pattern in 176.gcc in a few places (e.g. the start of +store_bit_field). The rem should be replaced with a multiply and subtract: + + %3 = sdiv i32 %A, %B + %4 = srem i32 %A, %B + +Similarly for udiv/urem. Note that this shouldn't be done on X86 or ARM, +which can do this in a single operation (instruction or libcall). It is +probably best to do this in the code generator. + +//===---------------------------------------------------------------------===// + +unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; } +should fold to (x & y) == 0. + +//===---------------------------------------------------------------------===// + +unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; } +should fold to x > y. + +//===---------------------------------------------------------------------===//