X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=b4285a0718799c379eb2a281f159d2c1a42784ed;hb=6c8afad198688649ba7fc024bd5521d6b77a7ad5;hp=a7a477d2d5b833b5509af047cf454fd7fb865b1c;hpb=c1e4ce6044a60a0e61bed0a719ad9deec60af0eb;p=oota-llvm.git diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index a7a477d2d5b..b4285a07187 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2,19 +2,6 @@ // Random ideas for the X86 backend. //===---------------------------------------------------------------------===// -We should add support for the "movbe" instruction, which does a byte-swapping -copy (3-addr bswap + memory support?) This is available on Atom processors. - -//===---------------------------------------------------------------------===// - -CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86 -backend knows how to three-addressify this shift, but it appears the register -allocator isn't even asking it to do so in this case. We should investigate -why this isn't happening, it could have significant impact on other important -cases for X86 as well. - -//===---------------------------------------------------------------------===// - This should be one DIV/IDIV instruction, not a libcall: unsigned test(unsigned long long X, unsigned Y) { @@ -69,7 +56,7 @@ cmovs, we should expand to a conditional branch like GCC produces. Some isel ideas: -1. Dynamic programming based approach when compile time if not an +1. Dynamic programming based approach when compile time is not an issue. 2. Code duplication (addressing mode) during isel. 3. Other ideas from "Register-Sensitive Selection, Duplication, and @@ -710,23 +697,17 @@ This: { return !full_add(a, b).second; } Should compile to: + addl %esi, %edi + setae %al + movzbl %al, %eax + ret - - _Z11no_overflowjj: - addl %edi, %esi - setae %al - ret - -FIXME: That code looks wrong; bool return is normally defined as zext. - -on x86-64, not: - -__Z11no_overflowjj: - addl %edi, %esi - cmpl %edi, %esi - setae %al - movzbl %al, %eax - ret +on x86-64, instead of the rather stupid-looking: + addl %esi, %edi + setb %al + xorb $1, %al + movzbl %al, %eax + ret //===---------------------------------------------------------------------===// @@ -994,10 +975,10 @@ _foo: instead of: _foo: - movl $255, %eax - orl 4(%esp), %eax - andl $65535, %eax - ret + movl $65280, %eax + andl 4(%esp), %eax + orl $255, %eax + ret //===---------------------------------------------------------------------===// @@ -1236,7 +1217,7 @@ Also check why xmm7 is not used at all in the function. Take the following: -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128" target triple = "i386-apple-darwin8" @in_exit.4870.b = internal global i1 false ; [#uses=2] define fastcc void @abort_gzip() noreturn nounwind { @@ -1578,7 +1559,7 @@ Implement processor-specific optimizations for parity with GCC on these processors. GCC does two optimizations: 1. ix86_pad_returns inserts a noop before ret instructions if immediately - preceeded by a conditional branch or is the target of a jump. + preceded by a conditional branch or is the target of a jump. 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of code contains more than 3 branches. @@ -1586,43 +1567,6 @@ The first one is done for all AMDs, Core2, and "Generic" The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, Core 2, and "Generic" -//===---------------------------------------------------------------------===// - -Testcase: -int a(int x) { return (x & 127) > 31; } - -Current output: - movl 4(%esp), %eax - andl $127, %eax - cmpl $31, %eax - seta %al - movzbl %al, %eax - ret - -Ideal output: - xorl %eax, %eax - testl $96, 4(%esp) - setne %al - ret - -This should definitely be done in instcombine, canonicalizing the range -condition into a != condition. We get this IR: - -define i32 @a(i32 %x) nounwind readnone { -entry: - %0 = and i32 %x, 127 ; [#uses=1] - %1 = icmp ugt i32 %0, 31 ; [#uses=1] - %2 = zext i1 %1 to i32 ; [#uses=1] - ret i32 %2 -} - -Instcombine prefers to strength reduce relational comparisons to equality -comparisons when possible, this should be another case of that. This could -be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it -looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already -be redesigned to use ComputeMaskedBits and friends. - - //===---------------------------------------------------------------------===// Testcase: int x(int a) { return (a&0xf0)>>4; } @@ -1662,28 +1606,61 @@ information to add the "lock" prefix. //===---------------------------------------------------------------------===// -_Bool bar(int *x) { return *x & 1; } +struct B { + unsigned char y0 : 1; +}; -define zeroext i1 @bar(i32* nocapture %x) nounwind readonly { -entry: - %tmp1 = load i32* %x ; [#uses=1] - %and = and i32 %tmp1, 1 ; [#uses=1] - %tobool = icmp ne i32 %and, 0 ; [#uses=1] - ret i1 %tobool +int bar(struct B* a) { return a->y0; } + +define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { + %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0 + %2 = load i8* %1, align 1 + %3 = and i8 %2, 1 + %4 = zext i8 %3 to i32 + ret i32 %4 } -bar: # @bar -# BB#0: # %entry - movl 4(%esp), %eax - movb (%eax), %al - andb $1, %al - movzbl %al, %eax - ret +bar: # @bar +# BB#0: + movb (%rdi), %al + andb $1, %al + movzbl %al, %eax + ret Missed optimization: should be movl+andl. //===---------------------------------------------------------------------===// +The x86_64 abi says: + +Booleans, when stored in a memory object, are stored as single byte objects the +value of which is always 0 (false) or 1 (true). + +We are not using this fact: + +int bar(_Bool *a) { return *a; } + +define i32 @bar(i8* nocapture %a) nounwind readonly optsize { + %1 = load i8* %a, align 1, !tbaa !0 + %tmp = and i8 %1, 1 + %2 = zext i8 %tmp to i32 + ret i32 %2 +} + +bar: + movb (%rdi), %al + andb $1, %al + movzbl %al, %eax + ret + +GCC produces + +bar: + movzbl (%rdi), %eax + ret + +//===---------------------------------------------------------------------===// + Consider the following two functions compiled with clang: _Bool foo(int *x) { return !(*x & 4); } unsigned bar(int *x) { return !(*x & 4); } @@ -1708,26 +1685,6 @@ are functionally identical. //===---------------------------------------------------------------------===// -Take the following C code: -int x(int y) { return (y & 63) << 14; } - -Code produced by gcc: - andl $63, %edi - sall $14, %edi - movl %edi, %eax - ret - -Code produced by clang: - shll $14, %edi - movl %edi, %eax - andl $1032192, %eax - ret - -The code produced by gcc is 3 bytes shorter. This sort of construct often -shows up with bitfields. - -//===---------------------------------------------------------------------===// - Take the following C code: int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } @@ -1885,39 +1842,202 @@ _add32carry: //===---------------------------------------------------------------------===// -This: -char t(char c) { - return c/3; +The hot loop of 256.bzip2 contains code that looks a bit like this: + +int foo(char *P, char *Q, int x, int y) { + if (P[0] != Q[0]) + return P[0] < Q[0]; + if (P[1] != Q[1]) + return P[1] < Q[1]; + if (P[2] != Q[2]) + return P[2] < Q[2]; + return P[3] < Q[3]; } -Compiles to: $clang t.c -S -o - -O3 -mkernel -fomit-frame-pointer +In the real code, we get a lot more wrong than this. However, even in this +code we generate: -_t: ## @t - movslq %edi, %rax - imulq $-1431655765, %rax, %rcx ## imm = 0xFFFFFFFFAAAAAAAB - shrq $32, %rcx - addl %ecx, %eax - movl %eax, %ecx - shrl $31, %ecx - shrl %eax - addl %ecx, %eax - movsbl %al, %eax +_foo: ## @foo +## BB#0: ## %entry + movb (%rsi), %al + movb (%rdi), %cl + cmpb %al, %cl + je LBB0_2 +LBB0_1: ## %if.then + cmpb %al, %cl + jmp LBB0_5 +LBB0_2: ## %if.end + movb 1(%rsi), %al + movb 1(%rdi), %cl + cmpb %al, %cl + jne LBB0_1 +## BB#3: ## %if.end38 + movb 2(%rsi), %al + movb 2(%rdi), %cl + cmpb %al, %cl + jne LBB0_1 +## BB#4: ## %if.end60 + movb 3(%rdi), %al + cmpb 3(%rsi), %al +LBB0_5: ## %if.end60 + setl %al + movzbl %al, %eax ret -GCC gets: +Note that we generate jumps to LBB0_1 which does a redundant compare. The +redundant compare also forces the register values to be live, which prevents +folding one of the loads into the compare. In contrast, GCC 4.2 produces: -_t: - movl $86, %eax - imulb %dil - shrw $8, %ax - sarb $7, %dil - subb %dil, %al - movsbl %al,%eax +_foo: + movzbl (%rsi), %eax + cmpb %al, (%rdi) + jne L10 +L12: + movzbl 1(%rsi), %eax + cmpb %al, 1(%rdi) + jne L10 + movzbl 2(%rsi), %eax + cmpb %al, 2(%rdi) + jne L10 + movzbl 3(%rdi), %eax + cmpb 3(%rsi), %al +L10: + setl %al + movzbl %al, %eax + ret + +which is "perfect". + +//===---------------------------------------------------------------------===// + +For the branch in the following code: +int a(); +int b(int x, int y) { + if (x & (1<<(y&7))) + return a(); + return y; +} + +We currently generate: + movb %sil, %al + andb $7, %al + movzbl %al, %eax + btl %eax, %edi + jae .LBB0_2 + +movl+andl would be shorter than the movb+andb+movzbl sequence. + +//===---------------------------------------------------------------------===// + +For the following: +struct u1 { + float x, y; +}; +float foo(struct u1 u) { + return u.x + u.y; +} + +We currently generate: + movdqa %xmm0, %xmm1 + pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0] + addss %xmm1, %xmm0 ret -which is nicer. This also happens for int, not just char. +We could save an instruction here by commuting the addss. //===---------------------------------------------------------------------===// +This (from PR9661): +float clamp_float(float a) { + if (a > 1.0f) + return 1.0f; + else if (a < 0.0f) + return 0.0f; + else + return a; +} +Could compile to: + +clamp_float: # @clamp_float + movss .LCPI0_0(%rip), %xmm1 + minss %xmm1, %xmm0 + pxor %xmm1, %xmm1 + maxss %xmm1, %xmm0 + ret + +with -ffast-math. + +//===---------------------------------------------------------------------===// + +This function (from PR9803): + +int clamp2(int a) { + if (a > 5) + a = 5; + if (a < 0) + return 0; + return a; +} + +Compiles to: + +_clamp2: ## @clamp2 + pushq %rbp + movq %rsp, %rbp + cmpl $5, %edi + movl $5, %ecx + cmovlel %edi, %ecx + testl %ecx, %ecx + movl $0, %eax + cmovnsl %ecx, %eax + popq %rbp + ret + +The move of 0 could be scheduled above the test to make it is xor reg,reg. + +//===---------------------------------------------------------------------===// + +GCC PR48986. We currently compile this: + +void bar(void); +void yyy(int* p) { + if (__sync_fetch_and_add(p, -1) == 1) + bar(); +} + +into: + movl $-1, %eax + lock + xaddl %eax, (%rdi) + cmpl $1, %eax + je LBB0_2 + +Instead we could generate: + + lock + dec %rdi + je LBB0_2 + +The trick is to match "fetch_and_add(X, -C) == C". + +//===---------------------------------------------------------------------===// + +unsigned t(unsigned a, unsigned b) { + return a <= b ? 5 : -5; +} + +We generate: + movl $5, %ecx + cmpl %esi, %edi + movl $-5, %eax + cmovbel %ecx, %eax + +GCC: + cmpl %edi, %esi + sbbl %eax, %eax + andl $-10, %eax + addl $5, %eax + +//===---------------------------------------------------------------------===//