X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=52d3c01076de2f69d27112c8e1e8c6caf5897bcc;hb=3922da8ae8fab29de6416eeeebf21208b1491557;hp=ea3014e7b92763838173abec4d1effc745553d2a;hpb=ce3a022b8c07339b31aadb56288a8e02c76b40e3;p=oota-llvm.git diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index ea3014e7b92..52d3c01076d 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2,11 +2,6 @@ // Random ideas for the X86 backend. //===---------------------------------------------------------------------===// -We should add support for the "movbe" instruction, which does a byte-swapping -copy (3-addr bswap + memory support?) This is available on Atom processors. - -//===---------------------------------------------------------------------===// - This should be one DIV/IDIV instruction, not a libcall: unsigned test(unsigned long long X, unsigned Y) { @@ -61,7 +56,7 @@ cmovs, we should expand to a conditional branch like GCC produces. Some isel ideas: -1. Dynamic programming based approach when compile time if not an +1. Dynamic programming based approach when compile time is not an issue. 2. Code duplication (addressing mode) during isel. 3. Other ideas from "Register-Sensitive Selection, Duplication, and @@ -128,7 +123,7 @@ flags. The instruction selector sometimes misses folding a load into a compare. The pattern is written as (cmp reg, (load p)). Because the compare isn't commutative, it is not matched with the load on both sides. The dag combiner -should be made smart enough to cannonicalize the load into the RHS of a compare +should be made smart enough to canonicalize the load into the RHS of a compare when it can invert the result of the compare for free. //===---------------------------------------------------------------------===// @@ -1222,7 +1217,7 @@ Also check why xmm7 is not used at all in the function. Take the following: -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128" target triple = "i386-apple-darwin8" @in_exit.4870.b = internal global i1 false ; [#uses=2] define fastcc void @abort_gzip() noreturn nounwind { @@ -1449,54 +1444,6 @@ it would be nice to produce "into" someday. //===---------------------------------------------------------------------===// -This code: - -void vec_mpys1(int y[], const int x[], int scaler) { -int i; -for (i = 0; i < 150; i++) - y[i] += (((long long)scaler * (long long)x[i]) >> 31); -} - -Compiles to this loop with GCC 3.x: - -.L5: - movl %ebx, %eax - imull (%edi,%ecx,4) - shrdl $31, %edx, %eax - addl %eax, (%esi,%ecx,4) - incl %ecx - cmpl $149, %ecx - jle .L5 - -llvm-gcc compiles it to the much uglier: - -LBB1_1: ## bb1 - movl 24(%esp), %eax - movl (%eax,%edi,4), %ebx - movl %ebx, %ebp - imull %esi, %ebp - movl %ebx, %eax - mull %ecx - addl %ebp, %edx - sarl $31, %ebx - imull %ecx, %ebx - addl %edx, %ebx - shldl $1, %eax, %ebx - movl 20(%esp), %eax - addl %ebx, (%eax,%edi,4) - incl %edi - cmpl $150, %edi - jne LBB1_1 ## bb1 - -The issue is that we hoist the cast of "scaler" to long long outside of the -loop, the value comes into the loop as two values, and -RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the -constructed BUILD_PAIR which represents the cast value. - -This can be handled by making CodeGenPrepare sink the cast. - -//===---------------------------------------------------------------------===// - Test instructions can be eliminated by using EFLAGS values from arithmetic instructions. This is currently not done for mul, and, or, xor, neg, shl, sra, srl, shld, shrd, atomic ops, and others. It is also currently not done @@ -1572,43 +1519,6 @@ The first one is done for all AMDs, Core2, and "Generic" The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, Core 2, and "Generic" -//===---------------------------------------------------------------------===// - -Testcase: -int a(int x) { return (x & 127) > 31; } - -Current output: - movl 4(%esp), %eax - andl $127, %eax - cmpl $31, %eax - seta %al - movzbl %al, %eax - ret - -Ideal output: - xorl %eax, %eax - testl $96, 4(%esp) - setne %al - ret - -This should definitely be done in instcombine, canonicalizing the range -condition into a != condition. We get this IR: - -define i32 @a(i32 %x) nounwind readnone { -entry: - %0 = and i32 %x, 127 ; [#uses=1] - %1 = icmp ugt i32 %0, 31 ; [#uses=1] - %2 = zext i1 %1 to i32 ; [#uses=1] - ret i32 %2 -} - -Instcombine prefers to strength reduce relational comparisons to equality -comparisons when possible, this should be another case of that. This could -be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it -looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already -be redesigned to use ComputeMaskedBits and friends. - - //===---------------------------------------------------------------------===// Testcase: int x(int a) { return (a&0xf0)>>4; } @@ -2040,3 +1950,46 @@ _clamp2: ## @clamp2 The move of 0 could be scheduled above the test to make it is xor reg,reg. //===---------------------------------------------------------------------===// + +GCC PR48986. We currently compile this: + +void bar(void); +void yyy(int* p) { + if (__sync_fetch_and_add(p, -1) == 1) + bar(); +} + +into: + movl $-1, %eax + lock + xaddl %eax, (%rdi) + cmpl $1, %eax + je LBB0_2 + +Instead we could generate: + + lock + dec %rdi + je LBB0_2 + +The trick is to match "fetch_and_add(X, -C) == C". + +//===---------------------------------------------------------------------===// + +unsigned t(unsigned a, unsigned b) { + return a <= b ? 5 : -5; +} + +We generate: + movl $5, %ecx + cmpl %esi, %edi + movl $-5, %eax + cmovbel %ecx, %eax + +GCC: + cmpl %edi, %esi + sbbl %eax, %eax + andl $-10, %eax + addl $5, %eax + +//===---------------------------------------------------------------------===//