X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=e67fab109047ed9cdce789af6c0fb50bae122c35;hb=d93ceeb125c11a96eb85618bb9a8a7d664a1d8f4;hp=d4545a6fcfd37150820e69ae6bb268a45c2d7e4d;hpb=85e96018621c8a763f9e60a62cca06b0d8475488;p=oota-llvm.git diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index d4545a6fcfd..e67fab10904 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -895,6 +895,24 @@ compare: //===---------------------------------------------------------------------===// +Linux is missing some basic tail call support: + +#include +double foo(double a) { return sin(a); } + +This compiles into this on x86-64 Linux (but not darwin): +foo: + subq $8, %rsp + call sin + addq $8, %rsp + ret +vs: + +foo: + jmp sin + +//===---------------------------------------------------------------------===// + Tail call optimization improvements: Tail call optimization currently pushes all arguments on the top of the stack (their normal place for non-tail call optimized calls) that source from the callers arguments @@ -1103,57 +1121,6 @@ be folded into: shl [mem], 1 //===---------------------------------------------------------------------===// -This testcase misses a read/modify/write opportunity (from PR1425): - -void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){ - int i; - for(i=0; i>0; -} - -We compile it down to: - -LBB1_2: # bb - movl (%esi,%edi,4), %ebx - addl (%ecx,%edi,4), %ebx - addl (%edx,%edi,4), %ebx - movl %ebx, (%ecx,%edi,4) - incl %edi - cmpl %eax, %edi - jne LBB1_2 # bb - -the inner loop should add to the memory location (%ecx,%edi,4), saving -a mov. Something like: - - movl (%esi,%edi,4), %ebx - addl (%edx,%edi,4), %ebx - addl %ebx, (%ecx,%edi,4) - -Here is another interesting example: - -void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){ - int i; - for(i=0; i>0; -} - -We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]: - -LBB9_2: # bb - movl (%ecx,%edi,4), %ebx - subl (%esi,%edi,4), %ebx - subl (%edx,%edi,4), %ebx - movl %ebx, (%ecx,%edi,4) - incl %edi - cmpl %eax, %edi - jne LBB9_2 # bb - -Additionally, LSR should rewrite the exit condition of these loops to use -a stride-4 IV, would would allow all the scales in the loop to go away. -This would result in smaller code and more efficient microops. - -//===---------------------------------------------------------------------===// - In SSE mode, we turn abs and neg into a load from the constant pool plus a xor or and instruction, for example: @@ -1186,13 +1153,6 @@ void test(double *P) { //===---------------------------------------------------------------------===// -handling llvm.memory.barrier on pre SSE2 cpus - -should generate: -lock ; mov %esp, %esp - -//===---------------------------------------------------------------------===// - The generated code on x86 for checking for signed overflow on a multiply the obvious way is much longer than it needs to be. @@ -1301,15 +1261,8 @@ FirstOnet: xorl %eax, %eax ret -There are a few possible improvements here: -1. We should be able to eliminate the dead load into %ecx -2. We could change the "movl 8(%esp), %eax" into - "movzwl 10(%esp), %eax"; this lets us change the cmpl - into a testl, which is shorter, and eliminate the shift. - -We could also in theory eliminate the branch by using a conditional -for the address of the load, but that seems unlikely to be worthwhile -in general. +We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this +lets us change the cmpl into a testl, which is shorter, and eliminate the shift. //===---------------------------------------------------------------------===// @@ -1331,22 +1284,23 @@ bb7: ; preds = %entry to: -_foo: +foo: # @foo +# BB#0: # %entry + movl 4(%esp), %ecx cmpb $0, 16(%esp) - movl 12(%esp), %ecx + je .LBB0_2 +# BB#1: # %bb movl 8(%esp), %eax - movl 4(%esp), %edx - je LBB1_2 # bb7 -LBB1_1: # bb - addl %edx, %eax + addl %ecx, %eax ret -LBB1_2: # bb7 - movl %edx, %eax - subl %ecx, %eax +.LBB0_2: # %bb7 + movl 12(%esp), %edx + movl %ecx, %eax + subl %edx, %eax ret -The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2 -if it commuted the addl in LBB1_1. +There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a +couple more movls by putting 4(%esp) into %eax instead of %ecx. //===---------------------------------------------------------------------===// @@ -1396,8 +1350,7 @@ Also check why xmm7 is not used at all in the function. //===---------------------------------------------------------------------===// -Legalize loses track of the fact that bools are always zero extended when in -memory. This causes us to compile abort_gzip (from 164.gzip) from: +Take the following: target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin8" @@ -1416,16 +1369,15 @@ bb4.i: ; preds = %entry } declare void @exit(i32) noreturn nounwind -into: - -_abort_gzip: +This compiles into: +_abort_gzip: ## @abort_gzip +## BB#0: ## %entry subl $12, %esp movb _in_exit.4870.b, %al - notb %al - testb $1, %al - jne LBB1_2 ## bb4.i -LBB1_1: ## bb.i - ... + cmpb $1, %al + jne LBB0_2 + +We somehow miss folding the movb into the cmpb. //===---------------------------------------------------------------------===// @@ -1929,3 +1881,100 @@ The code produced by gcc is 3 bytes shorter. This sort of construct often shows up with bitfields. //===---------------------------------------------------------------------===// + +Take the following C code: +int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } + +We generate the following IR with clang: +define i32 @f(i32 %a, i32 %b) nounwind readnone { +entry: + %tmp = xor i32 %b, %a ; [#uses=1] + %tmp6 = and i32 %tmp, 255 ; [#uses=1] + %cmp = icmp eq i32 %tmp6, 0 ; [#uses=1] + %conv5 = zext i1 %cmp to i32 ; [#uses=1] + ret i32 %conv5 +} + +And the following x86 code: + xorl %esi, %edi + testb $-1, %dil + sete %al + movzbl %al, %eax + ret + +A cmpb instead of the xorl+testb would be one instruction shorter. + +//===---------------------------------------------------------------------===// + +Given the following C code: +int f(int a, int b) { return (signed char)a == (signed char)b; } + +We generate the following IR with clang: +define i32 @f(i32 %a, i32 %b) nounwind readnone { +entry: + %sext = shl i32 %a, 24 ; [#uses=1] + %conv1 = ashr i32 %sext, 24 ; [#uses=1] + %sext6 = shl i32 %b, 24 ; [#uses=1] + %conv4 = ashr i32 %sext6, 24 ; [#uses=1] + %cmp = icmp eq i32 %conv1, %conv4 ; [#uses=1] + %conv5 = zext i1 %cmp to i32 ; [#uses=1] + ret i32 %conv5 +} + +And the following x86 code: + movsbl %sil, %eax + movsbl %dil, %ecx + cmpl %eax, %ecx + sete %al + movzbl %al, %eax + ret + + +It should be possible to eliminate the sign extensions. + +//===---------------------------------------------------------------------===// + +LLVM misses a load+store narrowing opportunity in this code: + +%struct.bf = type { i64, i16, i16, i32 } + +@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] + +define void @t1() nounwind ssp { +entry: + %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] + %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; [#uses=1] + %2 = bitcast i16* %1 to i32* ; [#uses=2] + %3 = load i32* %2, align 1 ; [#uses=1] + %4 = and i32 %3, -65537 ; [#uses=1] + store i32 %4, i32* %2, align 1 + %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] + %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; [#uses=1] + %7 = bitcast i16* %6 to i32* ; [#uses=2] + %8 = load i32* %7, align 1 ; [#uses=1] + %9 = and i32 %8, -131073 ; [#uses=1] + store i32 %9, i32* %7, align 1 + ret void +} + +LLVM currently emits this: + + movq bfi(%rip), %rax + andl $-65537, 8(%rax) + movq bfi(%rip), %rax + andl $-131073, 8(%rax) + ret + +It could narrow the loads and stores to emit this: + + movq bfi(%rip), %rax + andb $-2, 10(%rax) + movq bfi(%rip), %rax + andb $-3, 10(%rax) + ret + +The trouble is that there is a TokenFactor between the store and the +load, making it non-trivial to determine if there's anything between +the load and the store which would prohibit narrowing. + +//===---------------------------------------------------------------------===//