X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=073e2dacef183b26bab7ac32332d4a19f2702471;hb=c59e52108bbfca50b23c5d10706484d4b012c344;hp=4b9a9dd0299a7fab2017b0f250d6e7347a1c75c1;hpb=ead1b801e26dbb46c8484787ba16ed5a2bea13fe;p=oota-llvm.git diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 4b9a9dd0299..073e2dacef1 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2,6 +2,14 @@ // Random ideas for the X86 backend. //===---------------------------------------------------------------------===// +Missing features: + - Support for SSE4: http://www.intel.com/software/penryn +http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf + - support for 3DNow! + - weird abis? + +//===---------------------------------------------------------------------===// + Add a MUL2U and MUL2S nodes to represent a multiply that returns both the Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to X86, & make the dag combiner produce it when needed. This will eliminate one @@ -18,11 +26,28 @@ long long test(int X, int Y) { return (long long)X*Y; } ... which should only be one imul instruction. +or: + +unsigned long long int t2(unsigned int a, unsigned int b) { + return (unsigned long long)a * b; +} + +... which should be one mul instruction. + + This can be done with a custom expander, but it would be nice to move this to generic code. //===---------------------------------------------------------------------===// +CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86 +backend knows how to three-addressify this shift, but it appears the register +allocator isn't even asking it to do so in this case. We should investigate +why this isn't happening, it could have significant impact on other important +cases for X86 as well. + +//===---------------------------------------------------------------------===// + This should be one DIV/IDIV instruction, not a libcall: unsigned test(unsigned long long X, unsigned Y) { @@ -125,9 +150,7 @@ int foo (unsigned long j) { //===---------------------------------------------------------------------===// -Use push/pop instructions in prolog/epilog sequences instead of stores off -ESP (certain code size win, perf win on some [which?] processors). -Also, it appears icc use push for parameter passing. Need to investigate. +It appears icc use push for parameter passing. Need to investigate. //===---------------------------------------------------------------------===// @@ -339,6 +362,51 @@ lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. //===---------------------------------------------------------------------===// +We are generating far worse code than gcc: + +volatile short X, Y; + +void foo(int N) { + int i; + for (i = 0; i < N; i++) { X = i; Y = i*4; } +} + +LBB1_1: #bb.preheader + xorl %ecx, %ecx + xorw %dx, %dx +LBB1_2: #bb + movl L_X$non_lazy_ptr, %esi + movw %dx, (%esi) + movw %dx, %si + shlw $2, %si + movl L_Y$non_lazy_ptr, %edi + movw %si, (%edi) + incl %ecx + incw %dx + cmpl %eax, %ecx + jne LBB1_2 #bb + +vs. + + xorl %edx, %edx + movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi + movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx +L4: + movw %dx, (%esi) + leal 0(,%edx,4), %eax + movw %ax, (%ecx) + addl $1, %edx + cmpl %edx, %edi + jne L4 + +There are 3 issues: + +1. Lack of post regalloc LICM. +2. LSR unable to reused IV for a different type (i16 vs. i32) even though + the cast would be free. + +//===---------------------------------------------------------------------===// + Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 / FR64 to VR128. @@ -405,21 +473,6 @@ require a copy to be inserted (in X86InstrInfo::convertToThreeAddress). //===---------------------------------------------------------------------===// -Bad codegen: - -char foo(int x) { return x; } - -_foo: - movl 4(%esp), %eax - shll $24, %eax - sarl $24, %eax - ret - -SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of -sub-registers. - -//===---------------------------------------------------------------------===// - Consider this: typedef struct pair { float A, B; } pair; @@ -430,16 +483,13 @@ void pairtest(pair P, float *FP) { We currently generate this code with llvmgcc4: _pairtest: - subl $12, %esp - movl 20(%esp), %eax - movl %eax, 4(%esp) - movl 16(%esp), %eax - movl %eax, (%esp) - movss (%esp), %xmm0 - addss 4(%esp), %xmm0 - movl 24(%esp), %eax - movss %xmm0, (%eax) - addl $12, %esp + movl 8(%esp), %eax + movl 4(%esp), %ecx + movd %eax, %xmm0 + movd %ecx, %xmm1 + addss %xmm0, %xmm1 + movl 12(%esp), %eax + movss %xmm1, (%eax) ret we should be able to generate: @@ -455,6 +505,10 @@ integer chunks. It does this so that structs like {short,short} are passed in a single 32-bit integer stack slot. We should handle the safe cases above much nicer, while still handling the hard cases. +While true in general, in this specific case we could do better by promoting +load int + bitcast to float -> load fload. This basically needs alignment info, +the code is already implemented (but disabled) in dag combine). + //===---------------------------------------------------------------------===// Another instruction selector deficiency: @@ -533,10 +587,6 @@ do not make use of. //===---------------------------------------------------------------------===// -We should handle __attribute__ ((__visibility__ ("hidden"))). - -//===---------------------------------------------------------------------===// - int %foo(int* %a, int %t) { entry: br label %cond_true @@ -668,20 +718,6 @@ The add\sub pair is really unneeded here. //===---------------------------------------------------------------------===// -We generate really bad code in some cases due to lowering SETCC/SELECT at -legalize time, which prevents the post-legalize dag combine pass from -understanding the code. As a silly example, this prevents us from folding -stuff like this: - -bool %test(ulong %x) { - %tmp = setlt ulong %x, 4294967296 - ret bool %tmp -} - -into x.h == 0 - -//===---------------------------------------------------------------------===// - We currently compile sign_extend_inreg into two shifts: long foo(long X) { @@ -750,3 +786,330 @@ to grab the bytes from the next cacheline. //===---------------------------------------------------------------------===// In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. + +//===---------------------------------------------------------------------===// + +This could be a single 16-bit load. + +int f(char *p) { + if ((p[0] == 1) & (p[1] == 2)) return 1; + return 0; +} + +//===---------------------------------------------------------------------===// + +We should inline lrintf and probably other libc functions. + +//===---------------------------------------------------------------------===// + +Start using the flags more. For example, compile: + +int add_zf(int *x, int y, int a, int b) { + if ((*x += y) == 0) + return a; + else + return b; +} + +to: + addl %esi, (%rdi) + movl %edx, %eax + cmovne %ecx, %eax + ret +instead of: + +_add_zf: + addl (%rdi), %esi + movl %esi, (%rdi) + testl %esi, %esi + cmove %edx, %ecx + movl %ecx, %eax + ret + +and: + +int add_zf(int *x, int y, int a, int b) { + if ((*x + y) < 0) + return a; + else + return b; +} + +to: + +add_zf: + addl (%rdi), %esi + movl %edx, %eax + cmovns %ecx, %eax + ret + +instead of: + +_add_zf: + addl (%rdi), %esi + testl %esi, %esi + cmovs %edx, %ecx + movl %ecx, %eax + ret + +//===---------------------------------------------------------------------===// + +This: +#include +int foo(double X) { return isnan(X); } + +compiles to (-m64): + +_foo: + pxor %xmm1, %xmm1 + ucomisd %xmm1, %xmm0 + setp %al + movzbl %al, %eax + ret + +the pxor is not needed, we could compare the value against itself. + +//===---------------------------------------------------------------------===// + +These two functions have identical effects: + +unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;} +unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;} + +We currently compile them to: + +_f: + movl 4(%esp), %eax + movl %eax, %ecx + incl %ecx + movl 8(%esp), %edx + cmpl %edx, %ecx + jne LBB1_2 #UnifiedReturnBlock +LBB1_1: #cond_true + addl $2, %eax + ret +LBB1_2: #UnifiedReturnBlock + movl %ecx, %eax + ret +_f2: + movl 4(%esp), %eax + movl %eax, %ecx + incl %ecx + cmpl 8(%esp), %ecx + sete %cl + movzbl %cl, %ecx + leal 1(%ecx,%eax), %eax + ret + +both of which are inferior to GCC's: + +_f: + movl 4(%esp), %edx + leal 1(%edx), %eax + addl $2, %edx + cmpl 8(%esp), %eax + cmove %edx, %eax + ret +_f2: + movl 4(%esp), %eax + addl $1, %eax + xorl %edx, %edx + cmpl 8(%esp), %eax + sete %dl + addl %edx, %eax + ret + +//===---------------------------------------------------------------------===// + +This code: + +void test(int X) { + if (X) abort(); +} + +is currently compiled to: + +_test: + subl $12, %esp + cmpl $0, 16(%esp) + jne LBB1_1 + addl $12, %esp + ret +LBB1_1: + call L_abort$stub + +It would be better to produce: + +_test: + subl $12, %esp + cmpl $0, 16(%esp) + jne L_abort$stub + addl $12, %esp + ret + +This can be applied to any no-return function call that takes no arguments etc. +Alternatively, the stack save/restore logic could be shrink-wrapped, producing +something like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 + ret +LBB1_1: + subl $12, %esp + call L_abort$stub + +Both are useful in different situations. Finally, it could be shrink-wrapped +and tail called, like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 + ret +LBB1_1: + pop %eax # realign stack. + call L_abort$stub + +Though this probably isn't worth it. + +//===---------------------------------------------------------------------===// + +We need to teach the codegen to convert two-address INC instructions to LEA +when the flags are dead. For example, on X86-64, compile: + +int foo(int A, int B) { + return A+1; +} + +to: + +_foo: + leal 1(%edi), %eax + ret + +instead of: + +_foo: + incl %edi + movl %edi, %eax + ret + +Another example is: + +;; X's live range extends beyond the shift, so the register allocator +;; cannot coalesce it with Y. Because of this, a copy needs to be +;; emitted before the shift to save the register value before it is +;; clobbered. However, this copy is not needed if the register +;; allocator turns the shift into an LEA. This also occurs for ADD. + +; Check that the shift gets turned into an LEA. +; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \ +; RUN: not grep {mov E.X, E.X} + +%G = external global int + +int %test1(int %X, int %Y) { + %Z = add int %X, %Y + volatile store int %Y, int* %G + volatile store int %Z, int* %G + ret int %X +} + +int %test2(int %X) { + %Z = add int %X, 1 ;; inc + volatile store int %Z, int* %G + ret int %X +} + +//===---------------------------------------------------------------------===// + +This: +#include +unsigned test(float f) { + return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f )); +} + +Compiles to: +_test: + movss 4(%esp), %xmm0 + movd %xmm0, %eax + ret + +it should compile to a move from the stack slot directly into eax. DAGCombine +has this xform, but it is currently disabled until the alignment fields of +the load/store nodes are trustworthy. + +//===---------------------------------------------------------------------===// + +Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with +a neg instead of a sub instruction. Consider: + +int test(char X) { return 7-X; } + +we currently produce: +_test: + movl $7, %eax + movsbl 4(%esp), %ecx + subl %ecx, %eax + ret + +We would use one fewer register if codegen'd as: + + movsbl 4(%esp), %eax + neg %eax + add $7, %eax + ret + +Note that this isn't beneficial if the load can be folded into the sub. In +this case, we want a sub: + +int test(int X) { return 7-X; } +_test: + movl $7, %eax + subl 4(%esp), %eax + ret + +//===---------------------------------------------------------------------===// + +For code like: +phi (undef, x) + +We get an implicit def on the undef side. If the phi is spilled, we then get: +implicitdef xmm1 +store xmm1 -> stack + +It should be possible to teach the x86 backend to "fold" the store into the +implicitdef, which just deletes the implicit def. + +These instructions should go away: +#IMPLICIT_DEF %xmm1 +movaps %xmm1, 192(%esp) +movaps %xmm1, 224(%esp) +movaps %xmm1, 176(%esp) + +//===---------------------------------------------------------------------===// + +This is a "commutable two-address" register coallescing deficiency: + +define <4 x float> @test1(<4 x float> %V) { +entry: + %tmp8 = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> < i32 3, i32 2, i32 1, i32 0 > ; <<4 x float>> [#uses=1] + %add = add <4 x float> %tmp8, %V ; <<4 x float>> [#uses=1] + ret <4 x float> %add +} + +this codegens to: + +_test1: + pshufd $27, %xmm0, %xmm1 + addps %xmm0, %xmm1 + movaps %xmm1, %xmm0 + ret + +instead of: + +_test1: + pshufd $27, %xmm0, %xmm1 + addps %xmm1, %xmm0 + ret +