X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=073e2dacef183b26bab7ac32332d4a19f2702471;hb=c59e52108bbfca50b23c5d10706484d4b012c344;hp=0bf8c4f8b4d3412447906dc35d6e9abff3c5d76e;hpb=6ef3307062073ca584ea6980af88787c013ade71;p=oota-llvm.git diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 0bf8c4f8b4d..073e2dacef1 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2,6 +2,14 @@ // Random ideas for the X86 backend. //===---------------------------------------------------------------------===// +Missing features: + - Support for SSE4: http://www.intel.com/software/penryn +http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf + - support for 3DNow! + - weird abis? + +//===---------------------------------------------------------------------===// + Add a MUL2U and MUL2S nodes to represent a multiply that returns both the Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to X86, & make the dag combiner produce it when needed. This will eliminate one @@ -18,6 +26,15 @@ long long test(int X, int Y) { return (long long)X*Y; } ... which should only be one imul instruction. +or: + +unsigned long long int t2(unsigned int a, unsigned int b) { + return (unsigned long long)a * b; +} + +... which should be one mul instruction. + + This can be done with a custom expander, but it would be nice to move this to generic code. @@ -133,9 +150,7 @@ int foo (unsigned long j) { //===---------------------------------------------------------------------===// -Use push/pop instructions in prolog/epilog sequences instead of stores off -ESP (certain code size win, perf win on some [which?] processors). -Also, it appears icc use push for parameter passing. Need to investigate. +It appears icc use push for parameter passing. Need to investigate. //===---------------------------------------------------------------------===// @@ -387,9 +402,7 @@ L4: There are 3 issues: 1. Lack of post regalloc LICM. -2. Poor sub-regclass support. That leads to inability to promote the 16-bit - arithmetic op to 32-bit and making use of leal. -3. LSR unable to reused IV for a different type (i16 vs. i32) even though +2. LSR unable to reused IV for a different type (i16 vs. i32) even though the cast would be free. //===---------------------------------------------------------------------===// @@ -460,21 +473,6 @@ require a copy to be inserted (in X86InstrInfo::convertToThreeAddress). //===---------------------------------------------------------------------===// -Bad codegen: - -char foo(int x) { return x; } - -_foo: - movl 4(%esp), %eax - shll $24, %eax - sarl $24, %eax - ret - -SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of -sub-registers. - -//===---------------------------------------------------------------------===// - Consider this: typedef struct pair { float A, B; } pair; @@ -996,46 +994,122 @@ _foo: movl %edi, %eax ret +Another example is: + +;; X's live range extends beyond the shift, so the register allocator +;; cannot coalesce it with Y. Because of this, a copy needs to be +;; emitted before the shift to save the register value before it is +;; clobbered. However, this copy is not needed if the register +;; allocator turns the shift into an LEA. This also occurs for ADD. + +; Check that the shift gets turned into an LEA. +; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \ +; RUN: not grep {mov E.X, E.X} + +%G = external global int + +int %test1(int %X, int %Y) { + %Z = add int %X, %Y + volatile store int %Y, int* %G + volatile store int %Z, int* %G + ret int %X +} + +int %test2(int %X) { + %Z = add int %X, 1 ;; inc + volatile store int %Z, int* %G + ret int %X +} + //===---------------------------------------------------------------------===// -We use push/pop of stack space around calls in situations where we don't have to. -Call to f below produces: - subl $16, %esp <<<<< - movl %eax, (%esp) - call L_f$stub - addl $16, %esp <<<<< -The stack push/pop can be moved into the prolog/epilog. It does this because it's -building the frame pointer, but this should not be sufficient, only the use of alloca -should cause it to do this. -(There are other issues shown by this code, but this is one.) - -typedef struct _range_t { - float fbias; - float fscale; - int ibias; - int iscale; - int ishift; - unsigned char lut[]; -} range_t; - -struct _decode_t { - int type:4; - int unit:4; - int alpha:8; - int N:8; - int bpc:8; - int bpp:16; - int skip:8; - int swap:8; - const range_t*const*range; -}; - -typedef struct _decode_t decode_t; - -extern int f(const decode_t* decode); - -int decode_byte (const decode_t* decode) { - if (decode->swap != 0) - return f(decode); - return 0; +This: +#include +unsigned test(float f) { + return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f )); } + +Compiles to: +_test: + movss 4(%esp), %xmm0 + movd %xmm0, %eax + ret + +it should compile to a move from the stack slot directly into eax. DAGCombine +has this xform, but it is currently disabled until the alignment fields of +the load/store nodes are trustworthy. + +//===---------------------------------------------------------------------===// + +Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with +a neg instead of a sub instruction. Consider: + +int test(char X) { return 7-X; } + +we currently produce: +_test: + movl $7, %eax + movsbl 4(%esp), %ecx + subl %ecx, %eax + ret + +We would use one fewer register if codegen'd as: + + movsbl 4(%esp), %eax + neg %eax + add $7, %eax + ret + +Note that this isn't beneficial if the load can be folded into the sub. In +this case, we want a sub: + +int test(int X) { return 7-X; } +_test: + movl $7, %eax + subl 4(%esp), %eax + ret + +//===---------------------------------------------------------------------===// + +For code like: +phi (undef, x) + +We get an implicit def on the undef side. If the phi is spilled, we then get: +implicitdef xmm1 +store xmm1 -> stack + +It should be possible to teach the x86 backend to "fold" the store into the +implicitdef, which just deletes the implicit def. + +These instructions should go away: +#IMPLICIT_DEF %xmm1 +movaps %xmm1, 192(%esp) +movaps %xmm1, 224(%esp) +movaps %xmm1, 176(%esp) + +//===---------------------------------------------------------------------===// + +This is a "commutable two-address" register coallescing deficiency: + +define <4 x float> @test1(<4 x float> %V) { +entry: + %tmp8 = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> < i32 3, i32 2, i32 1, i32 0 > ; <<4 x float>> [#uses=1] + %add = add <4 x float> %tmp8, %V ; <<4 x float>> [#uses=1] + ret <4 x float> %add +} + +this codegens to: + +_test1: + pshufd $27, %xmm0, %xmm1 + addps %xmm0, %xmm1 + movaps %xmm1, %xmm0 + ret + +instead of: + +_test1: + pshufd $27, %xmm0, %xmm1 + addps %xmm1, %xmm0 + ret +