X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=073e2dacef183b26bab7ac32332d4a19f2702471;hb=c59e52108bbfca50b23c5d10706484d4b012c344;hp=9ab17dc3ee93be1f23d199b3addea8d0d548186b;hpb=0258011bb9d9ebee2a32af7c04d80c21b7869a66;p=oota-llvm.git

diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 9ab17dc3ee9..073e2dacef1 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -26,6 +26,15 @@ long long test(int X, int Y) { return (long long)X*Y; }
 
 ... which should only be one imul instruction.
 
+or:
+
+unsigned long long int t2(unsigned int a, unsigned int b) {
+       return (unsigned long long)a * b;
+}
+
+... which should be one mul instruction.
+
+
 This can be done with a custom expander, but it would be nice to move this to
 generic code.
 
@@ -141,9 +150,7 @@ int foo (unsigned long j) {
 
 //===---------------------------------------------------------------------===//
 
-Use push/pop instructions in prolog/epilog sequences instead of stores off 
-ESP (certain code size win, perf win on some [which?] processors).
-Also, it appears icc use push for parameter passing. Need to investigate.
+It appears icc use push for parameter passing. Need to investigate.
 
 //===---------------------------------------------------------------------===//
 
@@ -395,9 +402,7 @@ L4:
 There are 3 issues:
 
 1. Lack of post regalloc LICM.
-2. Poor sub-regclass support. That leads to inability to promote the 16-bit
-   arithmetic op to 32-bit and making use of leal.
-3. LSR unable to reused IV for a different type (i16 vs. i32) even though
+2. LSR unable to reused IV for a different type (i16 vs. i32) even though
    the cast would be free.
 
 //===---------------------------------------------------------------------===//
@@ -468,21 +473,6 @@ require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 
 //===---------------------------------------------------------------------===//
 
-Bad codegen:
-
-char foo(int x) { return x; }
-
-_foo:
-	movl 4(%esp), %eax
-	shll $24, %eax
-	sarl $24, %eax
-	ret
-
-SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of 
-sub-registers.
-
-//===---------------------------------------------------------------------===//
-
 Consider this:
 
 typedef struct pair { float A, B; } pair;
@@ -1031,51 +1021,6 @@ int %test2(int %X) {
         ret int %X
 }
 
-//===---------------------------------------------------------------------===//
-
-We use push/pop of stack space around calls in situations where we don't have to.
-Call to f below produces:
-        subl $16, %esp      <<<<<
-        movl %eax, (%esp)
-        call L_f$stub
-        addl $16, %esp     <<<<<
-The stack push/pop can be moved into the prolog/epilog.  It does this because it's
-building the frame pointer, but this should not be sufficient, only the use of alloca
-should cause it to do this.
-(There are other issues shown by this code, but this is one.)
-
-typedef struct _range_t {
-    float fbias;
-    float fscale;
-    int ibias;
-    int iscale;
-    int ishift;
-    unsigned char lut[];
-} range_t;
-
-struct _decode_t {
-    int type:4;
-    int unit:4;
-    int alpha:8;
-    int N:8;
-    int bpc:8;
-    int bpp:16;
-    int skip:8;
-    int swap:8;
-    const range_t*const*range;
-};
-
-typedef struct _decode_t decode_t;
-
-extern int f(const decode_t* decode);
-
-int decode_byte (const decode_t* decode) {
-  if (decode->swap != 0)
-    return f(decode);
-  return 0;
-}
-
-
 //===---------------------------------------------------------------------===//
 
 This:
@@ -1094,5 +1039,77 @@ it should compile to a move from the stack slot directly into eax.  DAGCombine
 has this xform, but it is currently disabled until the alignment fields of 
 the load/store nodes are trustworthy.
 
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction.  Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+        movl $7, %eax
+        movsbl 4(%esp), %ecx
+        subl %ecx, %eax
+        ret
+
+We would use one fewer register if codegen'd as:
 
+        movsbl 4(%esp), %eax
+	neg %eax
+        add $7, %eax
+        ret
+
+Note that this isn't beneficial if the load can be folded into the sub.  In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+        movl $7, %eax
+        subl 4(%esp), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+For code like:
+phi (undef, x)
+
+We get an implicit def on the undef side. If the phi is spilled, we then get:
+implicitdef xmm1
+store xmm1 -> stack
+
+It should be possible to teach the x86 backend to "fold" the store into the
+implicitdef, which just deletes the implicit def.
+
+These instructions should go away:
+#IMPLICIT_DEF %xmm1 
+movaps %xmm1, 192(%esp) 
+movaps %xmm1, 224(%esp) 
+movaps %xmm1, 176(%esp)
+
+//===---------------------------------------------------------------------===//
+
+This is a "commutable two-address" register coallescing deficiency:
+
+define <4 x float> @test1(<4 x float> %V) {
+entry:
+        %tmp8 = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> < i32 3, i32 2, i32 1, i32 0 >               ; <<4 x float>> [#uses=1]
+        %add = add <4 x float> %tmp8, %V                ; <<4 x float>> [#uses=1]
+        ret <4 x float> %add
+}
+
+this codegens to:
+
+_test1:
+        pshufd  $27, %xmm0, %xmm1
+        addps   %xmm0, %xmm1
+        movaps  %xmm1, %xmm0
+        ret
+
+instead of:
+
+_test1:
+        pshufd  $27, %xmm0, %xmm1
+        addps   %xmm1, %xmm0
+        ret