Increase efficiency of sign_extend_inreg by using subregisters for truncation. As...

[oota-llvm.git] / lib / Target / X86 / README.txt
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt

index 758133c04451dff29ff017ef32ba5d41e78641cd..073e2dacef183b26bab7ac32332d4a19f2702471 100644 (file)
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2,6 +2,14 @@
  // Random ideas for the X86 backend.
  //===---------------------------------------------------------------------===//
  
+Missing features:
+  - Support for SSE4: http://www.intel.com/software/penryn
+http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
+  - support for 3DNow!
+  - weird abis?
+
+//===---------------------------------------------------------------------===//
+
  Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
  Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
  X86, & make the dag combiner produce it when needed.  This will eliminate one
@@ -18,11 +26,28 @@ long long test(int X, int Y) { return (long long)X*Y; }
  
  ... which should only be one imul instruction.
  
+or:
+
+unsigned long long int t2(unsigned int a, unsigned int b) {
+       return (unsigned long long)a * b;
+}
+
+... which should be one mul instruction.
+
+
  This can be done with a custom expander, but it would be nice to move this to
  generic code.
  
  //===---------------------------------------------------------------------===//
  
+CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
+backend knows how to three-addressify this shift, but it appears the register
+allocator isn't even asking it to do so in this case.  We should investigate
+why this isn't happening, it could have significant impact on other important
+cases for X86 as well.
+
+//===---------------------------------------------------------------------===//
+
  This should be one DIV/IDIV instruction, not a libcall:
  
  unsigned test(unsigned long long X, unsigned Y) {
@@ -125,9 +150,7 @@ int foo (unsigned long j) {
  
  //===---------------------------------------------------------------------===//
  
-Use push/pop instructions in prolog/epilog sequences instead of stores off 
-ESP (certain code size win, perf win on some [which?] processors).
-Also, it appears icc use push for parameter passing. Need to investigate.
+It appears icc use push for parameter passing. Need to investigate.
  
  //===---------------------------------------------------------------------===//
  
@@ -379,9 +402,7 @@ L4:
  There are 3 issues:
  
  1. Lack of post regalloc LICM.
-2. Poor sub-regclass support. That leads to inability to promote the 16-bit
-   arithmetic op to 32-bit and making use of leal.
-3. LSR unable to reused IV for a different type (i16 vs. i32) even though
+2. LSR unable to reused IV for a different type (i16 vs. i32) even though
     the cast would be free.
  
  //===---------------------------------------------------------------------===//
@@ -452,21 +473,6 @@ require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
  
  //===---------------------------------------------------------------------===//
  
-Bad codegen:
-
-char foo(int x) { return x; }
-
-_foo:
-       movl 4(%esp), %eax
-       shll $24, %eax
-       sarl $24, %eax
-       ret
-
-SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of 
-sub-registers.
-
-//===---------------------------------------------------------------------===//
-
  Consider this:
  
  typedef struct pair { float A, B; } pair;
@@ -988,4 +994,122 @@ _foo:
          movl %edi, %eax
          ret
  
+Another example is:
+
+;; X's live range extends beyond the shift, so the register allocator
+;; cannot coalesce it with Y.  Because of this, a copy needs to be
+;; emitted before the shift to save the register value before it is
+;; clobbered.  However, this copy is not needed if the register
+;; allocator turns the shift into an LEA.  This also occurs for ADD.
+
+; Check that the shift gets turned into an LEA.
+; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
+; RUN:   not grep {mov E.X, E.X}
+
+%G = external global int
+
+int %test1(int %X, int %Y) {
+        %Z = add int %X, %Y
+        volatile store int %Y, int* %G
+        volatile store int %Z, int* %G
+        ret int %X
+}
+
+int %test2(int %X) {
+        %Z = add int %X, 1  ;; inc
+        volatile store int %Z, int* %G
+        ret int %X
+}
+
+//===---------------------------------------------------------------------===//
+
+This:
+#include <xmmintrin.h>
+unsigned test(float f) {
+ return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
+}
+
+Compiles to:
+_test:
+        movss 4(%esp), %xmm0
+        movd %xmm0, %eax
+        ret
+
+it should compile to a move from the stack slot directly into eax.  DAGCombine
+has this xform, but it is currently disabled until the alignment fields of 
+the load/store nodes are trustworthy.
+
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction.  Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+        movl $7, %eax
+        movsbl 4(%esp), %ecx
+        subl %ecx, %eax
+        ret
+
+We would use one fewer register if codegen'd as:
+
+        movsbl 4(%esp), %eax
+       neg %eax
+        add $7, %eax
+        ret
+
+Note that this isn't beneficial if the load can be folded into the sub.  In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+        movl $7, %eax
+        subl 4(%esp), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+For code like:
+phi (undef, x)
+
+We get an implicit def on the undef side. If the phi is spilled, we then get:
+implicitdef xmm1
+store xmm1 -> stack
+
+It should be possible to teach the x86 backend to "fold" the store into the
+implicitdef, which just deletes the implicit def.
+
+These instructions should go away:
+#IMPLICIT_DEF %xmm1 
+movaps %xmm1, 192(%esp) 
+movaps %xmm1, 224(%esp) 
+movaps %xmm1, 176(%esp)
+
  //===---------------------------------------------------------------------===//
+
+This is a "commutable two-address" register coallescing deficiency:
+
+define <4 x float> @test1(<4 x float> %V) {
+entry:
+        %tmp8 = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> < i32 3, i32 2, i32 1, i32 0 >               ; <<4 x float>> [#uses=1]
+        %add = add <4 x float> %tmp8, %V                ; <<4 x float>> [#uses=1]
+        ret <4 x float> %add
+}
+
+this codegens to:
+
+_test1:
+        pshufd  $27, %xmm0, %xmm1
+        addps   %xmm0, %xmm1
+        movaps  %xmm1, %xmm0
+        ret
+
+instead of:
+
+_test1:
+        pshufd  $27, %xmm0, %xmm1
+        addps   %xmm1, %xmm0
+        ret
+