Turn off the old way of handling debug information in the code generator. Use

[oota-llvm.git] / lib / Target / X86 / README.txt
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt

index 2374659a519a3bd5270cc5eae2d4298cc481ed81..2b867f4d8dc4428a5ab812e27de461bbd8ea6a8b 100644 (file)
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -369,37 +369,6 @@ There is also one case we do worse on PPC.
  
  //===---------------------------------------------------------------------===//
  
-If shorter, we should use things like:
-movzwl %ax, %eax
-instead of:
-andl $65535, %EAX
-
-The former can also be used when the two-addressy nature of the 'and' would
-require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
-
-//===---------------------------------------------------------------------===//
-
-Another instruction selector deficiency:
-
-void %bar() {
-       %tmp = load int (int)** %foo
-       %tmp = tail call int %tmp( int 3 )
-       ret void
-}
-
-_bar:
-       subl $12, %esp
-       movl L_foo$non_lazy_ptr, %eax
-       movl (%eax), %eax
-       call *%eax
-       addl $12, %esp
-       ret
-
-The current isel scheme will not allow the load to be folded in the call since
-the load's chain result is read by the callseq_start.
-
-//===---------------------------------------------------------------------===//
-
  For this:
  
  int test(int a)
@@ -425,6 +394,10 @@ estimate to determine whether the match is profitable.
  However, if we care more about code size, then imull is better. It's two bytes
  shorter than movl + leal.
  
+On a Pentium M, both variants have the same characteristics with regard
+to throughput; however, the multiplication has a latency of four cycles, as
+opposed to two cycles for the movl+lea variant.
+
  //===---------------------------------------------------------------------===//
  
  __builtin_ffs codegen is messy.
@@ -659,9 +632,9 @@ imagine there has to be some kind of complicated decoder reset and realignment
  to grab the bytes from the next cacheline.
  
  532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
-942  942 0x3d03 movl     %dh, (1809(%esp, %esi)                                                                          
-937  937 0x3d0a incl     %esi                           
-3    3   0x3d0b cmpb     %bl, %dl                                               
+942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
+937  937 0x3d0a incl     %esi
+3    3   0x3d0b cmpb     %bl, %dl
  27   27  0x3d0d jnz      0x000062db <main+11707>
  
  //===---------------------------------------------------------------------===//
@@ -1707,3 +1680,185 @@ _test:
         ret
  
  it would be better to codegen as: x+~y  (notl+addl)
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+int foo(const char *str,...)
+{
+ __builtin_va_list a; int x;
+ __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
+ return x;
+}
+
+gets compiled into this on x86-64:
+       subq    $200, %rsp
+        movaps  %xmm7, 160(%rsp)
+        movaps  %xmm6, 144(%rsp)
+        movaps  %xmm5, 128(%rsp)
+        movaps  %xmm4, 112(%rsp)
+        movaps  %xmm3, 96(%rsp)
+        movaps  %xmm2, 80(%rsp)
+        movaps  %xmm1, 64(%rsp)
+        movaps  %xmm0, 48(%rsp)
+        movq    %r9, 40(%rsp)
+        movq    %r8, 32(%rsp)
+        movq    %rcx, 24(%rsp)
+        movq    %rdx, 16(%rsp)
+        movq    %rsi, 8(%rsp)
+        leaq    (%rsp), %rax
+        movq    %rax, 192(%rsp)
+        leaq    208(%rsp), %rax
+        movq    %rax, 184(%rsp)
+        movl    $48, 180(%rsp)
+        movl    $8, 176(%rsp)
+        movl    176(%rsp), %eax
+        cmpl    $47, %eax
+        jbe     .LBB1_3 # bb
+.LBB1_1:        # bb3
+        movq    184(%rsp), %rcx
+        leaq    8(%rcx), %rax
+        movq    %rax, 184(%rsp)
+.LBB1_2:        # bb4
+        movl    (%rcx), %eax
+        addq    $200, %rsp
+        ret
+.LBB1_3:        # bb
+        movl    %eax, %ecx
+        addl    $8, %eax
+        addq    192(%rsp), %rcx
+        movl    %eax, 176(%rsp)
+        jmp     .LBB1_2 # bb4
+
+gcc 4.3 generates:
+       subq    $96, %rsp
+.LCFI0:
+        leaq    104(%rsp), %rax
+        movq    %rsi, -80(%rsp)
+        movl    $8, -120(%rsp)
+        movq    %rax, -112(%rsp)
+        leaq    -88(%rsp), %rax
+        movq    %rax, -104(%rsp)
+        movl    $8, %eax
+        cmpl    $48, %eax
+        jb      .L6
+        movq    -112(%rsp), %rdx
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+        .p2align 4,,10
+        .p2align 3
+.L6:
+        mov     %eax, %edx
+        addq    -104(%rsp), %rdx
+        addl    $8, %eax
+        movl    %eax, -120(%rsp)
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+
+and it gets compiled into this on x86:
+       pushl   %ebp
+        movl    %esp, %ebp
+        subl    $4, %esp
+        leal    12(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        leal    16(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        movl    12(%ebp), %eax
+        addl    $4, %esp
+        popl    %ebp
+        ret
+
+gcc 4.3 generates:
+       pushl   %ebp
+        movl    %esp, %ebp
+        movl    12(%ebp), %eax
+        popl    %ebp
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Teach tblgen not to check bitconvert source type in some cases. This allows us
+to consolidate the following patterns in X86InstrMMX.td:
+
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+There are other cases in various td files.
+
+//===---------------------------------------------------------------------===//
+
+Take something like the following on x86-32:
+unsigned a(unsigned long long x, unsigned y) {return x % y;}
+
+We currently generate a libcall, but we really shouldn't: the expansion is
+shorter and likely faster than the libcall.  The expected code is something
+like the following:
+
+       movl    12(%ebp), %eax
+       movl    16(%ebp), %ecx
+       xorl    %edx, %edx
+       divl    %ecx
+       movl    8(%ebp), %eax
+       divl    %ecx
+       movl    %edx, %eax
+       ret
+
+A similar code sequence works for division.
+
+//===---------------------------------------------------------------------===//
+
+These should compile to the same code, but the later codegen's to useless
+instructions on X86. This may be a trivial dag combine (GCC PR7061):
+
+struct s1 { unsigned char a, b; };
+unsigned long f1(struct s1 x) {
+    return x.a + x.b;
+}
+struct s2 { unsigned a: 8, b: 8; };
+unsigned long f2(struct s2 x) {
+    return x.a + x.b;
+}
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %sum = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %normal
+normal:
+  ret i32 %sum
+overflow:
+  call void @llvm.trap()
+  unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+       movl    4(%esp), %eax
+       addl    8(%esp), %eax
+       jo      LBB1_2  ## overflow
+LBB1_1:        ## normal
+       ret
+LBB1_2:        ## overflow
+       ud2
+
+it would be nice to produce "into" someday.
+
+//===---------------------------------------------------------------------===//