Reinitialize the ivars in the subtarget.

[oota-llvm.git] / lib / Target / X86 / README.txt
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt

index a7a477d2d5b833b5509af047cf454fd7fb865b1c..b4285a0718799c379eb2a281f159d2c1a42784ed 100644 (file)
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2,19 +2,6 @@
  // Random ideas for the X86 backend.
  //===---------------------------------------------------------------------===//
  
-We should add support for the "movbe" instruction, which does a byte-swapping
-copy (3-addr bswap + memory support?)  This is available on Atom processors.
-
-//===---------------------------------------------------------------------===//
-
-CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
-backend knows how to three-addressify this shift, but it appears the register
-allocator isn't even asking it to do so in this case.  We should investigate
-why this isn't happening, it could have significant impact on other important
-cases for X86 as well.
-
-//===---------------------------------------------------------------------===//
-
  This should be one DIV/IDIV instruction, not a libcall:
  
  unsigned test(unsigned long long X, unsigned Y) {
@@ -69,7 +56,7 @@ cmovs, we should expand to a conditional branch like GCC produces.
  
  Some isel ideas:
  
-1. Dynamic programming based approach when compile time if not an
+1. Dynamic programming based approach when compile time is not an
     issue.
  2. Code duplication (addressing mode) during isel.
  3. Other ideas from "Register-Sensitive Selection, Duplication, and
@@ -710,23 +697,17 @@ This:
          { return !full_add(a, b).second; }
  
  Should compile to:
+       addl    %esi, %edi
+       setae   %al
+       movzbl  %al, %eax
+       ret
  
-
-        _Z11no_overflowjj:
-                addl    %edi, %esi
-                setae   %al
-                ret
-
-FIXME: That code looks wrong; bool return is normally defined as zext.
-
-on x86-64, not:
-
-__Z11no_overflowjj:
-        addl    %edi, %esi
-        cmpl    %edi, %esi
-        setae   %al
-        movzbl  %al, %eax
-        ret
+on x86-64, instead of the rather stupid-looking:
+       addl    %esi, %edi
+       setb    %al
+       xorb    $1, %al
+       movzbl  %al, %eax
+       ret
  
  
  //===---------------------------------------------------------------------===//
@@ -994,10 +975,10 @@ _foo:
  
  instead of:
  _foo:
-        movl    $255, %eax
-        orl     4(%esp), %eax
-        andl    $65535, %eax
-        ret
+       movl    $65280, %eax
+       andl    4(%esp), %eax
+       orl     $255, %eax
+       ret
  
  //===---------------------------------------------------------------------===//
  
@@ -1236,7 +1217,7 @@ Also check why xmm7 is not used at all in the function.
  
  Take the following:
  
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
  target triple = "i386-apple-darwin8"
  @in_exit.4870.b = internal global i1 false             ; <i1*> [#uses=2]
  define fastcc void @abort_gzip() noreturn nounwind  {
@@ -1578,7 +1559,7 @@ Implement processor-specific optimizations for parity with GCC on these
  processors.  GCC does two optimizations:
  
  1. ix86_pad_returns inserts a noop before ret instructions if immediately
-   preceeded by a conditional branch or is the target of a jump.
+   preceded by a conditional branch or is the target of a jump.
  2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
     code contains more than 3 branches.
     
@@ -1586,43 +1567,6 @@ The first one is done for all AMDs, Core2, and "Generic"
  The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
    Core 2, and "Generic"
  
-//===---------------------------------------------------------------------===//
-
-Testcase:
-int a(int x) { return (x & 127) > 31; }
-
-Current output:
-       movl    4(%esp), %eax
-       andl    $127, %eax
-       cmpl    $31, %eax
-       seta    %al
-       movzbl  %al, %eax
-       ret
-
-Ideal output:
-       xorl    %eax, %eax
-       testl   $96, 4(%esp)
-       setne   %al
-       ret
-
-This should definitely be done in instcombine, canonicalizing the range
-condition into a != condition.  We get this IR:
-
-define i32 @a(i32 %x) nounwind readnone {
-entry:
-       %0 = and i32 %x, 127            ; <i32> [#uses=1]
-       %1 = icmp ugt i32 %0, 31                ; <i1> [#uses=1]
-       %2 = zext i1 %1 to i32          ; <i32> [#uses=1]
-       ret i32 %2
-}
-
-Instcombine prefers to strength reduce relational comparisons to equality
-comparisons when possible, this should be another case of that.  This could
-be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
-looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
-be redesigned to use ComputeMaskedBits and friends.
-
-
  //===---------------------------------------------------------------------===//
  Testcase:
  int x(int a) { return (a&0xf0)>>4; }
@@ -1662,28 +1606,61 @@ information to add the "lock" prefix.
  
  //===---------------------------------------------------------------------===//
  
-_Bool bar(int *x) { return *x & 1; }
+struct B {
+  unsigned char y0 : 1;
+};
  
-define zeroext i1 @bar(i32* nocapture %x) nounwind readonly {
-entry:
-  %tmp1 = load i32* %x                            ; <i32> [#uses=1]
-  %and = and i32 %tmp1, 1                         ; <i32> [#uses=1]
-  %tobool = icmp ne i32 %and, 0                   ; <i1> [#uses=1]
-  ret i1 %tobool
+int bar(struct B* a) { return a->y0; }
+
+define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
+  %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
+  %2 = load i8* %1, align 1
+  %3 = and i8 %2, 1
+  %4 = zext i8 %3 to i32
+  ret i32 %4
  }
  
-bar:                                                        # @bar
-# BB#0:                                                     # %entry
-       movl    4(%esp), %eax
-       movb    (%eax), %al
-       andb    $1, %al
-       movzbl  %al, %eax
-       ret
+bar:                                    # @bar
+# BB#0:
+        movb    (%rdi), %al
+        andb    $1, %al
+        movzbl  %al, %eax
+        ret
  
  Missed optimization: should be movl+andl.
  
  //===---------------------------------------------------------------------===//
  
+The x86_64 abi says:
+
+Booleans, when stored in a memory object, are stored as single byte objects the
+value of which is always 0 (false) or 1 (true).
+
+We are not using this fact:
+
+int bar(_Bool *a) { return *a; }
+
+define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
+  %1 = load i8* %a, align 1, !tbaa !0
+  %tmp = and i8 %1, 1
+  %2 = zext i8 %tmp to i32
+  ret i32 %2
+}
+
+bar:
+        movb    (%rdi), %al
+        andb    $1, %al
+        movzbl  %al, %eax
+        ret
+
+GCC produces
+
+bar:
+        movzbl  (%rdi), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
  Consider the following two functions compiled with clang:
  _Bool foo(int *x) { return !(*x & 4); }
  unsigned bar(int *x) { return !(*x & 4); }
@@ -1708,26 +1685,6 @@ are functionally identical.
  
  //===---------------------------------------------------------------------===//
  
-Take the following C code:
-int x(int y) { return (y & 63) << 14; }
-
-Code produced by gcc:
-       andl    $63, %edi
-       sall    $14, %edi
-       movl    %edi, %eax
-       ret
-
-Code produced by clang:
-       shll    $14, %edi
-       movl    %edi, %eax
-       andl    $1032192, %eax
-       ret
-
-The code produced by gcc is 3 bytes shorter.  This sort of construct often
-shows up with bitfields.
-
-//===---------------------------------------------------------------------===//
-
  Take the following C code:
  int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
  
@@ -1885,39 +1842,202 @@ _add32carry:
  
  //===---------------------------------------------------------------------===//
  
-This:
-char t(char c) {
-  return c/3;
+The hot loop of 256.bzip2 contains code that looks a bit like this:
+
+int foo(char *P, char *Q, int x, int y) {
+  if (P[0] != Q[0])
+     return P[0] < Q[0];
+  if (P[1] != Q[1])
+     return P[1] < Q[1];
+  if (P[2] != Q[2])
+     return P[2] < Q[2];
+   return P[3] < Q[3];
  }
  
-Compiles to: $clang t.c -S -o - -O3 -mkernel -fomit-frame-pointer
+In the real code, we get a lot more wrong than this.  However, even in this
+code we generate:
  
-_t:                                     ## @t
-       movslq  %edi, %rax
-       imulq   $-1431655765, %rax, %rcx ## imm = 0xFFFFFFFFAAAAAAAB
-       shrq    $32, %rcx
-       addl    %ecx, %eax
-       movl    %eax, %ecx
-       shrl    $31, %ecx
-       shrl    %eax
-       addl    %ecx, %eax
-       movsbl  %al, %eax
+_foo:                                   ## @foo
+## BB#0:                                ## %entry
+       movb    (%rsi), %al
+       movb    (%rdi), %cl
+       cmpb    %al, %cl
+       je      LBB0_2
+LBB0_1:                                 ## %if.then
+       cmpb    %al, %cl
+       jmp     LBB0_5
+LBB0_2:                                 ## %if.end
+       movb    1(%rsi), %al
+       movb    1(%rdi), %cl
+       cmpb    %al, %cl
+       jne     LBB0_1
+## BB#3:                                ## %if.end38
+       movb    2(%rsi), %al
+       movb    2(%rdi), %cl
+       cmpb    %al, %cl
+       jne     LBB0_1
+## BB#4:                                ## %if.end60
+       movb    3(%rdi), %al
+       cmpb    3(%rsi), %al
+LBB0_5:                                 ## %if.end60
+       setl    %al
+       movzbl  %al, %eax
         ret
  
-GCC gets:
+Note that we generate jumps to LBB0_1 which does a redundant compare.  The
+redundant compare also forces the register values to be live, which prevents
+folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
  
-_t:
-       movl    $86, %eax
-       imulb   %dil
-       shrw    $8, %ax
-       sarb    $7, %dil
-       subb    %dil, %al
-       movsbl  %al,%eax
+_foo:
+       movzbl  (%rsi), %eax
+       cmpb    %al, (%rdi)
+       jne     L10
+L12:
+       movzbl  1(%rsi), %eax
+       cmpb    %al, 1(%rdi)
+       jne     L10
+       movzbl  2(%rsi), %eax
+       cmpb    %al, 2(%rdi)
+       jne     L10
+       movzbl  3(%rdi), %eax
+       cmpb    3(%rsi), %al
+L10:
+       setl    %al
+       movzbl  %al, %eax
+       ret
+
+which is "perfect".
+
+//===---------------------------------------------------------------------===//
+
+For the branch in the following code:
+int a();
+int b(int x, int y) {
+  if (x & (1<<(y&7)))
+    return a();
+  return y;
+}
+
+We currently generate:
+       movb    %sil, %al
+       andb    $7, %al
+       movzbl  %al, %eax
+       btl     %eax, %edi
+       jae     .LBB0_2
+
+movl+andl would be shorter than the movb+andb+movzbl sequence.
+
+//===---------------------------------------------------------------------===//
+
+For the following:
+struct u1 {
+    float x, y;
+};
+float foo(struct u1 u) {
+    return u.x + u.y;
+}
+
+We currently generate:
+       movdqa  %xmm0, %xmm1
+       pshufd  $1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
+       addss   %xmm1, %xmm0
         ret
  
-which is nicer.  This also happens for int, not just char.
+We could save an instruction here by commuting the addss.
  
  //===---------------------------------------------------------------------===//
  
+This (from PR9661):
  
+float clamp_float(float a) {
+        if (a > 1.0f)
+                return 1.0f;
+        else if (a < 0.0f)
+                return 0.0f;
+        else
+                return a;
+}
  
+Could compile to:
+
+clamp_float:                            # @clamp_float
+        movss   .LCPI0_0(%rip), %xmm1
+        minss   %xmm1, %xmm0
+        pxor    %xmm1, %xmm1
+        maxss   %xmm1, %xmm0
+        ret
+
+with -ffast-math.
+
+//===---------------------------------------------------------------------===//
+
+This function (from PR9803):
+
+int clamp2(int a) {
+        if (a > 5)
+                a = 5;
+        if (a < 0) 
+                return 0;
+        return a;
+}
+
+Compiles to:
+
+_clamp2:                                ## @clamp2
+        pushq   %rbp
+        movq    %rsp, %rbp
+        cmpl    $5, %edi
+        movl    $5, %ecx
+        cmovlel %edi, %ecx
+        testl   %ecx, %ecx
+        movl    $0, %eax
+        cmovnsl %ecx, %eax
+        popq    %rbp
+        ret
+
+The move of 0 could be scheduled above the test to make it is xor reg,reg.
+
+//===---------------------------------------------------------------------===//
+
+GCC PR48986.  We currently compile this:
+
+void bar(void);
+void yyy(int* p) {
+    if (__sync_fetch_and_add(p, -1) == 1)
+      bar();
+}
+
+into:
+       movl    $-1, %eax
+       lock
+       xaddl   %eax, (%rdi)
+       cmpl    $1, %eax
+       je      LBB0_2
+
+Instead we could generate:
+
+       lock
+       dec %rdi
+       je LBB0_2
+
+The trick is to match "fetch_and_add(X, -C) == C".
+
+//===---------------------------------------------------------------------===//
+
+unsigned t(unsigned a, unsigned b) {
+  return a <= b ? 5 : -5;
+}
+
+We generate:
+       movl    $5, %ecx
+       cmpl    %esi, %edi
+       movl    $-5, %eax
+       cmovbel %ecx, %eax
+
+GCC:
+       cmpl    %edi, %esi
+       sbbl    %eax, %eax
+       andl    $-10, %eax
+       addl    $5, %eax
+
+//===---------------------------------------------------------------------===//