X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=7e9888cc13e80b48466cb5cca8997819eb07cb92;hb=6902c8db2603287335a7e46257b4530674983116;hp=99e1252dc8aed1584e585bbaccff02842ffa1253;hpb=ed0318eab83cac5250353d8b70bca684ce247d70;p=oota-llvm.git

diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 99e1252dc8a..7e9888cc13e 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2,22 +2,6 @@ Target Independent Opportunities:
 
 //===---------------------------------------------------------------------===//
 
-With the recent changes to make the implicit def/use set explicit in
-machineinstrs, we should change the target descriptions for 'call' instructions
-so that the .td files don't list all the call-clobbered registers as implicit
-defs.  Instead, these should be added by the code generator (e.g. on the dag).
-
-This has a number of uses:
-
-1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
-   for their different impdef sets.
-2. Targets with multiple calling convs (e.g. x86) which have different clobber
-   sets don't need copies of call instructions.
-3. 'Interprocedural register allocation' can be done to reduce the clobber sets
-   of calls.
-
-//===---------------------------------------------------------------------===//
-
 We should recognized various "overflow detection" idioms and translate them into
 llvm.uadd.with.overflow and similar intrinsics.  Here is a multiply idiom:
 
@@ -109,44 +93,6 @@ This requires reassociating to forms of expressions that are already available,
 something that reassoc doesn't think about yet.
 
 
-//===---------------------------------------------------------------------===//
-
-This function: (derived from GCC PR19988)
-double foo(double x, double y) {
-  return ((x + 0.1234 * y) * (x + -0.1234 * y));
-}
-
-compiles to:
-_foo:
-	movapd	%xmm1, %xmm2
-	mulsd	LCPI1_1(%rip), %xmm1
-	mulsd	LCPI1_0(%rip), %xmm2
-	addsd	%xmm0, %xmm1
-	addsd	%xmm0, %xmm2
-	movapd	%xmm1, %xmm0
-	mulsd	%xmm2, %xmm0
-	ret
-
-Reassociate should be able to turn it into:
-
-double foo(double x, double y) {
-  return ((x + 0.1234 * y) * (x - 0.1234 * y));
-}
-
-Which allows the multiply by constant to be CSE'd, producing:
-
-_foo:
-	mulsd	LCPI1_0(%rip), %xmm1
-	movapd	%xmm1, %xmm2
-	addsd	%xmm0, %xmm2
-	subsd	%xmm1, %xmm0
-	mulsd	%xmm2, %xmm0
-	ret
-
-This doesn't need -ffast-math support at all.  This is particularly bad because
-the llvm-gcc frontend is canonicalizing the later into the former, but clang
-doesn't have this problem.
-
 //===---------------------------------------------------------------------===//
 
 These two functions should generate the same code on big-endian systems:
@@ -160,7 +106,7 @@ for 1,2,4,8 bytes.
 //===---------------------------------------------------------------------===//
 
 It would be nice to revert this patch:
-http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
+http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 
 And teach the dag combiner enough to simplify the code expanded before 
 legalize.  It seems plausible that this knowledge would let it simplify other
@@ -168,7 +114,7 @@ stuff too.
 
 //===---------------------------------------------------------------------===//
 
-For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
+For vector types, DataLayout.cpp::getTypeInfo() returns alignment that is equal
 to the type size. It works but can be overly conservative as the alignment of
 specific vector types are target dependent.
 
@@ -278,22 +224,7 @@ unsigned countbits_slow(unsigned v) {
     c += v & 1;
   return c;
 }
-unsigned countbits_fast(unsigned v){
-  unsigned c;
-  for (c = 0; v; c++)
-    v &= v - 1; // clear the least significant bit set
-  return c;
-}
 
-BITBOARD = unsigned long long
-int PopCnt(register BITBOARD a) {
-  register int c=0;
-  while(a) {
-    c++;
-    a &= a - 1;
-  }
-  return c;
-}
 unsigned int popcount(unsigned int input) {
   unsigned int count = 0;
   for (unsigned int i =  0; i < 4 * 8; i++)
@@ -802,7 +733,7 @@ f (unsigned long a, unsigned long b, unsigned long c)
   return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0);
 }
 Both should combine to ((a|b) & (c-1)) != 0.  Currently not optimized with
-"clang -emit-llvm-bc | opt -std-compile-opts".
+"clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
@@ -815,7 +746,7 @@ void clear_pmd_range(unsigned long start, unsigned long end)
 }
 The expression should optimize to something like
 "!((start|end)&~PMD_MASK). Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
@@ -834,7 +765,7 @@ int f(int x, int y)
  return (abs(x)) >= 0;
 }
 This should optimize to x == INT_MIN. (With -fwrapv.)  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
@@ -870,99 +801,119 @@ rshift_gt (unsigned int a)
    bar ();
 }
 
-void neg_eq_cst(unsigned int a) {
-if (-a == 123)
-bar();
-}
-
 All should simplify to a single comparison.  All of these are
 currently not optimized with "clang -emit-llvm-bc | opt
--std-compile-opts".
+-O3".
 
 //===---------------------------------------------------------------------===//
 
 From GCC Bug 32605:
 int c(int* x) {return (char*)x+2 == (char*)x;}
 Should combine to 0.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it).
+-emit-llvm-bc | opt -O3" (although llc can optimize it).
 
 //===---------------------------------------------------------------------===//
 
 int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
 Should be combined to  "((b >> 1) | b) & 1".  Currently not optimized
-with "clang -emit-llvm-bc | opt -std-compile-opts".
+with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);}
 Should combine to "x | (y & 3)".  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
 Should fold to "(~a & c) | (a & b)".  Currently not optimized with
-"clang -emit-llvm-bc | opt -std-compile-opts".
+"clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int a,int b) {return (~(a|b))|a;}
 Should fold to "a|~b".  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int a, int b) {return (a&&b) || (a&&!b);}
 Should fold to "a".  Currently not optimized with "clang -emit-llvm-bc
-| opt -std-compile-opts".
+| opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int a, int b, int c) {return (a&&b) || (!a&&c);}
 Should fold to "a ? b : c", or at least something sane.  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);}
 Should fold to a && (b || c).  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int x) {return x | ((x & 8) ^ 8);}
 Should combine to x | 8.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int x) {return x ^ ((x & 8) ^ 8);}
 Should also combine to x | 8.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int x) {return ((x | -9) ^ 8) & x;}
 Should combine to x & -9.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;}
 Should combine to "a * 0x88888888 >> 31".  Currently not optimized
-with "clang -emit-llvm-bc | opt -std-compile-opts".
+with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 unsigned a(char* x) {if ((*x & 32) == 0) return b();}
 There's an unnecessary zext in the generated code with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 unsigned a(unsigned long long x) {return 40 * (x >> 1);}
 Should combine to "20 * (((unsigned)x) & -2)".  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
+
+//===---------------------------------------------------------------------===//
+
+int g(int x) { return (x - 10) < 0; }
+Should combine to "x <= 9" (the sub has nsw).  Currently not
+optimized with "clang -emit-llvm-bc | opt -O3".
+
+//===---------------------------------------------------------------------===//
+
+int g(int x) { return (x + 10) < 0; }
+Should combine to "x < -10" (the add has nsw).  Currently not
+optimized with "clang -emit-llvm-bc | opt -O3".
+
+//===---------------------------------------------------------------------===//
+
+int f(int i, int j) { return i < j + 1; }
+int g(int i, int j) { return j > i - 1; }
+Should combine to "i <= j" (the add/sub has nsw).  Currently not
+optimized with "clang -emit-llvm-bc | opt -O3".
+
+//===---------------------------------------------------------------------===//
+
+unsigned f(unsigned x) { return ((x & 7) + 1) & 15; }
+The & 15 part should be optimized away, it doesn't change the result. Currently
+not optimized with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
@@ -1174,7 +1125,7 @@ There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the
 GCC testsuite, ones we don't get yet are (checked through loadpre25):
 
 [CRIT EDGE BREAKING]
-loadpre3.c predcom-4.c
+predcom-4.c
 
 [PRE OF READONLY CALL]
 loadpre5.c
@@ -1317,7 +1268,8 @@ int foo (void) {
 ..
   else if (strchr ("<>", *intel_parser.op_string)
 
-Those should be turned into a switch.
+Those should be turned into a switch.  SimplifyLibCalls only gets the second
+case.
 
 //===---------------------------------------------------------------------===//
 
@@ -1767,7 +1719,6 @@ case it choses instead to keep the max operation obvious.
 
 //===---------------------------------------------------------------------===//
 
-Switch lowering generates less than ideal code for the following switch:
 define void @a(i32 %x) nounwind {
 entry:
   switch i32 %x, label %if.end [
@@ -1788,19 +1739,15 @@ declare void @foo()
 Generated code on x86-64 (other platforms give similar results):
 a:
 	cmpl	$5, %edi
-	ja	.LBB0_2
-	movl	%edi, %eax
-	movl	$47, %ecx
-	btq	%rax, %rcx
-	jb	.LBB0_3
+	ja	LBB2_2
+	cmpl	$4, %edi
+	jne	LBB2_3
 .LBB0_2:
 	ret
 .LBB0_3:
 	jmp	foo  # TAILCALL
 
-The movl+movl+btq+jb could be simplified to a cmpl+jne.
-
-Or, if we wanted to be really clever, we could simplify the whole thing to
+If we wanted to be really clever, we could simplify the whole thing to
 something like the following, which eliminates a branch:
 	xorl    $1, %edi
 	cmpl	$4, %edi
@@ -1897,44 +1844,6 @@ we remove checking in code like
 
 //===---------------------------------------------------------------------===//
 
-This code (from Benchmarks/Dhrystone/dry.c):
-
-define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
-entry:
-  %sext = shl i32 %0, 24
-  %conv = ashr i32 %sext, 24
-  %sext6 = shl i32 %1, 24
-  %conv4 = ashr i32 %sext6, 24
-  %cmp = icmp eq i32 %conv, %conv4
-  %. = select i1 %cmp, i32 10000, i32 0
-  ret i32 %.
-}
-
-Should be simplified into something like:
-
-define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
-entry:
-  %sext = shl i32 %0, 24
-  %conv = and i32 %sext, 0xFF000000
-  %sext6 = shl i32 %1, 24
-  %conv4 = and i32 %sext6, 0xFF000000
-  %cmp = icmp eq i32 %conv, %conv4
-  %. = select i1 %cmp, i32 10000, i32 0
-  ret i32 %.
-}
-
-and then to:
-
-define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
-entry:
-  %conv = and i32 %0, 0xFF
-  %conv4 = and i32 %1, 0xFF
-  %cmp = icmp eq i32 %conv, %conv4
-  %. = select i1 %cmp, i32 10000, i32 0
-  ret i32 %.
-}
-//===---------------------------------------------------------------------===//
-
 clang -O3 currently compiles this code
 
 int g(unsigned int a) {
@@ -2259,52 +2168,6 @@ icmp transform.
 
 //===---------------------------------------------------------------------===//
 
-We should optimize this:
-
-  %tmp = load i16* %arrayidx, align 4, !tbaa !0
-  %A = trunc i16 %tmp to i8
-  %cmp = icmp eq i8 %A, 127
-  %B.mask = and i16 %tmp, -256
-  %cmp7 = icmp eq i16 %B.mask, 17664
-  %or.cond = and i1 %cmp, %cmp7
-  br i1 %or.cond, label %land.lhs.true9, label %if.end
-
-into:
-
-  %tmp = load i16* %arrayidx, align 4, !tbaa !0
-  %0 = icmp eq i16 %tmp, 17791
-  br i1 %0, label %land.lhs.true9, label %if.end
-
-with this patch:
-Index: InstCombine/InstCombineCompares.cpp
-===================================================================
---- InstCombine/InstCombineCompares.cpp	(revision 129500)
-+++ InstCombine/InstCombineCompares.cpp	(working copy)
-@@ -2506,6 +2506,18 @@
-         return &I;
-       }
-     }
-+    
-+    // Transform "icmp eq (trunc X), cst" to "icmp (and X, mask), cst"
-+    if (Op0->hasOneUse() && match(Op0, m_Trunc(m_Value(A))) &&
-+        isa<ConstantInt>(Op1)) {
-+      APInt MaskV = APInt::getLowBitsSet(A->getType()->getPrimitiveSizeInBits(),
-+                                      Op0->getType()->getPrimitiveSizeInBits());
-+      Value *Mask =
-+        Builder->CreateAnd(A, ConstantInt::get(A->getContext(), MaskV));
-+      return new ICmpInst(I.getPredicate(), Mask,
-+                          ConstantExpr::getZExt(cast<ConstantInt>(Op1),
-+                                                Mask->getType()));
-+    }
-   }
-   
-   {
-
-
-Not having this is blocking resolving PR6627.
-
-//===---------------------------------------------------------------------===//
-
 This code:
 
 typedef struct {
@@ -2351,4 +2214,66 @@ The two or/and's should be merged into one each.
 
 //===---------------------------------------------------------------------===//
 
+Machine level code hoisting can be useful in some cases.  For example, PR9408
+is about:
+
+typedef union {
+ void (*f1)(int);
+ void (*f2)(long);
+} funcs;
+
+void foo(funcs f, int which) {
+ int a = 5;
+ if (which) {
+   f.f1(a);
+ } else {
+   f.f2(a);
+ }
+}
+
+which we compile to:
+
+foo:                                    # @foo
+# BB#0:                                 # %entry
+       pushq   %rbp
+       movq    %rsp, %rbp
+       testl   %esi, %esi
+       movq    %rdi, %rax
+       je      .LBB0_2
+# BB#1:                                 # %if.then
+       movl    $5, %edi
+       callq   *%rax
+       popq    %rbp
+       ret
+.LBB0_2:                                # %if.else
+       movl    $5, %edi
+       callq   *%rax
+       popq    %rbp
+       ret
+
+Note that bb1 and bb2 are the same.  This doesn't happen at the IR level
+because one call is passing an i32 and the other is passing an i64.
+
+//===---------------------------------------------------------------------===//
 
+I see this sort of pattern in 176.gcc in a few places (e.g. the start of
+store_bit_field).  The rem should be replaced with a multiply and subtract:
+
+  %3 = sdiv i32 %A, %B
+  %4 = srem i32 %A, %B
+
+Similarly for udiv/urem.  Note that this shouldn't be done on X86 or ARM,
+which can do this in a single operation (instruction or libcall).  It is
+probably best to do this in the code generator.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; }
+should fold to (x & y) == 0.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; }
+should fold to x > y.
+
+//===---------------------------------------------------------------------===//