Add NVExtFrm to represent NEON Vector Extract Instructions, that uses Inst{11-8}

[oota-llvm.git] / lib / Target / README.txt
diff --git a/lib/Target/README.txt b/lib/Target/README.txt

index 2d8a687ebebb5c29b3885141f8f86ecf7e4dd528..052a5756da9faafdd4b97653c36fadb00cc76048 100644 (file)
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -106,7 +106,17 @@ Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
  
  //===---------------------------------------------------------------------===//
  
-Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
+Reassociate should turn things like:
+
+int factorial(int X) {
+ return X*X*X*X*X*X*X*X;
+}
+
+into llvm.powi calls, allowing the code generator to produce balanced
+multiplication trees.
+
+First, the intrinsic needs to be extended to support integers, and second the
+code generator needs to be enhanced to lower these to multiplication trees.
  
  //===---------------------------------------------------------------------===//
  
@@ -119,7 +129,71 @@ int foo(int z, int n) {
    return bar(z, n) + bar(2*z, 2*n);
  }
  
-Reassociate should handle the example in GCC PR16157.
+This is blocked on not handling X*X*X -> powi(X, 3) (see note above).  The issue
+is that we end up getting t = 2*X  s = t*t   and don't turn this into 4*X*X,
+which is the same number of multiplies and is canonical, because the 2*X has
+multiple uses.  Here's a simple example:
+
+define i32 @test15(i32 %X1) {
+  %B = mul i32 %X1, 47   ; X1*47
+  %C = mul i32 %B, %B
+  ret i32 %C
+}
+
+
+//===---------------------------------------------------------------------===//
+
+Reassociate should handle the example in GCC PR16157:
+
+extern int a0, a1, a2, a3, a4; extern int b0, b1, b2, b3, b4; 
+void f () {  /* this can be optimized to four additions... */ 
+        b4 = a4 + a3 + a2 + a1 + a0; 
+        b3 = a3 + a2 + a1 + a0; 
+        b2 = a2 + a1 + a0; 
+        b1 = a1 + a0; 
+} 
+
+This requires reassociating to forms of expressions that are already available,
+something that reassoc doesn't think about yet.
+
+
+//===---------------------------------------------------------------------===//
+
+This function: (derived from GCC PR19988)
+double foo(double x, double y) {
+  return ((x + 0.1234 * y) * (x + -0.1234 * y));
+}
+
+compiles to:
+_foo:
+       movapd  %xmm1, %xmm2
+       mulsd   LCPI1_1(%rip), %xmm1
+       mulsd   LCPI1_0(%rip), %xmm2
+       addsd   %xmm0, %xmm1
+       addsd   %xmm0, %xmm2
+       movapd  %xmm1, %xmm0
+       mulsd   %xmm2, %xmm0
+       ret
+
+Reassociate should be able to turn it into:
+
+double foo(double x, double y) {
+  return ((x + 0.1234 * y) * (x - 0.1234 * y));
+}
+
+Which allows the multiply by constant to be CSE'd, producing:
+
+_foo:
+       mulsd   LCPI1_0(%rip), %xmm1
+       movapd  %xmm1, %xmm2
+       addsd   %xmm0, %xmm2
+       subsd   %xmm1, %xmm0
+       mulsd   %xmm2, %xmm0
+       ret
+
+This doesn't need -ffast-math support at all.  This is particularly bad because
+the llvm-gcc frontend is canonicalizing the later into the former, but clang
+doesn't have this problem.
  
  //===---------------------------------------------------------------------===//
  
@@ -202,24 +276,6 @@ define void @test(i32* %P) {
  
  //===---------------------------------------------------------------------===//
  
-dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
-
-Compile:
-
-int bar(int x)
-{
-  int t = __builtin_clz(x);
-  return -(t>>5);
-}
-
-to:
-
-_bar:   addic r3,r3,-1
-        subfe r3,r3,r3
-        blr
-
-//===---------------------------------------------------------------------===//
-
  quantum_sigma_x in 462.libquantum contains the following loop:
  
        for(i=0; i<reg->size; i++)
@@ -247,19 +303,6 @@ this requires TBAA.
  
  //===---------------------------------------------------------------------===//
  
-This should be optimized to one 'and' and one 'or', from PR4216:
-
-define i32 @test_bitfield(i32 %bf.prev.low) nounwind ssp {
-entry:
-  %bf.prev.lo.cleared10 = or i32 %bf.prev.low, 32962 ; <i32> [#uses=1]
-  %0 = and i32 %bf.prev.low, -65536               ; <i32> [#uses=1]
-  %1 = and i32 %bf.prev.lo.cleared10, 40186       ; <i32> [#uses=1]
-  %2 = or i32 %1, %0                              ; <i32> [#uses=1]
-  ret i32 %2
-}
-
-//===---------------------------------------------------------------------===//
-
  This isn't recognized as bswap by instcombine (yes, it really is bswap):
  
  unsigned long reverse(unsigned v) {
@@ -272,6 +315,8 @@ unsigned long reverse(unsigned v) {
  
  //===---------------------------------------------------------------------===//
  
+[LOOP RECOGNITION]
+
  These idioms should be recognized as popcount (see PR1488):
  
  unsigned countbits_slow(unsigned v) {
@@ -334,12 +379,36 @@ this construct.
  
  //===---------------------------------------------------------------------===//
  
+[LOOP RECOGNITION]
+
  viterbi speeds up *significantly* if the various "history" related copy loops
  are turned into memcpy calls at the source level.  We need a "loops to memcpy"
  pass.
  
  //===---------------------------------------------------------------------===//
  
+[LOOP OPTIMIZATION]
+
+SingleSource/Benchmarks/Misc/dt.c shows several interesting optimization
+opportunities in its double_array_divs_variable function: it needs loop
+interchange, memory promotion (which LICM already does), vectorization and
+variable trip count loop unrolling (since it has a constant trip count). ICC
+apparently produces this very nice code with -ffast-math:
+
+..B1.70:                        # Preds ..B1.70 ..B1.69
+       mulpd     %xmm0, %xmm1                                  #108.2
+       mulpd     %xmm0, %xmm1                                  #108.2
+       mulpd     %xmm0, %xmm1                                  #108.2
+       mulpd     %xmm0, %xmm1                                  #108.2
+       addl      $8, %edx                                      #
+       cmpl      $131072, %edx                                 #108.2
+       jb        ..B1.70       # Prob 99%                      #108.2
+
+It would be better to count down to zero, but this is a lot better than what we
+do.
+
+//===---------------------------------------------------------------------===//
+
  Consider:
  
  typedef unsigned U32;
@@ -721,47 +790,6 @@ be done safely if "b" isn't modified between the strlen and memcpy of course.
  
  //===---------------------------------------------------------------------===//
  
-Reassociate should turn things like:
-
-int factorial(int X) {
- return X*X*X*X*X*X*X*X;
-}
-
-into llvm.powi calls, allowing the code generator to produce balanced
-multiplication trees.
-
-//===---------------------------------------------------------------------===//
-
-We generate a horrible  libcall for llvm.powi.  For example, we compile:
-
-#include <cmath>
-double f(double a) { return std::pow(a, 4); }
-
-into:
-
-__Z1fd:
-       subl    $12, %esp
-       movsd   16(%esp), %xmm0
-       movsd   %xmm0, (%esp)
-       movl    $4, 8(%esp)
-       call    L___powidf2$stub
-       addl    $12, %esp
-       ret
-
-GCC produces:
-
-__Z1fd:
-       subl    $12, %esp
-       movsd   16(%esp), %xmm0
-       mulsd   %xmm0, %xmm0
-       mulsd   %xmm0, %xmm0
-       movsd   %xmm0, (%esp)
-       fldl    (%esp)
-       addl    $12, %esp
-       ret
-
-//===---------------------------------------------------------------------===//
-
  We compile this program: (from GCC PR11680)
  http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
  
@@ -801,8 +829,21 @@ void bar(unsigned n) {
      true();
  }
  
-I think this basically amounts to a dag combine to simplify comparisons against
-multiply hi's into a comparison against the mullo.
+This is equivalent to the following, where 2863311531 is the multiplicative
+inverse of 3, and 1431655766 is ((2^32)-1)/3+1:
+void bar(unsigned n) {
+  if (n * 2863311531U < 1431655766U)
+    true();
+}
+
+The same transformation can work with an even modulo with the addition of a
+rotate: rotate the result of the multiply to the right by the number of bits
+which need to be zero for the condition to be true, and shrink the compare RHS
+by the same amount.  Unless the target supports rotates, though, that
+transformation probably isn't worthwhile.
+
+The transformation can also easily be made to work with non-zero equality
+comparisons: just transform, for example, "n % 3 == 1" to "(n-1) % 3 == 0".
  
  //===---------------------------------------------------------------------===//
  
@@ -823,20 +864,6 @@ int main() {
  
  //===---------------------------------------------------------------------===//
  
-Instcombine will merge comparisons like (x >= 10) && (x < 20) by producing (x -
-10) u< 10, but only when the comparisons have matching sign.
-
-This could be converted with a similiar technique. (PR1941)
-
-define i1 @test(i8 %x) {
-  %A = icmp uge i8 %x, 5
-  %B = icmp slt i8 %x, 20
-  %C = and i1 %A, %B
-  ret i1 %C
-}
-
-//===---------------------------------------------------------------------===//
-
  These functions perform the same computation, but produce different assembly.
  
  define i8 @select(i8 %x) readnone nounwind {
@@ -884,18 +911,6 @@ The expression should optimize to something like
  
  //===---------------------------------------------------------------------===//
  
-From GCC Bug 15241:
-unsigned int
-foo (unsigned int a, unsigned int b)
-{
- if (a <= 7 && b <= 7)
-   baz ();
-}
-Should combine to "(a|b) <= 7".  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
  From GCC Bug 3756:
  int
  pn (int n)
@@ -907,19 +922,6 @@ Should combine to (n >> 31) | 1.  Currently not optimized with "clang
  
  //===---------------------------------------------------------------------===//
  
-From GCC Bug 28685:
-int test(int a, int b)
-{
- int lt = a < b;
- int eq = a == b;
-
- return (lt || eq);
-}
-Should combine to "a <= b".  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts | llc".
-
-//===---------------------------------------------------------------------===//
-
  void a(int variable)
  {
   if (variable == 4 || variable == 6)
@@ -993,12 +995,6 @@ Should combine to 0.  Currently not optimized with "clang
  
  //===---------------------------------------------------------------------===//
  
-int a(unsigned char* b) {return *b > 99;}
-There's an unnecessary zext in the generated code with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
  int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
  Should be combined to  "((b >> 1) | b) & 1".  Currently not optimized
  with "clang -emit-llvm-bc | opt -std-compile-opts".
@@ -1011,12 +1007,6 @@ Should combine to "x | (y & 3)".  Currently not optimized with "clang
  
  //===---------------------------------------------------------------------===//
  
-unsigned a(unsigned a) {return ((a | 1) & 3) | (a & -4);}
-Should combine to "a | 1".  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
  int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
  Should fold to "(~a & c) | (a & b)".  Currently not optimized with
  "clang -emit-llvm-bc | opt -std-compile-opts".
@@ -1275,9 +1265,16 @@ store->load.
  
  //===---------------------------------------------------------------------===//
  
+[ALIAS ANALYSIS]
+
  Type based alias analysis:
  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705
  
+We should do better analysis of posix_memalign.  At the least it should
+no-capture its pointer argument, at best, we should know that the out-value
+result doesn't point to anything (like malloc).  One example of this is in
+SingleSource/Benchmarks/Misc/dt.c
+
  //===---------------------------------------------------------------------===//
  
  A/B get pinned to the stack because we turn an if/then into a select instead
@@ -1704,3 +1701,145 @@ need all but the bottom two bits from %A, and if we gave that mask to SDB it
  would delete the or instruction for us.
  
  //===---------------------------------------------------------------------===//
+
+functionattrs doesn't know much about memcpy/memset.  This function should be
+marked readnone rather than readonly, since it only twiddles local memory, but
+functionattrs doesn't handle memset/memcpy/memmove aggressively:
+
+struct X { int *p; int *q; };
+int foo() {
+ int i = 0, j = 1;
+ struct X x, y;
+ int **p;
+ y.p = &i;
+ x.q = &j;
+ p = __builtin_memcpy (&x, &y, sizeof (int *));
+ return **p;
+}
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine transformation:
+define i1 @a(i32 %x) nounwind readnone {
+entry:
+  %cmp = icmp eq i32 %x, 30
+  %sub = add i32 %x, -30
+  %cmp2 = icmp ugt i32 %sub, 9
+  %or = or i1 %cmp, %cmp2
+  ret i1 %or
+}
+This should be optimized to a single compare.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine transformation:
+void b();
+void a(int x) { if (((1<<x)&8)==0) b(); }
+
+The shift should be optimized out.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine or reassociate transformation:
+int a(int a, int b) { return (a==12)&(b>47)&(b<58); }
+
+The sgt and slt should be combined into a single comparison. Testcase derived
+from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine transformation:
+define i32 @a(i32 %x) nounwind readnone {
+entry:
+  %rem = srem i32 %x, 32
+  %shl = shl i32 1, %rem
+  ret i32 %shl
+}
+
+The srem can be transformed to an and because if x is negative, the shift is
+undefined. Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine/dagcombine transformation:
+define i32 @a(i32 %x, i32 %y) nounwind readnone {
+entry:
+  %mul = mul i32 %y, -8
+  %sub = sub i32 %x, %mul
+  ret i32 %sub
+}
+
+Should compile to something like x+y*8, but currently compiles to an
+inefficient result.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine/dagcombine transformation:
+define void @lshift_lt(i8 zeroext %a) nounwind {
+entry:
+  %conv = zext i8 %a to i32
+  %shl = shl i32 %conv, 3
+  %cmp = icmp ult i32 %shl, 33
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  ret void
+
+if.end:
+  ret void
+}
+declare void @bar() nounwind
+
+The shift should be eliminated.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+These compile into different code, one gets recognized as a switch and the
+other doesn't due to phase ordering issues (PR6212):
+
+int test1(int mainType, int subType) {
+  if (mainType == 7)
+    subType = 4;
+  else if (mainType == 9)
+    subType = 6;
+  else if (mainType == 11)
+    subType = 9;
+  return subType;
+}
+
+int test2(int mainType, int subType) {
+  if (mainType == 7)
+    subType = 4;
+  if (mainType == 9)
+    subType = 6;
+  if (mainType == 11)
+    subType = 9;
+  return subType;
+}
+
+//===---------------------------------------------------------------------===//
+
+The following test case (from PR6576):
+
+define i32 @mul(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %cond1 = icmp eq i32 %b, 0                      ; <i1> [#uses=1]
+ br i1 %cond1, label %exit, label %bb.nph
+bb.nph:                                           ; preds = %entry
+ %tmp = mul i32 %b, %a                           ; <i32> [#uses=1]
+ ret i32 %tmp
+exit:                                             ; preds = %entry
+ ret i32 0
+}
+
+could be reduced to:
+
+define i32 @mul(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %tmp = mul i32 %b, %a
+ ret i32 %tmp
+}
+
+//===---------------------------------------------------------------------===//
+