Add NVExtFrm to represent NEON Vector Extract Instructions, that uses Inst{11-8}

[oota-llvm.git] / lib / Target / README.txt
diff --git a/lib/Target/README.txt b/lib/Target/README.txt

index 6c6290a70a1a1f32a8c67c3e62da883c279a6863..052a5756da9faafdd4b97653c36fadb00cc76048 100644 (file)
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -156,6 +156,45 @@ void f () {  /* this can be optimized to four additions... */
  This requires reassociating to forms of expressions that are already available,
  something that reassoc doesn't think about yet.
  
+
+//===---------------------------------------------------------------------===//
+
+This function: (derived from GCC PR19988)
+double foo(double x, double y) {
+  return ((x + 0.1234 * y) * (x + -0.1234 * y));
+}
+
+compiles to:
+_foo:
+       movapd  %xmm1, %xmm2
+       mulsd   LCPI1_1(%rip), %xmm1
+       mulsd   LCPI1_0(%rip), %xmm2
+       addsd   %xmm0, %xmm1
+       addsd   %xmm0, %xmm2
+       movapd  %xmm1, %xmm0
+       mulsd   %xmm2, %xmm0
+       ret
+
+Reassociate should be able to turn it into:
+
+double foo(double x, double y) {
+  return ((x + 0.1234 * y) * (x - 0.1234 * y));
+}
+
+Which allows the multiply by constant to be CSE'd, producing:
+
+_foo:
+       mulsd   LCPI1_0(%rip), %xmm1
+       movapd  %xmm1, %xmm2
+       addsd   %xmm0, %xmm2
+       subsd   %xmm1, %xmm0
+       mulsd   %xmm2, %xmm0
+       ret
+
+This doesn't need -ffast-math support at all.  This is particularly bad because
+the llvm-gcc frontend is canonicalizing the later into the former, but clang
+doesn't have this problem.
+
  //===---------------------------------------------------------------------===//
  
  These two functions should generate the same code on big-endian systems:
@@ -237,24 +276,6 @@ define void @test(i32* %P) {
  
  //===---------------------------------------------------------------------===//
  
-dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
-
-Compile:
-
-int bar(int x)
-{
-  int t = __builtin_clz(x);
-  return -(t>>5);
-}
-
-to:
-
-_bar:   addic r3,r3,-1
-        subfe r3,r3,r3
-        blr
-
-//===---------------------------------------------------------------------===//
-
  quantum_sigma_x in 462.libquantum contains the following loop:
  
        for(i=0; i<reg->size; i++)
@@ -294,6 +315,8 @@ unsigned long reverse(unsigned v) {
  
  //===---------------------------------------------------------------------===//
  
+[LOOP RECOGNITION]
+
  These idioms should be recognized as popcount (see PR1488):
  
  unsigned countbits_slow(unsigned v) {
@@ -1728,22 +1751,95 @@ from gcc.
  Missed instcombine transformation:
  define i32 @a(i32 %x) nounwind readnone {
  entry:
-  %shr = lshr i32 %x, 5                           ; <i32> [#uses=1]
-  %xor = xor i32 %shr, 67108864                   ; <i32> [#uses=1]
-  %sub = add i32 %xor, -67108864                  ; <i32> [#uses=1]
+  %rem = srem i32 %x, 32
+  %shl = shl i32 1, %rem
+  ret i32 %shl
+}
+
+The srem can be transformed to an and because if x is negative, the shift is
+undefined. Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine/dagcombine transformation:
+define i32 @a(i32 %x, i32 %y) nounwind readnone {
+entry:
+  %mul = mul i32 %y, -8
+  %sub = sub i32 %x, %mul
    ret i32 %sub
  }
  
-This function is equivalent to "ashr i32 %x, 5".  Testcase derived from gcc.
+Should compile to something like x+y*8, but currently compiles to an
+inefficient result.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine/dagcombine transformation:
+define void @lshift_lt(i8 zeroext %a) nounwind {
+entry:
+  %conv = zext i8 %a to i32
+  %shl = shl i32 %conv, 3
+  %cmp = icmp ult i32 %shl, 33
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  ret void
+
+if.end:
+  ret void
+}
+declare void @bar() nounwind
+
+The shift should be eliminated.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+These compile into different code, one gets recognized as a switch and the
+other doesn't due to phase ordering issues (PR6212):
+
+int test1(int mainType, int subType) {
+  if (mainType == 7)
+    subType = 4;
+  else if (mainType == 9)
+    subType = 6;
+  else if (mainType == 11)
+    subType = 9;
+  return subType;
+}
+
+int test2(int mainType, int subType) {
+  if (mainType == 7)
+    subType = 4;
+  if (mainType == 9)
+    subType = 6;
+  if (mainType == 11)
+    subType = 9;
+  return subType;
+}
  
  //===---------------------------------------------------------------------===//
  
-isSafeToLoadUnconditionally should allow a GEP of a global/alloca with constant
-indicies within the bounds of the allocated object. Reduced example:
+The following test case (from PR6576):
  
-const int a[] = {3,6};
-int b(int y) { int* x = y ? &a[0] : &a[1]; return *x; }
+define i32 @mul(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %cond1 = icmp eq i32 %b, 0                      ; <i1> [#uses=1]
+ br i1 %cond1, label %exit, label %bb.nph
+bb.nph:                                           ; preds = %entry
+ %tmp = mul i32 %b, %a                           ; <i32> [#uses=1]
+ ret i32 %tmp
+exit:                                             ; preds = %entry
+ ret i32 0
+}
  
-All the loads should be eliminated.  Testcase derived from gcc.
+could be reduced to:
+
+define i32 @mul(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %tmp = mul i32 %b, %a
+ ret i32 %tmp
+}
  
  //===---------------------------------------------------------------------===//
+