Don't forget to emit the load from indirect symbol when using movw + movt to material...

[oota-llvm.git] / lib / Target / README.txt
diff --git a/lib/Target/README.txt b/lib/Target/README.txt

index ff392f491d76129f002fdde5feffb4189eccf160..67aec852fa049c8876f0df369cb39f9e6e975698 100644 (file)
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2,29 +2,6 @@ Target Independent Opportunities:
  
  //===---------------------------------------------------------------------===//
  
-Dead argument elimination should be enhanced to handle cases when an argument is
-dead to an externally visible function.  Though the argument can't be removed
-from the externally visible function, the caller doesn't need to pass it in.
-For example in this testcase:
-
-  void foo(int X) __attribute__((noinline));
-  void foo(int X) { sideeffect(); }
-  void bar(int A) { foo(A+1); }
-
-We compile bar to:
-
-define void @bar(i32 %A) nounwind ssp {
-  %0 = add nsw i32 %A, 1                          ; <i32> [#uses=1]
-  tail call void @foo(i32 %0) nounwind noinline ssp
-  ret void
-}
-
-The add is dead, we could pass in 'i32 undef' instead.  This occurs for C++
-templates etc, which usually have linkonce_odr/weak_odr linkage, not internal
-linkage.
-
-//===---------------------------------------------------------------------===//
-
  With the recent changes to make the implicit def/use set explicit in
  machineinstrs, we should change the target descriptions for 'call' instructions
  so that the .td files don't list all the call-clobbered registers as implicit
@@ -1627,21 +1604,6 @@ int bar() { return foo("abcd"); }
  
  //===---------------------------------------------------------------------===//
  
-InstCombine should use SimplifyDemandedBits to remove the or instruction:
-
-define i1 @test(i8 %x, i8 %y) {
-  %A = or i8 %x, 1
-  %B = icmp ugt i8 %A, 3
-  ret i1 %B
-}
-
-Currently instcombine calls SimplifyDemandedBits with either all bits or just
-the sign bit, if the comparison is obviously a sign test. In this case, we only
-need all but the bottom two bits from %A, and if we gave that mask to SDB it
-would delete the or instruction for us.
-
-//===---------------------------------------------------------------------===//
-
  functionattrs doesn't know much about memcpy/memset.  This function should be
  marked readnone rather than readonly, since it only twiddles local memory, but
  functionattrs doesn't handle memset/memcpy/memmove aggressively:
@@ -2109,33 +2071,13 @@ aggressively as malloc though.
  
  //===---------------------------------------------------------------------===//
  
-clang -03 currently compiles this code
+clang -O3 doesn't optimize this:
  
  void f1(int* begin, int* end) {
    std::fill(begin, end, 0);
  }
  
-into
-
-define void @_Z2f1PiS_(i32* %begin, i32* %end) nounwind {
-entry:
-  %cmp7.i.i = icmp eq i32* %begin, %end
-  br i1 %cmp7.i.i, label %_ZSt4fillIPiiEvT_S1_RKT0_.exit, label %for.body.i.i
-
-for.body.i.i:                                     ; preds = %entry, %for.body.i.i
-  %indvar.i.i = phi i64 [ %tmp, %for.body.i.i ], [ 0, %entry ]
-  %tmp = add i64 %indvar.i.i, 1
-  %ptrincdec.i.i = getelementptr i32* %begin, i64 %tmp
-  %__first.addr.08.i.i = getelementptr i32* %begin, i64 %indvar.i.i
-  store i32 0, i32* %__first.addr.08.i.i, align 4, !tbaa !0
-  %cmp.i.i = icmp eq i32* %ptrincdec.i.i, %end
-  br i1 %cmp.i.i, label %_ZSt4fillIPiiEvT_S1_RKT0_.exit, label %for.body.i.i
-
-_ZSt4fillIPiiEvT_S1_RKT0_.exit:                   ; preds = %for.body.i.i, %entry
-  ret void
-}
-
-It should compile it to a memset.
+into a memset.  This is PR8942.
  
  //===---------------------------------------------------------------------===//
  
@@ -2143,7 +2085,8 @@ clang -O3 -fno-exceptions currently compiles this code:
  
  void f(int N) {
    std::vector<int> v(N);
-  g(v);
+
+  extern void sink(void*); sink(&v);
  }
  
  into
@@ -2189,40 +2132,186 @@ _ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i:    ; preds = %cond.true.i.i.i.i
    br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
  
  This is just the handling the construction of the vector. Most surprising here
-is the fact that all three null stores in %entry are dead, but not eliminated.
+is the fact that all three null stores in %entry are dead (because we do no
+cross-block DSE).
+
  Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i.
+This is a because the client of LazyValueInfo doesn't simplify all instruction
+operands, just selected ones.
  
  //===---------------------------------------------------------------------===//
  
  clang -O3 -fno-exceptions currently compiles this code:
  
+void f(char* a, int n) {
+  __builtin_memset(a, 0, n);
+  for (int i = 0; i < n; ++i)
+    a[i] = 0;
+}
+
+into:
+
+define void @_Z1fPci(i8* nocapture %a, i32 %n) nounwind {
+entry:
+  %conv = sext i32 %n to i64
+  tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %conv, i32 1, i1 false)
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %tmp10 = add i32 %n, -1
+  %tmp11 = zext i32 %tmp10 to i64
+  %tmp12 = add i64 %tmp11, 1
+  call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %tmp12, i32 1, i1 false)
+  ret void
+
+for.end:                                          ; preds = %entry
+  ret void
+}
+
+This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold
+the two memset's together. The issue with %n seems to stem from poor handling
+of the original loop.
+
+To simplify this, we need SCEV to know that "n != 0" because of the dominating
+conditional.  That would turn the second memset into a simple memset of 'n'.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 -fno-exceptions currently compiles this code:
+
+struct S {
+  unsigned short m1, m2;
+  unsigned char m3, m4;
+};
+
  void f(int N) {
-  std::vector<int> v(N);
-  g(v);
+  std::vector<S> v(N);
+  extern void sink(void*); sink(&v);
  }
  
-into almost the same as the previous note, but replace its final BB with:
+into poor code for zero-initializing 'v' when N is >0. The problem is that
+S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and
+4 stores on each iteration. If the struct were 8 bytes, this gets turned into
+a memset.
  
-for.body.lr.ph:                                   ; preds = %cond.true.i.i.i.i
-  %mul.i.i.i.i.i = shl i64 %conv, 2
-  %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
-  %0 = bitcast i8* %call3.i.i.i.i.i to i32*
-  store i32* %0, i32** %v8.sub, align 8, !tbaa !0
-  %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
-  store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
-  call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
-  store i32* %add.ptr.i.i.i, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
-  %tmp18 = add i32 %N, -1
-  %tmp19 = zext i32 %tmp18 to i64
-  %tmp20 = shl i64 %tmp19, 2
-  %tmp21 = add i64 %tmp20, 4
-  call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %tmp21, i32 4, i1 false)
-  br label %for.end
-
-First off, why (((zext %N - 1) << 2) + 4) instead of the ((sext %N) << 2) done
-previously? (or better yet, re-use that one?)
-
-Then, the really painful one is the second memset, of the same memory, to the
-same value.
+In order to handle this we have to:
+  A) Teach clang to generate metadata for memsets of structs that have holes in
+     them.
+  B) Teach clang to use such a memset for zero init of this struct (since it has
+     a hole), instead of doing elementwise zeroing.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 currently compiles this code:
+
+extern const int magic;
+double f() { return 0.0 * magic; }
+
+into
+
+@magic = external constant i32
+
+define double @_Z1fv() nounwind readnone {
+entry:
+  %tmp = load i32* @magic, align 4, !tbaa !0
+  %conv = sitofp i32 %tmp to double
+  %mul = fmul double %conv, 0.000000e+00
+  ret double %mul
+}
+
+We should be able to fold away this fmul to 0.0.  More generally, fmul(x,0.0)
+can be folded to 0.0 if we can prove that the LHS is not -0.0, not a NaN, and
+not an INF.  The CannotBeNegativeZero predicate in value tracking should be
+extended to support general "fpclassify" operations that can return 
+yes/no/unknown for each of these predicates.
+
+In this predicate, we know that uitofp is trivially never NaN or -0.0, and
+we know that it isn't +/-Inf if the floating point type has enough exponent bits
+to represent the largest integer value as < inf.
+
+//===---------------------------------------------------------------------===//
+
+When optimizing a transformation that can change the sign of 0.0 (such as the
+0.0*val -> 0.0 transformation above), it might be provable that the sign of the
+expression doesn't matter.  For example, by the above rules, we can't transform
+fmul(sitofp(x), 0.0) into 0.0, because x might be -1 and the result of the
+expression is defined to be -0.0.
+
+If we look at the uses of the fmul for example, we might be able to prove that
+all uses don't care about the sign of zero.  For example, if we have:
+
+  fadd(fmul(sitofp(x), 0.0), 2.0)
+
+Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
+transform the fmul to 0.0, and then the fadd to 2.0.
+
+//===---------------------------------------------------------------------===//
+
+We should enhance memcpy/memcpy/memset to allow a metadata node on them
+indicating that some bytes of the transfer are undefined.  This is useful for
+frontends like clang when lowering struct copies, when some elements of the
+struct are undefined.  Consider something like this:
+
+struct x {
+  char a;
+  int b[4];
+};
+void foo(struct x*P);
+struct x testfunc() {
+  struct x V1, V2;
+  foo(&V1);
+  V2 = V1;
+
+  return V2;
+}
+
+We currently compile this to:
+$ clang t.c -S -o - -O0 -emit-llvm | opt -scalarrepl -S
+
+
+%struct.x = type { i8, [4 x i32] }
+
+define void @testfunc(%struct.x* sret %agg.result) nounwind ssp {
+entry:
+  %V1 = alloca %struct.x, align 4
+  call void @foo(%struct.x* %V1)
+  %tmp1 = bitcast %struct.x* %V1 to i8*
+  %0 = bitcast %struct.x* %V1 to i160*
+  %srcval1 = load i160* %0, align 4
+  %tmp2 = bitcast %struct.x* %agg.result to i8*
+  %1 = bitcast %struct.x* %agg.result to i160*
+  store i160 %srcval1, i160* %1, align 4
+  ret void
+}
+
+This happens because SRoA sees that the temp alloca has is being memcpy'd into
+and out of and it has holes and it has to be conservative.  If we knew about the
+holes, then this could be much much better.
+
+Having information about these holes would also improve memcpy (etc) lowering at
+llc time when it gets inlined, because we can use smaller transfers.  This also
+avoids partial register stalls in some important cases.
+
+//===---------------------------------------------------------------------===//
+
+We miss an optzn when lowering divide by some constants.  For example:
+  int test(int x) { return x/10; }
+
+We produce:
+
+_test:                                  ## @test
+## BB#0:                                ## %entry
+       movslq  %edi, %rax
+       imulq   $1717986919, %rax, %rax ## imm = 0x66666667
+       movq    %rax, %rcx
+       shrq    $63, %rcx
+**     shrq    $32, %rax
+**      sarl   $2, %eax
+       addl    %ecx, %eax
+       ret
+
+The two starred instructions could be replaced with a "sarl $34, %rax".  This
+occurs in 186.crafty very frequently.
  
  //===---------------------------------------------------------------------===//