-for.body.lr.ph: ; preds = %cond.true.i.i.i.i
- %mul.i.i.i.i.i = shl i64 %conv, 2
- %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
- %0 = bitcast i8* %call3.i.i.i.i.i to i32*
- store i32* %0, i32** %v8.sub, align 8, !tbaa !0
- %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
- store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
- call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
- store i32* %add.ptr.i.i.i, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
- %tmp18 = add i32 %N, -1
- %tmp19 = zext i32 %tmp18 to i64
- %tmp20 = shl i64 %tmp19, 2
- %tmp21 = add i64 %tmp20, 4
- call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %tmp21, i32 4, i1 false)
- br label %for.end
-
-First off, why (((zext %N - 1) << 2) + 4) instead of the ((sext %N) << 2) done
-previously? (or better yet, re-use that one?)
-
-Then, the really painful one is the second memset, of the same memory, to the
-same value.
+In order to handle this we have to:
+ A) Teach clang to generate metadata for memsets of structs that have holes in
+ them.
+ B) Teach clang to use such a memset for zero init of this struct (since it has
+ a hole), instead of doing elementwise zeroing.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 currently compiles this code:
+
+extern const int magic;
+double f() { return 0.0 * magic; }
+
+into
+
+@magic = external constant i32
+
+define double @_Z1fv() nounwind readnone {
+entry:
+ %tmp = load i32* @magic, align 4, !tbaa !0
+ %conv = sitofp i32 %tmp to double
+ %mul = fmul double %conv, 0.000000e+00
+ ret double %mul
+}
+
+We should be able to fold away this fmul to 0.0. More generally, fmul(x,0.0)
+can be folded to 0.0 if we can prove that the LHS is not -0.0, not a NaN, and
+not an INF. The CannotBeNegativeZero predicate in value tracking should be
+extended to support general "fpclassify" operations that can return
+yes/no/unknown for each of these predicates.
+
+In this predicate, we know that uitofp is trivially never NaN or -0.0, and
+we know that it isn't +/-Inf if the floating point type has enough exponent bits
+to represent the largest integer value as < inf.
+
+//===---------------------------------------------------------------------===//
+
+When optimizing a transformation that can change the sign of 0.0 (such as the
+0.0*val -> 0.0 transformation above), it might be provable that the sign of the
+expression doesn't matter. For example, by the above rules, we can't transform
+fmul(sitofp(x), 0.0) into 0.0, because x might be -1 and the result of the
+expression is defined to be -0.0.
+
+If we look at the uses of the fmul for example, we might be able to prove that
+all uses don't care about the sign of zero. For example, if we have:
+
+ fadd(fmul(sitofp(x), 0.0), 2.0)
+
+Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
+transform the fmul to 0.0, and then the fadd to 2.0.
+
+//===---------------------------------------------------------------------===//
+
+We should enhance memcpy/memcpy/memset to allow a metadata node on them
+indicating that some bytes of the transfer are undefined. This is useful for
+frontends like clang when lowering struct copies, when some elements of the
+struct are undefined. Consider something like this:
+
+struct x {
+ char a;
+ int b[4];
+};
+void foo(struct x*P);
+struct x testfunc() {
+ struct x V1, V2;
+ foo(&V1);
+ V2 = V1;
+
+ return V2;
+}
+
+We currently compile this to:
+$ clang t.c -S -o - -O0 -emit-llvm | opt -scalarrepl -S
+
+
+%struct.x = type { i8, [4 x i32] }
+
+define void @testfunc(%struct.x* sret %agg.result) nounwind ssp {
+entry:
+ %V1 = alloca %struct.x, align 4
+ call void @foo(%struct.x* %V1)
+ %tmp1 = bitcast %struct.x* %V1 to i8*
+ %0 = bitcast %struct.x* %V1 to i160*
+ %srcval1 = load i160* %0, align 4
+ %tmp2 = bitcast %struct.x* %agg.result to i8*
+ %1 = bitcast %struct.x* %agg.result to i160*
+ store i160 %srcval1, i160* %1, align 4
+ ret void
+}
+
+This happens because SRoA sees that the temp alloca has is being memcpy'd into
+and out of and it has holes and it has to be conservative. If we knew about the
+holes, then this could be much much better.
+
+Having information about these holes would also improve memcpy (etc) lowering at
+llc time when it gets inlined, because we can use smaller transfers. This also
+avoids partial register stalls in some important cases.
+
+//===---------------------------------------------------------------------===//
+
+We don't fold (icmp (add) (add)) unless the two adds only have a single use.
+There are a lot of cases that we're refusing to fold in (e.g.) 256.bzip2, for
+example:
+
+ %indvar.next90 = add i64 %indvar89, 1 ;; Has 2 uses
+ %tmp96 = add i64 %tmp95, 1 ;; Has 1 use
+ %exitcond97 = icmp eq i64 %indvar.next90, %tmp96
+
+We don't fold this because we don't want to introduce an overlapped live range
+of the ivar. However if we can make this more aggressive without causing
+performance issues in two ways:
+
+1. If *either* the LHS or RHS has a single use, we can definitely do the
+ transformation. In the overlapping liverange case we're trading one register
+ use for one fewer operation, which is a reasonable trade. Before doing this
+ we should verify that the llc output actually shrinks for some benchmarks.
+2. If both ops have multiple uses, we can still fold it if the operations are
+ both sinkable to *after* the icmp (e.g. in a subsequent block) which doesn't
+ increase register pressure.
+
+There are a ton of icmp's we aren't simplifying because of the reg pressure
+concern. Care is warranted here though because many of these are induction
+variables and other cases that matter a lot to performance, like the above.
+Here's a blob of code that you can drop into the bottom of visitICmp to see some
+missed cases:
+
+ { Value *A, *B, *C, *D;
+ if (match(Op0, m_Add(m_Value(A), m_Value(B))) &&
+ match(Op1, m_Add(m_Value(C), m_Value(D))) &&
+ (A == C || A == D || B == C || B == D)) {
+ errs() << "OP0 = " << *Op0 << " U=" << Op0->getNumUses() << "\n";
+ errs() << "OP1 = " << *Op1 << " U=" << Op1->getNumUses() << "\n";
+ errs() << "CMP = " << I << "\n\n";
+ }
+ }
+
+//===---------------------------------------------------------------------===//
+
+define i1 @test1(i32 %x) nounwind {
+ %and = and i32 %x, 3
+ %cmp = icmp ult i32 %and, 2
+ ret i1 %cmp
+}
+
+Can be folded to (x & 2) == 0.
+
+define i1 @test2(i32 %x) nounwind {
+ %and = and i32 %x, 3
+ %cmp = icmp ugt i32 %and, 1
+ ret i1 %cmp
+}
+
+Can be folded to (x & 2) != 0.
+
+SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the
+icmp transform.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:1;
+int f4:29;
+} t1;
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:30;
+} t2;
+
+t1 s1;
+t2 s2;
+
+void func1(void)
+{
+s1.f1 = s2.f1;
+s1.f2 = s2.f2;
+}
+
+Compiles into this IR (on x86-64 at least):
+
+%struct.t1 = type { i8, [3 x i8] }
+@s2 = global %struct.t1 zeroinitializer, align 4
+@s1 = global %struct.t1 zeroinitializer, align 4
+define void @func1() nounwind ssp noredzone {
+entry:
+ %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4
+ %bf.val.sext5 = and i32 %0, 1
+ %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4
+ %2 = and i32 %1, -4
+ %3 = or i32 %2, %bf.val.sext5
+ %bf.val.sext26 = and i32 %0, 2
+ %4 = or i32 %3, %bf.val.sext26
+ store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4
+ ret void
+}
+
+The two or/and's should be merged into one each.
+
+//===---------------------------------------------------------------------===//
+
+Machine level code hoisting can be useful in some cases. For example, PR9408
+is about:
+
+typedef union {
+ void (*f1)(int);
+ void (*f2)(long);
+} funcs;
+
+void foo(funcs f, int which) {
+ int a = 5;
+ if (which) {
+ f.f1(a);
+ } else {
+ f.f2(a);
+ }
+}
+
+which we compile to:
+
+foo: # @foo
+# BB#0: # %entry
+ pushq %rbp
+ movq %rsp, %rbp
+ testl %esi, %esi
+ movq %rdi, %rax
+ je .LBB0_2
+# BB#1: # %if.then
+ movl $5, %edi
+ callq *%rax
+ popq %rbp
+ ret
+.LBB0_2: # %if.else
+ movl $5, %edi
+ callq *%rax
+ popq %rbp
+ ret
+
+Note that bb1 and bb2 are the same. This doesn't happen at the IR level
+because one call is passing an i32 and the other is passing an i64.
+
+//===---------------------------------------------------------------------===//
+
+I see this sort of pattern in 176.gcc in a few places (e.g. the start of
+store_bit_field). The rem should be replaced with a multiply and subtract:
+
+ %3 = sdiv i32 %A, %B
+ %4 = srem i32 %A, %B
+
+Similarly for udiv/urem. Note that this shouldn't be done on X86 or ARM,
+which can do this in a single operation (instruction or libcall). It is
+probably best to do this in the code generator.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; }
+should fold to (x & y) == 0.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; }
+should fold to x > y.