if (Index->getMemoryVT() != MemVT)
break;
+ // We do not allow under-aligned stores in order to prevent
+ // overriding stores. NOTE: this is a bad hack. Alignment SHOULD
+ // be irrelevant here; what MATTERS is that we not move memory
+ // operations that potentially overlap past each-other.
+ if (Index->getAlignment() < MemVT.getStoreSize())
+ break;
+
// We found a potential memory operand to merge.
StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++));
if (StoreNodes.size() < 2)
return false;
- // Sort the memory operands according to their distance from the base pointer.
+ // Sort the memory operands according to their distance from the
+ // base pointer. As a secondary criteria: make sure stores coming
+ // later in the code come first in the list. This is important for
+ // the non-UseAA case, because we're merging stores into the FINAL
+ // store along a chain which potentially contains aliasing stores.
+ // Thus, if there are multiple stores to the same address, the last
+ // one can be considered for merging but not the others.
std::sort(StoreNodes.begin(), StoreNodes.end(),
[](MemOpLink LHS, MemOpLink RHS) {
return LHS.OffsetFromBase < RHS.OffsetFromBase ||
(LHS.OffsetFromBase == RHS.OffsetFromBase &&
- LHS.SequenceNum > RHS.SequenceNum);
+ LHS.SequenceNum < RHS.SequenceNum);
});
// Scan the memory operations on the chain and find the first non-consecutive
ret void
}
-;; On x86, even unaligned copies can be merged to vector ops.
+;; On x86, even unaligned copies should be merged to vector ops.
+;; TODO: however, this cannot happen at the moment, due to brokenness
+;; in MergeConsecutiveStores. See UseAA FIXME in DAGCombiner.cpp
+;; visitSTORE.
+
; CHECK-LABEL: merge_loads_no_align:
; load:
-; CHECK: vmovups
+; CHECK-NOT: vmovups ;; TODO
; store:
-; CHECK: vmovups
+; CHECK-NOT: vmovups ;; TODO
; CHECK: ret
define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
%a1 = icmp sgt i32 %count, 0
}
+;; TODO: FAST *should* be:
+;; movups (%rdi), %xmm0
+;; movups %xmm0, 40(%rdi)
+;; ..but is not currently. See the UseAA FIXME in DAGCombiner.cpp
+;; visitSTORE.
+
define void @merge_vec_load_and_stores(i64 *%ptr) {
; FAST-LABEL: merge_vec_load_and_stores:
; FAST: # BB#0:
-; FAST-NEXT: movups (%rdi), %xmm0
-; FAST-NEXT: movups %xmm0, 40(%rdi)
+; FAST-NEXT: movq (%rdi), %rax
+; FAST-NEXT: movq 8(%rdi), %rcx
+; FAST-NEXT: movq %rax, 40(%rdi)
+; FAST-NEXT: movq %rcx, 48(%rdi)
; FAST-NEXT: retq
;
; SLOW-LABEL: merge_vec_load_and_stores:
@e = common global %structTy zeroinitializer, align 4
-; CHECK-LABEL: f
-define void @f() {
-entry:
+;; Ensure that MergeConsecutiveStores doesn't incorrectly reorder
+;; store operations. The first test stores in increasing address
+;; order, the second in decreasing -- but in both cases should have
+;; the same result in memory in the end.
-; CHECK: movabsq $528280977409, %rax
+; CHECK-LABEL: redundant_stores_merging:
+; CHECK: movl $123, e+8(%rip)
+; CHECK: movabsq $1958505086977, %rax
; CHECK: movq %rax, e+4(%rip)
-; CHECK: movl $456, e+8(%rip)
-
+define void @redundant_stores_merging() {
+entry:
store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
ret void
}
+;; This variant tests PR25154.
+; CHECK-LABEL: redundant_stores_merging_reverse:
+; CHECK: movl $123, e+8(%rip)
+; CHECK: movabsq $1958505086977, %rax
+; CHECK: movq %rax, e+4(%rip)
+define void @redundant_stores_merging_reverse() {
+entry:
+ store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
+ store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
+ store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
+ ret void
+}
+
+@b = common global [8 x i8] zeroinitializer, align 2
+
+;; The 2-byte store to offset 3 overlaps the 2-byte store to offset 2;
+;; these must not be reordered in MergeConsecutiveStores such that the
+;; store to 3 comes first (e.g. by merging the stores to 0 and 2 into
+;; a movl, after the store to 3).
+
+;; CHECK-LABEL: overlapping_stores_merging:
+;; CHECK: movw $0, b+2(%rip)
+;; CHECK: movw $2, b+3(%rip)
+;; CHECK: movw $1, b(%rip)
+define void @overlapping_stores_merging() {
+entry:
+ store i16 0, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 2) to i16*), align 2
+ store i16 2, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 3) to i16*), align 1
+ store i16 1, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 0) to i16*), align 2
+ ret void
+}