From 74615f55487cfa257486d606a404f608a264de0c Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Mon, 2 Nov 2015 18:48:08 +0000
Subject: [PATCH] Fix two issues in MergeConsecutiveStores:

1) PR25154. This is basically a repeat of PR18102, which was fixed in
r200201, and broken again by r234430. The latter changed which of the
store nodes was merged into from the first to the last. Thus, we now
also need to prefer merging a later store at a given address into the
target node, instead of an earlier one.

2) While investigating that, I also realized I'd introduced a bug in
r236850. There, I removed a check for alignment -- not realizing that
nothing except the alignment check was ensuring that none of the stores
were overlapping! This is a really bogus way to ensure there's no
aliased stores.

A better solution to both of these issues is likely to always use the
code added in the 'if (UseAA)' branches which rearrange the chain based
on a more principled analysis. I'll look into whether that can be used
always, but in the interest of getting things back to working, I think a
minimal change makes sense.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251816 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp    | 17 +++++++-
 test/CodeGen/X86/MergeConsecutiveStores.ll  | 10 +++--
 test/CodeGen/X86/dag-merge-fast-accesses.ll | 12 +++++-
 test/CodeGen/X86/stores-merging.ll          | 46 ++++++++++++++++++---
 4 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 31f0df99d47..a790282fada 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11165,6 +11165,13 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
     if (Index->getMemoryVT() != MemVT)
       break;
 
+    // We do not allow under-aligned stores in order to prevent
+    // overriding stores. NOTE: this is a bad hack. Alignment SHOULD
+    // be irrelevant here; what MATTERS is that we not move memory
+    // operations that potentially overlap past each-other.
+    if (Index->getAlignment() < MemVT.getStoreSize())
+      break;
+
     // We found a potential memory operand to merge.
     StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++));
 
@@ -11249,12 +11256,18 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   if (StoreNodes.size() < 2)
     return false;
 
-  // Sort the memory operands according to their distance from the base pointer.
+  // Sort the memory operands according to their distance from the
+  // base pointer.  As a secondary criteria: make sure stores coming
+  // later in the code come first in the list. This is important for
+  // the non-UseAA case, because we're merging stores into the FINAL
+  // store along a chain which potentially contains aliasing stores.
+  // Thus, if there are multiple stores to the same address, the last
+  // one can be considered for merging but not the others.
   std::sort(StoreNodes.begin(), StoreNodes.end(),
             [](MemOpLink LHS, MemOpLink RHS) {
     return LHS.OffsetFromBase < RHS.OffsetFromBase ||
            (LHS.OffsetFromBase == RHS.OffsetFromBase &&
-            LHS.SequenceNum > RHS.SequenceNum);
+            LHS.SequenceNum < RHS.SequenceNum);
   });
 
   // Scan the memory operations on the chain and find the first non-consecutive
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index 6670bbe0277..70af4184e8a 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -288,12 +288,16 @@ block4:                                       ; preds = %4, %.lr.ph
   ret void
 }
 
-;; On x86, even unaligned copies can be merged to vector ops.
+;; On x86, even unaligned copies should be merged to vector ops.
+;; TODO: however, this cannot happen at the moment, due to brokenness
+;; in MergeConsecutiveStores. See UseAA FIXME in DAGCombiner.cpp
+;; visitSTORE.
+
 ; CHECK-LABEL: merge_loads_no_align:
 ;  load:
-; CHECK: vmovups
+; CHECK-NOT: vmovups ;; TODO
 ;  store:
-; CHECK: vmovups
+; CHECK-NOT: vmovups ;; TODO
 ; CHECK: ret
 define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
   %a1 = icmp sgt i32 %count, 0
diff --git a/test/CodeGen/X86/dag-merge-fast-accesses.ll b/test/CodeGen/X86/dag-merge-fast-accesses.ll
index c5bb34013ad..49631ffa77d 100644
--- a/test/CodeGen/X86/dag-merge-fast-accesses.ll
+++ b/test/CodeGen/X86/dag-merge-fast-accesses.ll
@@ -50,11 +50,19 @@ define void @merge_vec_element_store(<4 x double> %v, double* %ptr) {
 }
 
 
+;; TODO: FAST *should* be:
+;;    movups (%rdi), %xmm0
+;;    movups %xmm0, 40(%rdi)
+;; ..but is not currently. See the UseAA FIXME in DAGCombiner.cpp
+;; visitSTORE.
+
 define void @merge_vec_load_and_stores(i64 *%ptr) {
 ; FAST-LABEL: merge_vec_load_and_stores:
 ; FAST:       # BB#0:
-; FAST-NEXT:    movups (%rdi), %xmm0
-; FAST-NEXT:    movups %xmm0, 40(%rdi)
+; FAST-NEXT:    movq (%rdi), %rax
+; FAST-NEXT:    movq 8(%rdi), %rcx
+; FAST-NEXT:    movq %rax, 40(%rdi)
+; FAST-NEXT:    movq %rcx, 48(%rdi)
 ; FAST-NEXT:    retq
 ;
 ; SLOW-LABEL: merge_vec_load_and_stores:
diff --git a/test/CodeGen/X86/stores-merging.ll b/test/CodeGen/X86/stores-merging.ll
index d6daa573b4a..9e479bd71b9 100644
--- a/test/CodeGen/X86/stores-merging.ll
+++ b/test/CodeGen/X86/stores-merging.ll
@@ -7,17 +7,51 @@ target triple = "x86_64-unknown-linux-gnu"
 
 @e = common global %structTy zeroinitializer, align 4
 
-; CHECK-LABEL: f
-define void @f() {
-entry:
+;; Ensure that MergeConsecutiveStores doesn't incorrectly reorder
+;; store operations.  The first test stores in increasing address
+;; order, the second in decreasing -- but in both cases should have
+;; the same result in memory in the end.
 
-; CHECK:   movabsq	$528280977409, %rax
+; CHECK-LABEL: redundant_stores_merging:
+; CHECK:   movl    $123, e+8(%rip)
+; CHECK:   movabsq $1958505086977, %rax
 ; CHECK:   movq    %rax, e+4(%rip)
-; CHECK:   movl    $456, e+8(%rip)
-
+define void @redundant_stores_merging() {
+entry:
   store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
   store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
   store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
   ret void
 }
 
+;; This variant tests PR25154.
+; CHECK-LABEL: redundant_stores_merging_reverse:
+; CHECK:   movl    $123, e+8(%rip)
+; CHECK:   movabsq $1958505086977, %rax
+; CHECK:   movq    %rax, e+4(%rip)
+define void @redundant_stores_merging_reverse() {
+entry:
+  store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
+  store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
+  store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
+  ret void
+}
+
+@b = common global [8 x i8] zeroinitializer, align 2
+
+;; The 2-byte store to offset 3 overlaps the 2-byte store to offset 2;
+;; these must not be reordered in MergeConsecutiveStores such that the
+;; store to 3 comes first (e.g. by merging the stores to 0 and 2 into
+;; a movl, after the store to 3).
+
+;; CHECK-LABEL: overlapping_stores_merging:
+;; CHECK:  movw    $0, b+2(%rip)
+;; CHECK:  movw    $2, b+3(%rip)
+;; CHECK:  movw    $1, b(%rip)
+define void @overlapping_stores_merging() {
+entry:
+  store i16 0, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 2) to i16*), align 2
+  store i16 2, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 3) to i16*), align 1
+  store i16 1, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 0) to i16*), align 2
+  ret void
+}
-- 
2.34.1