[EarlyCSE] Value forwarding for unordered atomics

author Philip Reames <listmail@philipreames.com>

Tue, 8 Dec 2015 21:45:41 +0000 (21:45 +0000)

committer Philip Reames <listmail@philipreames.com>

Tue, 8 Dec 2015 21:45:41 +0000 (21:45 +0000)
author Philip Reames <listmail@philipreames.com>
Tue, 8 Dec 2015 21:45:41 +0000 (21:45 +0000)
committer Philip Reames <listmail@philipreames.com>
Tue, 8 Dec 2015 21:45:41 +0000 (21:45 +0000)
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp

index 7e3703de25e4f81b1a8bcc2971f9c01e03538c61..eb38ef5f1645e6a74939d1c82a1f7b21d4506283 100644 (file)
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -281,21 +281,31 @@ public:
    /// that dominated values can succeed in their lookup.
    ScopedHTType AvailableValues;
  
-  /// \brief A scoped hash table of the current values of loads.
+  /// A scoped hash table of the current values of previously encounted memory
+  /// locations.
    ///
-  /// This allows us to get efficient access to dominating loads when we have
-  /// a fully redundant load.  In addition to the most recent load, we keep
-  /// track of a generation count of the read, which is compared against the
-  /// current generation count.  The current generation count is incremented
+  /// This allows us to get efficient access to dominating loads or stores when
+  /// we have a fully redundant load.  In addition to the most recent load, we
+  /// keep track of a generation count of the read, which is compared against
+  /// the current generation count.  The current generation count is incremented
    /// after every possibly writing memory operation, which ensures that we only
-  /// CSE loads with other loads that have no intervening store.
+  /// CSE loads with other loads that have no intervening store.  Ordering
+  /// events (such as fences or atomic instructions) increment the generation
+  /// count as well; essentially, we model these as writes to all possible
+  /// locations.  Note that atomic and/or volatile loads and stores can be
+  /// present the table; it is the responsibility of the consumer to inspect
+  /// the atomicity/volatility if needed.
    struct LoadValue {
      Value *Data;
      unsigned Generation;
      int MatchingId;
-    LoadValue() : Data(nullptr), Generation(0), MatchingId(-1) {}
-    LoadValue(Value *Data, unsigned Generation, unsigned MatchingId)
-        : Data(Data), Generation(Generation), MatchingId(MatchingId) {}
+    bool IsAtomic;
+    LoadValue()
+      : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {}
+    LoadValue(Value *Data, unsigned Generation, unsigned MatchingId,
+              bool IsAtomic)
+      : Data(Data), Generation(Generation), MatchingId(MatchingId),
+        IsAtomic(IsAtomic) {}
    };
    typedef RecyclingAllocator<BumpPtrAllocator,
                               ScopedHashTableVal<Value *, LoadValue>>
@@ -410,6 +420,42 @@ private:
        }
        return Inst->isAtomic();
      }
+    bool isAtomic() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return false;
+      }
+      return Inst->isAtomic();
+    }
+    bool isUnordered() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return true;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isUnordered();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isUnordered();
+      }
+      // Conservative answer
+      return !Inst->isAtomic();
+    }
+
+    bool isVolatile() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return false;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isVolatile();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isVolatile();
+      }
+      // Conservative answer
+      return true;
+    }
+
+    
      bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
        return (getPointerOperand() == Inst.getPointerOperand() &&
                getMatchingId() == Inst.getMatchingId());
@@ -561,20 +607,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
      ParseMemoryInst MemInst(Inst, TTI);
      // If this is a non-volatile load, process it.
      if (MemInst.isValid() && MemInst.isLoad()) {
-      // Ignore volatile or ordered loads.
-      if (!MemInst.isSimple()) {
+      // (conservatively) we can't peak past the ordering implied by this
+      // operation, but we can add this load to our set of available values
+      if (MemInst.isVolatile() || !MemInst.isUnordered()) {
          LastStore = nullptr;
-        // Don't CSE across synchronization boundaries.
-        if (Inst->mayWriteToMemory())
-          ++CurrentGeneration;
-        continue;
+        ++CurrentGeneration;
        }
  
        // If we have an available version of this load, and if it is the right
        // generation, replace this instruction.
        LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
        if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
-          InVal.MatchingId == MemInst.getMatchingId()) {
+          InVal.MatchingId == MemInst.getMatchingId() &&
+          // We don't yet handle removing loads with ordering of any kind.
+          !MemInst.isVolatile() && MemInst.isUnordered() &&
+          // We can't replace an atomic load with one which isn't also atomic.
+          InVal.IsAtomic >= MemInst.isAtomic()) {
          Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
          if (Op != nullptr) {
            DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
@@ -591,7 +639,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
        // Otherwise, remember that we have this instruction.
        AvailableLoads.insert(
            MemInst.getPointerOperand(),
-          LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
+          LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                    MemInst.isAtomic()));
        LastStore = nullptr;
        continue;
      }
@@ -646,9 +695,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
  
        if (MemInst.isValid() && MemInst.isStore()) {
          // We do a trivial form of DSE if there are two stores to the same
-        // location with no intervening loads.  Delete the earlier store.
+        // location with no intervening loads.  Delete the earlier store. Note
+        // that we can delete an earlier simple store even if the following one
+        // is ordered/volatile/atomic store.
          if (LastStore) {
            ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+          assert(LastStoreMemInst.isSimple() && "Violated invariant");
            if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
              DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
                           << "  due to: " << *Inst << '\n');
@@ -667,11 +719,17 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
          // the store.
          AvailableLoads.insert(
              MemInst.getPointerOperand(),
-            LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId()));
+            LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                      MemInst.isAtomic()));
  
          // Remember that this was the last normal store we saw for DSE.
+        // Note that we can't delete an earlier atomic or volatile store in
+        // favor of a later one which isn't.  We could in principal remove an
+        // earlier unordered store if the later one is also unordered.
          if (MemInst.isSimple())
            LastStore = Inst;
+        else
+          LastStore = nullptr;
        }
      }
    }
diff --git a/test/Transforms/EarlyCSE/atomics.ll b/test/Transforms/EarlyCSE/atomics.ll

new file mode 100644 (file)

index 0000000..f08220f
--- /dev/null
+++ b/test/Transforms/EarlyCSE/atomics.ll
@@ -0,0 +1,127 @@
+; RUN: opt < %s -S -early-cse | FileCheck %s
+
+; CHECK-LABEL: @test12(
+define i32 @test12(i1 %B, i32* %P1, i32* %P2) {
+  %load0 = load i32, i32* %P1
+  %1 = load atomic i32, i32* %P2 seq_cst, align 4
+  %load1 = load i32, i32* %P1
+  %sel = select i1 %B, i32 %load0, i32 %load1
+  ret i32 %sel
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load i32, i32* %P1
+}
+
+; CHECK-LABEL: @test13(
+; atomic to non-atomic forwarding is legal
+define i32 @test13(i1 %B, i32* %P1) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load i32, i32* %P1
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1
+  ; CHECK: ret i32 0
+}
+
+; CHECK-LABEL: @test14(
+; atomic to unordered atomic forwarding is legal
+define i32 @test14(i1 %B, i32* %P1) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load atomic i32, i32* %P1 unordered, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1
+  ; CHECK: ret i32 0
+}
+
+; CHECK-LABEL: @test15(
+; implementation restiction: can't forward to stonger
+; than unordered
+define i32 @test15(i1 %B, i32* %P1, i32* %P2) {
+  %a = load atomic i32, i32* %P1 seq_cst, align 4
+  %b = load atomic i32, i32* %P1 seq_cst, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load atomic i32, i32* %P1
+  ; CHECK: load atomic i32, i32* %P1
+}
+
+; CHECK-LABEL: @test16(
+; forwarding non-atomic to atomic is wrong! (However,
+; it would be legal to use the later value in place of the
+; former in this particular example.  We just don't
+; do that right now.)
+define i32 @test16(i1 %B, i32* %P1, i32* %P2) {
+  %a = load i32, i32* %P1, align 4
+  %b = load atomic i32, i32* %P1 unordered, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load atomic i32, i32* %P1
+}
+
+; Can't DSE across a full fence
+define void @test17(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @test17
+; CHECK: store
+; CHECK: store atomic
+; CHECK: store
+  store i32 0, i32* %P1, align 4
+  store atomic i32 0, i32* %P2 seq_cst, align 4
+  store i32 0, i32* %P1, align 4
+  ret void
+}
+
+; Can't remove a volatile load
+define i32 @test18(i1 %B, i32* %P1, i32* %P2) {
+  %a = load i32, i32* %P1, align 4
+  %b = load volatile i32, i32* %P1, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK-LABEL: @test18
+  ; CHECK: load i32, i32* %P1
+  ; CHECK: load volatile i32, i32* %P1
+}
+
+; Can't DSE a volatile store
+define void @test19(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @test19
+; CHECK: store volatile
+; CHECK: store
+  store volatile i32 0, i32* %P1, align 4
+  store i32 3, i32* %P1, align 4
+  ret void
+}
+
+; Can value forward from volailes
+define i32 @test20(i1 %B, i32* %P1, i32* %P2) {
+  %a = load volatile i32, i32* %P1, align 4
+  %b = load i32, i32* %P1, align 4
+  %res = sub i32 %a, %b
+  ret i32 %res
+  ; CHECK-LABEL: @test20
+  ; CHECK: load volatile i32, i32* %P1
+  ; CHECK: ret i32 0
+}
+
+; Can DSE a non-volatile store in favor of a volatile one
+; currently a missed optimization
+define void @test21(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @test21
+; CHECK: store 
+; CHECK: store volatile
+  store i32 0, i32* %P1, align 4
+  store volatile i32 3, i32* %P1, align 4
+  ret void
+}
+
+; Can DSE a normal store in favor of a unordered one
+define void @test22(i1 %B, i32* %P1, i32* %P2) {
+; CHECK-LABEL: @test22
+; CHECK-NEXT: store atomic
+  store i32 0, i32* %P1, align 4
+  store atomic i32 3, i32* %P1 unordered, align 4
+  ret void
+}
+
+
+
author	Philip Reames <listmail@philipreames.com>
	Tue, 8 Dec 2015 21:45:41 +0000 (21:45 +0000)
committer	Philip Reames <listmail@philipreames.com>
	Tue, 8 Dec 2015 21:45:41 +0000 (21:45 +0000)
lib/Transforms/Scalar/EarlyCSE.cpp		patch \| blob \| history
test/Transforms/EarlyCSE/atomics.ll	[new file with mode: 0644]	patch \| blob