From 4b4923953554f6bd467b09b540963e38c5446c82 Mon Sep 17 00:00:00 2001
From: Amaury Sechet <deadalnix@gmail.com>
Date: Tue, 5 Jan 2016 20:17:48 +0000
Subject: [PATCH] Implement load to store => memcpy in MemCpyOpt for aggregates

Summary:
Most of the tool chain is able to optimize scalar and memcpy like operation effisciently while it isn't that good with aggregates. In order to improve the support of aggregate, we try to change aggregate manipulation into either scalar or memcpy like ones whenever possible without loosing informations.

This is one such opportunity.

Reviewers: craig.topper, spatel, dexonsmith, Prazek, chandlerc

Subscribers: llvm-commits

Differential Revision: http://reviews.llvm.org/D15894

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256868 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/MemCpyOptimizer.cpp | 84 ++++++++++++++++++++---
 test/Transforms/MemCpyOpt/fca2memcpy.ll   | 47 +++++++++++++
 2 files changed, 120 insertions(+), 11 deletions(-)
 create mode 100644 test/Transforms/MemCpyOpt/fca2memcpy.ll
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 0333bf2284e..94725db56b8 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -481,6 +481,17 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
   return AMemSet;
 }
 
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+                                     const LoadInst *LI) {
+  unsigned StoreAlign = SI->getAlignment();
+  if (!StoreAlign)
+    StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+  unsigned LoadAlign = LI->getAlignment();
+  if (!LoadAlign)
+    LoadAlign = DL.getABITypeAlignment(LI->getType());
+
+  return std::min(StoreAlign, LoadAlign);
+}
 
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (!SI->isSimple()) return false;
@@ -496,12 +507,70 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
 
-  // Detect cases where we're performing call slot forwarding, but
-  // happen to be using a load-store pair to implement it, rather than
-  // a memcpy.
+  // Load to store forwarding can be interpreted as memcpy.
   if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
     if (LI->isSimple() && LI->hasOneUse() &&
         LI->getParent() == SI->getParent()) {
+
+      auto *T = LI->getType();
+      if (T->isAggregateType()) {
+        AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+        MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+        // We use alias analysis to check if an instruction may store to
+        // the memory we load from in between the load and the store. If
+        // such an instruction is found, we store it in AI.
+        Instruction *AI = nullptr;
+        for (BasicBlock::iterator I = ++LI->getIterator(), E = SI->getIterator();
+             I != E; ++I) {
+          if (AA.getModRefInfo(&*I, LoadLoc) & MRI_Mod) {
+            AI = &*I;
+            break;
+          }
+        }
+
+        // If no aliasing instruction is found, then we can promote the
+        // load/store pair to a memcpy at the store loaction.
+        if (!AI) {
+          // If we load from memory that may alias the memory we store to,
+          // memmove must be used to preserve semantic. If not, memcpy can
+          // be used.
+          bool UseMemMove = false;
+          if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
+            UseMemMove = true;
+
+          unsigned Align = findCommonAlignment(DL, SI, LI);
+          uint64_t Size = DL.getTypeStoreSize(T);
+
+          IRBuilder<> Builder(SI);
+          Instruction *M;
+          if (UseMemMove)
+            M = Builder.CreateMemMove(SI->getPointerOperand(),
+                                      LI->getPointerOperand(), Size,
+                                      Align, SI->isVolatile());
+          else
+            M = Builder.CreateMemCpy(SI->getPointerOperand(),
+                                     LI->getPointerOperand(), Size,
+                                     Align, SI->isVolatile());
+
+          DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI
+                       << " => " << *M << "\n");
+
+          MD->removeInstruction(SI);
+          SI->eraseFromParent();
+          MD->removeInstruction(LI);
+          LI->eraseFromParent();
+          ++NumMemCpyInstr;
+
+          // Make sure we do not invalidate the iterator.
+          BBI = M->getIterator();
+          return true;
+        }
+      }
+
+      // Detect cases where we're performing call slot forwarding, but
+      // happen to be using a load-store pair to implement it, rather than
+      // a memcpy.
       MemDepResult ldep = MD->getDependency(LI);
       CallInst *C = nullptr;
       if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
@@ -522,18 +591,11 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       }
 
       if (C) {
-        unsigned storeAlign = SI->getAlignment();
-        if (!storeAlign)
-          storeAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
-        unsigned loadAlign = LI->getAlignment();
-        if (!loadAlign)
-          loadAlign = DL.getABITypeAlignment(LI->getType());
-
         bool changed = performCallSlotOptzn(
             LI, SI->getPointerOperand()->stripPointerCasts(),
             LI->getPointerOperand()->stripPointerCasts(),
             DL.getTypeStoreSize(SI->getOperand(0)->getType()),
-            std::min(storeAlign, loadAlign), C);
+            findCommonAlignment(DL, SI, LI), C);
         if (changed) {
           MD->removeInstruction(SI);
           SI->eraseFromParent();
diff --git a/test/Transforms/MemCpyOpt/fca2memcpy.ll b/test/Transforms/MemCpyOpt/fca2memcpy.ll
new file mode 100644
index 00000000000..61ec7f1d914
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/fca2memcpy.ll
@@ -0,0 +1,47 @@
+; RUN: opt -memcpyopt -S < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+%S = type { i8*, i32 }
+
+define void @copy(%S* %src, %S* %dst) {
+; CHECK-LABEL: copy
+; CHECK: call void @llvm.memmove.p0i8.p0i8.i64
+; CHECK-NEXT: ret void
+  %1 = load %S, %S* %src
+  store %S %1, %S* %dst
+  ret void
+}
+
+define void @noaliassrc(%S* noalias %src, %S* %dst) {
+; CHECK-LABEL: noaliassrc
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+; CHECK-NEXT: ret void
+  %1 = load %S, %S* %src
+  store %S %1, %S* %dst
+  ret void
+}
+
+define void @noaliasdst(%S* %src, %S* noalias %dst) {
+; CHECK-LABEL: noaliasdst
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+; CHECK-NEXT: ret void
+  %1 = load %S, %S* %src
+  store %S %1, %S* %dst
+  ret void
+}
+
+define void @copyalias(%S* %src, %S* %dst) {
+; CHECK-LABEL: copyalias
+; CHECK-NEXT: [[LOAD:%[a-z0-9\.]+]] = load %S, %S* %src
+; CHECK-NOT: load
+; CHECK: call void @llvm.memmove.p0i8.p0i8.i64
+; CHECK-NEXT: store %S [[LOAD]], %S* %dst
+; CHECK-NEXT: ret void
+  %1 = load %S, %S* %src
+  %2 = load %S, %S* %src
+  store %S %1, %S* %dst
+  store %S %2, %S* %dst
+  ret void
+}
-- 
2.34.1