When performing return slot optimization, remember to inform memdep when we're removi...

[oota-llvm.git] / lib / Transforms / Scalar / GVN.cpp
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp

index a4f78fe4573c4291a960d2490e887b64039295f8..481956f6b4ce039dbb5307cc558372ce9e4c5d06 100644 (file)
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -34,6 +34,7 @@
  #include "llvm/Analysis/MemoryDependenceAnalysis.h"
  #include "llvm/Support/CFG.h"
  #include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetData.h"
  using namespace llvm;
  
  //===----------------------------------------------------------------------===//
@@ -721,8 +722,10 @@ namespace {
        AU.addRequired<DominatorTree>();
        AU.addRequired<MemoryDependenceAnalysis>();
        AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetData>();
        AU.addPreserved<AliasAnalysis>();
        AU.addPreserved<MemoryDependenceAnalysis>();
+      AU.addPreserved<TargetData>();
      }
    
      // Helper fuctions
@@ -738,7 +741,8 @@ namespace {
                              SmallVector<Instruction*, 4>& toErase);
      bool processNonLocalLoad(LoadInst* L,
                               SmallVector<Instruction*, 4>& toErase);
-    bool processMemCpy(MemCpyInst* M, SmallVector<Instruction*, 4>& toErase);
+    bool processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
+                       SmallVector<Instruction*, 4>& toErase);
      bool performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C,
                                  SmallVector<Instruction*, 4>& toErase);
      Value *GetValueForBlock(BasicBlock *BB, LoadInst* orig,
@@ -1051,57 +1055,91 @@ bool GVN::processLoad(LoadInst* L,
    return deletedLoad;
  }
  
+/// isReturnSlotOptznProfitable - Determine if performing a return slot 
+/// fusion with the slot dest is profitable
+static bool isReturnSlotOptznProfitable(Value* dest, MemCpyInst* cpy) {
+  // We currently consider it profitable if dest is otherwise dead.
+  SmallVector<User*, 8> useList(dest->use_begin(), dest->use_end());
+  while (!useList.empty()) {
+    User* UI = useList.back();
+    
+    if (isa<GetElementPtrInst>(UI) || isa<BitCastInst>(UI)) {
+      useList.pop_back();
+      for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
+           I != E; ++I)
+        useList.push_back(*I);
+    } else if (UI == cpy)
+      useList.pop_back();
+    else
+      return false;
+  }
+  
+  return true;
+}
+
  /// performReturnSlotOptzn - takes a memcpy and a call that it depends on,
  /// and checks for the possibility of a return slot optimization by having
  /// the call write its result directly into the callees return parameter
  /// rather than using memcpy
  bool GVN::performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C,
                                   SmallVector<Instruction*, 4>& toErase) {
-  // Check that we're copying to an argument...
+  // Deliberately get the source and destination with bitcasts stripped away,
+  // because we'll need to do type comparisons based on the underlying type.
    Value* cpyDest = cpy->getDest();
-  if (!isa<Argument>(cpyDest))
+  Value* cpySrc = cpy->getSource();
+  CallSite CS = CallSite::get(C);
+  
+  // Since this is a return slot optimization, we need to make sure that
+  // the value being copied is, in fact, in a return slot.  We also need to
+  // check that the return slot parameter is marked noalias, so that we can
+  // be sure that changing it will not cause unexpected behavior changes due
+  // to it being accessed through a global or another parameter.
+  if (CS.arg_size() == 0 ||
+      cpySrc != CS.getArgument(0) ||
+      !CS.paramHasAttr(1, ParamAttr::NoAlias | ParamAttr::StructRet))
      return false;
    
-  // And that the argument is the return slot
-  Argument* sretArg = cast<Argument>(cpyDest);
-  if (!sretArg->hasStructRetAttr())
+  // Check that something sneaky is not happening involving casting
+  // return slot types around.
+  if (CS.getArgument(0)->getType() != cpyDest->getType())
      return false;
+  // sret --> pointer
+  const PointerType* PT = cast<PointerType>(cpyDest->getType()); 
    
-  // Make sure the return slot is otherwise dead
-  std::set<User*> useList(sretArg->use_begin(), sretArg->use_end());
-  while (!useList.empty()) {
-    User* UI = *useList.begin();
-    
-    if (isa<GetElementPtrInst>(UI) || isa<BitCastInst>(UI)) {
-      useList.insert(UI->use_begin(), UI->use_end());
-      useList.erase(UI);
-    } else if (UI == cpy)
-      useList.erase(UI);
-    else
-      return false;
-  }
+  // We can only perform the transformation if the size of the memcpy
+  // is constant and equal to the size of the structure.
+  ConstantInt* cpyLength = dyn_cast<ConstantInt>(cpy->getLength());
+  if (!cpyLength)
+    return false;
    
-  // Make sure the call cannot modify the return slot in some unpredicted way
+  TargetData& TD = getAnalysis<TargetData>();
+  if (TD.getTypeStoreSize(PT->getElementType()) != cpyLength->getZExtValue())
+    return false;
+  
+  // We only perform the transformation if it will be profitable. 
+  if (!isReturnSlotOptznProfitable(cpyDest, cpy))
+    return false;
+  
+  // In addition to knowing that the call does not access the return slot
+  // in some unexpected manner, which we derive from the noalias attribute,
+  // we also need to know that it does not sneakily modify the destination
+  // slot in the caller.  We don't have parameter attributes to go by
+  // for this one, so we just rely on AA to figure it out for us.
    AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
-  if (AA.getModRefInfo(C, cpy->getRawDest(), ~0UL) != AliasAnalysis::NoModRef)
+  if (AA.getModRefInfo(C, cpy->getRawDest(), cpyLength->getZExtValue()) !=
+      AliasAnalysis::NoModRef)
      return false;
    
-  // If all checks passed, then we can perform the transformation
-  CallSite CS = CallSite::get(C);
-  for (unsigned i = 0; i < CS.arg_size(); ++i) {
-    if (CS.paramHasAttr(i+1, ParamAttr::StructRet)) {
-      if (CS.getArgument(i)->getType() != cpyDest->getType())
-        return false;
-      
-      CS.setArgument(i, cpyDest);
-      break;
-    }
-  }
+  // If all the checks have passed, then we're alright to do the transformation.
+  CS.setArgument(0, cpyDest);
    
+  // Drop any cached information about the call, because we may have changed
+  // its dependence information by changing its parameter.
    MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
    MD.dropInstruction(C);
    
    // Remove the memcpy
+  MD.removeInstruction(cpy);
    toErase.push_back(cpy);
    
    return true;
@@ -1111,25 +1149,10 @@ bool GVN::performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C,
  /// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
  /// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
  ///  This allows later passes to remove the first memcpy altogether.
-bool GVN::processMemCpy(MemCpyInst* M,
+bool GVN::processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
                          SmallVector<Instruction*, 4>& toErase) {
-  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
-  
-  // First, we have to check that the dependency is another memcpy
-  Instruction* dep = MD.getDependency(M);
-  if  (dep == MemoryDependenceAnalysis::None ||
-       dep == MemoryDependenceAnalysis::NonLocal)
-    return false;
-  else if (!isa<MemCpyInst>(dep)) {
-    if (CallInst* C = dyn_cast<CallInst>(dep))
-      return performReturnSlotOptzn(M, C, toErase);
-    else
-      return false;
-  }
-  
    // We can only transforms memcpy's where the dest of one is the source of the
    // other
-  MemCpyInst* MDep = cast<MemCpyInst>(dep);
    if (M->getSource() != MDep->getDest())
      return false;
    
@@ -1160,11 +1183,9 @@ bool GVN::processMemCpy(MemCpyInst* M,
      return false;
    
    // If all checks passed, then we can transform these memcpy's
-  bool is32bit = M->getIntrinsicID() == Intrinsic::memcpy_i32;
-  Function* MemMoveFun = Intrinsic::getDeclaration(
+  Function* MemCpyFun = Intrinsic::getDeclaration(
                                   M->getParent()->getParent()->getParent(),
-                                 is32bit ? Intrinsic::memcpy_i32 : 
-                                           Intrinsic::memcpy_i64);
+                                 M->getIntrinsicID());
      
    std::vector<Value*> args;
    args.push_back(M->getRawDest());
@@ -1172,8 +1193,9 @@ bool GVN::processMemCpy(MemCpyInst* M,
    args.push_back(M->getLength());
    args.push_back(M->getAlignment());
    
-  CallInst* C = new CallInst(MemMoveFun, args.begin(), args.end(), "", M);
+  CallInst* C = new CallInst(MemCpyFun, args.begin(), args.end(), "", M);
    
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
    if (MD.getDependency(C) == MDep) {
      MD.dropInstruction(M);
      toErase.push_back(M);
@@ -1194,7 +1216,20 @@ bool GVN::processInstruction(Instruction* I,
    if (LoadInst* L = dyn_cast<LoadInst>(I)) {
      return processLoad(L, lastSeenLoad, toErase);
    } else if (MemCpyInst* M = dyn_cast<MemCpyInst>(I)) {
-    return processMemCpy(M, toErase);
+    MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+
+    // The are two possible optimizations we can do for memcpy:
+    //   a) memcpy-memcpy xform which exposes redundance for DSE
+    //   b) call-memcpy xform for sret return slot optimization
+    Instruction* dep = MD.getDependency(M);
+    if (dep == MemoryDependenceAnalysis::None ||
+        dep == MemoryDependenceAnalysis::NonLocal)
+      return false;
+    if (MemCpyInst *MemCpy = dyn_cast<MemCpyInst>(dep))
+      return processMemCpy(M, MemCpy, toErase);
+    if (CallInst* C = dyn_cast<CallInst>(dep))
+      return performReturnSlotOptzn(M, C, toErase);
+    return false;
    }
    
    unsigned num = VN.lookup_or_add(I);