When performing return slot optimization, remember to inform memdep when we're removi...

[oota-llvm.git] / lib / Transforms / Scalar / GVN.cpp
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp

index 3478b3108030901cfe829c461a1ce322cfc290f4..481956f6b4ce039dbb5307cc558372ce9e4c5d06 100644 (file)
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -21,6 +21,7 @@
  #include "llvm/Function.h"
  #include "llvm/IntrinsicInst.h"
  #include "llvm/Instructions.h"
+#include "llvm/ParameterAttributes.h"
  #include "llvm/Value.h"
  #include "llvm/ADT/BitVector.h"
  #include "llvm/ADT/DenseMap.h"
@@ -33,6 +34,7 @@
  #include "llvm/Analysis/MemoryDependenceAnalysis.h"
  #include "llvm/Support/CFG.h"
  #include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetData.h"
  using namespace llvm;
  
  //===----------------------------------------------------------------------===//
@@ -720,8 +722,10 @@ namespace {
        AU.addRequired<DominatorTree>();
        AU.addRequired<MemoryDependenceAnalysis>();
        AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetData>();
        AU.addPreserved<AliasAnalysis>();
        AU.addPreserved<MemoryDependenceAnalysis>();
+      AU.addPreserved<TargetData>();
      }
    
      // Helper fuctions
@@ -737,7 +741,10 @@ namespace {
                              SmallVector<Instruction*, 4>& toErase);
      bool processNonLocalLoad(LoadInst* L,
                               SmallVector<Instruction*, 4>& toErase);
-    bool processMemCpy(MemCpyInst* M, SmallVector<Instruction*, 4>& toErase);
+    bool processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
+                       SmallVector<Instruction*, 4>& toErase);
+    bool performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C,
+                                SmallVector<Instruction*, 4>& toErase);
      Value *GetValueForBlock(BasicBlock *BB, LoadInst* orig,
                              DenseMap<BasicBlock*, Value*> &Phis,
                              bool top_level = false);
@@ -857,6 +864,8 @@ Value *GVN::GetValueForBlock(BasicBlock *BB, LoadInst* orig,
      
      PN->addIncoming(val, *PI);
    }
+  AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
+  AA.copyValue(orig, PN);
    
    // Attempt to collapse PHI nodes that are trivially redundant
    Value* v = CollapsePhi(PN);
@@ -1012,89 +1021,190 @@ bool GVN::processLoad(LoadInst* L,
        dep = MD.getDependency(L, dep);
      }
    }
-  
+
+  if (dep != MemoryDependenceAnalysis::None &&
+      dep != MemoryDependenceAnalysis::NonLocal &&
+      isa<AllocationInst>(dep)) {
+    // Check that this load is actually from the
+    // allocation we found
+    Value* v = L->getOperand(0);
+    while (true) {
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(v))
+        v = BC->getOperand(0);
+      else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(v))
+        v = GEP->getOperand(0);
+      else
+        break;
+    }
+    if (v == dep) {
+      // If this load depends directly on an allocation, there isn't
+      // anything stored there; therefore, we can optimize this load
+      // to undef.
+      MD.removeInstruction(L);
+
+      L->replaceAllUsesWith(UndefValue::get(L->getType()));
+      toErase.push_back(L);
+      deletedLoad = true;
+      NumGVNLoad++;
+    }
+  }
+
    if (!deletedLoad)
      last = L;
    
    return deletedLoad;
  }
  
+/// isReturnSlotOptznProfitable - Determine if performing a return slot 
+/// fusion with the slot dest is profitable
+static bool isReturnSlotOptznProfitable(Value* dest, MemCpyInst* cpy) {
+  // We currently consider it profitable if dest is otherwise dead.
+  SmallVector<User*, 8> useList(dest->use_begin(), dest->use_end());
+  while (!useList.empty()) {
+    User* UI = useList.back();
+    
+    if (isa<GetElementPtrInst>(UI) || isa<BitCastInst>(UI)) {
+      useList.pop_back();
+      for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
+           I != E; ++I)
+        useList.push_back(*I);
+    } else if (UI == cpy)
+      useList.pop_back();
+    else
+      return false;
+  }
+  
+  return true;
+}
+
+/// performReturnSlotOptzn - takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a return slot optimization by having
+/// the call write its result directly into the callees return parameter
+/// rather than using memcpy
+bool GVN::performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C,
+                                 SmallVector<Instruction*, 4>& toErase) {
+  // Deliberately get the source and destination with bitcasts stripped away,
+  // because we'll need to do type comparisons based on the underlying type.
+  Value* cpyDest = cpy->getDest();
+  Value* cpySrc = cpy->getSource();
+  CallSite CS = CallSite::get(C);
+  
+  // Since this is a return slot optimization, we need to make sure that
+  // the value being copied is, in fact, in a return slot.  We also need to
+  // check that the return slot parameter is marked noalias, so that we can
+  // be sure that changing it will not cause unexpected behavior changes due
+  // to it being accessed through a global or another parameter.
+  if (CS.arg_size() == 0 ||
+      cpySrc != CS.getArgument(0) ||
+      !CS.paramHasAttr(1, ParamAttr::NoAlias | ParamAttr::StructRet))
+    return false;
+  
+  // Check that something sneaky is not happening involving casting
+  // return slot types around.
+  if (CS.getArgument(0)->getType() != cpyDest->getType())
+    return false;
+  // sret --> pointer
+  const PointerType* PT = cast<PointerType>(cpyDest->getType()); 
+  
+  // We can only perform the transformation if the size of the memcpy
+  // is constant and equal to the size of the structure.
+  ConstantInt* cpyLength = dyn_cast<ConstantInt>(cpy->getLength());
+  if (!cpyLength)
+    return false;
+  
+  TargetData& TD = getAnalysis<TargetData>();
+  if (TD.getTypeStoreSize(PT->getElementType()) != cpyLength->getZExtValue())
+    return false;
+  
+  // We only perform the transformation if it will be profitable. 
+  if (!isReturnSlotOptznProfitable(cpyDest, cpy))
+    return false;
+  
+  // In addition to knowing that the call does not access the return slot
+  // in some unexpected manner, which we derive from the noalias attribute,
+  // we also need to know that it does not sneakily modify the destination
+  // slot in the caller.  We don't have parameter attributes to go by
+  // for this one, so we just rely on AA to figure it out for us.
+  AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
+  if (AA.getModRefInfo(C, cpy->getRawDest(), cpyLength->getZExtValue()) !=
+      AliasAnalysis::NoModRef)
+    return false;
+  
+  // If all the checks have passed, then we're alright to do the transformation.
+  CS.setArgument(0, cpyDest);
+  
+  // Drop any cached information about the call, because we may have changed
+  // its dependence information by changing its parameter.
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  MD.dropInstruction(C);
+  
+  // Remove the memcpy
+  MD.removeInstruction(cpy);
+  toErase.push_back(cpy);
+  
+  return true;
+}
+
  /// processMemCpy - perform simplication of memcpy's.  If we have memcpy A which
  /// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
  /// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
  ///  This allows later passes to remove the first memcpy altogether.
-bool GVN::processMemCpy(MemCpyInst* M,
+bool GVN::processMemCpy(MemCpyInst* M, MemCpyInst* MDep,
                          SmallVector<Instruction*, 4>& toErase) {
-  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
-  
-  // First, we have to check that the dependency is another memcpy
-  Instruction* dep = MD.getDependency(M);
-  if  (dep == MemoryDependenceAnalysis::None ||
-       dep == MemoryDependenceAnalysis::NonLocal ||
-       !isa<MemCpyInst>(dep))
-    return false;
-  
    // We can only transforms memcpy's where the dest of one is the source of the
    // other
-  MemCpyInst* MDep = cast<MemCpyInst>(dep);
    if (M->getSource() != MDep->getDest())
      return false;
    
    // Second, the length of the memcpy's must be the same, or the preceeding one
    // must be larger than the following one.
-  Value* DepLength = MDep->getLength();
-  uint64_t CpySize = ~0UL;
-  uint64_t DepSize = ~0UL;
-  if (isa<ConstantInt>(DepLength)) {
-    if (isa<ConstantInt>(M->getLength())) {
-      if (cast<ConstantInt>(DepLength)->getLimitedValue() <
-          cast<ConstantInt>(M->getLength())->getLimitedValue()) {
-        return false;
-      } else {
-        CpySize = cast<ConstantInt>(M->getLength())->getLimitedValue();
-        DepSize = cast<ConstantInt>(DepLength)->getLimitedValue();
-      }
-    } else {
-      return false;
-    }
-  } else {
+  ConstantInt* C1 = dyn_cast<ConstantInt>(MDep->getLength());
+  ConstantInt* C2 = dyn_cast<ConstantInt>(M->getLength());
+  if (!C1 || !C2)
+    return false;
+  
+  uint64_t CpySize = C1->getValue().getZExtValue();
+  uint64_t DepSize = C2->getValue().getZExtValue();
+  
+  if (DepSize < CpySize)
      return false;
-  }
    
    // Finally, we have to make sure that the dest of the second does not
    // alias the source of the first
    AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
    if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) !=
-      AliasAnalysis::NoAlias) {
-    // If they don't, we can still make the transformation by first turning M
-    // into a memmove rather than a memcpy.
-    bool is32bit = M->getIntrinsicID() == Intrinsic::memcpy_i32;
-    Function* MemMoveFun = Intrinsic::getDeclaration(
+      AliasAnalysis::NoAlias)
+    return false;
+  else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) !=
+           AliasAnalysis::NoAlias)
+    return false;
+  else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize)
+           != AliasAnalysis::NoAlias)
+    return false;
+  
+  // If all checks passed, then we can transform these memcpy's
+  Function* MemCpyFun = Intrinsic::getDeclaration(
                                   M->getParent()->getParent()->getParent(),
-                                 is32bit ? Intrinsic::memmove_i32 : 
-                                           Intrinsic::memmove_i64);
-    
-    std::vector<Value*> args;
-    args.push_back(M->getRawDest());
-    args.push_back(MDep->getRawSource());
-    args.push_back(M->getLength());
-    args.push_back(M->getAlignment());
-                                           
-    new CallInst(MemMoveFun, args.begin(), args.end(), "", M);
+                                 M->getIntrinsicID());
      
-    MD.removeInstruction(M);
+  std::vector<Value*> args;
+  args.push_back(M->getRawDest());
+  args.push_back(MDep->getRawSource());
+  args.push_back(M->getLength());
+  args.push_back(M->getAlignment());
+  
+  CallInst* C = new CallInst(MemCpyFun, args.begin(), args.end(), "", M);
+  
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  if (MD.getDependency(C) == MDep) {
+    MD.dropInstruction(M);
      toErase.push_back(M);
-    
      return true;
+  } else {
+    MD.removeInstruction(C);
+    toErase.push_back(C);
+    return false;
    }
-  
-  // If all checks passed, then we can transform these memcpy's
-  M->setSource(MDep->getRawSource());
-  
-  // Reset dependence information for the memcpy
-  MD.removeInstruction(M);
-  
-  return true;
  }
  
  /// processInstruction - When calculating availability, handle an instruction
@@ -1106,7 +1216,20 @@ bool GVN::processInstruction(Instruction* I,
    if (LoadInst* L = dyn_cast<LoadInst>(I)) {
      return processLoad(L, lastSeenLoad, toErase);
    } else if (MemCpyInst* M = dyn_cast<MemCpyInst>(I)) {
-    return processMemCpy(M, toErase);
+    MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+
+    // The are two possible optimizations we can do for memcpy:
+    //   a) memcpy-memcpy xform which exposes redundance for DSE
+    //   b) call-memcpy xform for sret return slot optimization
+    Instruction* dep = MD.getDependency(M);
+    if (dep == MemoryDependenceAnalysis::None ||
+        dep == MemoryDependenceAnalysis::NonLocal)
+      return false;
+    if (MemCpyInst *MemCpy = dyn_cast<MemCpyInst>(dep))
+      return processMemCpy(M, MemCpy, toErase);
+    if (CallInst* C = dyn_cast<CallInst>(dep))
+      return performReturnSlotOptzn(M, C, toErase);
+    return false;
    }
    
    unsigned num = VN.lookup_or_add(I);
@@ -1216,8 +1339,9 @@ bool GVN::iterateOnFunction(Function &F) {
        ++BI;
  
        for (SmallVector<Instruction*, 4>::iterator I = toErase.begin(),
-           E = toErase.end(); I != E; ++I)
+           E = toErase.end(); I != E; ++I) {
          (*I)->eraseFromParent();
+      }
  
        toErase.clear();
      }