More effective breakdown of memcpy into repeated load/store. These are now

author Dale Johannesen <dalej@apple.com>

Thu, 17 May 2007 21:31:21 +0000 (21:31 +0000)

committer Dale Johannesen <dalej@apple.com>

Thu, 17 May 2007 21:31:21 +0000 (21:31 +0000)
author Dale Johannesen <dalej@apple.com>
Thu, 17 May 2007 21:31:21 +0000 (21:31 +0000)
committer Dale Johannesen <dalej@apple.com>
Thu, 17 May 2007 21:31:21 +0000 (21:31 +0000)
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index d059db1acfe64453e26eaedf4cb88c726911c70b..d30a288d1988359d9cf1da701c4aa7d578fb8b05 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -187,7 +187,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
  
    // Expand mem operations genericly.
    setOperationAction(ISD::MEMSET          , MVT::Other, Expand);
-  setOperationAction(ISD::MEMCPY          , MVT::Other, Expand);
+  setOperationAction(ISD::MEMCPY          , MVT::Other, Custom);
    setOperationAction(ISD::MEMMOVE         , MVT::Other, Expand);
    
    // Use the default implementation.
@@ -255,6 +255,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
  
    setSchedulingPreference(SchedulingForRegPressure);
    computeRegisterProperties();
+
+  maxStoresPerMemcpy = 1;   //// temporary - rewrite interface to use type
  }
  
  
@@ -1328,6 +1330,78 @@ static SDOperand LowerSRx(SDOperand Op, SelectionDAG &DAG,
    return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi);
  }
  
+SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Dest = Op.getOperand(1);
+  SDOperand Src = Op.getOperand(2);
+  SDOperand Count = Op.getOperand(3);
+  unsigned Align =
+    (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue();
+  if (Align == 0) Align = 1;
+
+  ConstantSDNode *I = dyn_cast<ConstantSDNode>(Count);
+  // Just call memcpy if:
+  // not 4-byte aligned
+  // size is unknown
+  // size is >= the threshold.
+  if ((Align & 3) != 0 || 
+       !I ||
+       I->getValue() >= 64 ||
+       (I->getValue() & 3) != 0) {
+    MVT::ValueType IntPtr = getPointerTy();
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = getTargetData()->getIntPtrType();
+    Entry.Node = Op.getOperand(1); Args.push_back(Entry);
+    Entry.Node = Op.getOperand(2); Args.push_back(Entry);
+    Entry.Node = Op.getOperand(3); Args.push_back(Entry);
+    std::pair<SDOperand,SDOperand> CallResult =
+      LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false,
+                  DAG.getExternalSymbol("memcpy", IntPtr), Args, DAG);
+    return CallResult.second;
+  }
+
+  // Otherwise do repeated 4-byte loads and stores.  To be improved.
+  assert((I->getValue() & 3) == 0);
+  assert((Align & 3) == 0);
+  unsigned NumMemOps = I->getValue() >> 2;
+  unsigned EmittedNumMemOps = 0;
+  unsigned SrcOff = 0, DstOff = 0;
+  MVT::ValueType VT = MVT::i32;
+  unsigned VTSize = 4;
+  const int MAX_LOADS_IN_LDM = 6;
+  SDOperand LoadChains[MAX_LOADS_IN_LDM];
+  SDOperand Loads[MAX_LOADS_IN_LDM];
+
+  // Emit up to 4 loads, then a TokenFactor barrier, then the same
+  // number of stores.  The loads and stores will get combined into
+  // ldm/stm later on.
+  while(EmittedNumMemOps < NumMemOps) {
+    unsigned int i;
+    for (i=0; i<MAX_LOADS_IN_LDM && EmittedNumMemOps+i < NumMemOps; i++) {
+      Loads[i] = DAG.getLoad(VT, Chain,
+                             DAG.getNode(ISD::ADD, VT, Src, 
+                                         DAG.getConstant(SrcOff, VT)),
+                             NULL, 0);
+      LoadChains[i] = Loads[i].getValue(1);
+      SrcOff += VTSize;
+    }
+
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &LoadChains[0], i);
+
+    for (i=0; i<MAX_LOADS_IN_LDM && EmittedNumMemOps+i < NumMemOps; i++) {
+      Chain = DAG.getStore(Chain, Loads[i],
+                           DAG.getNode(ISD::ADD, VT, Dest, 
+                                       DAG.getConstant(DstOff, VT)),
+                           NULL, 0);
+      DstOff += VTSize;
+    }
+    EmittedNumMemOps += i;
+  }
+
+  return Chain;
+}
+
  SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
    switch (Op.getOpcode()) {
    default: assert(0 && "Don't know how to custom lower this!"); abort();
@@ -1358,6 +1432,7 @@ SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
    case ISD::RETURNADDR:    break;
    case ISD::FRAMEADDR:     break;
    case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
+  case ISD::MEMCPY:        return LowerMEMCPY(Op, DAG);
    }
    return SDOperand();
  }
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h

index 8e9ef889894bfbbdbca30601ab211bb1b59525a9..b0c5e8948945010c32f5d53a1bed4ca3c6961fea 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -135,6 +135,7 @@ namespace llvm {
      SDOperand LowerGLOBAL_OFFSET_TABLE(SDOperand Op, SelectionDAG &DAG);
      SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
      SDOperand LowerBR_JT(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG);
    };
  }
author	Dale Johannesen <dalej@apple.com>
	Thu, 17 May 2007 21:31:21 +0000 (21:31 +0000)
committer	Dale Johannesen <dalej@apple.com>
	Thu, 17 May 2007 21:31:21 +0000 (21:31 +0000)
lib/Target/ARM/ARMISelLowering.cpp		patch \| blob \| history
lib/Target/ARM/ARMISelLowering.h		patch \| blob \| history