Expand small memmovs using inline code. Set the X86 threshold for expanding

[oota-llvm.git] / lib / CodeGen / SelectionDAG / SelectionDAG.cpp
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

index 57784c21eec8705468776fef6cd32a5ebeba37ab..2d2ae069b55db59f4e1026289770cc736821469e 100644 (file)
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -74,12 +74,9 @@ bool ConstantFPSDNode::isValueValidForType(MVT::ValueType VT,
                                             const APFloat& Val) {
    assert(MVT::isFloatingPoint(VT) && "Can only convert between FP types");
    
-  // Anything can be extended to ppc long double.
-  if (VT == MVT::ppcf128)
-    return true;
-  
-  // PPC long double cannot be shrunk to anything though.
-  if (&Val.getSemantics() == &APFloat::PPCDoubleDouble)
+  // PPC long double cannot be converted to any other type.
+  if (VT == MVT::ppcf128 ||
+      &Val.getSemantics() == &APFloat::PPCDoubleDouble)
      return false;
    
    // convert modifies in place, so make a copy.
@@ -296,6 +293,7 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
      switch (Result) {
      default: break;
      case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT
+    case ISD::SETOEQ:                                 // SETEQ  & SETU[LG]E
      case ISD::SETUEQ: Result = ISD::SETEQ   ; break;  // SETUGE & SETULE
      case ISD::SETOLT: Result = ISD::SETULT  ; break;  // SETULT & SETNE
      case ISD::SETOGT: Result = ISD::SETUGT  ; break;  // SETUGT & SETNE
@@ -321,7 +319,7 @@ static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC)  {
  
  /// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
  /// solely with their pointer.
-void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
+static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
    ID.AddPointer(VTList.VTs);  
  }
  
@@ -337,7 +335,7 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID,
  
  static void AddNodeIDNode(FoldingSetNodeID &ID,
                            unsigned short OpC, SDVTList VTList, 
-                          const SDOperand *OpList, unsigned N) {
+                          SDOperandPtr OpList, unsigned N) {
    AddNodeIDOpcode(ID, OpC);
    AddNodeIDValueTypes(ID, VTList);
    AddNodeIDOperands(ID, OpList, N);
@@ -1232,6 +1230,52 @@ void SelectionDAG::ComputeMaskedBits(SDOperand Op, const APInt &Mask,
      KnownZero = KnownZeroOut;
      return;
    }
+  case ISD::MUL: {
+    APInt Mask2 = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(Op.getOperand(1), Mask2, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If low bits are zero in either operand, output low known-0 bits.
+    // Also compute a conserative estimate for high known-0 bits.
+    // More trickiness is possible, but this is sufficient for the
+    // interesting case of alignment computation.
+    KnownOne.clear();
+    unsigned TrailZ = KnownZero.countTrailingOnes() +
+                      KnownZero2.countTrailingOnes();
+    unsigned LeadZ =  std::max(KnownZero.countLeadingOnes() +
+                               KnownZero2.countLeadingOnes(),
+                               BitWidth) - BitWidth;
+
+    TrailZ = std::min(TrailZ, BitWidth);
+    LeadZ = std::min(LeadZ, BitWidth);
+    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
+                APInt::getHighBitsSet(BitWidth, LeadZ);
+    KnownZero &= Mask;
+    return;
+  }
+  case ISD::UDIV: {
+    // For the purposes of computing leading zeros we can conservatively
+    // treat a udiv as a logical right shift by the power of 2 known to
+    // be less than the denominator.
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(Op.getOperand(0),
+                      AllOnes, KnownZero2, KnownOne2, Depth+1);
+    unsigned LeadZ = KnownZero2.countLeadingOnes();
+
+    KnownOne2.clear();
+    KnownZero2.clear();
+    ComputeMaskedBits(Op.getOperand(1),
+                      AllOnes, KnownZero2, KnownOne2, Depth+1);
+    unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
+    if (RHSUnknownLeadingOnes != BitWidth)
+      LeadZ = std::min(BitWidth,
+                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+
+    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ) & Mask;
+    return;
+  }
    case ISD::SELECT:
      ComputeMaskedBits(Op.getOperand(2), Mask, KnownZero, KnownOne, Depth+1);
      ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero2, KnownOne2, Depth+1);
@@ -1472,48 +1516,95 @@ void SelectionDAG::ComputeMaskedBits(SDOperand Op, const APInt &Mask,
      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1);
      return;
    
+  case ISD::SUB: {
+    if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0))) {
+      // We know that the top bits of C-X are clear if X contains less bits
+      // than C (i.e. no wrap-around can happen).  For example, 20-X is
+      // positive if we can prove that X is >= 0 and < 16.
+      if (CLHS->getAPIntValue().isNonNegative()) {
+        unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
+        // NLZ can't be BitWidth with no sign bit
+        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
+        ComputeMaskedBits(Op.getOperand(1), MaskV, KnownZero2, KnownOne2,
+                          Depth+1);
+
+        // If all of the MaskV bits are known to be zero, then we know the
+        // output top bits are zero, because we now know that the output is
+        // from [0-C].
+        if ((KnownZero2 & MaskV) == MaskV) {
+          unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
+          // Top bits known zero.
+          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2) & Mask;
+        }
+      }
+    }
+  }
+  // fall through
    case ISD::ADD: {
-    // If either the LHS or the RHS are Zero, the result is zero.
-    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
-    
      // Output known-0 bits are known if clear or set in both the low clear bits
      // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
      // low 3 bits clear.
-    unsigned KnownZeroOut = std::min(KnownZero.countTrailingOnes(), 
-                                     KnownZero2.countTrailingOnes());
-    
-    KnownZero = APInt::getLowBitsSet(BitWidth, KnownZeroOut);
-    KnownOne = APInt(BitWidth, 0);
+    APInt Mask2 = APInt::getLowBitsSet(BitWidth, Mask.countTrailingOnes());
+    ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    unsigned KnownZeroOut = KnownZero2.countTrailingOnes();
+
+    ComputeMaskedBits(Op.getOperand(1), Mask2, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    KnownZeroOut = std::min(KnownZeroOut,
+                            KnownZero2.countTrailingOnes());
+
+    KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroOut);
      return;
    }
-  case ISD::SUB: {
-    ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0));
-    if (!CLHS) return;
-
-    // We know that the top bits of C-X are clear if X contains less bits
-    // than C (i.e. no wrap-around can happen).  For example, 20-X is
-    // positive if we can prove that X is >= 0 and < 16.
-    if (CLHS->getAPIntValue().isNonNegative()) {
-      unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
-      // NLZ can't be BitWidth with no sign bit
-      APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-      ComputeMaskedBits(Op.getOperand(1), MaskV, KnownZero, KnownOne, Depth+1);
-
-      // If all of the MaskV bits are known to be zero, then we know the output
-      // top bits are zero, because we now know that the output is from [0-C].
-      if ((KnownZero & MaskV) == MaskV) {
-        unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
-        // Top bits known zero.
-        KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2) & Mask;
-        KnownOne = APInt(BitWidth, 0);   // No one bits known.
-      } else {
-        KnownZero = KnownOne = APInt(BitWidth, 0);  // Otherwise, nothing known.
+  case ISD::SREM:
+    if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      APInt RA = Rem->getAPIntValue();
+      if (RA.isPowerOf2() || (-RA).isPowerOf2()) {
+        APInt LowBits = RA.isStrictlyPositive() ? (RA - 1) : ~RA;
+        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
+        ComputeMaskedBits(Op.getOperand(0), Mask2,KnownZero2,KnownOne2,Depth+1);
+
+        // The sign of a remainder is equal to the sign of the first
+        // operand (zero being positive).
+        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
+          KnownZero2 |= ~LowBits;
+        else if (KnownOne2[BitWidth-1])
+          KnownOne2 |= ~LowBits;
+
+        KnownZero |= KnownZero2 & Mask;
+        KnownOne |= KnownOne2 & Mask;
+
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
        }
      }
      return;
+  case ISD::UREM: {
+    if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      APInt RA = Rem->getAPIntValue();
+      if (RA.isPowerOf2()) {
+        APInt LowBits = (RA - 1);
+        APInt Mask2 = LowBits & Mask;
+        KnownZero |= ~LowBits & Mask;
+        ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero, KnownOne,Depth+1);
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
+        break;
+      }
+    }
+
+    // Since the result is less than or equal to either operand, any leading
+    // zero bits in either operand must also exist in the result.
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    ComputeMaskedBits(Op.getOperand(0), AllOnes, KnownZero, KnownOne,
+                      Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), AllOnes, KnownZero2, KnownOne2,
+                      Depth+1);
+
+    uint32_t Leaders = std::max(KnownZero.countLeadingOnes(),
+                                KnownZero2.countLeadingOnes());
+    KnownOne.clear();
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & Mask;
+    return;
    }
    default:
      // Allow the target to implement this method for its nodes.
@@ -1537,6 +1628,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDOperand Op, unsigned Depth) const{
    assert(MVT::isInteger(VT) && "Invalid VT!");
    unsigned VTBits = MVT::getSizeInBits(VT);
    unsigned Tmp, Tmp2;
+  unsigned FirstAnswer = 1;
    
    if (Depth == 6)
      return 1;  // Limit search depth.
@@ -1592,16 +1684,21 @@ unsigned SelectionDAG::ComputeNumSignBits(SDOperand Op, unsigned Depth) const{
    case ISD::AND:
    case ISD::OR:
    case ISD::XOR:    // NOT is handled here.
-    // Logical binary ops preserve the number of sign bits.
+    // Logical binary ops preserve the number of sign bits at the worst.
      Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
-    if (Tmp == 1) return 1;  // Early out.
-    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
-    return std::min(Tmp, Tmp2);
+    if (Tmp != 1) {
+      Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+      FirstAnswer = std::min(Tmp, Tmp2);
+      // We computed what we know about the sign bits as our first
+      // answer. Now proceed to the generic code that uses
+      // ComputeMaskedBits, and pick whichever answer is better.
+    }
+    break;
  
    case ISD::SELECT:
-    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    Tmp = ComputeNumSignBits(Op.getOperand(1), Depth+1);
      if (Tmp == 1) return 1;  // Early out.
-    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    Tmp2 = ComputeNumSignBits(Op.getOperand(2), Depth+1);
      return std::min(Tmp, Tmp2);
      
    case ISD::SETCC:
@@ -1710,7 +1807,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDOperand Op, unsigned Depth) const{
        Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
        Op.getOpcode() == ISD::INTRINSIC_VOID) {
      unsigned NumBits = TLI.ComputeNumSignBitsForTargetNode(Op, Depth);
-    if (NumBits > 1) return NumBits;
+    if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits);
    }
    
    // Finally, if we can prove that the top bits of the result are 0's or 1's,
@@ -1725,7 +1822,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDOperand Op, unsigned Depth) const{
      Mask = KnownOne;
    } else {
      // Nothing known.
-    return 1;
+    return FirstAnswer;
    }
    
    // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
@@ -1734,7 +1831,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDOperand Op, unsigned Depth) const{
    Mask <<= Mask.getBitWidth()-VTBits;
    // Return # leading zeros.  We use 'min' here in case Val was zero before
    // shifting.  We don't want to return '64' as for an i32 "0".
-  return std::min(VTBits, Mask.countLeadingZeros());
+  return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
  }
  
  
@@ -1748,6 +1845,35 @@ bool SelectionDAG::isVerifiedDebugInfoDesc(SDOperand Op) const {
  }
  
  
+/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+SDOperand SelectionDAG::getShuffleScalarElt(const SDNode *N, unsigned Idx) {
+  MVT::ValueType VT = N->getValueType(0);
+  SDOperand PermMask = N->getOperand(2);
+  unsigned NumElems = PermMask.getNumOperands();
+  SDOperand V = (Idx < NumElems) ? N->getOperand(0) : N->getOperand(1);
+  Idx %= NumElems;
+
+  if (V.getOpcode() == ISD::BIT_CONVERT) {
+    V = V.getOperand(0);
+    if (MVT::getVectorNumElements(V.getValueType()) != NumElems)
+      return SDOperand();
+  }
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return (Idx == 0) ? V.getOperand(0)
+                      : getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
+  if (V.getOpcode() == ISD::BUILD_VECTOR)
+    return V.getOperand(Idx);
+  if (V.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    SDOperand Elt = PermMask.getOperand(Idx);
+    if (Elt.getOpcode() == ISD::UNDEF)
+      return getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
+    return getShuffleScalarElt(V.Val,cast<ConstantSDNode>(Elt)->getValue());
+  }
+  return SDOperand();
+}
+
+
  /// getNode - Gets or creates the specified node.
  ///
  SDOperand SelectionDAG::getNode(unsigned Opcode, MVT::ValueType VT) {
@@ -2392,41 +2518,42 @@ SDOperand SelectionDAG::getNode(unsigned Opcode, MVT::ValueType VT,
  /// operand.
  static SDOperand getMemsetValue(SDOperand Value, MVT::ValueType VT,
                                  SelectionDAG &DAG) {
-  MVT::ValueType CurVT = VT;
+  unsigned NumBits = MVT::isVector(VT) ?
+    MVT::getSizeInBits(MVT::getVectorElementType(VT)) : MVT::getSizeInBits(VT);
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
-    uint64_t Val   = C->getValue() & 255;
+    APInt Val = APInt(NumBits, C->getValue() & 255);
      unsigned Shift = 8;
-    while (CurVT != MVT::i8) {
+    for (unsigned i = NumBits; i > 8; i >>= 1) {
        Val = (Val << Shift) | Val;
        Shift <<= 1;
-      CurVT = (MVT::ValueType)((unsigned)CurVT - 1);
-    }
-    return DAG.getConstant(Val, VT);
-  } else {
-    Value = DAG.getNode(ISD::ZERO_EXTEND, VT, Value);
-    unsigned Shift = 8;
-    while (CurVT != MVT::i8) {
-      Value =
-        DAG.getNode(ISD::OR, VT,
-                    DAG.getNode(ISD::SHL, VT, Value,
-                                DAG.getConstant(Shift, MVT::i8)), Value);
-      Shift <<= 1;
-      CurVT = (MVT::ValueType)((unsigned)CurVT - 1);
      }
+    if (MVT::isInteger(VT))
+      return DAG.getConstant(Val, VT);
+    return DAG.getConstantFP(APFloat(Val), VT);
+  }
  
-    return Value;
+  Value = DAG.getNode(ISD::ZERO_EXTEND, VT, Value);
+  unsigned Shift = 8;
+  for (unsigned i = NumBits; i > 8; i >>= 1) {
+    Value = DAG.getNode(ISD::OR, VT,
+                        DAG.getNode(ISD::SHL, VT, Value,
+                                    DAG.getConstant(Shift, MVT::i8)), Value);
+    Shift <<= 1;
    }
+
+  return Value;
  }
  
  /// getMemsetStringVal - Similar to getMemsetValue. Except this is only
  /// used when a memcpy is turned into a memset when the source is a constant
  /// string ptr.
-static SDOperand getMemsetStringVal(MVT::ValueType VT,
-                                    SelectionDAG &DAG,
+static SDOperand getMemsetStringVal(MVT::ValueType VT, SelectionDAG &DAG,
                                      const TargetLowering &TLI,
                                      std::string &Str, unsigned Offset) {
+  assert(!MVT::isVector(VT) && "Can't handle vector type here!");
+  unsigned NumBits = MVT::getSizeInBits(VT);
+  unsigned MSB = NumBits / 8;
    uint64_t Val = 0;
-  unsigned MSB = MVT::getSizeInBits(VT) / 8;
    if (TLI.isLittleEndian())
      Offset = Offset + MSB - 1;
    for (unsigned i = 0; i != MSB; ++i) {
@@ -2437,56 +2564,119 @@ static SDOperand getMemsetStringVal(MVT::ValueType VT,
  }
  
  /// getMemBasePlusOffset - Returns base and offset node for the 
+///
  static SDOperand getMemBasePlusOffset(SDOperand Base, unsigned Offset,
                                        SelectionDAG &DAG) {
    MVT::ValueType VT = Base.getValueType();
    return DAG.getNode(ISD::ADD, VT, Base, DAG.getConstant(Offset, VT));
  }
  
+/// isMemSrcFromString - Returns true if memcpy source is a string constant.
+///
+static bool isMemSrcFromString(SDOperand Src, std::string &Str,
+                               uint64_t &SrcOff) {
+  unsigned SrcDelta = 0;
+  GlobalAddressSDNode *G = NULL;
+  if (Src.getOpcode() == ISD::GlobalAddress)
+    G = cast<GlobalAddressSDNode>(Src);
+  else if (Src.getOpcode() == ISD::ADD &&
+           Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
+           Src.getOperand(1).getOpcode() == ISD::Constant) {
+    G = cast<GlobalAddressSDNode>(Src.getOperand(0));
+    SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getValue();
+  }
+  if (!G)
+    return false;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(G->getGlobal());
+  if (GV && GV->isConstant()) {
+    Str = GV->getStringValue(false);
+    if (!Str.empty()) {
+      SrcOff += SrcDelta;
+      return true;
+    }
+  }
+
+  return false;
+}
+
  /// MeetsMaxMemopRequirement - Determines if the number of memory ops required
  /// to replace the memset / memcpy is below the threshold. It also returns the
  /// types of the sequence of memory ops to perform memset / memcpy.
-static bool MeetsMaxMemopRequirement(std::vector<MVT::ValueType> &MemOps,
-                                     unsigned Limit, uint64_t Size,
-                                     unsigned Align,
-                                     const TargetLowering &TLI) {
-  MVT::ValueType VT;
-
-  if (TLI.allowsUnalignedMemoryAccesses()) {
-    VT = MVT::i64;
-  } else {
-    switch (Align & 7) {
-    case 0:
-      VT = MVT::i64;
-      break;
-    case 4:
-      VT = MVT::i32;
-      break;
-    case 2:
-      VT = MVT::i16;
-      break;
-    default:
-      VT = MVT::i8;
-      break;
+static
+bool MeetsMaxMemopRequirement(std::vector<MVT::ValueType> &MemOps,
+                              SDOperand Dst, SDOperand Src,
+                              unsigned Limit, uint64_t Size, unsigned &Align,
+                              SelectionDAG &DAG,
+                              const TargetLowering &TLI) {
+  bool AllowUnalign = TLI.allowsUnalignedMemoryAccesses();
+
+  std::string Str;
+  uint64_t SrcOff = 0;
+  bool isSrcStr = isMemSrcFromString(Src, Str, SrcOff);
+  bool isSrcConst = isa<ConstantSDNode>(Src);
+  MVT::ValueType VT= TLI.getOptimalMemOpType(Size, Align, isSrcConst, isSrcStr);
+  if (VT != MVT::iAny) {
+    unsigned NewAlign = (unsigned)
+      TLI.getTargetData()->getABITypeAlignment(MVT::getTypeForValueType(VT));
+    // If source is a string constant, this will require an unaligned load.
+    if (NewAlign > Align && (isSrcConst || AllowUnalign)) {
+      if (Dst.getOpcode() != ISD::FrameIndex) {
+        // Can't change destination alignment. It requires a unaligned store.
+        if (AllowUnalign)
+          VT = MVT::iAny;
+      } else {
+        int FI = cast<FrameIndexSDNode>(Dst)->getIndex();
+        MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+        if (MFI->isFixedObjectIndex(FI)) {
+          // Can't change destination alignment. It requires a unaligned store.
+          if (AllowUnalign)
+            VT = MVT::iAny;
+        } else {
+          // Give the stack frame object a larger alignment.
+          MFI->setObjectAlignment(FI, NewAlign);
+          Align = NewAlign;
+        }
+      }
      }
    }
  
-  MVT::ValueType LVT = MVT::i64;
-  while (!TLI.isTypeLegal(LVT))
-    LVT = (MVT::ValueType)((unsigned)LVT - 1);
-  assert(MVT::isInteger(LVT));
+  if (VT == MVT::iAny) {
+    if (AllowUnalign) {
+      VT = MVT::i64;
+    } else {
+      switch (Align & 7) {
+      case 0:  VT = MVT::i64; break;
+      case 4:  VT = MVT::i32; break;
+      case 2:  VT = MVT::i16; break;
+      default: VT = MVT::i8;  break;
+      }
+    }
  
-  if (VT > LVT)
-    VT = LVT;
+    MVT::ValueType LVT = MVT::i64;
+    while (!TLI.isTypeLegal(LVT))
+      LVT = (MVT::ValueType)((unsigned)LVT - 1);
+    assert(MVT::isInteger(LVT));
+
+    if (VT > LVT)
+      VT = LVT;
+  }
  
    unsigned NumMemOps = 0;
    while (Size != 0) {
      unsigned VTSize = MVT::getSizeInBits(VT) / 8;
      while (VTSize > Size) {
-      VT = (MVT::ValueType)((unsigned)VT - 1);
-      VTSize >>= 1;
+      // For now, only use non-vector load / store's for the left-over pieces.
+      if (MVT::isVector(VT)) {
+        VT = MVT::i64;
+        while (!TLI.isTypeLegal(VT))
+          VT = (MVT::ValueType)((unsigned)VT - 1);         
+        VTSize = MVT::getSizeInBits(VT) / 8;
+      } else {
+        VT = (MVT::ValueType)((unsigned)VT - 1);
+        VTSize >>= 1;
+      }
      }
-    assert(MVT::isInteger(VT));
  
      if (++NumMemOps > Limit)
        return false;
@@ -2500,67 +2690,49 @@ static bool MeetsMaxMemopRequirement(std::vector<MVT::ValueType> &MemOps,
  static SDOperand getMemcpyLoadsAndStores(SelectionDAG &DAG,
                                           SDOperand Chain, SDOperand Dst,
                                           SDOperand Src, uint64_t Size,
-                                         unsigned Align,
-                                         bool AlwaysInline,
-                                         const Value *DstSV, uint64_t DstOff,
-                                         const Value *SrcSV, uint64_t SrcOff) {
+                                         unsigned Align, bool AlwaysInline,
+                                         const Value *DstSV, uint64_t DstSVOff,
+                                         const Value *SrcSV, uint64_t SrcSVOff){
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
-  // Expand memcpy to a series of store ops if the size operand falls below
-  // a certain threshold.
+  // Expand memcpy to a series of load and store ops if the size operand falls
+  // below a certain threshold.
    std::vector<MVT::ValueType> MemOps;
    uint64_t Limit = -1;
    if (!AlwaysInline)
      Limit = TLI.getMaxStoresPerMemcpy();
-  if (!MeetsMaxMemopRequirement(MemOps, Limit, Size, Align, TLI))
+  unsigned DstAlign = Align;  // Destination alignment can change.
+  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, Limit, Size, DstAlign,
+                                DAG, TLI))
      return SDOperand();
  
-  SmallVector<SDOperand, 8> OutChains;
-
-  unsigned NumMemOps = MemOps.size();
-  unsigned SrcDelta = 0;
-  GlobalAddressSDNode *G = NULL;
    std::string Str;
-  bool CopyFromStr = false;
-
-  if (Src.getOpcode() == ISD::GlobalAddress)
-    G = cast<GlobalAddressSDNode>(Src);
-  else if (Src.getOpcode() == ISD::ADD &&
-           Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
-           Src.getOperand(1).getOpcode() == ISD::Constant) {
-    G = cast<GlobalAddressSDNode>(Src.getOperand(0));
-    SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getValue();
-  }
-  if (G) {
-    GlobalVariable *GV = dyn_cast<GlobalVariable>(G->getGlobal());
-    if (GV && GV->isConstant()) {
-      Str = GV->getStringValue(false);
-      if (!Str.empty()) {
-        CopyFromStr = true;
-        SrcOff += SrcDelta;
-      }
-    }
-  }
+  uint64_t SrcOff = 0, DstOff = 0;
+  bool CopyFromStr = isMemSrcFromString(Src, Str, SrcOff);
  
+  SmallVector<SDOperand, 8> OutChains;
+  unsigned NumMemOps = MemOps.size();
    for (unsigned i = 0; i < NumMemOps; i++) {
      MVT::ValueType VT = MemOps[i];
      unsigned VTSize = MVT::getSizeInBits(VT) / 8;
      SDOperand Value, Store;
  
-    if (CopyFromStr) {
+    if (CopyFromStr && !MVT::isVector(VT)) {
+      // It's unlikely a store of a vector immediate can be done in a single
+      // instruction. It would require a load from a constantpool first.
+      // FIXME: Handle cases where store of vector immediate is done in a
+      // single instruction.
        Value = getMemsetStringVal(VT, DAG, TLI, Str, SrcOff);
-      Store =
-        DAG.getStore(Chain, Value,
-                     getMemBasePlusOffset(Dst, DstOff, DAG),
-                     DstSV, DstOff);
+      Store = DAG.getStore(Chain, Value,
+                           getMemBasePlusOffset(Dst, DstOff, DAG),
+                           DstSV, DstSVOff + DstOff);
      } else {
        Value = DAG.getLoad(VT, Chain,
                            getMemBasePlusOffset(Src, SrcOff, DAG),
-                          SrcSV, SrcOff, false, Align);
-      Store =
-        DAG.getStore(Chain, Value,
-                     getMemBasePlusOffset(Dst, DstOff, DAG),
-                     DstSV, DstOff, false, Align);
+                          SrcSV, SrcSVOff + SrcOff, false, Align);
+      Store = DAG.getStore(Chain, Value,
+                           getMemBasePlusOffset(Dst, DstOff, DAG),
+                           DstSV, DstSVOff + DstOff, false, DstAlign);
      }
      OutChains.push_back(Store);
      SrcOff += VTSize;
@@ -2571,21 +2743,79 @@ static SDOperand getMemcpyLoadsAndStores(SelectionDAG &DAG,
                       &OutChains[0], OutChains.size());
  }
  
+static SDOperand getMemmoveLoadsAndStores(SelectionDAG &DAG,
+                                          SDOperand Chain, SDOperand Dst,
+                                          SDOperand Src, uint64_t Size,
+                                          unsigned Align, bool AlwaysInline,
+                                          const Value *DstSV, uint64_t DstSVOff,
+                                          const Value *SrcSV, uint64_t SrcSVOff){
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Expand memmove to a series of load and store ops if the size operand falls
+  // below a certain threshold.
+  std::vector<MVT::ValueType> MemOps;
+  uint64_t Limit = -1;
+  if (!AlwaysInline)
+    Limit = TLI.getMaxStoresPerMemmove();
+  unsigned DstAlign = Align;  // Destination alignment can change.
+  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, Limit, Size, DstAlign,
+                                DAG, TLI))
+    return SDOperand();
+
+  std::string Str;
+  uint64_t SrcOff = 0, DstOff = 0;
+
+  SmallVector<SDOperand, 8> LoadValues;
+  SmallVector<SDOperand, 8> LoadChains;
+  SmallVector<SDOperand, 8> OutChains;
+  unsigned NumMemOps = MemOps.size();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    MVT::ValueType VT = MemOps[i];
+    unsigned VTSize = MVT::getSizeInBits(VT) / 8;
+    SDOperand Value, Store;
+
+    Value = DAG.getLoad(VT, Chain,
+                        getMemBasePlusOffset(Src, SrcOff, DAG),
+                        SrcSV, SrcSVOff + SrcOff, false, Align);
+    LoadValues.push_back(Value);
+    LoadChains.push_back(Value.getValue(1));
+    SrcOff += VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                      &LoadChains[0], LoadChains.size());
+  OutChains.clear();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    MVT::ValueType VT = MemOps[i];
+    unsigned VTSize = MVT::getSizeInBits(VT) / 8;
+    SDOperand Value, Store;
+
+    Store = DAG.getStore(Chain, LoadValues[i],
+                         getMemBasePlusOffset(Dst, DstOff, DAG),
+                         DstSV, DstSVOff + DstOff, false, DstAlign);
+    OutChains.push_back(Store);
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
  static SDOperand getMemsetStores(SelectionDAG &DAG,
                                   SDOperand Chain, SDOperand Dst,
                                   SDOperand Src, uint64_t Size,
                                   unsigned Align,
-                                 const Value *DstSV, uint64_t DstOff) {
+                                 const Value *DstSV, uint64_t DstSVOff) {
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
    // Expand memset to a series of load/store ops if the size operand
    // falls below a certain threshold.
    std::vector<MVT::ValueType> MemOps;
-  if (!MeetsMaxMemopRequirement(MemOps, TLI.getMaxStoresPerMemset(),
-                                Size, Align, TLI))
+  if (!MeetsMaxMemopRequirement(MemOps, Dst, Src, TLI.getMaxStoresPerMemset(),
+                                Size, Align, DAG, TLI))
      return SDOperand();
  
    SmallVector<SDOperand, 8> OutChains;
+  uint64_t DstOff = 0;
  
    unsigned NumMemOps = MemOps.size();
    for (unsigned i = 0; i < NumMemOps; i++) {
@@ -2594,7 +2824,7 @@ static SDOperand getMemsetStores(SelectionDAG &DAG,
      SDOperand Value = getMemsetValue(Src, VT, DAG);
      SDOperand Store = DAG.getStore(Chain, Value,
                                     getMemBasePlusOffset(Dst, DstOff, DAG),
-                                   DstSV, DstOff);
+                                   DstSV, DstSVOff + DstOff);
      OutChains.push_back(Store);
      DstOff += VTSize;
    }
@@ -2606,8 +2836,8 @@ static SDOperand getMemsetStores(SelectionDAG &DAG,
  SDOperand SelectionDAG::getMemcpy(SDOperand Chain, SDOperand Dst,
                                    SDOperand Src, SDOperand Size,
                                    unsigned Align, bool AlwaysInline,
-                                  const Value *DstSV, uint64_t DstOff,
-                                  const Value *SrcSV, uint64_t SrcOff) {
+                                  const Value *DstSV, uint64_t DstSVOff,
+                                  const Value *SrcSV, uint64_t SrcSVOff) {
  
    // Check to see if we should lower the memcpy to loads and stores first.
    // For cases within the target-specified limits, this is the best choice.
@@ -2619,7 +2849,7 @@ SDOperand SelectionDAG::getMemcpy(SDOperand Chain, SDOperand Dst,
  
      SDOperand Result =
        getMemcpyLoadsAndStores(*this, Chain, Dst, Src, ConstantSize->getValue(),
-                              Align, false, DstSV, DstOff, SrcSV, SrcOff);
+                              Align, false, DstSV, DstSVOff, SrcSV, SrcSVOff);
      if (Result.Val)
        return Result;
    }
@@ -2629,7 +2859,7 @@ SDOperand SelectionDAG::getMemcpy(SDOperand Chain, SDOperand Dst,
    SDOperand Result =
      TLI.EmitTargetCodeForMemcpy(*this, Chain, Dst, Src, Size, Align,
                                  AlwaysInline,
-                                DstSV, DstOff, SrcSV, SrcOff);
+                                DstSV, DstSVOff, SrcSV, SrcSVOff);
    if (Result.Val)
      return Result;
  
@@ -2639,7 +2869,7 @@ SDOperand SelectionDAG::getMemcpy(SDOperand Chain, SDOperand Dst,
      assert(ConstantSize && "AlwaysInline requires a constant size!");
      return getMemcpyLoadsAndStores(*this, Chain, Dst, Src,
                                     ConstantSize->getValue(), Align, true,
-                                   DstSV, DstOff, SrcSV, SrcOff);
+                                   DstSV, DstSVOff, SrcSV, SrcSVOff);
    }
  
    // Emit a library call.
@@ -2660,18 +2890,29 @@ SDOperand SelectionDAG::getMemcpy(SDOperand Chain, SDOperand Dst,
  SDOperand SelectionDAG::getMemmove(SDOperand Chain, SDOperand Dst,
                                     SDOperand Src, SDOperand Size,
                                     unsigned Align,
-                                   const Value *DstSV, uint64_t DstOff,
-                                   const Value *SrcSV, uint64_t SrcOff) {
+                                   const Value *DstSV, uint64_t DstSVOff,
+                                   const Value *SrcSV, uint64_t SrcSVOff) {
+
+  // Check to see if we should lower the memmove to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memmove with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
  
-  // TODO: Optimize small memmove cases with simple loads and stores,
-  // ensuring that all loads precede all stores. This can cause severe
-  // register pressure, so targets should be careful with the size limit.
+    SDOperand Result =
+      getMemmoveLoadsAndStores(*this, Chain, Dst, Src, ConstantSize->getValue(),
+                               Align, false, DstSV, DstSVOff, SrcSV, SrcSVOff);
+    if (Result.Val)
+      return Result;
+  }
  
    // Then check to see if we should lower the memmove with target-specific
    // code. If the target chooses to do this, this is the next best.
    SDOperand Result =
      TLI.EmitTargetCodeForMemmove(*this, Chain, Dst, Src, Size, Align,
-                                 DstSV, DstOff, SrcSV, SrcOff);
+                                 DstSV, DstSVOff, SrcSV, SrcSVOff);
    if (Result.Val)
      return Result;
  
@@ -2693,7 +2934,7 @@ SDOperand SelectionDAG::getMemmove(SDOperand Chain, SDOperand Dst,
  SDOperand SelectionDAG::getMemset(SDOperand Chain, SDOperand Dst,
                                    SDOperand Src, SDOperand Size,
                                    unsigned Align,
-                                  const Value *DstSV, uint64_t DstOff) {
+                                  const Value *DstSV, uint64_t DstSVOff) {
  
    // Check to see if we should lower the memset to stores first.
    // For cases within the target-specified limits, this is the best choice.
@@ -2705,7 +2946,7 @@ SDOperand SelectionDAG::getMemset(SDOperand Chain, SDOperand Dst,
  
      SDOperand Result =
        getMemsetStores(*this, Chain, Dst, Src, ConstantSize->getValue(), Align,
-                      DstSV, DstOff);
+                      DstSV, DstSVOff);
      if (Result.Val)
        return Result;
    }
@@ -2714,7 +2955,7 @@ SDOperand SelectionDAG::getMemset(SDOperand Chain, SDOperand Dst,
    // code. If the target chooses to do this, this is the next best.
    SDOperand Result =
      TLI.EmitTargetCodeForMemset(*this, Chain, Dst, Src, Size, Align,
-                                DstSV, DstOff);
+                                DstSV, DstSVOff);
    if (Result.Val)
      return Result;
  
@@ -2763,7 +3004,11 @@ SDOperand SelectionDAG::getAtomic(unsigned Opcode, SDOperand Chain,
  SDOperand SelectionDAG::getAtomic(unsigned Opcode, SDOperand Chain, 
                                    SDOperand Ptr, SDOperand Val, 
                                    MVT::ValueType VT) {
-  assert((Opcode == ISD::ATOMIC_LAS || Opcode == ISD::ATOMIC_SWAP)
+  assert((   Opcode == ISD::ATOMIC_LAS || Opcode == ISD::ATOMIC_LSS
+          || Opcode == ISD::ATOMIC_SWAP || Opcode == ISD::ATOMIC_LOAD_AND
+          || Opcode == ISD::ATOMIC_LOAD_OR || Opcode == ISD::ATOMIC_LOAD_XOR
+          || Opcode == ISD::ATOMIC_LOAD_MIN || Opcode == ISD::ATOMIC_LOAD_MAX
+          || Opcode == ISD::ATOMIC_LOAD_UMIN || Opcode == ISD::ATOMIC_LOAD_UMAX) 
           && "Invalid Atomic Op");
    SDVTList VTs = getVTList(Val.getValueType(), MVT::Other);
    FoldingSetNodeID ID;
@@ -2816,7 +3061,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
    }
  
    bool Indexed = AM != ISD::UNINDEXED;
-  assert(Indexed || Offset.getOpcode() == ISD::UNDEF &&
+  assert((Indexed || Offset.getOpcode() == ISD::UNDEF) &&
           "Unindexed load with an offset!");
  
    SDVTList VTs = Indexed ?
@@ -3319,7 +3564,7 @@ UpdateNodeOperands(SDOperand InN, SDOperandPtr Ops, unsigned NumOps) {
    if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, NumOps, InsertPos))
      return SDOperand(Existing, InN.ResNo);
    
-  // Nope it doesn't.  Remove the node from it's current place in the maps.
+  // Nope it doesn't.  Remove the node from its current place in the maps.
    if (InsertPos)
      RemoveNodeFromCSEMaps(N);
    
@@ -3342,7 +3587,7 @@ UpdateNodeOperands(SDOperand InN, SDOperandPtr Ops, unsigned NumOps) {
  /// opcode, types, and operands to the specified value.  This should only be
  /// used by the SelectionDAG class.
  void SDNode::MorphNodeTo(unsigned Opc, SDVTList L,
-                         const SDOperand *Ops, unsigned NumOps) {
+                         SDOperandPtr Ops, unsigned NumOps) {
    NodeType = Opc;
    ValueList = L.VTs;
    NumValues = L.NumVTs;
@@ -3393,7 +3638,7 @@ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned TargetOpc,
     
    RemoveNodeFromCSEMaps(N);
    
-  N->MorphNodeTo(ISD::BUILTIN_OP_END+TargetOpc, VTs, 0, 0);
+  N->MorphNodeTo(ISD::BUILTIN_OP_END+TargetOpc, VTs, SDOperandPtr(), 0);
  
    CSEMap.InsertNode(N, IP);
    return N;
@@ -3951,7 +4196,7 @@ void AtomicSDNode::ANCHOR() {}
  
  HandleSDNode::~HandleSDNode() {
    SDVTList VTs = { 0, 0 };
-  MorphNodeTo(ISD::HANDLENODE, VTs, 0, 0);  // Drops operand uses.
+  MorphNodeTo(ISD::HANDLENODE, VTs, SDOperandPtr(), 0);  // Drops operand uses.
  }
  
  GlobalAddressSDNode::GlobalAddressSDNode(bool isTarget, const GlobalValue *GA,
@@ -4177,7 +4422,15 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
    case ISD::MEMBARRIER:    return "MemBarrier";
    case ISD::ATOMIC_LCS:    return "AtomicLCS";
    case ISD::ATOMIC_LAS:    return "AtomicLAS";
-  case ISD::ATOMIC_SWAP:    return "AtomicSWAP";
+  case ISD::ATOMIC_LSS:    return "AtomicLSS";
+  case ISD::ATOMIC_LOAD_AND:  return "AtomicLoadAnd";
+  case ISD::ATOMIC_LOAD_OR:   return "AtomicLoadOr";
+  case ISD::ATOMIC_LOAD_XOR:  return "AtomicLoadXor";
+  case ISD::ATOMIC_LOAD_MIN:  return "AtomicLoadMin";
+  case ISD::ATOMIC_LOAD_MAX:  return "AtomicLoadMax";
+  case ISD::ATOMIC_LOAD_UMIN: return "AtomicLoadUMin";
+  case ISD::ATOMIC_LOAD_UMAX: return "AtomicLoadUMax";
+  case ISD::ATOMIC_SWAP:   return "AtomicSWAP";
    case ISD::PCMARKER:      return "PCMarker";
    case ISD::READCYCLECOUNTER: return "ReadCycleCounter";
    case ISD::SRCVALUE:      return "SrcValue";
@@ -4279,6 +4532,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
    case ISD::FGETSIGN:  return "fgetsign";
  
    case ISD::SETCC:       return "setcc";
+  case ISD::VSETCC:      return "vsetcc";
    case ISD::SELECT:      return "select";
    case ISD::SELECT_CC:   return "select_cc";
    case ISD::INSERT_VECTOR_ELT:   return "insert_vector_elt";