Add patterns for the x86 popcnt instruction.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index f698ceceac52f3ea6bdf1da47887e29863fdc860..478bf71c686c6d7b732cc94e7a36846a13de90ba 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -104,8 +104,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2");
      setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
      setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::X86_StdCall);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C);
+    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C);
    }
  
    if (Subtarget->isTargetDarwin()) {
@@ -226,12 +226,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
  
    // TODO: when we have SSE, these could be more efficient, by using movd/movq.
    if (!X86ScalarSSEf64) {
-    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
-    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
+    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
+    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
      if (Subtarget->is64Bit()) {
-      setOperationAction(ISD::BIT_CONVERT    , MVT::f64  , Expand);
+      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
        // Without SSE, i64->f64 goes through memory.
-      setOperationAction(ISD::BIT_CONVERT    , MVT::i64  , Expand);
+      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
      }
    }
  
@@ -285,21 +285,27 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
  
-  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
    setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
    setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
-  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
    setOperationAction(ISD::CTTZ             , MVT::i16  , Custom);
    setOperationAction(ISD::CTLZ             , MVT::i16  , Custom);
-  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
    setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
    setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
    if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
      setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
      setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
    }
  
+  if (Subtarget->hasPOPCNT()) {
+    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
+  } else {
+    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
+    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
+    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
+  }
+
    setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
    setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
  
@@ -521,13 +527,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
      setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
      {
-      bool ignored;
-      APFloat TmpFlt(+0.0);
-      TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
-                     &ignored);
+      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
        addLegalFPImmediate(TmpFlt);  // FLD0
        TmpFlt.changeSign();
        addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
+
+      bool ignored;
        APFloat TmpFlt2(+1.0);
        TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
                        &ignored);
@@ -654,10 +659,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
    setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
    setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
-  setOperationAction(ISD::BIT_CONVERT,        MVT::v8i8,  Expand);
-  setOperationAction(ISD::BIT_CONVERT,        MVT::v4i16, Expand);
-  setOperationAction(ISD::BIT_CONVERT,        MVT::v2i32, Expand);
-  setOperationAction(ISD::BIT_CONVERT,        MVT::v1i64, Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
  
    if (!UseSoftFloat && Subtarget->hasSSE1()) {
      addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
@@ -1102,16 +1107,6 @@ unsigned X86TargetLowering::getJumpTableEncoding() const {
    return TargetLowering::getJumpTableEncoding();
  }
  
-/// getPICBaseSymbol - Return the X86-32 PIC base.
-MCSymbol *
-X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF,
-                                    MCContext &Ctx) const {
-  const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo();
-  return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+
-                               Twine(MF->getFunctionNumber())+"$pb");
-}
-
-
  const MCExpr *
  X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                               const MachineBasicBlock *MBB,
@@ -1146,7 +1141,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
      return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
  
    // Otherwise, the reference is relative to the PIC base.
-  return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx);
+  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
  }
  
  /// getFunctionAlignment - Return the Log2 alignment of this function.
@@ -1182,7 +1177,9 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{
  unsigned
  X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC,
                                         MachineFunction &MF) const {
-  unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0;
+  const TargetFrameInfo *TFI = MF.getTarget().getFrameInfo();
+
+  unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
    switch (RC->getID()) {
    default:
      return 0;
@@ -1301,13 +1298,13 @@ X86TargetLowering::LowerReturn(SDValue Chain,
      if (Subtarget->is64Bit()) {
        if (ValVT == MVT::x86mmx) {
          if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
-          ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
+          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
            ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
                                    ValToCopy);
            // If we don't have SSE2 available, convert to v4f32 so the generated
            // register is legal.
            if (!Subtarget->hasSSE2())
-            ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,ValToCopy);
+            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
          }
        }
      }
@@ -1346,6 +1343,28 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                       MVT::Other, &RetOps[0], RetOps.size());
  }
  
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() != ISD::CopyToReg &&
+      Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
+
+  bool HasRet = false;
+  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() != X86ISD::RET_FLAG)
+      return false;
+    HasRet = true;
+  }
+
+  return HasRet;
+}
+
  /// LowerCallResult - Lower the result values of a call into the
  /// appropriate copies out of appropriate physical registers.
  ///
@@ -1414,7 +1433,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                     MVT::i64, InFlag).getValue(1);
          Val = Chain.getValue(0);
        }
-      Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
+      Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val);
      } else {
        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
                                   CopyVT, InFlag).getValue(1);
@@ -1457,30 +1476,6 @@ ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
    return Ins[0].Flags.isSRet();
  }
  
-/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
-/// given CallingConvention value.
-CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
-  if (Subtarget->is64Bit()) {
-    if (CC == CallingConv::GHC)
-      return CC_X86_64_GHC;
-    else if (Subtarget->isTargetWin64())
-      return CC_X86_Win64_C;
-    else
-      return CC_X86_64_C;
-  }
-
-  if (CC == CallingConv::X86_FastCall)
-    return CC_X86_32_FastCall;
-  else if (CC == CallingConv::X86_ThisCall)
-    return CC_X86_32_ThisCall;
-  else if (CC == CallingConv::Fast)
-    return CC_X86_32_FastCC;
-  else if (CC == CallingConv::GHC)
-    return CC_X86_32_GHC;
-  else
-    return CC_X86_32_C;
-}
-
  /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
  /// by "Src" to address "Dst" with size and alignment information specified by
  /// the specific parameter attribute. The copy will be passed as a byval
@@ -1576,7 +1571,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
                   ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
+  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
  
    unsigned LastVal = ~0U;
    SDValue ArgValue;
@@ -1621,7 +1616,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
          ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
                                 DAG.getValueType(VA.getValVT()));
        else if (VA.getLocInfo() == CCValAssign::BCvt)
-        ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
+        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
  
        if (VA.isExtInLoc()) {
          // Handle MMX values passed in XMM regs.
@@ -1895,7 +1890,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
                   ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
+  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
  
    // Get a count of how many bytes are to be pushed on the stack.
    unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -1954,14 +1949,14 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      case CCValAssign::AExt:
        if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
          // Special case: passing MMX values in XMM registers.
-        Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
          Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
          Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
        } else
          Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
        break;
      case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
+      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
        break;
      case CCValAssign::Indirect: {
        // Store the argument.
@@ -2174,8 +2169,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
      unsigned char OpFlags = 0;
  
-    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
-    // symbols should go through the PLT.
+    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
+    // external symbols should go through the PLT.
      if (Subtarget->isTargetELF() &&
          getTargetMachine().getRelocationModel() == Reloc::PIC_) {
        OpFlags = X86II::MO_PLT;
@@ -2478,7 +2473,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
      SmallVector<CCValAssign, 16> ArgLocs;
      CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
                     ArgLocs, *DAG.getContext());
-    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
      if (CCInfo.getNextStackOffset()) {
        MachineFunction &MF = DAG.getMachineFunction();
        if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
@@ -2533,7 +2528,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    }
  
    // An stdcall caller is expected to clean up its arguments; the callee
-  // isn't going to do that.   PR 8461.
+  // isn't going to do that.
    if (!CCMatch && CallerCC==CallingConv::X86_StdCall)
      return false;
  
@@ -3533,7 +3528,7 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
    }
-  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
  }
  
  /// getOnesVector - Returns a vector of specified type with all bits set.
@@ -3546,7 +3541,7 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
    SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
    SDValue Vec;
    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
-  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
  }
  
  
@@ -3631,9 +3626,9 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
  
    // Perform the splat.
    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
-  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
+  V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1);
    V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
-  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
+  return DAG.getNode(ISD::BITCAST, dl, VT, V1);
  }
  
  /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
@@ -3757,7 +3752,7 @@ SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
    }
  
    // Actual nodes that may contain scalar elements
-  if (Opcode == ISD::BIT_CONVERT) {
+  if (Opcode == ISD::BITCAST) {
      V = V.getOperand(0);
      EVT SrcVT = V.getValueType();
      unsigned NumElems = VT.getVectorNumElements();
@@ -3946,7 +3941,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
      }
    }
  
-  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
+  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
  }
  
  /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
@@ -3987,8 +3982,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                           const TargetLowering &TLI, DebugLoc dl) {
    EVT ShVT = MVT::v2i64;
    unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
-  SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
-  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
+  return DAG.getNode(ISD::BITCAST, dl, VT,
                       DAG.getNode(Opc, dl, ShVT, SrcOp,
                               DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
  }
@@ -4055,8 +4050,8 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
                               LD->getPointerInfo().getWithOffset(StartOffset),
                               false, false, 0);
      // Canonicalize it to a v4i32 shuffle.
-    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
-    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+    return DAG.getNode(ISD::BITCAST, dl, VT,
                         DAG.getVectorShuffle(MVT::v4i32, dl, V1,
                                              DAG.getUNDEF(MVT::v4i32),&Mask[0]));
    }
@@ -4124,7 +4119,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
      SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys,
                                                Ops, 2, MVT::i32,
                                                LDBase->getMemOperand());
-    return DAG.getNode(ISD::BIT_CONVERT, DL, VT, ResNode);
+    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
    }
    return SDValue();
  }
@@ -4216,7 +4211,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                        DAG.getUNDEF(Item.getValueType()),
                                        &Mask[0]);
          }
-        return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
+        return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item);
        }
      }
  
@@ -4240,7 +4235,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
          Item = getShuffleVectorZeroOrUndef(Item, 0, true,
                                             Subtarget->hasSSE2(), DAG);
-        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
        }
      }
  
@@ -4433,21 +4428,21 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
    assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
           ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
    int Mask[2];
-  SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0));
+  SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
    SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
    InVec = Op.getOperand(1);
    if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
      unsigned NumElts = ResVT.getVectorNumElements();
-    VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
+    VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
      VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
                         InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
    } else {
-    InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec);
+    InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
      SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
      Mask[0] = 0; Mask[1] = 2;
      VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
    }
-  return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
+  return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
  }
  
  // v8i16 shuffles - Prefer shuffles in the following order:
@@ -4529,9 +4524,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
      MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
      MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
      NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
-                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
-                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
-    NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
+                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
+                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
+    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
  
      // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
      // source words for the shuffle, to aid later transformations.
@@ -4600,12 +4595,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
        pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
        pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
      }
-    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
+    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
      V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                       DAG.getNode(ISD::BUILD_VECTOR, dl,
                                   MVT::v16i8, &pshufbMask[0], 16));
      if (!TwoInputs)
-      return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
  
      // Calculate the shuffle mask for the second input, shuffle it, and
      // OR it with the first shuffled input.
@@ -4620,12 +4615,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
        pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
        pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
      }
-    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
+    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
      V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                       DAG.getNode(ISD::BUILD_VECTOR, dl,
                                   MVT::v16i8, &pshufbMask[0], 16));
      V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
-    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
    }
  
    // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
@@ -4792,8 +4787,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
    // No SSSE3 - Calculate in place words and then fix all out of place words
    // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
    // the 16 different words that comprise the two doublequadword input vectors.
-  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
-  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
+  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
    SDValue NewV = V2Only ? V2 : V1;
    for (int i = 0; i != 8; ++i) {
      int Elt0 = MaskVals[i*2];
@@ -4855,7 +4850,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
                         DAG.getIntPtrConstant(i));
    }
-  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
+  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
  }
  
  /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
@@ -4899,8 +4894,8 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
        MaskVec.push_back(StartIdx / Scale);
    }
  
-  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
-  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
+  V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
+  V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
    return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
  }
  
@@ -4917,13 +4912,13 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
        // movssrr and movsdrr do not clear top bits. Try to use movd, movq
        // instead.
        MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
-      if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
+      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
            SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
-          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
+          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
            SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
          // PR2108
          OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
-        return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+        return DAG.getNode(ISD::BITCAST, dl, VT,
                             DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
                                         DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                                     OpVT,
@@ -4933,9 +4928,9 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
      }
    }
  
-  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+  return DAG.getNode(ISD::BITCAST, dl, VT,
                       DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
-                                 DAG.getNode(ISD::BIT_CONVERT, dl,
+                                 DAG.getNode(ISD::BITCAST, dl,
                                               OpVT, SrcOp)));
  }
  
@@ -5089,7 +5084,7 @@ LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
  }
  
  static bool MayFoldVectorLoad(SDValue V) {
-  if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT)
+  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
      V = V.getOperand(0);
    if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
      V = V.getOperand(0);
@@ -5106,7 +5101,7 @@ static bool MayFoldVectorLoad(SDValue V) {
  // one use. Remove this version after this bug get fixed.
  // rdar://8434668, PR8156
  static bool RelaxedMayFoldVectorLoad(SDValue V) {
-  if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT)
+  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
      V = V.getOperand(0);
    if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
      V = V.getOperand(0);
@@ -5144,7 +5139,7 @@ bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
    // If the bit convert changed the number of elements, it is unsafe
    // to examine the mask.
    bool HasShuffleIntoBitcast = false;
-  if (V.getOpcode() == ISD::BIT_CONVERT) {
+  if (V.getOpcode() == ISD::BITCAST) {
      EVT SrcVT = V.getOperand(0).getValueType();
      if (SrcVT.getVectorNumElements() != VT.getVectorNumElements())
        return false;
@@ -5159,7 +5154,7 @@ bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
    V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1);
  
    // Skip one more bit_convert if necessary
-  if (V.getOpcode() == ISD::BIT_CONVERT)
+  if (V.getOpcode() == ISD::BITCAST)
      V = V.getOperand(0);
  
    if (ISD::isNormalLoad(V.getNode())) {
@@ -5196,8 +5191,8 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
    EVT VT = Op.getValueType();
  
    // Canonizalize to v2f64.
-  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, V1);
-  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
+  return DAG.getNode(ISD::BITCAST, dl, VT,
                       getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
                                            V1, DAG));
  }
@@ -5351,7 +5346,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
    if (VT == MVT::v8i16 || VT == MVT::v16i8) {
      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
      if (NewOp.getNode())
-      return DAG.getNode(ISD::BIT_CONVERT, dl, VT, NewOp);
+      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
    } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
      // FIXME: Figure out a cleaner way to do this.
      // Try to make use of movq to zero out the top part.
@@ -5661,7 +5656,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
      if (Idx == 0)
        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                     DAG.getNode(ISD::BIT_CONVERT, dl,
+                                     DAG.getNode(ISD::BITCAST, dl,
                                                   MVT::v4i32,
                                                   Op.getOperand(0)),
                                       Op.getOperand(1)));
@@ -5682,14 +5677,14 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
      if ((User->getOpcode() != ISD::STORE ||
           (isa<ConstantSDNode>(Op.getOperand(1)) &&
            cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
-        (User->getOpcode() != ISD::BIT_CONVERT ||
+        (User->getOpcode() != ISD::BITCAST ||
           User->getValueType(0) != MVT::i32))
        return SDValue();
      SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
+                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
                                                Op.getOperand(0)),
                                                Op.getOperand(1));
-    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
    } else if (VT == MVT::i32) {
      // ExtractPS works with constant index.
      if (isa<ConstantSDNode>(Op.getOperand(1)))
@@ -5720,7 +5715,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
      if (Idx == 0)
        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                     DAG.getNode(ISD::BIT_CONVERT, dl,
+                                     DAG.getNode(ISD::BITCAST, dl,
                                                   MVT::v4i32, Vec),
                                       Op.getOperand(1)));
      // Transform it so it match pextrw which produces a 32-bit result.
@@ -5851,7 +5846,7 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
    SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
    assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 &&
           "Expected an SSE type!");
-  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
+  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(),
                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
  }
  
@@ -5918,12 +5913,11 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
    Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
  
    // With PIC, the address is actually $g + Offset.
-  if (OpFlag) {
+  if (OpFlag)
      Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                           DAG.getNode(X86ISD::GlobalBaseReg,
                                       DebugLoc(), getPointerTy()),
                           Result);
-  }
  
    return Result;
  }
@@ -6423,7 +6417,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
                                MachinePointerInfo::getConstantPool(),
                                false, false, 16);
    SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
-  SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
+  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2);
    SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                                MachinePointerInfo::getConstantPool(),
                                false, false, 16);
@@ -6453,19 +6447,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
                                           DAG.getIntPtrConstant(0)));
  
    Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
+                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
                       DAG.getIntPtrConstant(0));
  
    // Or the load with the bias.
    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
-                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
                                         DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                                     MVT::v2f64, Load)),
-                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
                                         DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                                     MVT::v2f64, Bias)));
    Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
-                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
+                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
                     DAG.getIntPtrConstant(0));
  
    // Subtract the bias.
@@ -6723,11 +6717,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
                               MachinePointerInfo::getConstantPool(),
                               false, false, 16);
    if (VT.isVector()) {
-    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+    return DAG.getNode(ISD::BITCAST, dl, VT,
                         DAG.getNode(ISD::XOR, dl, MVT::v2i64,
-                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
                                  Op.getOperand(0)),
-                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
+                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask)));
    } else {
      return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
    }
@@ -6779,7 +6773,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
      SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
      SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
                            DAG.getConstant(32, MVT::i32));
-    SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
+    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
      SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
                            DAG.getIntPtrConstant(0));
    }
@@ -7928,7 +7922,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
      }
  
      EVT VT = Op.getValueType();
-    ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt);
+    ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                         DAG.getConstant(NewIntNo, MVT::i32),
                         Op.getOperand(1), ShAmt);
@@ -8362,7 +8356,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
                                   false, false, 16);
  
      Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
-    Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op);
+    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
      Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
      return DAG.getNode(ISD::MUL, dl, VT, Op, R);
    }
@@ -8583,16 +8577,16 @@ SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
    return DAG.getMergeValues(Ops, 2, dl);
  }
  
-SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op,
+SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
                                              SelectionDAG &DAG) const {
    EVT SrcVT = Op.getOperand(0).getValueType();
    EVT DstVT = Op.getValueType();
    assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
            Subtarget->hasMMX() && !DisableMMX) &&
-         "Unexpected custom BIT_CONVERT");
+         "Unexpected custom BITCAST");
    assert((DstVT == MVT::i64 ||
            (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
-         "Unexpected custom BIT_CONVERT");
+         "Unexpected custom BITCAST");
    // i64 <=> MMX conversions are Legal.
    if (SrcVT==MVT::i64 && DstVT.isVector())
      return Op;
@@ -8675,7 +8669,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::SMULO:
    case ISD::UMULO:              return LowerXALUO(Op, DAG);
    case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
-  case ISD::BIT_CONVERT:        return LowerBIT_CONVERT(Op, DAG);
+  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
    }
  }
  
@@ -9438,15 +9432,12 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
  MachineBasicBlock *
  X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
                              unsigned numArgs, bool memArg) const {
-
    assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) &&
           "Target must have SSE4.2 or AVX features enabled");
  
    DebugLoc dl = MI->getDebugLoc();
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
    unsigned Opc;
-
    if (!Subtarget->hasAVX()) {
      if (memArg)
        Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
@@ -9459,20 +9450,59 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
        Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
    }
  
-  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc));
-
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
    for (unsigned i = 0; i < numArgs; ++i) {
      MachineOperand &Op = MI->getOperand(i+1);
-
      if (!(Op.isReg() && Op.isImplicit()))
        MIB.addOperand(Op);
    }
-
-  BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
+  BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
      .addReg(X86::XMM0);
  
    MI->eraseFromParent();
+  return BB;
+}
  
+MachineBasicBlock *
+X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  
+  // Address into RAX/EAX, other two args into ECX, EDX.
+  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(i));
+  
+  unsigned ValOps = X86::AddrNumOperands;
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
+    .addReg(MI->getOperand(ValOps).getReg());
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
+    .addReg(MI->getOperand(ValOps+1).getReg());
+
+  // The instruction doesn't actually take any operands though.
+  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
+  
+  MI->eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const {
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  
+  // First arg in ECX, the second in EAX.
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
+    .addReg(MI->getOperand(0).getReg());
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
+    .addReg(MI->getOperand(1).getReg());
+    
+  // The instruction doesn't actually take any operands though.
+  BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr));
+  
+  MI->eraseFromParent(); // The pseudo is gone now.
    return BB;
  }
  
@@ -10075,6 +10105,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::VPCMPESTRM128MEM:
      return EmitPCMP(MI, BB, 5, true /* in mem */);
  
+    // Thread synchronization.
+  case X86::MONITOR:
+    return EmitMonitor(MI, BB);  
+  case X86::MWAIT:
+    return EmitMwait(MI, BB);
+
      // Atomic Lowering.
    case X86::ATOMAND32:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
@@ -11210,13 +11246,13 @@ static SDValue PerformBTCombine(SDNode *N,
  
  static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
    SDValue Op = N->getOperand(0);
-  if (Op.getOpcode() == ISD::BIT_CONVERT)
+  if (Op.getOpcode() == ISD::BITCAST)
      Op = Op.getOperand(0);
    EVT VT = N->getValueType(0), OpVT = Op.getValueType();
    if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
        VT.getVectorElementType().getSizeInBits() ==
        OpVT.getVectorElementType().getSizeInBits()) {
-    return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
+    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
    }
    return SDValue();
  }
@@ -11428,13 +11464,13 @@ static bool LowerToBSwap(CallInst *CI) {
  
  bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
    InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
-  std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
+  InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
  
    std::string AsmStr = IA->getAsmString();
  
    // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
    SmallVector<StringRef, 4> AsmPieces;
-  SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
+  SplitString(AsmStr, AsmPieces, ";\n");
  
    switch (AsmPieces.size()) {
    default: return false;
@@ -11475,6 +11511,35 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
      }
      break;
    case 3:
+    if (CI->getType()->isIntegerTy(32) &&
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
+      SmallVector<StringRef, 4> Words;
+      SplitString(AsmPieces[0], Words, " \t,");
+      if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
+          Words[2] == "${0:w}") {
+        Words.clear();
+        SplitString(AsmPieces[1], Words, " \t,");
+        if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" &&
+            Words[2] == "$0") {
+          Words.clear();
+          SplitString(AsmPieces[2], Words, " \t,");
+          if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
+              Words[2] == "${0:w}") {
+            AsmPieces.clear();
+            const std::string &Constraints = IA->getConstraintString();
+            SplitString(StringRef(Constraints).substr(5), AsmPieces, ",");
+            std::sort(AsmPieces.begin(), AsmPieces.end());
+            if (AsmPieces.size() == 4 &&
+                AsmPieces[0] == "~{cc}" &&
+                AsmPieces[1] == "~{dirflag}" &&
+                AsmPieces[2] == "~{flags}" &&
+                AsmPieces[3] == "~{fpsr}") {
+              return LowerToBSwap(CI);
+            }
+          }
+        }
+      }
+    }
      if (CI->getType()->isIntegerTy(64) &&
          Constraints.size() >= 2 &&
          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
@@ -11508,18 +11573,32 @@ X86TargetLowering::ConstraintType
  X86TargetLowering::getConstraintType(const std::string &Constraint) const {
    if (Constraint.size() == 1) {
      switch (Constraint[0]) {
-    case 'A':
-      return C_Register;
-    case 'f':
-    case 'r':
      case 'R':
-    case 'l':
      case 'q':
      case 'Q':
-    case 'x':
+    case 'f':
+    case 't':
+    case 'u':
      case 'y':
+    case 'x':
      case 'Y':
        return C_RegisterClass;
+    case 'a':
+    case 'b':
+    case 'c':
+    case 'd':
+    case 'S':
+    case 'D':
+    case 'A':
+      return C_Register;
+    case 'I':
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'G':
+    case 'C':
      case 'e':
      case 'Z':
        return C_Other;
@@ -11530,30 +11609,106 @@ X86TargetLowering::getConstraintType(const std::string &Constraint) const {
    return TargetLowering::getConstraintType(Constraint);
  }
  
-/// Examine constraint type and operand type and determine a weight value,
-/// where: -1 = invalid match, and 0 = so-so match to 3 = good match.
+/// Examine constraint type and operand type and determine a weight value.
  /// This object must already have been set up with the operand type
  /// and the current alternative constraint selected.
-int X86TargetLowering::getSingleConstraintMatchWeight(
+TargetLowering::ConstraintWeight
+  X86TargetLowering::getSingleConstraintMatchWeight(
      AsmOperandInfo &info, const char *constraint) const {
-  int weight = -1;
+  ConstraintWeight weight = CW_Invalid;
    Value *CallOperandVal = info.CallOperandVal;
      // If we don't have a value, we can't do a match,
      // but allow it at the lowest weight.
    if (CallOperandVal == NULL)
-    return 0;
+    return CW_Default;
+  const Type *type = CallOperandVal->getType();
    // Look at the constraint type.
    switch (*constraint) {
    default:
-    return TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+  case 'R':
+  case 'q':
+  case 'Q':
+  case 'a':
+  case 'b':
+  case 'c':
+  case 'd':
+  case 'S':
+  case 'D':
+  case 'A':
+    if (CallOperandVal->getType()->isIntegerTy())
+      weight = CW_SpecificReg;
+    break;
+  case 'f':
+  case 't':
+  case 'u':
+      if (type->isFloatingPointTy())
+        weight = CW_SpecificReg;
+      break;
+  case 'y':
+      if (type->isX86_MMXTy() && !DisableMMX && Subtarget->hasMMX())
+        weight = CW_SpecificReg;
+      break;
+  case 'x':
+  case 'Y':
+    if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1())
+      weight = CW_Register;
      break;
    case 'I':
      if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
        if (C->getZExtValue() <= 31)
-        weight = 3;
+        weight = CW_Constant;
+    }
+    break;
+  case 'J':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 63)
+        weight = CW_Constant;
+    }
+    break;
+  case 'K':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
+        weight = CW_Constant;
+    }
+    break;
+  case 'L':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
+        weight = CW_Constant;
+    }
+    break;
+  case 'M':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 3)
+        weight = CW_Constant;
+    }
+    break;
+  case 'N':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 0xff)
+        weight = CW_Constant;
+    }
+    break;
+  case 'G':
+  case 'C':
+    if (dyn_cast<ConstantFP>(CallOperandVal)) {
+      weight = CW_Constant;
+    }
+    break;
+  case 'e':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -0x80000000LL) &&
+          (C->getSExtValue() <= 0x7fffffffLL))
+        weight = CW_Constant;
+    }
+    break;
+  case 'Z':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 0xffffffff)
+        weight = CW_Constant;
      }
      break;
-  // etc.
    }
    return weight;
  }