X86InstrInfoSSE.td declares PINSRW as having type v8i16,

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 6f710ecbe15f45b1d2fc3a2de009746a4613e128..972773176043d902a67eef720b8da660298070ce 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16,6 +16,7 @@
  #include "X86.h"
  #include "X86InstrBuilder.h"
  #include "X86ISelLowering.h"
+#include "X86MCTargetExpr.h"
  #include "X86TargetMachine.h"
  #include "X86TargetObjectFile.h"
  #include "llvm/CallingConv.h"
@@ -36,7 +37,6 @@
  #include "llvm/CodeGen/PseudoSourceValue.h"
  #include "llvm/MC/MCAsmInfo.h"
  #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
  #include "llvm/MC/MCSymbol.h"
  #include "llvm/ADT/BitVector.h"
  #include "llvm/ADT/SmallSet.h"
@@ -73,15 +73,16 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
    case X86Subtarget::isDarwin:
      if (TM.getSubtarget<X86Subtarget>().is64Bit())
        return new X8664_MachoTargetObjectFile();
-    return new X8632_MachoTargetObjectFile();
+    return new TargetLoweringObjectFileMachO();
    case X86Subtarget::isELF:
-    return new TargetLoweringObjectFileELF();
+   if (TM.getSubtarget<X86Subtarget>().is64Bit())
+     return new X8664_ELFTargetObjectFile(TM);
+    return new X8632_ELFTargetObjectFile(TM);
    case X86Subtarget::isMingw:
    case X86Subtarget::isCygwin:
    case X86Subtarget::isWindows:
      return new TargetLoweringObjectFileCOFF();
    }
-
  }
  
  X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
@@ -989,6 +990,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
    setTargetDAGCombine(ISD::BUILD_VECTOR);
    setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::AND);
    setTargetDAGCombine(ISD::SHL);
    setTargetDAGCombine(ISD::SRA);
    setTargetDAGCombine(ISD::SRL);
@@ -1001,19 +1003,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
  
    computeRegisterProperties();
  
-  // Divide and reminder operations have no vector equivalent and can
-  // trap. Do a custom widening for these operations in which we never
-  // generate more divides/remainder than the original vector width.
-  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-    if (!isTypeLegal((MVT::SimpleValueType)VT)) {
-      setOperationAction(ISD::SDIV, (MVT::SimpleValueType) VT, Custom);
-      setOperationAction(ISD::UDIV, (MVT::SimpleValueType) VT, Custom);
-      setOperationAction(ISD::SREM, (MVT::SimpleValueType) VT, Custom);
-      setOperationAction(ISD::UREM, (MVT::SimpleValueType) VT, Custom);
-    }
-  }
-
    // FIXME: These should be based on subtarget info. Plus, the values should
    // be smaller when we are in optimizing for size mode.
    maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
@@ -1130,10 +1119,8 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
           Subtarget->isPICStyleGOT());
    // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
    // entries.
-
-  // FIXME: @GOTOFF should be a property of MCSymbolRefExpr not in the MCSymbol.
-  std::string Name = MBB->getSymbol(Ctx)->getName() + "@GOTOFF";
-  return MCSymbolRefExpr::Create(Ctx.GetOrCreateSymbol(StringRef(Name)), Ctx);
+  return X86MCTargetExpr::Create(MBB->getSymbol(Ctx),
+                                 X86MCTargetExpr::GOTOFF, Ctx);
  }
  
  /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
@@ -1195,13 +1182,11 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                   RVLocs, *DAG.getContext());
    CCInfo.AnalyzeReturn(Outs, RetCC_X86);
  
-  // If this is the first return lowered for this function, add the regs to the
-  // liveout set for the function.
-  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
-    for (unsigned i = 0; i != RVLocs.size(); ++i)
-      if (RVLocs[i].isRegLoc())
-        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-  }
+  // Add the regs to the liveout set for the function.
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+  for (unsigned i = 0; i != RVLocs.size(); ++i)
+    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
+      MRI.addLiveOut(RVLocs[i].getLocReg());
  
    SDValue Flag;
  
@@ -1254,7 +1239,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
      X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
      unsigned Reg = FuncInfo->getSRetReturnReg();
      if (!Reg) {
-      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+      Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64));
        FuncInfo->setSRetReturnReg(Reg);
      }
      SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
@@ -1263,7 +1248,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
      Flag = Chain.getValue(1);
  
      // RAX now acts like a return value.
-    MF.getRegInfo().addLiveOut(X86::RAX);
+    MRI.addLiveOut(X86::RAX);
    }
  
    RetOps[0] = Chain;  // Update chain.
@@ -1393,7 +1378,7 @@ bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){
    case CallingConv::X86_FastCall:
      return !Subtarget->is64Bit();
    case CallingConv::Fast:
-    return PerformTailCallOpt;
+    return GuaranteedTailCallOpt;
    }
  }
  
@@ -1415,18 +1400,6 @@ CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
      return CC_X86_32_C;
  }
  
-/// NameDecorationForCallConv - Selects the appropriate decoration to
-/// apply to a MachineFunction containing a given calling convention.
-NameDecorationStyle
-X86TargetLowering::NameDecorationForCallConv(CallingConv::ID CallConv) {
-  if (CallConv == CallingConv::X86_FastCall)
-    return FastCall;
-  else if (CallConv == CallingConv::X86_StdCall)
-    return StdCall;
-  return None;
-}
-
-
  /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
  /// by "Src" to address "Dst" with size and alignment information specified by
  /// the specific parameter attribute. The copy will be passed as a byval
@@ -1443,7 +1416,7 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
  /// FuncIsMadeTailCallSafe - Return true if the function is being made into
  /// a tailcall target by changing its ABI.
  static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
-  return PerformTailCallOpt && CC == CallingConv::Fast;
+  return GuaranteedTailCallOpt && CC == CallingConv::Fast;
  }
  
  SDValue
@@ -1480,7 +1453,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
                                      VA.getLocMemOffset(), isImmutable, false);
      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
      return DAG.getLoad(ValVT, dl, Chain, FIN,
-                       PseudoSourceValue::getFixedStack(FI), 0);
+                       PseudoSourceValue::getFixedStack(FI), 0,
+                       false, false, 0);
    }
  }
  
@@ -1502,9 +1476,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
        Fn->getName() == "main")
      FuncInfo->setForceFramePointer(true);
  
-  // Decorate the function name.
-  FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv));
-
    MachineFrameInfo *MFI = MF.getFrameInfo();
    bool Is64Bit = Subtarget->is64Bit();
    bool IsWin64 = Subtarget->isTargetWin64();
@@ -1577,7 +1548,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
  
      // If value is passed via pointer - do a load.
      if (VA.getLocInfo() == CCValAssign::Indirect)
-      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0);
+      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0,
+                             false, false, 0);
  
      InVals.push_back(ArgValue);
    }
@@ -1672,7 +1644,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
          SDValue Store =
            DAG.getStore(Val.getValue(1), dl, Val, FIN,
                         PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
-                       Offset);
+                       Offset, false, false, 0);
          MemOps.push_back(Store);
          Offset += 8;
        }
@@ -1741,7 +1713,8 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
      return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
    }
    return DAG.getStore(Chain, dl, Arg, PtrOff,
-                      PseudoSourceValue::getStack(), LocMemOffset);
+                      PseudoSourceValue::getStack(), LocMemOffset,
+                      false, false, 0);
  }
  
  /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
@@ -1751,14 +1724,12 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
                                             SDValue &OutRetAddr, SDValue Chain,
                                             bool IsTailCall, bool Is64Bit,
                                             int FPDiff, DebugLoc dl) {
-  if (!IsTailCall || FPDiff==0) return Chain;
-
    // Adjust the Return address stack slot.
    EVT VT = getPointerTy();
    OutRetAddr = getReturnAddressFrameIndex(DAG);
  
    // Load the "old" Return address.
-  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0);
+  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0);
    return SDValue(OutRetAddr.getNode(), 1);
  }
  
@@ -1773,11 +1744,12 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
    // Calculate the new stack slot for the return address.
    int SlotSize = Is64Bit ? 8 : 4;
    int NewReturnAddrFI =
-    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, true,false);
+    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false);
    EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
    SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
    Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
-                       PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0);
+                       PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0,
+                       false, false, 0);
    return Chain;
  }
  
@@ -1792,12 +1764,22 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    MachineFunction &MF = DAG.getMachineFunction();
    bool Is64Bit        = Subtarget->is64Bit();
    bool IsStructRet    = CallIsStructReturn(Outs);
+  bool IsSibcall      = false;
  
-  if (isTailCall)
+  if (isTailCall) {
      // Check if it's really possible to do a tail call.
      isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                     Outs, Ins, DAG);
  
+    // Sibcalls are automatically detected tailcalls which do not require
+    // ABI changes.
+    if (!GuaranteedTailCallOpt && isTailCall)
+      IsSibcall = true;
+
+    if (isTailCall)
+      ++NumTailCalls;
+  }
+
    assert(!(isVarArg && CallConv == CallingConv::Fast) &&
           "Var args not supported with calling convention fastcc");
  
@@ -1809,17 +1791,15 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
  
    // Get a count of how many bytes are to be pushed on the stack.
    unsigned NumBytes = CCInfo.getNextStackOffset();
-  if (FuncIsMadeTailCallSafe(CallConv))
-    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
-  else if (isTailCall && !PerformTailCallOpt)
+  if (IsSibcall)
      // This is a sibcall. The memory operands are available in caller's
      // own caller's stack.
      NumBytes = 0;
+  else if (GuaranteedTailCallOpt && CallConv == CallingConv::Fast)
+    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
  
    int FPDiff = 0;
-  if (isTailCall) {
-    ++NumTailCalls;
-
+  if (isTailCall && !IsSibcall) {
      // Lower arguments at fp - stackoffset + fpdiff.
      unsigned NumBytesCallerPushed =
        MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
@@ -1831,12 +1811,14 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
        MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
    }
  
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  if (!IsSibcall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
  
    SDValue RetAddrFrIdx;
    // Load return adress for tail calls.
-  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit,
-                                  FPDiff, dl);
+  if (isTailCall && FPDiff)
+    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
+                                    Is64Bit, FPDiff, dl);
  
    SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
    SmallVector<SDValue, 8> MemOpChains;
@@ -1878,7 +1860,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
        SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
        int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
        Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
-                           PseudoSourceValue::getFixedStack(FI), 0);
+                           PseudoSourceValue::getFixedStack(FI), 0,
+                           false, false, 0);
        Arg = SpillSlot;
        break;
      }
@@ -1886,15 +1869,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
  
      if (VA.isRegLoc()) {
        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else {
-      if (!isTailCall || (isTailCall && isByVal)) {
-        assert(VA.isMemLoc());
-        if (StackPtr.getNode() == 0)
-          StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
-
-        MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
-                                               dl, DAG, VA, Flags));
-      }
+    } else if (!IsSibcall && (!isTailCall || isByVal)) {
+      assert(VA.isMemLoc());
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
      }
    }
  
@@ -1914,7 +1894,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
        InFlag = Chain.getValue(1);
      }
  
-
    if (Subtarget->isPICStyleGOT()) {
      // ELF / PIC requires GOT in the EBX register before function calls via PLT
      // GOT pointer.
@@ -1984,7 +1963,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      int FI = 0;
      // Do not flag preceeding copytoreg stuff together with the following stuff.
      InFlag = SDValue();
-    if (PerformTailCallOpt) {
+    if (GuaranteedTailCallOpt) {
        for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
          CCValAssign &VA = ArgLocs[i];
          if (VA.isRegLoc())
@@ -2013,7 +1992,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
            // Store relative to framepointer.
            MemOpChains2.push_back(
              DAG.getStore(ArgChain, dl, Arg, FIN,
-                         PseudoSourceValue::getFixedStack(FI), 0));
+                         PseudoSourceValue::getFixedStack(FI), 0,
+                         false, false, 0));
          }
        }
      }
@@ -2096,21 +2076,22 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    }
  
    if (isTailCall && !WasGlobalOrExternal) {
-    unsigned Opc = Is64Bit ? X86::R11 : X86::EAX;
-
+    // Force the address into a (call preserved) caller-saved register since
+    // tailcall must happen after callee-saved registers are poped.
+    // FIXME: Give it a special register class that contains caller-saved
+    // register instead?
+    unsigned TCReg = Is64Bit ? X86::R11 : X86::EAX;
      Chain = DAG.getCopyToReg(Chain,  dl,
-                             DAG.getRegister(Opc, getPointerTy()),
+                             DAG.getRegister(TCReg, getPointerTy()),
                               Callee,InFlag);
-    Callee = DAG.getRegister(Opc, getPointerTy());
-    // Add register as live out.
-    MF.getRegInfo().addLiveOut(Opc);
+    Callee = DAG.getRegister(TCReg, getPointerTy());
    }
  
    // Returns a chain & a flag for retval copy to use.
    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
    SmallVector<SDValue, 8> Ops;
  
-  if (isTailCall) {
+  if (!IsSibcall && isTailCall) {
      Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                             DAG.getIntPtrConstant(0, true), InFlag);
      InFlag = Chain.getValue(1);
@@ -2171,7 +2152,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    if (IsCalleePop(isVarArg, CallConv))
      NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
    else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet)
-    // If this is is a call to a struct-return function, the callee
+    // If this is a call to a struct-return function, the callee
      // pops the hidden struct pointer, so we have to push it back.
      // This is common for Darwin/X86, Linux & Mingw32 targets.
      NumBytesForCalleeToPush = 4;
@@ -2179,12 +2160,14 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      NumBytesForCalleeToPush = 0;  // Callee pops nothing.
  
    // Returns a flag for retval copy to use.
-  Chain = DAG.getCALLSEQ_END(Chain,
-                             DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(NumBytesForCalleeToPush,
-                                                   true),
-                             InFlag);
-  InFlag = Chain.getValue(1);
+  if (!IsSibcall) {
+    Chain = DAG.getCALLSEQ_END(Chain,
+                               DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
+                                                     true),
+                               InFlag);
+    InFlag = Chain.getValue(1);
+  }
  
    // Handle result values, copying them out of physregs into vregs that we
    // return.
@@ -2246,6 +2229,50 @@ unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
    return Offset;
  }
  
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+                         const X86InstrInfo *TII) {
+  int FI;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(Def, FI))
+        return false;
+    } else {
+      unsigned Opcode = Def->getOpcode();
+      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+          Def->getOperand(1).isFI()) {
+        FI = Def->getOperand(1).getIndex();
+        if (MFI->getObjectSize(FI) != Flags.getByValSize())
+          return false;
+      } else
+        return false;
+    }
+  } else {
+    LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg);
+    if (!Ld)
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  }
+
+  if (!MFI->isFixedObjectIndex(FI))
+    return false;
+  return Offset == MFI->getObjectOffset(FI);
+}
+
  /// IsEligibleForTailCallOptimization - Check whether the call is eligible
  /// for tail call optimization. Targets which want to do tail call
  /// optimization should implement this function.
@@ -2262,14 +2289,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
  
    // If -tailcallopt is specified, make fastcc functions tail-callable.
    const Function *CallerF = DAG.getMachineFunction().getFunction();
-  if (PerformTailCallOpt) {
+  if (GuaranteedTailCallOpt) {
      if (CalleeCC == CallingConv::Fast &&
          CallerF->getCallingConv() == CalleeCC)
        return true;
      return false;
    }
  
-
    // Look for obvious safe cases to perform tail call optimization that does not
    // requite ABI changes. This is what gcc calls sibcall.
  
@@ -2297,49 +2323,26 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
        // Check if the arguments are already laid out in the right way as
        // the caller's fixed stack objects.
        MachineFrameInfo *MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const X86InstrInfo *TII =
+        ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
        for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
          CCValAssign &VA = ArgLocs[i];
          EVT RegVT = VA.getLocVT();
          SDValue Arg = Outs[i].Val;
          ISD::ArgFlagsTy Flags = Outs[i].Flags;
-        if (Flags.isByVal())
-          return false; // TODO
          if (VA.getLocInfo() == CCValAssign::Indirect)
            return false;
          if (!VA.isRegLoc()) {
-          LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg);
-          if (!Ld)
-            return false;
-          SDValue Ptr = Ld->getBasePtr();
-          FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
-          if (!FINode)
-            return false;
-          int FI = FINode->getIndex();
-          if (!MFI->isFixedObjectIndex(FI))
-            return false;
-          if (VA.getLocMemOffset() != MFI->getObjectOffset(FI))
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII))
              return false;
          }
        }
      }
    }
  
-  // If the caller does not return a value, then this is obviously safe.
-  // This is one case where it's safe to perform this optimization even
-  // if the return types do not match.
-  const Type *CallerRetTy = CallerF->getReturnType();
-  if (CallerRetTy->isVoidTy())
-    return true;
-
-  // If the return types match, then it's safe.
-  // Don't tail call optimize recursive call.
-  GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-  if (!G) return false;  // FIXME: common external symbols?
-  if (const Function *CalleeF = dyn_cast<Function>(G->getGlobal())) {
-    const Type *CalleeRetTy = CalleeF->getReturnType();
-    return CallerRetTy == CalleeRetTy;
-  }
-  return false;
+  return true;
  }
  
  FastISel *
@@ -2374,7 +2377,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
      // Set up a frame object for the return address.
      uint64_t SlotSize = TD->getPointerSize();
      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
-                                                           true, false);
+                                                           false, false);
      FuncInfo->setRAIndex(ReturnAddrIndex);
    }
  
@@ -3569,7 +3572,8 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
      int EltNo = (Offset - StartOffset) >> 2;
      int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
      EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
-    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0);
+    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0,
+                             false, false, 0);
      // Canonicalize it to a v4i32 shuffle.
      V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
@@ -4813,8 +4817,16 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
  
    if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
        isa<ConstantSDNode>(N2)) {
-    unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB
-                                                : X86ISD::PINSRW;
+    unsigned Opc;
+    if (VT == MVT::v8i16)
+      Opc = X86ISD::PINSRW;
+    else if (VT == MVT::v4i16)
+      Opc = X86ISD::MMX_PINSRW;
+    else if (VT == MVT::v16i8)
+      Opc = X86ISD::PINSRB;
+    else
+      Opc = X86ISD::PINSRB;
+
      // Transform it so it match pinsr{b,w} which expects a GR32 as its second
      // argument.
      if (N1.getValueType() != MVT::i32)
@@ -4865,7 +4877,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
        N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
      if (N2.getValueType() != MVT::i32)
        N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
-    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+    return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW,
+                       dl, VT, N0, N1, N2);
    }
    return SDValue();
  }
@@ -5068,7 +5081,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
    // load.
    if (isGlobalStubReference(OpFlags))
      Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
-                         PseudoSourceValue::getGOT(), 0);
+                         PseudoSourceValue::getGOT(), 0, false, false, 0);
  
    // If there was a non-zero offset that we didn't fold, create an explicit
    // addition for it.
@@ -5148,7 +5161,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                               MVT::i32));
  
    SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
-                                      NULL, 0);
+                                      NULL, 0, false, false, 0);
  
    unsigned char OperandFlags = 0;
    // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
@@ -5173,7 +5186,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
  
    if (model == TLSModel::InitialExec)
      Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         PseudoSourceValue::getGOT(), 0);
+                         PseudoSourceValue::getGOT(), 0, false, false, 0);
  
    // The address of the thread local variable is the add of the thread
    // pointer with the offset of the variable.
@@ -5241,7 +5254,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
  
    SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                  DAG.getConstant(VTBits, MVT::i8));
-  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
+  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                               AndNode, DAG.getConstant(0, MVT::i8));
  
    SDValue Hi, Lo;
@@ -5290,7 +5303,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
    SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                 StackSlot,
-                               PseudoSourceValue::getFixedStack(SSFI), 0);
+                               PseudoSourceValue::getFixedStack(SSFI), 0,
+                               false, false, 0);
    return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
  }
  
@@ -5325,7 +5339,8 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
      };
      Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops));
      Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
-                         PseudoSourceValue::getFixedStack(SSFI), 0);
+                         PseudoSourceValue::getFixedStack(SSFI), 0,
+                         false, false, 0);
    }
  
    return Result;
@@ -5398,12 +5413,12 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
    SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
    SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                                PseudoSourceValue::getConstantPool(), 0,
-                              false, 16);
+                              false, false, 16);
    SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
    SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
    SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                                PseudoSourceValue::getConstantPool(), 0,
-                              false, 16);
+                              false, false, 16);
    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
  
    // Add the halves; easiest way is to swap them into another reg first.
@@ -5490,9 +5505,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
                                     getPointerTy(), StackSlot, WordOff);
    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
-                                StackSlot, NULL, 0);
+                                StackSlot, NULL, 0, false, false, 0);
    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
-                                OffsetSlot, NULL, 0);
+                                OffsetSlot, NULL, 0, false, false, 0);
    return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
  }
  
@@ -5540,7 +5555,8 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
    if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
      assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
      Chain = DAG.getStore(Chain, dl, Value, StackSlot,
-                         PseudoSourceValue::getFixedStack(SSFI), 0);
+                         PseudoSourceValue::getFixedStack(SSFI), 0,
+                         false, false, 0);
      SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
      SDValue Ops[] = {
        Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
@@ -5574,7 +5590,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
  
    // Load the result.
    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
-                     FIST, StackSlot, NULL, 0);
+                     FIST, StackSlot, NULL, 0, false, false, 0);
  }
  
  SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) {
@@ -5584,7 +5600,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) {
  
    // Load the result.
    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
-                     FIST, StackSlot, NULL, 0);
+                     FIST, StackSlot, NULL, 0, false, false, 0);
  }
  
  SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
@@ -5609,8 +5625,8 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
    Constant *C = ConstantVector::get(CV);
    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
    SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                               PseudoSourceValue::getConstantPool(), 0,
-                               false, 16);
+                             PseudoSourceValue::getConstantPool(), 0,
+                             false, false, 16);
    return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
  }
  
@@ -5636,8 +5652,8 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
    Constant *C = ConstantVector::get(CV);
    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
    SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                               PseudoSourceValue::getConstantPool(), 0,
-                               false, 16);
+                             PseudoSourceValue::getConstantPool(), 0,
+                             false, false, 16);
    if (VT.isVector()) {
      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
                         DAG.getNode(ISD::XOR, dl, MVT::v2i64,
@@ -5685,8 +5701,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
    Constant *C = ConstantVector::get(CV);
    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
    SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
-                                PseudoSourceValue::getConstantPool(), 0,
-                                false, 16);
+                              PseudoSourceValue::getConstantPool(), 0,
+                              false, false, 16);
    SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
  
    // Shift sign bit right or left if the two operands have different types.
@@ -5714,8 +5730,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
    C = ConstantVector::get(CV);
    CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
    SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                                PseudoSourceValue::getConstantPool(), 0,
-                                false, 16);
+                              PseudoSourceValue::getConstantPool(), 0,
+                              false, false, 16);
    SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
  
    // Or the value with the sign bit.
@@ -6623,7 +6639,8 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
      // vastart just stores the address of the VarArgsFrameIndex slot into the
      // memory location argument.
      SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
-    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
+    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0,
+                        false, false, 0);
    }
  
    // __va_list_tag:
@@ -6635,8 +6652,8 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
    SDValue FIN = Op.getOperand(1);
    // Store gp_offset
    SDValue Store = DAG.getStore(Op.getOperand(0), dl,
-                                 DAG.getConstant(VarArgsGPOffset, MVT::i32),
-                                 FIN, SV, 0);
+                               DAG.getConstant(VarArgsGPOffset, MVT::i32),
+                               FIN, SV, 0, false, false, 0);
    MemOps.push_back(Store);
  
    // Store fp_offset
@@ -6644,21 +6661,23 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
                      FIN, DAG.getIntPtrConstant(4));
    Store = DAG.getStore(Op.getOperand(0), dl,
                         DAG.getConstant(VarArgsFPOffset, MVT::i32),
-                       FIN, SV, 0);
+                       FIN, SV, 0, false, false, 0);
    MemOps.push_back(Store);
  
    // Store ptr to overflow_arg_area
    FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                      FIN, DAG.getIntPtrConstant(4));
    SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
-  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0);
+  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0,
+                       false, false, 0);
    MemOps.push_back(Store);
  
    // Store ptr to reg_save_area.
    FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                      FIN, DAG.getIntPtrConstant(8));
    SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
-  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0);
+  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0,
+                       false, false, 0);
    MemOps.push_back(Store);
    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                       &MemOps[0], MemOps.size());
@@ -6944,13 +6963,13 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
      return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
                         DAG.getNode(ISD::ADD, dl, getPointerTy(),
                                     FrameAddr, Offset),
-                       NULL, 0);
+                       NULL, 0, false, false, 0);
    }
  
    // Just load the return address.
    SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                     RetAddrFI, NULL, 0);
+                     RetAddrFI, NULL, 0, false, false, 0);
  }
  
  SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
@@ -6962,7 +6981,8 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
    unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
    SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
    while (Depth--)
-    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0,
+                            false, false, 0);
    return FrameAddr;
  }
  
@@ -6986,7 +7006,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
    SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
                                    DAG.getIntPtrConstant(-TD->getPointerSize()));
    StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
-  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0);
+  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0);
    Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
    MF.getRegInfo().addLiveOut(StoreAddrReg);
  
@@ -7005,16 +7025,12 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
  
    const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
  
-  const X86InstrInfo *TII =
-    ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
-
    if (Subtarget->is64Bit()) {
      SDValue OutChains[6];
  
      // Large code-model.
-
-    const unsigned char JMP64r  = TII->getBaseOpcodeFor(X86::JMP64r);
-    const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri);
+    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
+    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
  
      const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
      const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
@@ -7025,11 +7041,12 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
      unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
      SDValue Addr = Trmp;
      OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
-                                Addr, TrmpAddr, 0);
+                                Addr, TrmpAddr, 0, false, false, 0);
  
      Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                         DAG.getConstant(2, MVT::i64));
-    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2);
+    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2,
+                                false, false, 2);
  
      // Load the 'nest' parameter value into R10.
      // R10 is specified in X86CallingConv.td
@@ -7037,24 +7054,25 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
      Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                         DAG.getConstant(10, MVT::i64));
      OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
-                                Addr, TrmpAddr, 10);
+                                Addr, TrmpAddr, 10, false, false, 0);
  
      Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                         DAG.getConstant(12, MVT::i64));
-    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2);
+    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12,
+                                false, false, 2);
  
      // Jump to the nested function.
      OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
      Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                         DAG.getConstant(20, MVT::i64));
      OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
-                                Addr, TrmpAddr, 20);
+                                Addr, TrmpAddr, 20, false, false, 0);
  
      unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
      Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                         DAG.getConstant(22, MVT::i64));
      OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
-                                TrmpAddr, 22);
+                                TrmpAddr, 22, false, false, 0);
  
      SDValue Ops[] =
        { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
@@ -7109,25 +7127,28 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
                         DAG.getConstant(10, MVT::i32));
      Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
  
-    const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri);
+    // This is storing the opcode for MOV32ri.
+    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
      const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
      OutChains[0] = DAG.getStore(Root, dl,
                                  DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
-                                Trmp, TrmpAddr, 0);
+                                Trmp, TrmpAddr, 0, false, false, 0);
  
      Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                         DAG.getConstant(1, MVT::i32));
-    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1);
+    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1,
+                                false, false, 1);
  
-    const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP);
+    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
      Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                         DAG.getConstant(5, MVT::i32));
      OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
-                                TrmpAddr, 5, false, 1);
+                                TrmpAddr, 5, false, false, 1);
  
      Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                         DAG.getConstant(6, MVT::i32));
-    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1);
+    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6,
+                                false, false, 1);
  
      SDValue Ops[] =
        { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
@@ -7170,7 +7191,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
                                DAG.getEntryNode(), StackSlot);
  
    // Load FP Control Word from stack slot
-  SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0);
+  SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0,
+                            false, false, 0);
  
    // Transform as necessary
    SDValue CWD1 =
@@ -7534,7 +7556,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      if (FIST.getNode() != 0) {
        EVT VT = N->getValueType(0);
        // Return a load from the stack slot.
-      Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0));
+      Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0,
+                                    false, false, 0));
      }
      return;
    }
@@ -7552,14 +7575,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      Results.push_back(edx.getValue(1));
      return;
    }
-  case ISD::SDIV:
-  case ISD::UDIV:
-  case ISD::SREM:
-  case ISD::UREM: {
-    EVT WidenVT = getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-    Results.push_back(DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements()));
-    return;
-  }
    case ISD::ATOMIC_CMP_SWAP: {
      EVT T = N->getValueType(0);
      assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
@@ -7657,6 +7672,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
    case X86ISD::PINSRB:             return "X86ISD::PINSRB";
    case X86ISD::PINSRW:             return "X86ISD::PINSRW";
+  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
    case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
    case X86ISD::FMAX:               return "X86ISD::FMAX";
    case X86ISD::FMIN:               return "X86ISD::FMIN";
@@ -7758,7 +7774,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
  
  
  bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
-  if (!Ty1->isInteger() || !Ty2->isInteger())
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
      return false;
    unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
    unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
@@ -7779,7 +7795,7 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
  
  bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
    // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
-  return Ty1->isInteger(32) && Ty2->isInteger(64) && Subtarget->is64Bit();
+  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
  }
  
  bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
@@ -7935,7 +7951,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
    MIB.addReg(EAXreg);
  
    // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
  
    F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
    return nextMBB;
@@ -8092,7 +8108,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
    MIB.addReg(X86::EDX);
  
    // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
  
    F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
    return nextMBB;
@@ -8195,7 +8211,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
    MIB.addReg(X86::EAX);
  
    // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
  
    F->DeleteMachineInstr(mInstr);   // The pseudo instruction is gone now.
    return nextMBB;
@@ -8277,7 +8293,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
    if (!Subtarget->isTargetWin64()) {
      // If %al is 0, branch around the XMM save block.
      BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
-    BuildMI(MBB, DL, TII->get(X86::JE)).addMBB(EndMBB);
+    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
      MBB->addSuccessor(EndMBB);
    }
  
@@ -8763,10 +8779,11 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
      if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16)
        return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
                           LD->getSrcValue(), LD->getSrcValueOffset(),
-                         LD->isVolatile());
+                         LD->isVolatile(), LD->isNonTemporal(), 0);
      return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
                         LD->getSrcValue(), LD->getSrcValueOffset(),
-                       LD->isVolatile(), LD->getAlignment());
+                       LD->isVolatile(), LD->isNonTemporal(),
+                       LD->getAlignment());
    } else if (NumElems == 4 && LastLoadedElt == 1) {
      SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
      SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
@@ -8786,10 +8803,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    SDValue RHS = N->getOperand(2);
  
    // If we have SSE[12] support, try to form min/max nodes. SSE min/max
-  // instructions have the peculiarity that if either operand is a NaN,
-  // they chose what we call the RHS operand (and as such are not symmetric).
-  // It happens that this matches the semantics of the common C idiom
-  // x<y?x:y and related forms, so we can recognize these cases.
+  // instructions match the semantics of the common C idiom x<y?x:y but not
+  // x<=y?x:y, because of how they handle negative zero (which can be
+  // ignored in unsafe-math mode).
    if (Subtarget->hasSSE2() &&
        (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
        Cond.getOpcode() == ISD::SETCC) {
@@ -8801,33 +8817,14 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
        switch (CC) {
        default: break;
        case ISD::SETULT:
-        // This can be a min if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(RHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(LHS))
-            break;
-        }
+        if (!UnsafeFPMath) break;
          Opcode = X86ISD::FMIN;
          break;
        case ISD::SETOLE:
-        // This can be a min if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(LHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(RHS))
-            break;
-        }
+        if (!UnsafeFPMath) break;
          Opcode = X86ISD::FMIN;
          break;
        case ISD::SETULE:
-        // This can be a min, but if either operand is a NaN we need it to
-        // preserve the original LHS.
-        std::swap(LHS, RHS);
        case ISD::SETOLT:
        case ISD::SETLT:
        case ISD::SETLE:
@@ -8835,33 +8832,14 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
          break;
  
        case ISD::SETOGE:
-        // This can be a max if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(LHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(RHS))
-            break;
-        }
+        if (!UnsafeFPMath) break;
          Opcode = X86ISD::FMAX;
          break;
        case ISD::SETUGT:
-        // This can be a max if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(RHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(LHS))
-            break;
-        }
+        if (!UnsafeFPMath) break;
          Opcode = X86ISD::FMAX;
          break;
        case ISD::SETUGE:
-        // This can be a max, but if either operand is a NaN we need it to
-        // preserve the original LHS.
-        std::swap(LHS, RHS);
        case ISD::SETOGT:
        case ISD::SETGT:
        case ISD::SETGE:
@@ -8873,33 +8851,14 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
        switch (CC) {
        default: break;
        case ISD::SETOGE:
-        // This can be a min if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(RHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(LHS))
-            break;
-        }
+        if (!UnsafeFPMath) break;
          Opcode = X86ISD::FMIN;
          break;
        case ISD::SETUGT:
-        // This can be a min if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(LHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(RHS))
-            break;
-        }
+        if (!UnsafeFPMath) break;
          Opcode = X86ISD::FMIN;
          break;
        case ISD::SETUGE:
-        // This can be a min, but if either operand is a NaN we need it to
-        // preserve the original LHS.
-        std::swap(LHS, RHS);
        case ISD::SETOGT:
        case ISD::SETGT:
        case ISD::SETGE:
@@ -8907,33 +8866,14 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
          break;
  
        case ISD::SETULT:
-        // This can be a max if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(LHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(RHS))
-            break;
-        }
+        if (!UnsafeFPMath) break;
          Opcode = X86ISD::FMAX;
          break;
        case ISD::SETOLE:
-        // This can be a max if we can prove that at least one of the operands
-        // is not a nan.
-        if (!FiniteOnlyFPMath()) {
-          if (DAG.isKnownNeverNaN(RHS)) {
-            // Put the potential NaN in the RHS so that SSE will preserve it.
-            std::swap(LHS, RHS);
-          } else if (!DAG.isKnownNeverNaN(LHS))
-            break;
-        }
+        if (!UnsafeFPMath) break;
          Opcode = X86ISD::FMAX;
          break;
        case ISD::SETULE:
-        // This can be a max, but if either operand is a NaN we need it to
-        // preserve the original LHS.
-        std::swap(LHS, RHS);
        case ISD::SETOLT:
        case ISD::SETLT:
        case ISD::SETLE:
@@ -9151,6 +9091,53 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+/// PerformANDCombine - Look for SSE and instructions of this form:
+/// (and x, (build_vector c1,c2,c3,c4)). If there exists a use of a build_vector
+/// that's the bitwise complement of the mask, then transform the node to
+/// (and (xor x, (build_vector -1,-1,-1,-1)), (build_vector ~c1,~c2,~c3,~c4)).
+static SDValue PerformANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0.getOpcode() == ISD::XOR || !N1.hasOneUse())
+    return SDValue();
+
+  if (N1.getOpcode() == ISD::BUILD_VECTOR) {
+    unsigned NumElts = VT.getVectorNumElements();
+    EVT EltVT = VT.getVectorElementType();
+    SmallVector<SDValue, 8> Mask;
+    Mask.reserve(NumElts);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      SDValue Arg = N1.getOperand(i);
+      if (Arg.getOpcode() == ISD::UNDEF) {
+        Mask.push_back(Arg);
+        continue;
+      }
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Arg);
+      if (!C) return SDValue();
+      Mask.push_back(DAG.getConstant(~C->getAPIntValue(), EltVT));
+    }
+    N1 = DAG.getNode(ISD::BUILD_VECTOR, N1.getDebugLoc(), VT,
+                     &Mask[0], NumElts);
+    if (!N1.use_empty()) {
+      unsigned Bits = EltVT.getSizeInBits();
+      Mask.clear();
+      for (unsigned i = 0; i != NumElts; ++i)
+        Mask.push_back(DAG.getConstant(APInt::getAllOnesValue(Bits), EltVT));
+      SDValue NewMask = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                                    VT, &Mask[0], NumElts);
+      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                         DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+                                     N0, NewMask), N1);
+    }
+  }
+
+  return SDValue();
+}
  
  /// PerformMulCombine - Optimize a single multiply with constant into two
  /// in order to implement it with two cheaper instructions, e.g.
@@ -9299,7 +9286,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
        }
      } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
-         unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
+         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
           if (C->getZExtValue() == SplatIdx)
             BaseShAmt = InVec.getOperand(1);
         }
@@ -9485,7 +9472,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
        SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
                                    Ld->getBasePtr(), Ld->getSrcValue(),
                                    Ld->getSrcValueOffset(), Ld->isVolatile(),
-                                  Ld->getAlignment());
+                                  Ld->isNonTemporal(), Ld->getAlignment());
        SDValue NewChain = NewLd.getValue(1);
        if (TokenFactorIndex != -1) {
          Ops.push_back(NewChain);
@@ -9494,7 +9481,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
        }
        return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
                            St->getSrcValue(), St->getSrcValueOffset(),
-                          St->isVolatile(), St->getAlignment());
+                          St->isVolatile(), St->isNonTemporal(),
+                          St->getAlignment());
      }
  
      // Otherwise, lower to two pairs of 32-bit loads / stores.
@@ -9504,10 +9492,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
  
      SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
                                 Ld->getSrcValue(), Ld->getSrcValueOffset(),
-                               Ld->isVolatile(), Ld->getAlignment());
+                               Ld->isVolatile(), Ld->isNonTemporal(),
+                               Ld->getAlignment());
      SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
                                 Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
-                               Ld->isVolatile(),
+                               Ld->isVolatile(), Ld->isNonTemporal(),
                                 MinAlign(Ld->getAlignment(), 4));
  
      SDValue NewChain = LoLd.getValue(1);
@@ -9524,11 +9513,13 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
  
      SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
                                  St->getSrcValue(), St->getSrcValueOffset(),
-                                St->isVolatile(), St->getAlignment());
+                                St->isVolatile(), St->isNonTemporal(),
+                                St->getAlignment());
      SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
                                  St->getSrcValue(),
                                  St->getSrcValueOffset() + 4,
                                  St->isVolatile(),
+                                St->isNonTemporal(),
                                  MinAlign(St->getAlignment(), 4));
      return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
    }
@@ -9680,6 +9671,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
    case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
    case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
+  case ISD::AND:            return PerformANDCombine(N, DAG, DCI);
    case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
    case ISD::SHL:
    case ISD::SRA:
@@ -9711,7 +9703,7 @@ static bool LowerToBSwap(CallInst *CI) {
    // Verify this is a simple bswap.
    if (CI->getNumOperands() != 2 ||
        CI->getType() != CI->getOperand(1)->getType() ||
-      !CI->getType()->isInteger())
+      !CI->getType()->isIntegerTy())
      return false;
  
    const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
@@ -9760,7 +9752,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
        return LowerToBSwap(CI);
      }
      // rorw $$8, ${0:w}  -->  llvm.bswap.i16
-    if (CI->getType()->isInteger(16) &&
+    if (CI->getType()->isIntegerTy(16) &&
          AsmPieces.size() == 3 &&
          AsmPieces[0] == "rorw" &&
          AsmPieces[1] == "$$8," &&
@@ -9770,7 +9762,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
      }
      break;
    case 3:
-    if (CI->getType()->isInteger(64) &&
+    if (CI->getType()->isIntegerTy(64) &&
          Constraints.size() >= 2 &&
          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {