Two types of instructions have register lists:

[oota-llvm.git] / lib / Target / CellSPU / SPUISelLowering.cpp
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp

index 2042a93e919be6d3fee017c320deedd8f36bb60d..b5f0e055a9427ea38f6bdeb6a51523c79103fec1 100644 (file)
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -15,24 +15,25 @@
  #include "SPUISelLowering.h"
  #include "SPUTargetMachine.h"
  #include "SPUFrameInfo.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/VectorExtras.h"
+#include "SPUMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
  #include "llvm/CallingConv.h"
+#include "llvm/Type.h"
  #include "llvm/CodeGen/CallingConvLower.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
  #include "llvm/CodeGen/MachineFunction.h"
  #include "llvm/CodeGen/MachineInstrBuilder.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
  #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Intrinsics.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/VectorExtras.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
  #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOptions.h"
-
  #include <map>
  
  using namespace llvm;
@@ -41,10 +42,10 @@ using namespace llvm;
  namespace {
    std::map<unsigned, const char *> node_names;
  
-  //! MVT mapping to useful data for Cell SPU
+  //! EVT mapping to useful data for Cell SPU
    struct valtype_map_s {
-    const MVT   valtype;
-    const int   prefslot_byte;
+    EVT   valtype;
+    int   prefslot_byte;
    };
  
    const valtype_map_s valtype_map[] = {
@@ -60,7 +61,7 @@ namespace {
  
    const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
  
-  const valtype_map_s *getValueTypeMapEntry(MVT VT) {
+  const valtype_map_s *getValueTypeMapEntry(EVT VT) {
      const valtype_map_s *retval = 0;
  
      for (size_t i = 0; i < n_valtype_map; ++i) {
@@ -72,11 +73,8 @@ namespace {
  
  #ifndef NDEBUG
      if (retval == 0) {
-      std::string msg;
-      raw_string_ostream Msg(msg);
-      Msg << "getValueTypeMapEntry returns NULL for "
-           << VT.getMVTString();
-      llvm_report_error(Msg.str());
+      report_fatal_error("getValueTypeMapEntry returns NULL for " +
+                         Twine(VT.getEVTString()));
      }
  #endif
  
@@ -92,7 +90,7 @@ namespace {
  
    SDValue
    ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
-                bool isSigned, SDValue &Hi, SPUTargetLowering &TLI) {
+                bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
      // The input chain to this libcall is the entry node of the function.
      // Legalizing the call will automatically add the previous call to the
      // dependence.
@@ -101,8 +99,8 @@ namespace {
      TargetLowering::ArgListTy Args;
      TargetLowering::ArgListEntry Entry;
      for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
-      MVT ArgVT = Op.getOperand(i).getValueType();
-      const Type *ArgTy = ArgVT.getTypeForMVT(*DAG.getContext());
+      EVT ArgVT = Op.getOperand(i).getValueType();
+      const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
        Entry.Node = Op.getOperand(i);
        Entry.Ty = ArgTy;
        Entry.isSExt = isSigned;
@@ -114,20 +112,20 @@ namespace {
  
      // Splice the libcall in wherever FindInputOutputChains tells us to.
      const Type *RetTy =
-                 Op.getNode()->getValueType(0).getTypeForMVT(*DAG.getContext());
+                Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
      std::pair<SDValue, SDValue> CallInfo =
              TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
-                            0, CallingConv::C, false, Callee, Args, DAG,
-                            Op.getDebugLoc());
+                            0, TLI.getLibcallCallingConv(LC), false,
+                            /*isReturnValueUsed=*/true,
+                            Callee, Args, DAG, Op.getDebugLoc());
  
      return CallInfo.first;
    }
  }
  
  SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
-  : TargetLowering(TM),
-    SPUTM(TM)
-{
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()),
+    SPUTM(TM) {
    // Fold away setcc operations if possible.
    setPow2DivIsCheap();
  
@@ -155,6 +153,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
    setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
  
+  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i8, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
    // SPU constant load actions are custom lowered:
    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
    setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
@@ -162,7 +167,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    // SPU's loads and stores have to be custom lowered:
    for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
         ++sctype) {
-    MVT VT = (MVT::SimpleValueType)sctype;
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
  
      setOperationAction(ISD::LOAD,   VT, Custom);
      setOperationAction(ISD::STORE,  VT, Custom);
@@ -171,20 +176,20 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
      setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
  
      for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
-      MVT StoreVT = (MVT::SimpleValueType) stype;
+      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
        setTruncStoreAction(VT, StoreVT, Expand);
      }
    }
  
    for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
         ++sctype) {
-    MVT VT = (MVT::SimpleValueType) sctype;
+    MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
  
      setOperationAction(ISD::LOAD,   VT, Custom);
      setOperationAction(ISD::STORE,  VT, Custom);
  
      for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
-      MVT StoreVT = (MVT::SimpleValueType) stype;
+      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
        setTruncStoreAction(VT, StoreVT, Expand);
      }
    }
@@ -203,11 +208,37 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    // SPU has no intrinsics for these particular operations:
    setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
  
-  // SPU has no SREM/UREM instructions
-  setOperationAction(ISD::SREM, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
+  // SPU has no division/remainder instructions
+  setOperationAction(ISD::SREM,    MVT::i8,   Expand);
+  setOperationAction(ISD::UREM,    MVT::i8,   Expand);
+  setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
+  setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
+  setOperationAction(ISD::SREM,    MVT::i16,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i16,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i32,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i32,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i64,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i64,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i128, Expand);
+  setOperationAction(ISD::UREM,    MVT::i128, Expand);
+  setOperationAction(ISD::SDIV,    MVT::i128, Expand);
+  setOperationAction(ISD::UDIV,    MVT::i128, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
  
    // We don't support sin/cos/sqrt/fmod
    setOperationAction(ISD::FSIN , MVT::f64, Expand);
@@ -287,11 +318,19 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
    setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
    setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
  
+  setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
    setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
    setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
+  setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
  
+  setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
+  setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
    setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
+  setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
+  setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
  
    // SPU has a version of select that implements (a&~c)|(b&c), just like
    // select ought to work:
@@ -309,10 +348,21 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    // Custom lower i128 -> i64 truncates
    setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
  
+  // Custom lower i32/i64 -> i128 sign extend
+  setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
+
+  setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
    // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
    // to expand to a libcall, hence the custom lowering:
    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
  
    // FDIV on SPU requires custom lowering
    setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
@@ -335,24 +385,17 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    // We cannot sextinreg(i1).  Expand to shifts.
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  
-  // Support label based line numbers.
-  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
-  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
-
    // We want to legalize GlobalAddress and ConstantPool nodes into the
    // appropriate instructions to materialize the address.
    for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
         ++sctype) {
-    MVT VT = (MVT::SimpleValueType)sctype;
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
  
      setOperationAction(ISD::GlobalAddress,  VT, Custom);
      setOperationAction(ISD::ConstantPool,   VT, Custom);
      setOperationAction(ISD::JumpTable,      VT, Custom);
    }
  
-  // RET must be custom lowered, to meet ABI requirements
-  setOperationAction(ISD::RET,           MVT::Other, Custom);
-
    // VASTART needs to be custom lowered to use the VarArgsFrameIndex
    setOperationAction(ISD::VASTART           , MVT::Other, Custom);
  
@@ -384,12 +427,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
    addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
  
-  // "Odd size" vector classes that we're willing to support:
-  addRegisterClass(MVT::v2i32, SPU::VECREGRegisterClass);
-
    for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
         i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
-    MVT VT = (MVT::SimpleValueType)i;
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
  
      // add/sub are legal for all supported vector VT's.
      setOperationAction(ISD::ADD,     VT, Legal);
@@ -443,7 +483,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    // Set pre-RA register scheduler default to BURR, which produces slightly
    // better code than the default (could also be TDRR, but TargetLowering.h
    // needs a mod to support that model):
-  setSchedulingPreference(SchedulingForRegPressure);
+  setSchedulingPreference(Sched::RegPressure);
  }
  
  const char *
@@ -465,9 +505,6 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
      node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
      node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS";
      node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES";
-    node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL";
-    node_names[(unsigned) SPUISD::VEC_SRL] = "SPUISD::VEC_SRL";
-    node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
      node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
      node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
      node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
@@ -494,9 +531,11 @@ unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const {
  // Return the Cell SPU's SETCC result type
  //===----------------------------------------------------------------------===//
  
-MVT SPUTargetLowering::getSetCCResultType(MVT VT) const {
+MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const {
    // i16 and i32 are valid SETCC result types
-  return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ? VT : MVT::i32);
+  return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ?
+    VT.getSimpleVT().SimpleTy :
+    MVT::i32);
  }
  
  //===----------------------------------------------------------------------===//
@@ -529,9 +568,9 @@ static SDValue
  LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    LoadSDNode *LN = cast<LoadSDNode>(Op);
    SDValue the_chain = LN->getChain();
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  MVT InVT = LN->getMemoryVT();
-  MVT OutVT = Op.getValueType();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT InVT = LN->getMemoryVT();
+  EVT OutVT = Op.getValueType();
    ISD::LoadExtType ExtType = LN->getExtensionType();
    unsigned alignment = LN->getAlignment();
    const valtype_map_s *vtm = getValueTypeMapEntry(InVT);
@@ -624,8 +663,8 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  
      // Re-emit as a v16i8 vector load
      result = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr,
-                         LN->getSrcValue(), LN->getSrcValueOffset(),
-                         LN->isVolatile(), 16);
+                         LN->getPointerInfo(),
+                         LN->isVolatile(), LN->isNonTemporal(), 16);
  
      // Update the chain
      the_chain = result.getValue(1);
@@ -636,7 +675,8 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  
      // Convert the loaded v16i8 vector to the appropriate vector type
      // specified by the operand:
-    MVT vecVT = MVT::getVectorVT(InVT, (128 / InVT.getSizeInBits()));
+    EVT vecVT = EVT::getVectorVT(*DAG.getContext(), 
+                                 InVT, (128 / InVT.getSizeInBits()));
      result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
                           DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result));
  
@@ -670,12 +710,9 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    case ISD::POST_DEC:
    case ISD::LAST_INDEXED_MODE:
      {
-      std::string msg;
-      raw_string_ostream Msg(msg);
-      Msg << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
-            "UNINDEXED\n";
-      Msg << (unsigned) LN->getAddressingMode();
-      llvm_report_error(Msg.str());
+      report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other "
+                         "than UNINDEXED\n" +
+                         Twine((unsigned)LN->getAddressingMode()));
        /*NOTREACHED*/
      }
    }
@@ -693,17 +730,17 @@ static SDValue
  LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    StoreSDNode *SN = cast<StoreSDNode>(Op);
    SDValue Value = SN->getValue();
-  MVT VT = Value.getValueType();
-  MVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT VT = Value.getValueType();
+  EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    DebugLoc dl = Op.getDebugLoc();
    unsigned alignment = SN->getAlignment();
  
    switch (SN->getAddressingMode()) {
    case ISD::UNINDEXED: {
      // The vector type we really want to load from the 16-byte chunk.
-    MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits())),
-        stVecVT = MVT::getVectorVT(StVT, (128 / StVT.getSizeInBits()));
+    EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
+                                 VT, (128 / VT.getSizeInBits()));
  
      SDValue alignLoadVec;
      SDValue basePtr = SN->getBasePtr();
@@ -712,7 +749,6 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  
      if (alignment == 16) {
        ConstantSDNode *CN;
-
        // Special cases for a known aligned load to simplify the base pointer
        // and insertion byte:
        if (basePtr.getOpcode() == ISD::ADD
@@ -736,6 +772,9 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
          insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                      basePtr,
                                      DAG.getConstant(0, PtrVT));
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                    basePtr,
+                                    DAG.getConstant(0, PtrVT));
        }
      } else {
        // Unaligned load: must be more pessimistic about addressing modes:
@@ -772,10 +811,10 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
                                    DAG.getConstant(0, PtrVT));
      }
  
-    // Re-emit as a v16i8 vector load
-    alignLoadVec = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr,
-                               SN->getSrcValue(), SN->getSrcValueOffset(),
-                               SN->isVolatile(), 16);
+    // Load the memory to which to store.
+    alignLoadVec = DAG.getLoad(vecVT, dl, the_chain, basePtr,
+                               SN->getPointerInfo(),
+                               SN->isVolatile(), SN->isNonTemporal(), 16);
  
      // Update the chain
      the_chain = alignLoadVec.getValue(1);
@@ -798,16 +837,16 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
      // to the stack pointer, which is always aligned.
  #if !defined(NDEBUG)
        if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
-        cerr << "CellSPU LowerSTORE: basePtr = ";
+        errs() << "CellSPU LowerSTORE: basePtr = ";
          basePtr.getNode()->dump(&DAG);
-        cerr << "\n";
+        errs() << "\n";
        }
  #endif
  
-    SDValue insertEltOp =
-            DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT, insertEltOffs);
-    SDValue vectorizeOp =
-            DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, theValue);
+    SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
+                                      insertEltOffs);
+    SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, 
+                                      theValue);
  
      result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
                           vectorizeOp, alignLoadVec,
@@ -815,17 +854,18 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
                                       MVT::v4i32, insertEltOp));
  
      result = DAG.getStore(the_chain, dl, result, basePtr,
-                          LN->getSrcValue(), LN->getSrcValueOffset(),
-                          LN->isVolatile(), LN->getAlignment());
+                          LN->getPointerInfo(),
+                          LN->isVolatile(), LN->isNonTemporal(),
+                          LN->getAlignment());
  
  #if 0 && !defined(NDEBUG)
      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
        const SDValue &currentRoot = DAG.getRoot();
  
        DAG.setRoot(result);
-      cerr << "------- CellSPU:LowerStore result:\n";
+      errs() << "------- CellSPU:LowerStore result:\n";
        DAG.dump();
-      cerr << "-------\n";
+      errs() << "-------\n";
        DAG.setRoot(currentRoot);
      }
  #endif
@@ -839,12 +879,9 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    case ISD::POST_DEC:
    case ISD::LAST_INDEXED_MODE:
      {
-      std::string msg;
-      raw_string_ostream Msg(msg);
-      Msg << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
-            "UNINDEXED\n";
-      Msg << (unsigned) SN->getAddressingMode();
-      llvm_report_error(Msg.str());
+      report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other "
+                         "than UNINDEXED\n" +
+                         Twine((unsigned)SN->getAddressingMode()));
        /*NOTREACHED*/
      }
    }
@@ -853,11 +890,11 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  }
  
  //! Generate the address of a constant pool entry.
-SDValue
+static SDValue
  LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  Constant *C = CP->getConstVal();
+  const Constant *C = CP->getConstVal();
    SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
    SDValue Zero = DAG.getConstant(0, PtrVT);
    const TargetMachine &TM = DAG.getTarget();
@@ -888,7 +925,7 @@ SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM
  
  static SDValue
  LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
    JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
    SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
    SDValue Zero = DAG.getConstant(0, PtrVT);
@@ -913,10 +950,11 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  
  static SDValue
  LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
-  MVT PtrVT = Op.getValueType();
+  EVT PtrVT = Op.getValueType();
    GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
-  GlobalValue *GV = GSDN->getGlobal();
-  SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
+  const GlobalValue *GV = GSDN->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+                                          PtrVT, GSDN->getOffset());
    const TargetMachine &TM = DAG.getTarget();
    SDValue Zero = DAG.getConstant(0, PtrVT);
    // FIXME there is no actual debug info here
@@ -931,7 +969,7 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
        return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
      }
    } else {
-    llvm_report_error("LowerGlobalAddress: Relocation model other than static"
+    report_fatal_error("LowerGlobalAddress: Relocation model other than static"
                        "not supported.");
      /*NOTREACHED*/
    }
@@ -942,7 +980,7 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  //! Custom lower double precision floating point constants
  static SDValue
  LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
    // FIXME there is no actual debug info here
    DebugLoc dl = Op.getDebugLoc();
  
@@ -962,44 +1000,46 @@ LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
    return SDValue();
  }
  
-static SDValue
-LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
-{
+SDValue
+SPUTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals)
+                                          const {
+
    MachineFunction &MF = DAG.getMachineFunction();
    MachineFrameInfo *MFI = MF.getFrameInfo();
    MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  SmallVector<SDValue, 48> ArgValues;
-  SDValue Root = Op.getOperand(0);
-  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
-  DebugLoc dl = Op.getDebugLoc();
-
-  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
-  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
+  SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
  
    unsigned ArgOffset = SPUFrameInfo::minStackSize();
    unsigned ArgRegIdx = 0;
    unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
  
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext());
+  // FIXME: allow for other calling conventions
+  CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
  
    // Add DAG nodes to load the arguments or copy them out of registers.
-  for (unsigned ArgNo = 0, e = Op.getNode()->getNumValues() - 1;
-       ArgNo != e; ++ArgNo) {
-    MVT ObjectVT = Op.getValue(ArgNo).getValueType();
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+    EVT ObjectVT = Ins[ArgNo].VT;
      unsigned ObjSize = ObjectVT.getSizeInBits()/8;
      SDValue ArgVal;
+    CCValAssign &VA = ArgLocs[ArgNo];
  
-    if (ArgRegIdx < NumArgRegs) {
+    if (VA.isRegLoc()) {
        const TargetRegisterClass *ArgRegClass;
  
-      switch (ObjectVT.getSimpleVT()) {
-      default: {
-        std::string msg;
-        raw_string_ostream Msg(msg);
-        Msg << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
-             << ObjectVT.getMVTString();
-        llvm_report_error(Msg.str());
-      }
+      switch (ObjectVT.getSimpleVT().SimpleTy) {
+      default:
+        report_fatal_error("LowerFormalArguments Unhandled argument type: " +
+                           Twine(ObjectVT.getEVTString()));
        case MVT::i8:
          ArgRegClass = &SPU::R8CRegClass;
          break;
@@ -1032,53 +1072,69 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
        }
  
        unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
-      RegInfo.addLiveIn(ArgRegs[ArgRegIdx], VReg);
-      ArgVal = DAG.getCopyFromReg(Root, dl, VReg, ObjectVT);
+      RegInfo.addLiveIn(VA.getLocReg(), VReg);
+      ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
        ++ArgRegIdx;
      } else {
        // We need to load the argument to a virtual register if we determined
        // above that we ran out of physical registers of the appropriate type
        // or we're forced to do vararg
-      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset);
+      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      ArgVal = DAG.getLoad(ObjectVT, dl, Root, FIN, NULL, 0);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
+                           false, false, 0);
        ArgOffset += StackSlotSize;
      }
  
-    ArgValues.push_back(ArgVal);
+    InVals.push_back(ArgVal);
      // Update the chain
-    Root = ArgVal.getOperand(0);
+    Chain = ArgVal.getOperand(0);
    }
  
    // vararg handling:
    if (isVarArg) {
-    // unsigned int ptr_size = PtrVT.getSizeInBits() / 8;
+    // FIXME: we should be able to query the argument registers from 
+    //        tablegen generated code. 
+    static const unsigned ArgRegs[] = {
+      SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
+      SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
+      SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
+      SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
+      SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
+      SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
+      SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
+      SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
+      SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
+      SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
+      SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
+    };
+    // size of ArgRegs array
+    unsigned NumArgRegs = 77;
+
      // We will spill (79-3)+1 registers to the stack
      SmallVector<SDValue, 79-3+1> MemOps;
  
      // Create the frame slot
-
      for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
-      VarArgsFrameIndex = MFI->CreateFixedObject(StackSlotSize, ArgOffset);
-      SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
-      SDValue ArgVal = DAG.getRegister(ArgRegs[ArgRegIdx], MVT::v16i8);
-      SDValue Store = DAG.getStore(Root, dl, ArgVal, FIN, NULL, 0);
-      Root = Store.getOperand(0);
+      FuncInfo->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
+      SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass);
+      SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
+      SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
+                                   false, false, 0);
+      Chain = Store.getOperand(0);
        MemOps.push_back(Store);
  
        // Increment address by stack slot size for the next stored argument
        ArgOffset += StackSlotSize;
      }
      if (!MemOps.empty())
-      Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                         &MemOps[0], MemOps.size());
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &MemOps[0], MemOps.size());
    }
  
-  ArgValues.push_back(Root);
-
-  // Return the new list of results.
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
-                     &ArgValues[0], ArgValues.size());
+  return Chain;
  }
  
  /// isLSAAddress - Return the immediate to use if the specified
@@ -1095,24 +1151,33 @@ static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
    return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
  }
  
-static SDValue
-LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
-  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
-  SDValue Chain = TheCall->getChain();
-  SDValue Callee    = TheCall->getCallee();
-  unsigned NumOps     = TheCall->getNumArgs();
+SDValue
+SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool &isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  // CellSPU target does not yet support tail call optimization.
+  isTailCall = false;
+
+  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
+  unsigned NumOps     = Outs.size();
    unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
-  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
-  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
-  DebugLoc dl = TheCall->getDebugLoc();
  
-  // Handy pointer type
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext()); 
+  // FIXME: allow for other calling conventions
+  CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
+  
+  const unsigned NumArgRegs = ArgLocs.size();
  
-  // Accumulate how many bytes are to be pushed on the stack, including the
-  // linkage area, and parameter passing area.  According to the SPU ABI,
-  // we minimally need space for [LR] and [SP]
-  unsigned NumStackBytes = SPUFrameInfo::minStackSize();
+
+  // Handy pointer type
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
  
    // Set up a copy of the stack pointer for use loading and storing any
    // arguments that may not fit in the registers available for argument
@@ -1129,37 +1194,24 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    // And the arguments passed on the stack
    SmallVector<SDValue, 8> MemOpChains;
  
-  for (unsigned i = 0; i != NumOps; ++i) {
-    SDValue Arg = TheCall->getArg(i);
+  for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
+    SDValue Arg = OutVals[ArgRegIdx];
+    CCValAssign &VA = ArgLocs[ArgRegIdx];
  
      // PtrOff will be used to store the current argument to the stack if a
      // register cannot be found for it.
      SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
      PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
  
-    switch (Arg.getValueType().getSimpleVT()) {
+    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
      default: llvm_unreachable("Unexpected ValueType for argument!");
      case MVT::i8:
      case MVT::i16:
      case MVT::i32:
      case MVT::i64:
      case MVT::i128:
-      if (ArgRegIdx != NumArgRegs) {
-        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
-      } else {
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
-        ArgOffset += StackSlotSize;
-      }
-      break;
      case MVT::f32:
      case MVT::f64:
-      if (ArgRegIdx != NumArgRegs) {
-        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
-      } else {
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
-        ArgOffset += StackSlotSize;
-      }
-      break;
      case MVT::v2i64:
      case MVT::v2f64:
      case MVT::v4f32:
@@ -1167,17 +1219,23 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
      case MVT::v8i16:
      case MVT::v16i8:
        if (ArgRegIdx != NumArgRegs) {
-        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
        } else {
-        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                           MachinePointerInfo(),
+                                           false, false, 0));
          ArgOffset += StackSlotSize;
        }
        break;
      }
    }
  
-  // Update number of stack bytes actually used, insert a call sequence start
-  NumStackBytes = (ArgOffset - SPUFrameInfo::minStackSize());
+  // Accumulate how many bytes are to be pushed on the stack, including the
+  // linkage area, and parameter passing area.  According to the SPU ABI,
+  // we minimally need space for [LR] and [SP].
+  unsigned NumStackBytes = ArgOffset - SPUFrameInfo::minStackSize();
+
+  // Insert a call sequence start
    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
                                                              true));
  
@@ -1203,10 +1261,10 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
    // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
    // node so that legalize doesn't hack it.
    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    GlobalValue *GV = G->getGlobal();
-    MVT CalleeVT = Callee.getValueType();
+    const GlobalValue *GV = G->getGlobal();
+    EVT CalleeVT = Callee.getValueType();
      SDValue Zero = DAG.getConstant(0, PtrVT);
-    SDValue GA = DAG.getTargetGlobalAddress(GV, CalleeVT);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
  
      if (!ST->usingLargeMem()) {
        // Turn calls to targets that are defined (i.e., have bodies) into BRSL
@@ -1228,7 +1286,7 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
        Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
      }
    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    MVT CalleeVT = Callee.getValueType();
+    EVT CalleeVT = Callee.getValueType();
      SDValue Zero = DAG.getConstant(0, PtrVT);
      SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
          Callee.getValueType());
@@ -1262,82 +1320,45 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  
    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
                               DAG.getIntPtrConstant(0, true), InFlag);
-  if (TheCall->getValueType(0) != MVT::Other)
+  if (!Ins.empty())
      InFlag = Chain.getValue(1);
  
-  SDValue ResultVals[3];
-  unsigned NumResults = 0;
-
-  // If the call has results, copy the values out of the ret val registers.
-  switch (TheCall->getValueType(0).getSimpleVT()) {
-  default: llvm_unreachable("Unexpected ret value!");
-  case MVT::Other: break;
-  case MVT::i32:
-    if (TheCall->getValueType(1) == MVT::i32) {
-      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R4,
-                                 MVT::i32, InFlag).getValue(1);
-      ResultVals[0] = Chain.getValue(0);
-      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
-                                 Chain.getValue(2)).getValue(1);
-      ResultVals[1] = Chain.getValue(0);
-      NumResults = 2;
-    } else {
-      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
-                                 InFlag).getValue(1);
-      ResultVals[0] = Chain.getValue(0);
-      NumResults = 1;
-    }
-    break;
-  case MVT::i64:
-    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i64,
-                               InFlag).getValue(1);
-    ResultVals[0] = Chain.getValue(0);
-    NumResults = 1;
-    break;
-  case MVT::i128:
-    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i128,
-                               InFlag).getValue(1);
-    ResultVals[0] = Chain.getValue(0);
-    NumResults = 1;
-    break;
-  case MVT::f32:
-  case MVT::f64:
-    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, TheCall->getValueType(0),
-                               InFlag).getValue(1);
-    ResultVals[0] = Chain.getValue(0);
-    NumResults = 1;
-    break;
-  case MVT::v2f64:
-  case MVT::v2i64:
-  case MVT::v4f32:
-  case MVT::v4i32:
-  case MVT::v8i16:
-  case MVT::v16i8:
-    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, TheCall->getValueType(0),
-                                   InFlag).getValue(1);
-    ResultVals[0] = Chain.getValue(0);
-    NumResults = 1;
-    break;
-  }
-
    // If the function returns void, just return the chain.
-  if (NumResults == 0)
+  if (Ins.empty())
      return Chain;
  
-  // Otherwise, merge everything together with a MERGE_VALUES node.
-  ResultVals[NumResults++] = Chain;
-  SDValue Res = DAG.getMergeValues(ResultVals, NumResults, dl);
-  return Res.getValue(Op.getResNo());
+  // Now handle the return value(s)
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(),
+                    RVLocs, *DAG.getContext());
+  CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
+
+
+  // If the call has results, copy the values out of the ret val registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+    
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
+                                     InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+    InVals.push_back(Val);
+   }
+
+  return Chain;
  }
  
-static SDValue
-LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM) {
+SDValue
+SPUTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               DebugLoc dl, SelectionDAG &DAG) const {
+
    SmallVector<CCValAssign, 16> RVLocs;
-  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
-  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
-  DebugLoc dl = Op.getDebugLoc();
-  CCState CCInfo(CC, isVarArg, TM, RVLocs, DAG.getContext());
-  CCInfo.AnalyzeReturn(Op.getNode(), RetCC_SPU);
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
  
    // If this is the first return lowered for this function, add the regs to the
    // liveout set for the function.
@@ -1346,7 +1367,6 @@ LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM) {
        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
    }
  
-  SDValue Chain = Op.getOperand(0);
    SDValue Flag;
  
    // Copy the result values into the output registers.
@@ -1354,7 +1374,7 @@ LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM) {
      CCValAssign &VA = RVLocs[i];
      assert(VA.isRegLoc() && "Can only return in registers!");
      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             Op.getOperand(i*2+1), Flag);
+                             OutVals[i], Flag);
      Flag = Chain.getValue(1);
    }
  
@@ -1395,7 +1415,7 @@ getVecImm(SDNode *N) {
  /// and the value fits into an unsigned 18-bit constant, and if so, return the
  /// constant
  SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
-                              MVT ValueType) {
+                              EVT ValueType) {
    if (ConstantSDNode *CN = getVecImm(N)) {
      uint64_t Value = CN->getZExtValue();
      if (ValueType == MVT::i64) {
@@ -1417,7 +1437,7 @@ SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
  /// and the value fits into a signed 16-bit constant, and if so, return the
  /// constant
  SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
-                              MVT ValueType) {
+                              EVT ValueType) {
    if (ConstantSDNode *CN = getVecImm(N)) {
      int64_t Value = CN->getSExtValue();
      if (ValueType == MVT::i64) {
@@ -1440,7 +1460,7 @@ SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
  /// and the value fits into a signed 10-bit constant, and if so, return the
  /// constant
  SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
-                              MVT ValueType) {
+                              EVT ValueType) {
    if (ConstantSDNode *CN = getVecImm(N)) {
      int64_t Value = CN->getSExtValue();
      if (ValueType == MVT::i64) {
@@ -1451,7 +1471,7 @@ SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
          return SDValue();
        Value = Value >> 32;
      }
-    if (isS10Constant(Value))
+    if (isInt<10>(Value))
        return DAG.getTargetConstant(Value, ValueType);
    }
  
@@ -1466,7 +1486,7 @@ SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
  /// constant vectors. Thus, we test to see if the upper and lower bytes are the
  /// same value.
  SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
-                             MVT ValueType) {
+                             EVT ValueType) {
    if (ConstantSDNode *CN = getVecImm(N)) {
      int Value = (int) CN->getZExtValue();
      if (ValueType == MVT::i16
@@ -1485,7 +1505,7 @@ SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
  /// and the value fits into a signed 16-bit constant, and if so, return the
  /// constant
  SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
-                               MVT ValueType) {
+                               EVT ValueType) {
    if (ConstantSDNode *CN = getVecImm(N)) {
      uint64_t Value = CN->getZExtValue();
      if ((ValueType == MVT::i32
@@ -1516,10 +1536,10 @@ SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
  }
  
  //! Lower a BUILD_VECTOR instruction creatively:
-SDValue
+static SDValue
  LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
-  MVT EltVT = VT.getVectorElementType();
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT.getVectorElementType();
    DebugLoc dl = Op.getDebugLoc();
    BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
    assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
@@ -1539,15 +1559,11 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
  
    uint64_t SplatBits = APSplatBits.getZExtValue();
  
-  switch (VT.getSimpleVT()) {
-  default: {
-    std::string msg;
-    raw_string_ostream Msg(msg);
-    Msg << "CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = "
-         << VT.getMVTString();
-    llvm_report_error(Msg.str());
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
+                       Twine(VT.getEVTString()));
      /*NOTREACHED*/
-  }
    case MVT::v4f32: {
      uint32_t Value32 = uint32_t(SplatBits);
      assert(SplatBitSize == 32
@@ -1589,10 +1605,6 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
      SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
      return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
    }
-  case MVT::v2i32: {
-    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T);
-  }
    case MVT::v2i64: {
      return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
    }
@@ -1604,7 +1616,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
  /*!
   */
  SDValue
-SPU::LowerV2I64Splat(MVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
+SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
                       DebugLoc dl) {
    uint32_t upper = uint32_t(SplatVal >> 32);
    uint32_t lower = uint32_t(SplatVal);
@@ -1716,27 +1728,33 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
  
    // If we have a single element being moved from V1 to V2, this can be handled
    // using the C*[DX] compute mask instructions, but the vector elements have
-  // to be monotonically increasing with one exception element.
-  MVT VecVT = V1.getValueType();
-  MVT EltVT = VecVT.getVectorElementType();
+  // to be monotonically increasing with one exception element, and the source
+  // slot of the element to move must be the same as the destination.
+  EVT VecVT = V1.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
    unsigned EltsFromV2 = 0;
-  unsigned V2Elt = 0;
+  unsigned V2EltOffset = 0;
    unsigned V2EltIdx0 = 0;
    unsigned CurrElt = 0;
    unsigned MaxElts = VecVT.getVectorNumElements();
    unsigned PrevElt = 0;
-  unsigned V0Elt = 0;
    bool monotonic = true;
    bool rotate = true;
+  int rotamt=0;
+  EVT maskVT;             // which of the c?d instructions to use
  
    if (EltVT == MVT::i8) {
      V2EltIdx0 = 16;
+    maskVT = MVT::v16i8; 
    } else if (EltVT == MVT::i16) {
      V2EltIdx0 = 8;
+    maskVT = MVT::v8i16;
    } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
      V2EltIdx0 = 4;
+    maskVT = MVT::v4i32;
    } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
      V2EltIdx0 = 2;
+    maskVT = MVT::v2i64;
    } else
      llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
  
@@ -1748,9 +1766,13 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
  
      if (monotonic) {
        if (SrcElt >= V2EltIdx0) {
-        if (1 >= (++EltsFromV2)) {
-          V2Elt = (V2EltIdx0 - SrcElt) << 2;
-        }
+        // TODO: optimize for the monotonic case when several consecutive
+        // elements are taken form V2. Do we ever get such a case?
+        if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
+          V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
+        else
+          monotonic = false;
+        ++EltsFromV2;
        } else if (CurrElt != SrcElt) {
          monotonic = false;
        }
@@ -1762,14 +1784,13 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
        if (PrevElt > 0 && SrcElt < MaxElts) {
          if ((PrevElt == SrcElt - 1)
              || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
+          rotamt = SrcElt-i;
            PrevElt = SrcElt;
-          if (SrcElt == 0)
-            V0Elt = i;
          } else {
            rotate = false;
          }
-      } else if (PrevElt == 0) {
-        // First time through, need to keep track of previous element
+      } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
+        // First time or after a "wrap around"
          PrevElt = SrcElt;
        } else {
          // This isn't a rotation, takes elements from vector 2
@@ -1780,24 +1801,23 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
  
    if (EltsFromV2 == 1 && monotonic) {
      // Compute mask and shuffle
-    MachineFunction &MF = DAG.getMachineFunction();
-    MachineRegisterInfo &RegInfo = MF.getRegInfo();
-    unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
-    MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-    // Initialize temporary register to 0
-    SDValue InitTempReg =
-      DAG.getCopyToReg(DAG.getEntryNode(), dl, VReg, DAG.getConstant(0, PtrVT));
-    // Copy register's contents as index in SHUFFLE_MASK:
-    SDValue ShufMaskOp =
-      DAG.getNode(SPUISD::SHUFFLE_MASK, dl, MVT::v4i32,
-                  DAG.getTargetConstant(V2Elt, MVT::i32),
-                  DAG.getCopyFromReg(InitTempReg, dl, VReg, PtrVT));
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+    // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
+    // R1 ($sp) is used here only as it is guaranteed to have last bits zero
+    SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                DAG.getRegister(SPU::R1, PtrVT),
+                                DAG.getConstant(V2EltOffset, MVT::i32));
+    SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, 
+                                     maskVT, Pointer);
+
      // Use shuffle mask in SHUFB synthetic instruction:
      return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
                         ShufMaskOp);
    } else if (rotate) {
-    int rotamt = (MaxElts - V0Elt) * EltVT.getSizeInBits()/8;
-
+    if (rotamt < 0)
+      rotamt +=MaxElts;
+    rotamt *= EltVT.getSizeInBits()/8;
      return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
                         V1, DAG.getConstant(rotamt, MVT::i16));
    } else {
@@ -1812,7 +1832,6 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
        for (unsigned j = 0; j < BytesPerElement; ++j)
          ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
      }
-
      SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
                                      &ResultMask[0], ResultMask.size());
      return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
@@ -1829,11 +1848,11 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
  
      ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
      SmallVector<SDValue, 16> ConstVecValues;
-    MVT VT;
+    EVT VT;
      size_t n_copies;
  
      // Create a constant vector:
-    switch (Op.getValueType().getSimpleVT()) {
+    switch (Op.getValueType().getSimpleVT().SimpleTy) {
      default: llvm_unreachable("Unexpected constant value type in "
                                "LowerSCALAR_TO_VECTOR");
      case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
@@ -1852,7 +1871,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
                         &ConstVecValues[0], ConstVecValues.size());
    } else {
      // Otherwise, copy the value from one register to another:
-    switch (Op0.getValueType().getSimpleVT()) {
+    switch (Op0.getValueType().getSimpleVT().SimpleTy) {
      default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
      case MVT::i8:
      case MVT::i16:
@@ -1868,7 +1887,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
  }
  
  static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
    SDValue N = Op.getOperand(0);
    SDValue Elt = Op.getOperand(1);
    DebugLoc dl = Op.getDebugLoc();
@@ -1897,7 +1916,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
      int prefslot_begin = -1, prefslot_end = -1;
      int elt_byte = EltNo * VT.getSizeInBits() / 8;
  
-    switch (VT.getSimpleVT()) {
+    switch (VT.getSimpleVT().SimpleTy) {
      default:
        assert(false && "Invalid value type!");
      case MVT::i8: {
@@ -1923,7 +1942,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
      assert(prefslot_begin != -1 && prefslot_end != -1 &&
             "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
  
-    unsigned int ShufBytes[16];
+    unsigned int ShufBytes[16] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    };
      for (int i = 0; i < 16; ++i) {
        // zero fill uppper part of preferred slot, don't care about the
        // other slots:
@@ -1959,9 +1980,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
    } else {
      // Variable index: Rotate the requested element into slot 0, then replicate
      // slot 0 across the vector
-    MVT VecVT = N.getValueType();
-    if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) {
-      llvm_report_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
+    EVT VecVT = N.getValueType();
+    if (!VecVT.isSimple() || !VecVT.isVector()) {
+      report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
                          "vector type!");
      }
  
@@ -1987,9 +2008,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
      // consistency with the notion of a unified register set)
      SDValue replicate;
  
-    switch (VT.getSimpleVT()) {
+    switch (VT.getSimpleVT().SimpleTy) {
      default:
-      llvm_report_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
+      report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
                          "type");
        /*NOTREACHED*/
      case MVT::i8: {
@@ -2034,17 +2055,26 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
    SDValue ValOp = Op.getOperand(1);
    SDValue IdxOp = Op.getOperand(2);
    DebugLoc dl = Op.getDebugLoc();
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
+  EVT eltVT = ValOp.getValueType();
  
-  ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
-  assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
+  // use 0 when the lane to insert to is 'undef'
+  int64_t Offset=0;
+  if (IdxOp.getOpcode() != ISD::UNDEF) {
+    ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
+    assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
+    Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
+  }
  
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    // Use $sp ($1) because it's always 16-byte aligned and it's available:
    SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
                                  DAG.getRegister(SPU::R1, PtrVT),
-                                DAG.getConstant(CN->getSExtValue(), PtrVT));
-  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, VT, Pointer);
+                                DAG.getConstant(Offset, PtrVT));
+  // widen the mask when dealing with half vectors
+  EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(), 
+                                128/ VT.getVectorElementType().getSizeInBits());
+  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
  
    SDValue result =
      DAG.getNode(SPUISD::SHUFB, dl, VT,
@@ -2060,7 +2090,7 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
  {
    SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
    DebugLoc dl = Op.getDebugLoc();
-  MVT ShiftVT = TLI.getShiftAmountTy();
+  EVT ShiftVT = TLI.getShiftAmountTy();
  
    assert(Op.getValueType() == MVT::i8);
    switch (Opc) {
@@ -2091,7 +2121,7 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
    case ISD::ROTR:
    case ISD::ROTL: {
      SDValue N1 = Op.getOperand(1);
-    MVT N1VT = N1.getValueType();
+    EVT N1VT = N1.getValueType();
  
      N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
      if (!N1VT.bitsEq(ShiftVT)) {
@@ -2114,7 +2144,7 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
    case ISD::SRL:
    case ISD::SHL: {
      SDValue N1 = Op.getOperand(1);
-    MVT N1VT = N1.getValueType();
+    EVT N1VT = N1.getValueType();
  
      N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
      if (!N1VT.bitsEq(ShiftVT)) {
@@ -2131,7 +2161,7 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
    }
    case ISD::SRA: {
      SDValue N1 = Op.getOperand(1);
-    MVT N1VT = N1.getValueType();
+    EVT N1VT = N1.getValueType();
  
      N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
      if (!N1VT.bitsEq(ShiftVT)) {
@@ -2164,7 +2194,7 @@ static SDValue
  LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
    SDValue ConstVec;
    SDValue Arg;
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
    DebugLoc dl = Op.getDebugLoc();
  
    ConstVec = Op.getOperand(0);
@@ -2215,11 +2245,12 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
    ones per byte, which then have to be accumulated.
  */
  static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
-  MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
+  EVT VT = Op.getValueType();
+  EVT vecVT = EVT::getVectorVT(*DAG.getContext(), 
+                               VT, (128 / VT.getSizeInBits()));
    DebugLoc dl = Op.getDebugLoc();
  
-  switch (VT.getSimpleVT()) {
+  switch (VT.getSimpleVT().SimpleTy) {
    default:
      assert(false && "Invalid value type!");
    case MVT::i8: {
@@ -2324,10 +2355,10 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
   All conversions to i64 are expanded to a libcall.
   */
  static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
-                              SPUTargetLowering &TLI) {
-  MVT OpVT = Op.getValueType();
+                              const SPUTargetLowering &TLI) {
+  EVT OpVT = Op.getValueType();
    SDValue Op0 = Op.getOperand(0);
-  MVT Op0VT = Op0.getValueType();
+  EVT Op0VT = Op0.getValueType();
  
    if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
        || OpVT == MVT::i64) {
@@ -2350,10 +2381,10 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   All conversions from i64 are expanded to a libcall.
   */
  static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
-                              SPUTargetLowering &TLI) {
-  MVT OpVT = Op.getValueType();
+                              const SPUTargetLowering &TLI) {
+  EVT OpVT = Op.getValueType();
    SDValue Op0 = Op.getOperand(0);
-  MVT Op0VT = Op0.getValueType();
+  EVT Op0VT = Op0.getValueType();
  
    if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
        || Op0VT == MVT::i64) {
@@ -2382,12 +2413,12 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
  
    SDValue lhs = Op.getOperand(0);
    SDValue rhs = Op.getOperand(1);
-  MVT lhsVT = lhs.getValueType();
+  EVT lhsVT = lhs.getValueType();
    assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
  
-  MVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
+  EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
    APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
-  MVT IntVT(MVT::i64);
+  EVT IntVT(MVT::i64);
  
    // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
    // selected to a NOP:
@@ -2471,7 +2502,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
    case ISD::SETONE:
      compareOp = ISD::SETNE; break;
    default:
-    llvm_report_error("CellSPU ISel Select: unimplemented f64 condition");
+    report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
    }
  
    SDValue result =
@@ -2508,7 +2539,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
  
  static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
                                const TargetLowering &TLI) {
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
    SDValue lhs = Op.getOperand(0);
    SDValue rhs = Op.getOperand(1);
    SDValue trueval = Op.getOperand(2);
@@ -2537,16 +2568,17 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
  static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
  {
    // Type to truncate to
-  MVT VT = Op.getValueType();
-  MVT::SimpleValueType simpleVT = VT.getSimpleVT();
-  MVT VecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
+  EVT VT = Op.getValueType();
+  MVT simpleVT = VT.getSimpleVT();
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), 
+                               VT, (128 / VT.getSizeInBits()));
    DebugLoc dl = Op.getDebugLoc();
  
    // Type to truncate from
    SDValue Op0 = Op.getOperand(0);
-  MVT Op0VT = Op0.getValueType();
+  EVT Op0VT = Op0.getValueType();
  
-  if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) {
+  if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
      // Create shuffle mask, least significant doubleword of quadword
      unsigned maskHigh = 0x08090a0b;
      unsigned maskLow = 0x0c0d0e0f;
@@ -2566,23 +2598,81 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
    return SDValue();             // Leave the truncate unmolested
  }
  
+/*!
+ * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
+ * algorithm is to duplicate the sign bit using rotmai to generate at
+ * least one byte full of sign bits. Then propagate the "sign-byte" into
+ * the leftmost words and the i64/i32 into the rightmost words using shufb.
+ *
+ * @param Op The sext operand
+ * @param DAG The current DAG
+ * @return The SDValue with the entire instruction sequence
+ */
+static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
+{
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Type to extend to
+  MVT OpVT = Op.getValueType().getSimpleVT();
+
+  // Type to extend from
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType().getSimpleVT();
+
+  // The type to extend to needs to be a i128 and
+  // the type to extend from needs to be i64 or i32.
+  assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
+          "LowerSIGN_EXTEND: input and/or output operand have wrong size");
+
+  // Create shuffle mask
+  unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
+  unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
+  unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
+  SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 DAG.getConstant(mask1, MVT::i32),
+                                 DAG.getConstant(mask1, MVT::i32),
+                                 DAG.getConstant(mask2, MVT::i32),
+                                 DAG.getConstant(mask3, MVT::i32));
+
+  // Word wise arithmetic right shift to generate at least one byte
+  // that contains sign bits.
+  MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
+  SDValue sraVal = DAG.getNode(ISD::SRA,
+                 dl,
+                 mvt,
+                 DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
+                 DAG.getConstant(31, MVT::i32));
+
+  // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
+  SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 
+                                        dl, Op0VT, Op0,
+                                        DAG.getTargetConstant(
+                                                  SPU::GPRCRegClass.getID(), 
+                                                  MVT::i32)), 0);
+  // Shuffle bytes - Copy the sign bits into the upper 64 bits
+  // and the input value into the lower 64 bits.
+  SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
+        extended, sraVal, shufMask);
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, extShuffle);
+}
+
  //! Custom (target-specific) lowering entry point
  /*!
    This is where LLVM's DAG selection process calls to do target-specific
    lowering of nodes.
   */
  SDValue
-SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
+SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
  {
    unsigned Opc = (unsigned) Op.getOpcode();
-  MVT VT = Op.getValueType();
+  EVT VT = Op.getValueType();
  
    switch (Opc) {
    default: {
  #ifndef NDEBUG
-    cerr << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
-    cerr << "Op.getOpcode() = " << Opc << "\n";
-    cerr << "*Op.getNode():\n";
+    errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
+    errs() << "Op.getOpcode() = " << Opc << "\n";
+    errs() << "*Op.getNode():\n";
      Op.getNode()->dump();
  #endif
      llvm_unreachable(0);
@@ -2602,12 +2692,6 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
      return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
    case ISD::ConstantFP:
      return LowerConstantFP(Op, DAG);
-  case ISD::FORMAL_ARGUMENTS:
-    return LowerFORMAL_ARGUMENTS(Op, DAG, VarArgsFrameIndex);
-  case ISD::CALL:
-    return LowerCALL(Op, DAG, SPUTM.getSubtargetImpl());
-  case ISD::RET:
-    return LowerRET(Op, DAG, getTargetMachine());
  
    // i8, i64 math ops:
    case ISD::ADD:
@@ -2664,6 +2748,9 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
  
    case ISD::TRUNCATE:
      return LowerTRUNCATE(Op, DAG);
+
+  case ISD::SIGN_EXTEND:
+    return LowerSIGN_EXTEND(Op, DAG);
    }
  
    return SDValue();
@@ -2671,17 +2758,17 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
  
  void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
                                             SmallVectorImpl<SDValue>&Results,
-                                           SelectionDAG &DAG)
+                                           SelectionDAG &DAG) const
  {
  #if 0
    unsigned Opc = (unsigned) N->getOpcode();
-  MVT OpVT = N->getValueType(0);
+  EVT OpVT = N->getValueType(0);
  
    switch (Opc) {
    default: {
-    cerr << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
-    cerr << "Op.getOpcode() = " << Opc << "\n";
-    cerr << "*Op.getNode():\n";
+    errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
+    errs() << "Op.getOpcode() = " << Opc << "\n";
+    errs() << "*Op.getNode():\n";
      N->dump();
      abort();
      /*NOTREACHED*/
@@ -2705,8 +2792,8 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
    const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
    SelectionDAG &DAG = DCI.DAG;
    SDValue Op0 = N->getOperand(0);       // everything has at least one operand
-  MVT NodeVT = N->getValueType(0);      // The node's value type
-  MVT Op0VT = Op0.getValueType();       // The first operand's result
+  EVT NodeVT = N->getValueType(0);      // The node's value type
+  EVT Op0VT = Op0.getValueType();       // The first operand's result
    SDValue Result;                       // Initially, empty result
    DebugLoc dl = N->getDebugLoc();
  
@@ -2735,7 +2822,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
  
  #if !defined(NDEBUG)
            if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
-            cerr << "\n"
+            errs() << "\n"
                   << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
                   << "With:    (SPUindirect <arg>, <arg>)\n";
            }
@@ -2751,7 +2838,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
  
  #if !defined(NDEBUG)
            if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
-            cerr << "\n"
+            errs() << "\n"
                   << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
                   << "), " << CN0->getSExtValue() << ")\n"
                   << "With:    (SPUindirect <arg>, "
@@ -2775,11 +2862,11 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
        // Types must match, however...
  #if !defined(NDEBUG)
        if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
-        cerr << "\nReplace: ";
+        errs() << "\nReplace: ";
          N->dump(&DAG);
-        cerr << "\nWith:    ";
+        errs() << "\nWith:    ";
          Op0.getNode()->dump(&DAG);
-        cerr << "\n";
+        errs() << "\n";
        }
  #endif
  
@@ -2790,15 +2877,15 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
    case SPUISD::IndirectAddr: {
      if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
        ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
-      if (CN != 0 && CN->getZExtValue() == 0) {
+      if (CN != 0 && CN->isNullValue()) {
          // (SPUindirect (SPUaform <addr>, 0), 0) ->
          // (SPUaform <addr>, 0)
  
-        DEBUG(cerr << "Replace: ");
+        DEBUG(errs() << "Replace: ");
          DEBUG(N->dump(&DAG));
-        DEBUG(cerr << "\nWith:    ");
+        DEBUG(errs() << "\nWith:    ");
          DEBUG(Op0.getNode()->dump(&DAG));
-        DEBUG(cerr << "\n");
+        DEBUG(errs() << "\n");
  
          return Op0;
        }
@@ -2811,7 +2898,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
  
  #if !defined(NDEBUG)
            if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
-            cerr << "\n"
+            errs() << "\n"
                   << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
                   << "With:    (SPUindirect <arg>, <arg>)\n";
            }
@@ -2826,9 +2913,6 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
    }
    case SPUISD::SHLQUAD_L_BITS:
    case SPUISD::SHLQUAD_L_BYTES:
-  case SPUISD::VEC_SHL:
-  case SPUISD::VEC_SRL:
-  case SPUISD::VEC_SRA:
    case SPUISD::ROTBYTES_LEFT: {
      SDValue Op1 = N->getOperand(1);
  
@@ -2873,11 +2957,11 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
    // Otherwise, return unchanged.
  #ifndef NDEBUG
    if (Result.getNode()) {
-    DEBUG(cerr << "\nReplace.SPU: ");
+    DEBUG(errs() << "\nReplace.SPU: ");
      DEBUG(N->dump(&DAG));
-    DEBUG(cerr << "\nWith:        ");
+    DEBUG(errs() << "\nWith:        ");
      DEBUG(Result.getNode()->dump(&DAG));
-    DEBUG(cerr << "\n");
+    DEBUG(errs() << "\n");
    }
  #endif
  
@@ -2906,9 +2990,41 @@ SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const
    return TargetLowering::getConstraintType(ConstraintLetter);
  }
  
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+SPUTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;\r
+    //FIXME: Seems like the supported constraint letters were just copied
+    // from PPC, as the following doesn't correspond to the GCC docs.
+    // I'm leaving it so until someone adds the corresponding lowering support.
+  case 'b':
+  case 'r':
+  case 'f':
+  case 'd':
+  case 'v':
+  case 'y':
+    weight = CW_Register;
+    break;
+  }
+  return weight;
+}
+
  std::pair<unsigned, const TargetRegisterClass*>
  SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                MVT VT) const
+                                                EVT VT) const
  {
    if (Constraint.size() == 1) {
      // GCC RS6000 Constraint Letters
@@ -2956,9 +3072,6 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
    case SPUISD::VEC2PREFSLOT:
    case SPUISD::SHLQUAD_L_BITS:
    case SPUISD::SHLQUAD_L_BYTES:
-  case SPUISD::VEC_SHL:
-  case SPUISD::VEC_SRL:
-  case SPUISD::VEC_SRA:
    case SPUISD::VEC_ROTL:
    case SPUISD::VEC_ROTR:
    case SPUISD::ROTBYTES_LEFT:
@@ -2976,7 +3089,7 @@ SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
      return 1;
  
    case ISD::SETCC: {
-    MVT VT = Op.getValueType();
+    EVT VT = Op.getValueType();
  
      if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
        VT = MVT::i32;
@@ -2990,12 +3103,10 @@ SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
  void
  SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                  char ConstraintLetter,
-                                                bool hasMemory,
                                                  std::vector<SDValue> &Ops,
                                                  SelectionDAG &DAG) const {
    // Default, for the time being, to the base class handler
-  TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, hasMemory,
-                                               Ops, DAG);
+  TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, Ops, DAG);
  }
  
  /// isLegalAddressImmediate - Return true if the integer value can be used
@@ -3015,3 +3126,29 @@ SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
    // The SPU target isn't yet aware of offsets.
    return false;
  }
+
+// can we compare to Imm without writing it into a register?
+bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  //ceqi, cgti, etc. all take s10 operand
+  return isInt<10>(Imm);
+}
+
+bool 
+SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM, 
+                                         const Type * ) const{
+
+  // A-form: 18bit absolute address. 
+  if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
+    return true;
+ 
+  // D-form: reg + 14bit offset
+  if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
+    return true;
+
+  // X-form: reg+reg
+  if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
+    return true;
+
+  return false;
+}
+