X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;ds=sidebyside;f=lib%2FTarget%2FCellSPU%2FSPUISelLowering.cpp;h=50ba00f63675302c94f8b4922e2b742b9384c66e;hb=e53a600f065075731d0aeb9dc8f4f3d75f5a05f8;hp=184f0b23fa54bfd009401b0fe595ef8b143335f4;hpb=aaffa05d0a652dd3eae76a941d02d6b0469fa821;p=oota-llvm.git

diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 184f0b23fa5..50ba00f6367 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -1,5 +1,5 @@
-//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
 //
+//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
@@ -15,20 +15,23 @@
 #include "SPUISelLowering.h"
 #include "SPUTargetMachine.h"
 #include "SPUFrameInfo.h"
-#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Intrinsics.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/VectorExtras.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetOptions.h"
-
+#include "llvm/Support/raw_ostream.h"
 #include <map>
 
 using namespace llvm;
@@ -68,52 +71,61 @@ namespace {
 
 #ifndef NDEBUG
     if (retval == 0) {
-      cerr << "getValueTypeMapEntry returns NULL for "
-           << VT.getMVTString()
-           << "\n";
-      abort();
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "getValueTypeMapEntry returns NULL for "
+           << VT.getMVTString();
+      llvm_report_error(Msg.str());
     }
 #endif
 
     return retval;
   }
 
-  //! Predicate that returns true if operand is a memory target
+  //! Expand a library call into an actual call DAG node
   /*!
-    \arg Op Operand to test
-    \return true if the operand is a memory target (i.e., global
-    address, external symbol, constant pool) or an A-form
-    address.
+   \note
+   This code is taken from SelectionDAGLegalize, since it is not exposed as
+   part of the LLVM SelectionDAG API.
    */
-  bool isMemoryOperand(const SDValue &Op)
-  {
-    const unsigned Opc = Op.getOpcode();
-    return (Opc == ISD::GlobalAddress
-            || Opc == ISD::GlobalTLSAddress
-            || Opc == ISD::JumpTable
-            || Opc == ISD::ConstantPool
-            || Opc == ISD::ExternalSymbol
-            || Opc == ISD::TargetGlobalAddress
-            || Opc == ISD::TargetGlobalTLSAddress
-            || Opc == ISD::TargetJumpTable
-            || Opc == ISD::TargetConstantPool
-            || Opc == ISD::TargetExternalSymbol
-            || Opc == SPUISD::AFormAddr);
-  }
-
-  //! Predicate that returns true if the operand is an indirect target
-  bool isIndirectOperand(const SDValue &Op)
-  {
-    const unsigned Opc = Op.getOpcode();
-    return (Opc == ISD::Register
-            || Opc == SPUISD::LDRESULT);
+
+  SDValue
+  ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
+                bool isSigned, SDValue &Hi, SPUTargetLowering &TLI) {
+    // The input chain to this libcall is the entry node of the function.
+    // Legalizing the call will automatically add the previous call to the
+    // dependence.
+    SDValue InChain = DAG.getEntryNode();
+
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+      MVT ArgVT = Op.getOperand(i).getValueType();
+      const Type *ArgTy = ArgVT.getTypeForMVT(*DAG.getContext());
+      Entry.Node = Op.getOperand(i);
+      Entry.Ty = ArgTy;
+      Entry.isSExt = isSigned;
+      Entry.isZExt = !isSigned;
+      Args.push_back(Entry);
+    }
+    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                           TLI.getPointerTy());
+
+    // Splice the libcall in wherever FindInputOutputChains tells us to.
+    const Type *RetTy =
+                 Op.getNode()->getValueType(0).getTypeForMVT(*DAG.getContext());
+    std::pair<SDValue, SDValue> CallInfo =
+            TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+                            0, CallingConv::C, false, Callee, Args, DAG,
+                            Op.getDebugLoc());
+
+    return CallInfo.first;
   }
 }
 
 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
-  : TargetLowering(TM),
-    SPUTM(TM)
-{
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()),
+    SPUTM(TM) {
   // Fold away setcc operations if possible.
   setPow2DivIsCheap();
 
@@ -121,6 +133,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setUseUnderscoreSetJmp(true);
   setUseUnderscoreLongJmp(true);
 
+  // Set RTLIB libcall names as used by SPU:
+  setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
+
   // Set up the SPU's register classes:
   addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
   addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
@@ -130,43 +145,54 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
   addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
 
-  // Initialize libcalls:
-  setLibcallName(RTLIB::MUL_I64, "__muldi3");
-
   // SPU has no sign or zero extended loads for i1, i8, i16:
   setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
 
-  setLoadExtAction(ISD::EXTLOAD,  MVT::i8, Custom);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
-  setTruncStoreAction(MVT::i8,    MVT::i8, Custom);
-  setTruncStoreAction(MVT::i16,   MVT::i8, Custom);
-  setTruncStoreAction(MVT::i32,   MVT::i8, Custom);
-  setTruncStoreAction(MVT::i64,   MVT::i8, Custom);
-  setTruncStoreAction(MVT::i128,  MVT::i8, Custom);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
+
+  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i8, Expand);
 
-  setLoadExtAction(ISD::EXTLOAD,  MVT::i16, Custom);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   // SPU constant load actions are custom lowered:
-  setOperationAction(ISD::Constant,   MVT::i64, Custom);
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 
   // SPU's loads and stores have to be custom lowered:
-  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
+  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
        ++sctype) {
     MVT VT = (MVT::SimpleValueType)sctype;
 
-    setOperationAction(ISD::LOAD, VT, Custom);
-    setOperationAction(ISD::STORE, VT, Custom);
+    setOperationAction(ISD::LOAD,   VT, Custom);
+    setOperationAction(ISD::STORE,  VT, Custom);
+    setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
+
+    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
+      MVT StoreVT = (MVT::SimpleValueType) stype;
+      setTruncStoreAction(VT, StoreVT, Expand);
+    }
   }
 
-  // Custom lower BRCOND for i8 to "promote" the result to i16
-  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+  for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
+       ++sctype) {
+    MVT VT = (MVT::SimpleValueType) sctype;
+
+    setOperationAction(ISD::LOAD,   VT, Custom);
+    setOperationAction(ISD::STORE,  VT, Custom);
+
+    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
+      MVT StoreVT = (MVT::SimpleValueType) stype;
+      setTruncStoreAction(VT, StoreVT, Expand);
+    }
+  }
 
   // Expand the jumptable branches
   setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
@@ -177,18 +203,42 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
   setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
   setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
-#if 0
   setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
-#endif
 
   // SPU has no intrinsics for these particular operations:
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
 
-  // PowerPC has no SREM/UREM instructions
-  setOperationAction(ISD::SREM, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
+  // SPU has no division/remainder instructions
+  setOperationAction(ISD::SREM,    MVT::i8,   Expand);
+  setOperationAction(ISD::UREM,    MVT::i8,   Expand);
+  setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
+  setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
+  setOperationAction(ISD::SREM,    MVT::i16,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i16,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i32,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i32,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i64,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i64,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i128, Expand);
+  setOperationAction(ISD::UREM,    MVT::i128, Expand);
+  setOperationAction(ISD::SDIV,    MVT::i128, Expand);
+  setOperationAction(ISD::UDIV,    MVT::i128, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
 
   // We don't support sin/cos/sqrt/fmod
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
@@ -198,7 +248,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::FCOS , MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
 
-  // If we're enabling GP optimizations, use hardware square root
+  // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
+  // for f32!)
   setOperationAction(ISD::FSQRT, MVT::f64, Expand);
   setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 
@@ -223,26 +274,40 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::SRL,  MVT::i8,     Custom);
   setOperationAction(ISD::SRA,  MVT::i8,     Custom);
 
-  // SPU needs custom lowering for shift left/right for i64
-  setOperationAction(ISD::SHL,  MVT::i64,    Custom);
-  setOperationAction(ISD::SRL,  MVT::i64,    Custom);
-  setOperationAction(ISD::SRA,  MVT::i64,    Custom);
+  // Make these operations legal and handle them during instruction selection:
+  setOperationAction(ISD::SHL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRA,  MVT::i64,    Legal);
 
   // Custom lower i8, i32 and i64 multiplications
   setOperationAction(ISD::MUL,  MVT::i8,     Custom);
-  setOperationAction(ISD::MUL,  MVT::i32,    Custom);
-  setOperationAction(ISD::MUL,  MVT::i64,    Expand);   // libcall
-
-  // SMUL_LOHI, UMUL_LOHI
-  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Custom);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
+  setOperationAction(ISD::MUL,  MVT::i32,    Legal);
+  setOperationAction(ISD::MUL,  MVT::i64,    Legal);
+
+  // Expand double-width multiplication
+  // FIXME: It would probably be reasonable to support some of these operations
+  setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
+  setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
+  setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
+  setOperationAction(ISD::MULHU,     MVT::i16, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i16, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::MULHU,     MVT::i32, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::MULHU,     MVT::i64, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i64, Expand);
 
   // Need to custom handle (some) common i8, i64 math ops
-  setOperationAction(ISD::ADD,  MVT::i64,    Custom);
+  setOperationAction(ISD::ADD,  MVT::i8,     Custom);
+  setOperationAction(ISD::ADD,  MVT::i64,    Legal);
   setOperationAction(ISD::SUB,  MVT::i8,     Custom);
-  setOperationAction(ISD::SUB,  MVT::i64,    Custom);
+  setOperationAction(ISD::SUB,  MVT::i64,    Legal);
 
   // SPU does not have BSWAP. It does have i32 support CTLZ.
   // CTPOP has to be custom lowered.
@@ -253,47 +318,59 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
   setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
   setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
 
+  setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
   setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
   setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
+  setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
 
+  setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
+  setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
   setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
+  setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
+  setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
 
   // SPU has a version of select that implements (a&~c)|(b&c), just like
   // select ought to work:
   setOperationAction(ISD::SELECT, MVT::i8,   Legal);
   setOperationAction(ISD::SELECT, MVT::i16,  Legal);
   setOperationAction(ISD::SELECT, MVT::i32,  Legal);
-  setOperationAction(ISD::SELECT, MVT::i64,  Expand);
+  setOperationAction(ISD::SELECT, MVT::i64,  Legal);
 
   setOperationAction(ISD::SETCC, MVT::i8,    Legal);
   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
   setOperationAction(ISD::SETCC, MVT::i32,   Legal);
-  setOperationAction(ISD::SETCC, MVT::i64,   Expand);
-
-  // Zero extension and sign extension for i64 have to be
-  // custom legalized
-  setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
-  setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
-  setOperationAction(ISD::ANY_EXTEND,  MVT::i64, Custom);
-
-  // SPU has a legal FP -> signed INT instruction
-  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::i64,   Legal);
+  setOperationAction(ISD::SETCC, MVT::f64,   Custom);
+
+  // Custom lower i128 -> i64 truncates
+  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
+
+  setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+  // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
+  // to expand to a libcall, hence the custom lowering:
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
 
   // FDIV on SPU requires custom lowering
-  setOperationAction(ISD::FDIV, MVT::f32, Custom);
-  //setOperationAction(ISD::FDIV, MVT::f64, Custom);
+  setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
 
-  // SPU has [U|S]INT_TO_FP
-  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+  // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 
@@ -315,9 +392,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
        ++sctype) {
     MVT VT = (MVT::SimpleValueType)sctype;
 
-    setOperationAction(ISD::GlobalAddress, VT, Custom);
-    setOperationAction(ISD::ConstantPool,  VT, Custom);
-    setOperationAction(ISD::JumpTable,     VT, Custom);
+    setOperationAction(ISD::GlobalAddress,  VT, Custom);
+    setOperationAction(ISD::ConstantPool,   VT, Custom);
+    setOperationAction(ISD::JumpTable,      VT, Custom);
   }
 
   // RET must be custom lowered, to meet ABI requirements
@@ -354,29 +431,31 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
   addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
 
+  // "Odd size" vector classes that we're willing to support:
+  addRegisterClass(MVT::v2i32, SPU::VECREGRegisterClass);
+
   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
     MVT VT = (MVT::SimpleValueType)i;
 
     // add/sub are legal for all supported vector VT's.
-    setOperationAction(ISD::ADD , VT, Legal);
-    setOperationAction(ISD::SUB , VT, Legal);
+    setOperationAction(ISD::ADD,     VT, Legal);
+    setOperationAction(ISD::SUB,     VT, Legal);
     // mul has to be custom lowered.
-    setOperationAction(ISD::MUL , VT, Custom);
+    setOperationAction(ISD::MUL,     VT, Legal);
 
-    setOperationAction(ISD::AND   , VT, Legal);
-    setOperationAction(ISD::OR    , VT, Legal);
-    setOperationAction(ISD::XOR   , VT, Legal);
-    setOperationAction(ISD::LOAD  , VT, Legal);
-    setOperationAction(ISD::SELECT, VT, Legal);
-    setOperationAction(ISD::STORE,  VT, Legal);
+    setOperationAction(ISD::AND,     VT, Legal);
+    setOperationAction(ISD::OR,      VT, Legal);
+    setOperationAction(ISD::XOR,     VT, Legal);
+    setOperationAction(ISD::LOAD,    VT, Legal);
+    setOperationAction(ISD::SELECT,  VT, Legal);
+    setOperationAction(ISD::STORE,   VT, Legal);
 
     // These operations need to be expanded:
-    setOperationAction(ISD::SDIV, VT, Expand);
-    setOperationAction(ISD::SREM, VT, Expand);
-    setOperationAction(ISD::UDIV, VT, Expand);
-    setOperationAction(ISD::UREM, VT, Expand);
-    setOperationAction(ISD::FDIV, VT, Custom);
+    setOperationAction(ISD::SDIV,    VT, Expand);
+    setOperationAction(ISD::SREM,    VT, Expand);
+    setOperationAction(ISD::UDIV,    VT, Expand);
+    setOperationAction(ISD::UREM,    VT, Expand);
 
     // Custom lower build_vector, constant pool spills, insert and
     // extract vector elements:
@@ -388,14 +467,15 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
   }
 
-  setOperationAction(ISD::MUL, MVT::v16i8, Custom);
   setOperationAction(ISD::AND, MVT::v16i8, Custom);
   setOperationAction(ISD::OR,  MVT::v16i8, Custom);
   setOperationAction(ISD::XOR, MVT::v16i8, Custom);
   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
 
+  setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+
   setShiftAmountType(MVT::i32);
-  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
 
   setStackPointerRegisterToSaveRestore(SPU::R1);
 
@@ -407,8 +487,10 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
 
   computeRegisterProperties();
 
-  // Set other properties:
-  setSchedulingPreference(SchedulingForLatency);
+  // Set pre-RA register scheduler default to BURR, which produces slightly
+  // better code than the default (could also be TDRR, but TargetLowering.h
+  // needs a mod to support that model):
+  setSchedulingPreference(SchedulingForRegPressure);
 }
 
 const char *
@@ -426,18 +508,8 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
     node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
     node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
     node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
-    node_names[(unsigned) SPUISD::PROMOTE_SCALAR] = "SPUISD::PROMOTE_SCALAR";
+    node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
     node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
-    node_names[(unsigned) SPUISD::VEC2PREFSLOT_CHAINED]
-                                              = "SPUISD::VEC2PREFSLOT_CHAINED";
-    node_names[(unsigned) SPUISD::EXTRACT_I1_ZEXT] = "SPUISD::EXTRACT_I1_ZEXT";
-    node_names[(unsigned) SPUISD::EXTRACT_I1_SEXT] = "SPUISD::EXTRACT_I1_SEXT";
-    node_names[(unsigned) SPUISD::EXTRACT_I8_ZEXT] = "SPUISD::EXTRACT_I8_ZEXT";
-    node_names[(unsigned) SPUISD::EXTRACT_I8_SEXT] = "SPUISD::EXTRACT_I8_SEXT";
-    node_names[(unsigned) SPUISD::MPY] = "SPUISD::MPY";
-    node_names[(unsigned) SPUISD::MPYU] = "SPUISD::MPYU";
-    node_names[(unsigned) SPUISD::MPYH] = "SPUISD::MPYH";
-    node_names[(unsigned) SPUISD::MPYHH] = "SPUISD::MPYHH";
     node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS";
     node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES";
     node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL";
@@ -445,24 +517,14 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
     node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
     node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
     node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
-    node_names[(unsigned) SPUISD::ROTQUAD_RZ_BYTES] =
-      "SPUISD::ROTQUAD_RZ_BYTES";
-    node_names[(unsigned) SPUISD::ROTQUAD_RZ_BITS] =
-      "SPUISD::ROTQUAD_RZ_BITS";
     node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
-    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_CHAINED] =
-      "SPUISD::ROTBYTES_LEFT_CHAINED";
     node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
-      "SPUISD::ROTBYTES_LEFT_BITS";
+            "SPUISD::ROTBYTES_LEFT_BITS";
     node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
     node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
-    node_names[(unsigned) SPUISD::ADD_EXTENDED] = "SPUISD::ADD_EXTENDED";
-    node_names[(unsigned) SPUISD::CARRY_GENERATE] = "SPUISD::CARRY_GENERATE";
-    node_names[(unsigned) SPUISD::SUB_EXTENDED] = "SPUISD::SUB_EXTENDED";
-    node_names[(unsigned) SPUISD::BORROW_GENERATE] = "SPUISD::BORROW_GENERATE";
-    node_names[(unsigned) SPUISD::FPInterp] = "SPUISD::FPInterp";
-    node_names[(unsigned) SPUISD::FPRecipEst] = "SPUISD::FPRecipEst";
-    node_names[(unsigned) SPUISD::SEXT32TO64] = "SPUISD::SEXT32TO64";
+    node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
+    node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
+    node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
   }
 
   std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
@@ -470,9 +532,18 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
   return ((i != node_names.end()) ? i->second : 0);
 }
 
-MVT SPUTargetLowering::getSetCCResultType(const SDValue &Op) const {
-  MVT VT = Op.getValueType();
-  return (VT.isInteger() ? VT : MVT(MVT::i32));
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const {
+  return 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Return the Cell SPU's SETCC result type
+//===----------------------------------------------------------------------===//
+
+MVT SPUTargetLowering::getSetCCResultType(MVT VT) const {
+  // i16 and i32 are valid SETCC result types
+  return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ? VT : MVT::i32);
 }
 
 //===----------------------------------------------------------------------===//
@@ -485,195 +556,158 @@ MVT SPUTargetLowering::getSetCCResultType(const SDValue &Op) const {
 //  LowerOperation implementation
 //===----------------------------------------------------------------------===//
 
-/// Aligned load common code for CellSPU
-/*!
-  \param[in] Op The SelectionDAG load or store operand
-  \param[in] DAG The selection DAG
-  \param[in] ST CellSPU subtarget information structure
-  \param[in,out] alignment Caller initializes this to the load or store node's
-  value from getAlignment(), may be updated while generating the aligned load
-  \param[in,out] alignOffs Aligned offset; set by AlignedLoad to the aligned
-  offset (divisible by 16, modulo 16 == 0)
-  \param[in,out] prefSlotOffs Preferred slot offset; set by AlignedLoad to the
-  offset of the preferred slot (modulo 16 != 0)
-  \param[in,out] VT Caller initializes this value type to the the load or store
-  node's loaded or stored value type; may be updated if an i1-extended load or
-  store.
-  \param[out] was16aligned true if the base pointer had 16-byte alignment,
-  otherwise false. Can help to determine if the chunk needs to be rotated.
-
- Both load and store lowering load a block of data aligned on a 16-byte
- boundary. This is the common aligned load code shared between both.
- */
-static SDValue
-AlignedLoad(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST,
-            LSBaseSDNode *LSN,
-            unsigned &alignment, int &alignOffs, int &prefSlotOffs,
-            MVT &VT, bool &was16aligned)
-{
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  const valtype_map_s *vtm = getValueTypeMapEntry(VT);
-  SDValue basePtr = LSN->getBasePtr();
-  SDValue chain = LSN->getChain();
-
-  if (basePtr.getOpcode() == ISD::ADD) {
-    SDValue Op1 = basePtr.getNode()->getOperand(1);
-
-    if (Op1.getOpcode() == ISD::Constant
-        || Op1.getOpcode() == ISD::TargetConstant) {
-      const ConstantSDNode *CN = cast<ConstantSDNode>(basePtr.getOperand(1));
-
-      alignOffs = (int) CN->getZExtValue();
-      prefSlotOffs = (int) (alignOffs & 0xf);
-
-      // Adjust the rotation amount to ensure that the final result ends up in
-      // the preferred slot:
-      prefSlotOffs -= vtm->prefslot_byte;
-      basePtr = basePtr.getOperand(0);
-
-      // Loading from memory, can we adjust alignment?
-      if (basePtr.getOpcode() == SPUISD::AFormAddr) {
-        SDValue APtr = basePtr.getOperand(0);
-        if (APtr.getOpcode() == ISD::TargetGlobalAddress) {
-          GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(APtr);
-          alignment = GSDN->getGlobal()->getAlignment();
-        }
-      }
-    } else {
-      alignOffs = 0;
-      prefSlotOffs = -vtm->prefslot_byte;
-    }
-  } else if (basePtr.getOpcode() == ISD::FrameIndex) {
-    FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(basePtr);
-    alignOffs = int(FIN->getIndex() * SPUFrameInfo::stackSlotSize());
-    prefSlotOffs = (int) (alignOffs & 0xf);
-    prefSlotOffs -= vtm->prefslot_byte;
-    basePtr = DAG.getRegister(SPU::R1, VT);
-  } else {
-    alignOffs = 0;
-    prefSlotOffs = -vtm->prefslot_byte;
-  }
-
-  if (alignment == 16) {
-    // Realign the base pointer as a D-Form address:
-    if (!isMemoryOperand(basePtr) || (alignOffs & ~0xf) != 0) {
-      basePtr = DAG.getNode(ISD::ADD, PtrVT,
-                            basePtr,
-                            DAG.getConstant((alignOffs & ~0xf), PtrVT));
-    }
-
-    // Emit the vector load:
-    was16aligned = true;
-    return DAG.getLoad(MVT::v16i8, chain, basePtr,
-                       LSN->getSrcValue(), LSN->getSrcValueOffset(),
-                       LSN->isVolatile(), 16);
-  }
-
-  // Unaligned load or we're using the "large memory" model, which means that
-  // we have to be very pessimistic:
-  if (isMemoryOperand(basePtr) || isIndirectOperand(basePtr)) {
-    basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT, basePtr,
-                          DAG.getConstant(0, PtrVT));
-  }
-
-  // Add the offset
-  basePtr = DAG.getNode(ISD::ADD, PtrVT, basePtr,
-                        DAG.getConstant((alignOffs & ~0xf), PtrVT));
-  was16aligned = false;
-  return DAG.getLoad(MVT::v16i8, chain, basePtr,
-                     LSN->getSrcValue(), LSN->getSrcValueOffset(),
-                     LSN->isVolatile(), 16);
-}
-
 /// Custom lower loads for CellSPU
 /*!
  All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
  within a 16-byte block, we have to rotate to extract the requested element.
- */
+
+ For extending loads, we also want to ensure that the following sequence is
+ emitted, e.g. for MVT::f32 extending load to MVT::f64:
+
+\verbatim
+%1  v16i8,ch = load
+%2  v16i8,ch = rotate %1
+%3  v4f8, ch = bitconvert %2
+%4  f32      = vec2perfslot %3
+%5  f64      = fp_extend %4
+\endverbatim
+*/
 static SDValue
 LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   LoadSDNode *LN = cast<LoadSDNode>(Op);
   SDValue the_chain = LN->getChain();
-  MVT VT = LN->getMemoryVT();
-  MVT OpVT = Op.getNode()->getValueType(0);
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  MVT InVT = LN->getMemoryVT();
+  MVT OutVT = Op.getValueType();
   ISD::LoadExtType ExtType = LN->getExtensionType();
   unsigned alignment = LN->getAlignment();
-  SDValue Ops[8];
+  const valtype_map_s *vtm = getValueTypeMapEntry(InVT);
+  DebugLoc dl = Op.getDebugLoc();
 
   switch (LN->getAddressingMode()) {
   case ISD::UNINDEXED: {
-    int offset, rotamt;
-    bool was16aligned;
-    SDValue result =
-      AlignedLoad(Op, DAG, ST, LN,alignment, offset, rotamt, VT, was16aligned);
-
-    if (result.getNode() == 0)
-      return result;
-
-    the_chain = result.getValue(1);
-    // Rotate the chunk if necessary
-    if (rotamt < 0)
-      rotamt += 16;
-    if (rotamt != 0 || !was16aligned) {
-      SDVTList vecvts = DAG.getVTList(MVT::v16i8, MVT::Other);
-
-      Ops[0] = the_chain;
-      Ops[1] = result;
-      if (was16aligned) {
-        Ops[2] = DAG.getConstant(rotamt, MVT::i16);
+    SDValue result;
+    SDValue basePtr = LN->getBasePtr();
+    SDValue rotate;
+
+    if (alignment == 16) {
+      ConstantSDNode *CN;
+
+      // Special cases for a known aligned load to simplify the base pointer
+      // and the rotation amount:
+      if (basePtr.getOpcode() == ISD::ADD
+          && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
+        // Known offset into basePtr
+        int64_t offset = CN->getSExtValue();
+        int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte);
+
+        if (rotamt < 0)
+          rotamt += 16;
+
+        rotate = DAG.getConstant(rotamt, MVT::i16);
+
+        // Simplify the base pointer for this case:
+        basePtr = basePtr.getOperand(0);
+        if ((offset & ~0xf) > 0) {
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                basePtr,
+                                DAG.getConstant((offset & ~0xf), PtrVT));
+        }
+      } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
+                 || (basePtr.getOpcode() == SPUISD::IndirectAddr
+                     && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
+                     && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
+        // Plain aligned a-form address: rotate into preferred slot
+        // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
+        int64_t rotamt = -vtm->prefslot_byte;
+        if (rotamt < 0)
+          rotamt += 16;
+        rotate = DAG.getConstant(rotamt, MVT::i16);
       } else {
-        MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-        LoadSDNode *LN1 = cast<LoadSDNode>(result);
-        Ops[2] = DAG.getNode(ISD::ADD, PtrVT, LN1->getBasePtr(),
+        // Offset the rotate amount by the basePtr and the preferred slot
+        // byte offset
+        int64_t rotamt = -vtm->prefslot_byte;
+        if (rotamt < 0)
+          rotamt += 16;
+        rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
+                             basePtr,
                              DAG.getConstant(rotamt, PtrVT));
       }
+    } else {
+      // Unaligned load: must be more pessimistic about addressing modes:
+      if (basePtr.getOpcode() == ISD::ADD) {
+        MachineFunction &MF = DAG.getMachineFunction();
+        MachineRegisterInfo &RegInfo = MF.getRegInfo();
+        unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+        SDValue Flag;
+
+        SDValue Op0 = basePtr.getOperand(0);
+        SDValue Op1 = basePtr.getOperand(1);
+
+        if (isa<ConstantSDNode>(Op1)) {
+          // Convert the (add <ptr>, <const>) to an indirect address contained
+          // in a register. Note that this is done because we need to avoid
+          // creating a 0(reg) d-form address due to the SPU's block loads.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+          the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
+          basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
+        } else {
+          // Convert the (add <arg1>, <arg2>) to an indirect address, which
+          // will likely be lowered as a reg(reg) x-form address.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+        }
+      } else {
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                              basePtr,
+                              DAG.getConstant(0, PtrVT));
+      }
 
-      result = DAG.getNode(SPUISD::ROTBYTES_LEFT_CHAINED, vecvts, Ops, 3);
-      the_chain = result.getValue(1);
+      // Offset the rotate amount by the basePtr and the preferred slot
+      // byte offset
+      rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
+                           basePtr,
+                           DAG.getConstant(-vtm->prefslot_byte, PtrVT));
     }
 
-    if (VT == OpVT || ExtType == ISD::EXTLOAD) {
-      SDVTList scalarvts;
-      MVT vecVT = MVT::v16i8;
+    // Re-emit as a v16i8 vector load
+    result = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr,
+                         LN->getSrcValue(), LN->getSrcValueOffset(),
+                         LN->isVolatile(), 16);
 
-      // Convert the loaded v16i8 vector to the appropriate vector type
-      // specified by the operand:
-      if (OpVT == VT) {
-        if (VT != MVT::i1)
-          vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
-      } else
-        vecVT = MVT::getVectorVT(OpVT, (128 / OpVT.getSizeInBits()));
+    // Update the chain
+    the_chain = result.getValue(1);
 
-      Ops[0] = the_chain;
-      Ops[1] = DAG.getNode(ISD::BIT_CONVERT, vecVT, result);
-      scalarvts = DAG.getVTList((OpVT == VT ? VT : OpVT), MVT::Other);
-      result = DAG.getNode(SPUISD::VEC2PREFSLOT_CHAINED, scalarvts, Ops, 2);
-      the_chain = result.getValue(1);
-    } else {
-      // Handle the sign and zero-extending loads for i1 and i8:
-      unsigned NewOpC;
+    // Rotate into the preferred slot:
+    result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::v16i8,
+                         result.getValue(0), rotate);
 
-      if (ExtType == ISD::SEXTLOAD) {
-        NewOpC = (OpVT == MVT::i1
-                  ? SPUISD::EXTRACT_I1_SEXT
-                  : SPUISD::EXTRACT_I8_SEXT);
-      } else {
-        assert(ExtType == ISD::ZEXTLOAD);
-        NewOpC = (OpVT == MVT::i1
-                  ? SPUISD::EXTRACT_I1_ZEXT
-                  : SPUISD::EXTRACT_I8_ZEXT);
-      }
+    // Convert the loaded v16i8 vector to the appropriate vector type
+    // specified by the operand:
+    MVT vecVT = MVT::getVectorVT(InVT, (128 / InVT.getSizeInBits()));
+    result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
+                         DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result));
 
-      result = DAG.getNode(NewOpC, OpVT, result);
+    // Handle extending loads by extending the scalar result:
+    if (ExtType == ISD::SEXTLOAD) {
+      result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
+    } else if (ExtType == ISD::ZEXTLOAD) {
+      result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
+    } else if (ExtType == ISD::EXTLOAD) {
+      unsigned NewOpc = ISD::ANY_EXTEND;
+
+      if (OutVT.isFloatingPoint())
+        NewOpc = ISD::FP_EXTEND;
+
+      result = DAG.getNode(NewOpc, dl, OutVT, result);
     }
 
-    SDVTList retvts = DAG.getVTList(OpVT, MVT::Other);
+    SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
     SDValue retops[2] = {
       result,
       the_chain
     };
 
-    result = DAG.getNode(SPUISD::LDRESULT, retvts,
+    result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
                          retops, sizeof(retops) / sizeof(retops[0]));
     return result;
   }
@@ -682,11 +716,15 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   case ISD::POST_INC:
   case ISD::POST_DEC:
   case ISD::LAST_INDEXED_MODE:
-    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
+    {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
             "UNINDEXED\n";
-    cerr << (unsigned) LN->getAddressingMode() << "\n";
-    abort();
-    /*NOTREACHED*/
+      Msg << (unsigned) LN->getAddressingMode();
+      llvm_report_error(Msg.str());
+      /*NOTREACHED*/
+    }
   }
 
   return SDValue();
@@ -705,27 +743,91 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   MVT VT = Value.getValueType();
   MVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
   MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
   unsigned alignment = SN->getAlignment();
 
   switch (SN->getAddressingMode()) {
   case ISD::UNINDEXED: {
-    int chunk_offset, slot_offset;
-    bool was16aligned;
-
     // The vector type we really want to load from the 16-byte chunk.
     MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits())),
         stVecVT = MVT::getVectorVT(StVT, (128 / StVT.getSizeInBits()));
 
-    SDValue alignLoadVec =
-      AlignedLoad(Op, DAG, ST, SN, alignment,
-                  chunk_offset, slot_offset, VT, was16aligned);
+    SDValue alignLoadVec;
+    SDValue basePtr = SN->getBasePtr();
+    SDValue the_chain = SN->getChain();
+    SDValue insertEltOffs;
+
+    if (alignment == 16) {
+      ConstantSDNode *CN;
+
+      // Special cases for a known aligned load to simplify the base pointer
+      // and insertion byte:
+      if (basePtr.getOpcode() == ISD::ADD
+          && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
+        // Known offset into basePtr
+        int64_t offset = CN->getSExtValue();
+
+        // Simplify the base pointer for this case:
+        basePtr = basePtr.getOperand(0);
+        insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                    basePtr,
+                                    DAG.getConstant((offset & 0xf), PtrVT));
+
+        if ((offset & ~0xf) > 0) {
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                basePtr,
+                                DAG.getConstant((offset & ~0xf), PtrVT));
+        }
+      } else {
+        // Otherwise, assume it's at byte 0 of basePtr
+        insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                    basePtr,
+                                    DAG.getConstant(0, PtrVT));
+      }
+    } else {
+      // Unaligned load: must be more pessimistic about addressing modes:
+      if (basePtr.getOpcode() == ISD::ADD) {
+        MachineFunction &MF = DAG.getMachineFunction();
+        MachineRegisterInfo &RegInfo = MF.getRegInfo();
+        unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+        SDValue Flag;
+
+        SDValue Op0 = basePtr.getOperand(0);
+        SDValue Op1 = basePtr.getOperand(1);
+
+        if (isa<ConstantSDNode>(Op1)) {
+          // Convert the (add <ptr>, <const>) to an indirect address contained
+          // in a register. Note that this is done because we need to avoid
+          // creating a 0(reg) d-form address due to the SPU's block loads.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+          the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
+          basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
+        } else {
+          // Convert the (add <arg1>, <arg2>) to an indirect address, which
+          // will likely be lowered as a reg(reg) x-form address.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+        }
+      } else {
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                              basePtr,
+                              DAG.getConstant(0, PtrVT));
+      }
+
+      // Insertion point is solely determined by basePtr's contents
+      insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
+                                  basePtr,
+                                  DAG.getConstant(0, PtrVT));
+    }
 
-    if (alignLoadVec.getNode() == 0)
-      return alignLoadVec;
+    // Re-emit as a v16i8 vector load
+    alignLoadVec = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr,
+                               SN->getSrcValue(), SN->getSrcValueOffset(),
+                               SN->isVolatile(), 16);
+
+    // Update the chain
+    the_chain = alignLoadVec.getValue(1);
 
     LoadSDNode *LN = cast<LoadSDNode>(alignLoadVec);
-    SDValue basePtr = LN->getBasePtr();
-    SDValue the_chain = alignLoadVec.getValue(1);
     SDValue theValue = SN->getValue();
     SDValue result;
 
@@ -737,40 +839,33 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
       theValue = theValue.getOperand(0);
     }
 
-    chunk_offset &= 0xf;
-
-    SDValue insertEltOffs = DAG.getConstant(chunk_offset, PtrVT);
-    SDValue insertEltPtr;
-
     // If the base pointer is already a D-form address, then just create
     // a new D-form address with a slot offset and the orignal base pointer.
     // Otherwise generate a D-form address with the slot offset relative
     // to the stack pointer, which is always aligned.
-    DEBUG(cerr << "CellSPU LowerSTORE: basePtr = ");
-    DEBUG(basePtr.getNode()->dump(&DAG));
-    DEBUG(cerr << "\n");
-
-    if (basePtr.getOpcode() == SPUISD::IndirectAddr ||
-        (basePtr.getOpcode() == ISD::ADD
-         && basePtr.getOperand(0).getOpcode() == SPUISD::IndirectAddr)) {
-      insertEltPtr = basePtr;
-    } else {
-      insertEltPtr = DAG.getNode(ISD::ADD, PtrVT, basePtr, insertEltOffs);
-    }
+#if !defined(NDEBUG)
+      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+        cerr << "CellSPU LowerSTORE: basePtr = ";
+        basePtr.getNode()->dump(&DAG);
+        cerr << "\n";
+      }
+#endif
 
     SDValue insertEltOp =
-            DAG.getNode(SPUISD::SHUFFLE_MASK, stVecVT, insertEltPtr);
+            DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT, insertEltOffs);
     SDValue vectorizeOp =
-            DAG.getNode(ISD::SCALAR_TO_VECTOR, vecVT, theValue);
+            DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, theValue);
 
-    result = DAG.getNode(SPUISD::SHUFB, vecVT, vectorizeOp, alignLoadVec,
-                         DAG.getNode(ISD::BIT_CONVERT, vecVT, insertEltOp));
+    result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
+                         vectorizeOp, alignLoadVec,
+                         DAG.getNode(ISD::BIT_CONVERT, dl,
+                                     MVT::v4i32, insertEltOp));
 
-    result = DAG.getStore(the_chain, result, basePtr,
+    result = DAG.getStore(the_chain, dl, result, basePtr,
                           LN->getSrcValue(), LN->getSrcValueOffset(),
                           LN->isVolatile(), LN->getAlignment());
 
-#if 0 && defined(NDEBUG)
+#if 0 && !defined(NDEBUG)
     if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
       const SDValue &currentRoot = DAG.getRoot();
 
@@ -781,7 +876,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
       DAG.setRoot(currentRoot);
     }
 #endif
-    
+
     return result;
     /*UNREACHED*/
   }
@@ -790,18 +885,22 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   case ISD::POST_INC:
   case ISD::POST_DEC:
   case ISD::LAST_INDEXED_MODE:
-    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
+    {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
             "UNINDEXED\n";
-    cerr << (unsigned) SN->getAddressingMode() << "\n";
-    abort();
-    /*NOTREACHED*/
+      Msg << (unsigned) SN->getAddressingMode();
+      llvm_report_error(Msg.str());
+      /*NOTREACHED*/
+    }
   }
 
   return SDValue();
 }
 
-/// Generate the address of a constant pool entry.
-static SDValue
+//! Generate the address of a constant pool entry.
+SDValue
 LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   MVT PtrVT = Op.getValueType();
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
@@ -809,24 +908,31 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
   SDValue Zero = DAG.getConstant(0, PtrVT);
   const TargetMachine &TM = DAG.getTarget();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
 
   if (TM.getRelocationModel() == Reloc::Static) {
     if (!ST->usingLargeMem()) {
       // Just return the SDValue with the constant pool address in it.
-      return DAG.getNode(SPUISD::AFormAddr, PtrVT, CPI, Zero);
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
     } else {
-      SDValue Hi = DAG.getNode(SPUISD::Hi, PtrVT, CPI, Zero);
-      SDValue Lo = DAG.getNode(SPUISD::Lo, PtrVT, CPI, Zero);
-      return DAG.getNode(SPUISD::IndirectAddr, PtrVT, Hi, Lo);
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
     }
   }
 
-  assert(0 &&
-         "LowerConstantPool: Relocation model other than static"
-         " not supported.");
+  llvm_unreachable("LowerConstantPool: Relocation model other than static"
+                   " not supported.");
   return SDValue();
 }
 
+//! Alternate entry point for generating the address of a constant pool entry
+SDValue
+SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
+  return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
+}
+
 static SDValue
 LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   MVT PtrVT = Op.getValueType();
@@ -834,19 +940,21 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
   SDValue Zero = DAG.getConstant(0, PtrVT);
   const TargetMachine &TM = DAG.getTarget();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
 
   if (TM.getRelocationModel() == Reloc::Static) {
     if (!ST->usingLargeMem()) {
-      return DAG.getNode(SPUISD::AFormAddr, PtrVT, JTI, Zero);
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
     } else {
-      SDValue Hi = DAG.getNode(SPUISD::Hi, PtrVT, JTI, Zero);
-      SDValue Lo = DAG.getNode(SPUISD::Lo, PtrVT, JTI, Zero);
-      return DAG.getNode(SPUISD::IndirectAddr, PtrVT, Hi, Lo);
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
     }
   }
 
-  assert(0 &&
-         "LowerJumpTable: Relocation model other than static not supported.");
+  llvm_unreachable("LowerJumpTable: Relocation model other than static"
+                   " not supported.");
   return SDValue();
 }
 
@@ -858,44 +966,20 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
   const TargetMachine &TM = DAG.getTarget();
   SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
 
   if (TM.getRelocationModel() == Reloc::Static) {
     if (!ST->usingLargeMem()) {
-      return DAG.getNode(SPUISD::AFormAddr, PtrVT, GA, Zero);
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
     } else {
-      SDValue Hi = DAG.getNode(SPUISD::Hi, PtrVT, GA, Zero);
-      SDValue Lo = DAG.getNode(SPUISD::Lo, PtrVT, GA, Zero);
-      return DAG.getNode(SPUISD::IndirectAddr, PtrVT, Hi, Lo);
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
     }
   } else {
-    cerr << "LowerGlobalAddress: Relocation model other than static not "
-         << "supported.\n";
-    abort();
-    /*NOTREACHED*/
-  }
-
-  return SDValue();
-}
-
-//! Custom lower i64 integer constants
-/*!
- This code inserts all of the necessary juggling that needs to occur to load
- a 64-bit constant into a register.
- */
-static SDValue
-LowerConstant(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
-  ConstantSDNode *CN = cast<ConstantSDNode>(Op.getNode());
-
-  if (VT == MVT::i64) {
-    SDValue T = DAG.getConstant(CN->getZExtValue(), MVT::i64);
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                       DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T));
-  } else {
-    cerr << "LowerConstant: unhandled constant type "
-         << VT.getMVTString()
-         << "\n";
-    abort();
+    llvm_report_error("LowerGlobalAddress: Relocation model other than static"
+                      "not supported.");
     /*NOTREACHED*/
   }
 
@@ -906,38 +990,25 @@ LowerConstant(SDValue Op, SelectionDAG &DAG) {
 static SDValue
 LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getValueType();
-  ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
-
-  assert((FP != 0) &&
-         "LowerConstantFP: Node is not ConstantFPSDNode");
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
 
   if (VT == MVT::f64) {
+    ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
+
+    assert((FP != 0) &&
+           "LowerConstantFP: Node is not ConstantFPSDNode");
+
     uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
-    return DAG.getNode(ISD::BIT_CONVERT, VT,
-                       LowerConstant(DAG.getConstant(dbits, MVT::i64), DAG));
+    SDValue T = DAG.getConstant(dbits, MVT::i64);
+    SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                       DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Tvec));
   }
 
   return SDValue();
 }
 
-//! Lower MVT::i8 brcond to a promoted type (MVT::i32, MVT::i16)
-static SDValue
-LowerBRCOND(SDValue Op, SelectionDAG &DAG)
-{
-  SDValue Cond = Op.getOperand(1);
-  MVT CondVT = Cond.getValueType();
-  MVT CondNVT;
-
-  if (CondVT == MVT::i8) {
-    CondNVT = MVT::i16;
-    return DAG.getNode(ISD::BRCOND, Op.getValueType(),
-                      Op.getOperand(0),
-                      DAG.getNode(ISD::ZERO_EXTEND, CondNVT, Op.getOperand(1)),
-                      Op.getOperand(2));
-  } else
-    return SDValue();                // Unchanged
-}
-
 static SDValue
 LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
 {
@@ -947,6 +1018,7 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
   SmallVector<SDValue, 48> ArgValues;
   SDValue Root = Op.getOperand(0);
   bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
+  DebugLoc dl = Op.getDebugLoc();
 
   const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
   const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
@@ -969,10 +1041,11 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
 
       switch (ObjectVT.getSimpleVT()) {
       default: {
-        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
-             << ObjectVT.getMVTString()
-             << "\n";
-        abort();
+        std::string msg;
+        raw_string_ostream Msg(msg);
+        Msg << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
+             << ObjectVT.getMVTString();
+        llvm_report_error(Msg.str());
       }
       case MVT::i8:
         ArgRegClass = &SPU::R8CRegClass;
@@ -986,6 +1059,9 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
       case MVT::i64:
         ArgRegClass = &SPU::R64CRegClass;
         break;
+      case MVT::i128:
+        ArgRegClass = &SPU::GPRCRegClass;
+        break;
       case MVT::f32:
         ArgRegClass = &SPU::R32FPRegClass;
         break;
@@ -1004,7 +1080,7 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
 
       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
       RegInfo.addLiveIn(ArgRegs[ArgRegIdx], VReg);
-      ArgVal = DAG.getCopyFromReg(Root, VReg, ObjectVT);
+      ArgVal = DAG.getCopyFromReg(Root, dl, VReg, ObjectVT);
       ++ArgRegIdx;
     } else {
       // We need to load the argument to a virtual register if we determined
@@ -1012,7 +1088,7 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
       // or we're forced to do vararg
       int FI = MFI->CreateFixedObject(ObjSize, ArgOffset);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      ArgVal = DAG.getLoad(ObjectVT, Root, FIN, NULL, 0);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Root, FIN, NULL, 0);
       ArgOffset += StackSlotSize;
     }
 
@@ -1033,7 +1109,7 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
       VarArgsFrameIndex = MFI->CreateFixedObject(StackSlotSize, ArgOffset);
       SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
       SDValue ArgVal = DAG.getRegister(ArgRegs[ArgRegIdx], MVT::v16i8);
-      SDValue Store = DAG.getStore(Root, ArgVal, FIN, NULL, 0);
+      SDValue Store = DAG.getStore(Root, dl, ArgVal, FIN, NULL, 0);
       Root = Store.getOperand(0);
       MemOps.push_back(Store);
 
@@ -1041,13 +1117,14 @@ LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
       ArgOffset += StackSlotSize;
     }
     if (!MemOps.empty())
-      Root = DAG.getNode(ISD::TokenFactor,MVT::Other,&MemOps[0],MemOps.size());
+      Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                         &MemOps[0], MemOps.size());
   }
 
   ArgValues.push_back(Root);
 
   // Return the new list of results.
-  return DAG.getNode(ISD::MERGE_VALUES, Op.getNode()->getVTList(),
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(),
                      &ArgValues[0], ArgValues.size());
 }
 
@@ -1065,8 +1142,7 @@ static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
   return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
 }
 
-static
-SDValue
+static SDValue
 LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
   SDValue Chain = TheCall->getChain();
@@ -1075,6 +1151,7 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
   const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
   const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
+  DebugLoc dl = TheCall->getDebugLoc();
 
   // Handy pointer type
   MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -1105,17 +1182,19 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
     // PtrOff will be used to store the current argument to the stack if a
     // register cannot be found for it.
     SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
-    PtrOff = DAG.getNode(ISD::ADD, PtrVT, StackPtr, PtrOff);
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
 
     switch (Arg.getValueType().getSimpleVT()) {
-    default: assert(0 && "Unexpected ValueType for argument!");
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i8:
+    case MVT::i16:
     case MVT::i32:
     case MVT::i64:
     case MVT::i128:
       if (ArgRegIdx != NumArgRegs) {
         RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
       } else {
-        MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
         ArgOffset += StackSlotSize;
       }
       break;
@@ -1124,10 +1203,12 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
       if (ArgRegIdx != NumArgRegs) {
         RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
       } else {
-        MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
         ArgOffset += StackSlotSize;
       }
       break;
+    case MVT::v2i64:
+    case MVT::v2f64:
     case MVT::v4f32:
     case MVT::v4i32:
     case MVT::v8i16:
@@ -1135,7 +1216,7 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
       if (ArgRegIdx != NumArgRegs) {
         RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
       } else {
-        MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
         ArgOffset += StackSlotSize;
       }
       break;
@@ -1149,7 +1230,7 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 
   if (!MemOpChains.empty()) {
     // Adjust the stack pointer for the stack arguments.
-    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
   }
 
@@ -1157,8 +1238,8 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
-                             InFlag);
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
     InFlag = Chain.getValue(1);
   }
 
@@ -1184,18 +1265,27 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
       // This may be an unsafe assumption for JIT and really large compilation
       // units.
       if (GV->isDeclaration()) {
-        Callee = DAG.getNode(SPUISD::AFormAddr, CalleeVT, GA, Zero);
+        Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
       } else {
-        Callee = DAG.getNode(SPUISD::PCRelAddr, CalleeVT, GA, Zero);
+        Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
       }
     } else {
       // "Large memory" mode: Turn all calls into indirect calls with a X-form
       // address pairs:
-      Callee = DAG.getNode(SPUISD::IndirectAddr, PtrVT, GA, Zero);
+      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
     }
-  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
-    Callee = DAG.getExternalSymbol(S->getSymbol(), Callee.getValueType());
-  else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    MVT CalleeVT = Callee.getValueType();
+    SDValue Zero = DAG.getConstant(0, PtrVT);
+    SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
+        Callee.getValueType());
+
+    if (!ST->usingLargeMem()) {
+      Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
+    } else {
+      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
+    }
+  } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
     // If this is an absolute destination address that appears to be a legal
     // local store address, use the munged value.
     Callee = SDValue(Dest, 0);
@@ -1213,7 +1303,7 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   if (InFlag.getNode())
     Ops.push_back(InFlag);
   // Returns a chain and a flag for retval copy to use.
-  Chain = DAG.getNode(CallOpc, DAG.getVTList(MVT::Other, MVT::Flag),
+  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag),
                       &Ops[0], Ops.size());
   InFlag = Chain.getValue(1);
 
@@ -1227,40 +1317,50 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 
   // If the call has results, copy the values out of the ret val registers.
   switch (TheCall->getValueType(0).getSimpleVT()) {
-  default: assert(0 && "Unexpected ret value!");
+  default: llvm_unreachable("Unexpected ret value!");
   case MVT::Other: break;
   case MVT::i32:
     if (TheCall->getValueType(1) == MVT::i32) {
-      Chain = DAG.getCopyFromReg(Chain, SPU::R4, MVT::i32, InFlag).getValue(1);
+      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R4,
+                                 MVT::i32, InFlag).getValue(1);
       ResultVals[0] = Chain.getValue(0);
-      Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i32,
+      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
                                  Chain.getValue(2)).getValue(1);
       ResultVals[1] = Chain.getValue(0);
       NumResults = 2;
     } else {
-      Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i32, InFlag).getValue(1);
+      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
+                                 InFlag).getValue(1);
       ResultVals[0] = Chain.getValue(0);
       NumResults = 1;
     }
     break;
   case MVT::i64:
-    Chain = DAG.getCopyFromReg(Chain, SPU::R3, MVT::i64, InFlag).getValue(1);
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i64,
+                               InFlag).getValue(1);
+    ResultVals[0] = Chain.getValue(0);
+    NumResults = 1;
+    break;
+  case MVT::i128:
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i128,
+                               InFlag).getValue(1);
     ResultVals[0] = Chain.getValue(0);
     NumResults = 1;
     break;
   case MVT::f32:
   case MVT::f64:
-    Chain = DAG.getCopyFromReg(Chain, SPU::R3, TheCall->getValueType(0),
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, TheCall->getValueType(0),
                                InFlag).getValue(1);
     ResultVals[0] = Chain.getValue(0);
     NumResults = 1;
     break;
   case MVT::v2f64:
+  case MVT::v2i64:
   case MVT::v4f32:
   case MVT::v4i32:
   case MVT::v8i16:
   case MVT::v16i8:
-    Chain = DAG.getCopyFromReg(Chain, SPU::R3, TheCall->getValueType(0),
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, TheCall->getValueType(0),
                                    InFlag).getValue(1);
     ResultVals[0] = Chain.getValue(0);
     NumResults = 1;
@@ -1273,7 +1373,7 @@ LowerCALL(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 
   // Otherwise, merge everything together with a MERGE_VALUES node.
   ResultVals[NumResults++] = Chain;
-  SDValue Res = DAG.getMergeValues(ResultVals, NumResults);
+  SDValue Res = DAG.getMergeValues(ResultVals, NumResults, dl);
   return Res.getValue(Op.getResNo());
 }
 
@@ -1282,7 +1382,8 @@ LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM) {
   SmallVector<CCValAssign, 16> RVLocs;
   unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
   bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
-  CCState CCInfo(CC, isVarArg, TM, RVLocs);
+  DebugLoc dl = Op.getDebugLoc();
+  CCState CCInfo(CC, isVarArg, TM, RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Op.getNode(), RetCC_SPU);
 
   // If this is the first return lowered for this function, add the regs to the
@@ -1299,14 +1400,15 @@ LowerRET(SDValue Op, SelectionDAG &DAG, TargetMachine &TM) {
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
-    Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), Op.getOperand(i*2+1), Flag);
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             Op.getOperand(i*2+1), Flag);
     Flag = Chain.getValue(1);
   }
 
   if (Flag.getNode())
-    return DAG.getNode(SPUISD::RET_FLAG, MVT::Other, Chain, Flag);
+    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
   else
-    return DAG.getNode(SPUISD::RET_FLAG, MVT::Other, Chain);
+    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
 }
 
 
@@ -1333,7 +1435,7 @@ getVecImm(SDNode *N) {
     }
   }
 
-  return 0; // All UNDEF: use implicit def.; not Constant node
+  return 0;
 }
 
 /// get_vec_i18imm - Test if this vector is a vector filled with the same value
@@ -1460,269 +1562,182 @@ SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// If this is a vector of constants or undefs, get the bits.  A bit in
-// UndefBits is set if the corresponding element of the vector is an
-// ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
-// zero.   Return true if this is not an array of constants, false if it is.
-//
-static bool GetConstantBuildVectorBits(SDNode *BV, uint64_t VectorBits[2],
-                                       uint64_t UndefBits[2]) {
-  // Start with zero'd results.
-  VectorBits[0] = VectorBits[1] = UndefBits[0] = UndefBits[1] = 0;
-
-  unsigned EltBitSize = BV->getOperand(0).getValueType().getSizeInBits();
-  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
-    SDValue OpVal = BV->getOperand(i);
-
-    unsigned PartNo = i >= e/2;     // In the upper 128 bits?
-    unsigned SlotNo = e/2 - (i & (e/2-1))-1;  // Which subpiece of the uint64_t.
-
-    uint64_t EltBits = 0;
-    if (OpVal.getOpcode() == ISD::UNDEF) {
-      uint64_t EltUndefBits = ~0ULL >> (64-EltBitSize);
-      UndefBits[PartNo] |= EltUndefBits << (SlotNo*EltBitSize);
-      continue;
-    } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
-      EltBits = CN->getZExtValue() & (~0ULL >> (64-EltBitSize));
-    } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
-      const APFloat &apf = CN->getValueAPF();
-      EltBits = (CN->getValueType(0) == MVT::f32
-                 ? FloatToBits(apf.convertToFloat())
-                 : DoubleToBits(apf.convertToDouble()));
-    } else {
-      // Nonconstant element.
-      return true;
-    }
-
-    VectorBits[PartNo] |= EltBits << (SlotNo*EltBitSize);
-  }
+//! Lower a BUILD_VECTOR instruction creatively:
+SDValue
+LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  MVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = Op.getDebugLoc();
+  BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
+  unsigned minSplatBits = EltVT.getSizeInBits();
 
-  //printf("%llx %llx  %llx %llx\n",
-  //       VectorBits[0], VectorBits[1], UndefBits[0], UndefBits[1]);
-  return false;
-}
+  if (minSplatBits < 16)
+    minSplatBits = 16;
 
-/// If this is a splat (repetition) of a value across the whole vector, return
-/// the smallest size that splats it.  For example, "0x01010101010101..." is a
-/// splat of 0x01, 0x0101, and 0x01010101.  We return SplatBits = 0x01 and
-/// SplatSize = 1 byte.
-static bool isConstantSplat(const uint64_t Bits128[2],
-                            const uint64_t Undef128[2],
-                            int MinSplatBits,
-                            uint64_t &SplatBits, uint64_t &SplatUndef,
-                            int &SplatSize) {
-  // Don't let undefs prevent splats from matching.  See if the top 64-bits are
-  // the same as the lower 64-bits, ignoring undefs.
-  uint64_t Bits64  = Bits128[0] | Bits128[1];
-  uint64_t Undef64 = Undef128[0] & Undef128[1];
-  uint32_t Bits32  = uint32_t(Bits64) | uint32_t(Bits64 >> 32);
-  uint32_t Undef32 = uint32_t(Undef64) & uint32_t(Undef64 >> 32);
-  uint16_t Bits16  = uint16_t(Bits32)  | uint16_t(Bits32 >> 16);
-  uint16_t Undef16 = uint16_t(Undef32) & uint16_t(Undef32 >> 16);
-
-  if ((Bits128[0] & ~Undef128[1]) == (Bits128[1] & ~Undef128[0])) {
-    if (MinSplatBits < 64) {
-
-      // Check that the top 32-bits are the same as the lower 32-bits, ignoring
-      // undefs.
-      if ((Bits64 & (~Undef64 >> 32)) == ((Bits64 >> 32) & ~Undef64)) {
-        if (MinSplatBits < 32) {
-
-          // If the top 16-bits are different than the lower 16-bits, ignoring
-          // undefs, we have an i32 splat.
-          if ((Bits32 & (~Undef32 >> 16)) == ((Bits32 >> 16) & ~Undef32)) {
-            if (MinSplatBits < 16) {
-              // If the top 8-bits are different than the lower 8-bits, ignoring
-              // undefs, we have an i16 splat.
-              if ((Bits16 & (uint16_t(~Undef16) >> 8))
-                  == ((Bits16 >> 8) & ~Undef16)) {
-                // Otherwise, we have an 8-bit splat.
-                SplatBits  = uint8_t(Bits16)  | uint8_t(Bits16 >> 8);
-                SplatUndef = uint8_t(Undef16) & uint8_t(Undef16 >> 8);
-                SplatSize = 1;
-                return true;
-              }
-            } else {
-              SplatBits = Bits16;
-              SplatUndef = Undef16;
-              SplatSize = 2;
-              return true;
-            }
-          }
-        } else {
-          SplatBits = Bits32;
-          SplatUndef = Undef32;
-          SplatSize = 4;
-          return true;
-        }
-      }
-    } else {
-      SplatBits = Bits128[0];
-      SplatUndef = Undef128[0];
-      SplatSize = 8;
-      return true;
-    }
-  }
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
 
-  return false;  // Can't be a splat if two pieces don't match.
-}
+  if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                            HasAnyUndefs, minSplatBits)
+      || minSplatBits < SplatBitSize)
+    return SDValue();   // Wasn't a constant vector or splat exceeded min
 
-// If this is a case we can't handle, return null and let the default
-// expansion code take care of it.  If we CAN select this case, and if it
-// selects to a single instruction, return Op.  Otherwise, if we can codegen
-// this case more efficiently than a constant pool load, lower it to the
-// sequence of ops that should be used.
-static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
-  // If this is a vector of constants or undefs, get the bits.  A bit in
-  // UndefBits is set if the corresponding element of the vector is an
-  // ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
-  // zero.
-  uint64_t VectorBits[2];
-  uint64_t UndefBits[2];
-  uint64_t SplatBits, SplatUndef;
-  int SplatSize;
-  if (GetConstantBuildVectorBits(Op.getNode(), VectorBits, UndefBits)
-      || !isConstantSplat(VectorBits, UndefBits,
-                          VT.getVectorElementType().getSizeInBits(),
-                          SplatBits, SplatUndef, SplatSize))
-    return SDValue();   // Not a constant vector, not a splat.
+  uint64_t SplatBits = APSplatBits.getZExtValue();
 
   switch (VT.getSimpleVT()) {
-  default:
+  default: {
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = "
+         << VT.getMVTString();
+    llvm_report_error(Msg.str());
+    /*NOTREACHED*/
+  }
   case MVT::v4f32: {
-    uint32_t Value32 = SplatBits;
-    assert(SplatSize == 4
+    uint32_t Value32 = uint32_t(SplatBits);
+    assert(SplatBitSize == 32
            && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
     SDValue T = DAG.getConstant(Value32, MVT::i32);
-    return DAG.getNode(ISD::BIT_CONVERT, MVT::v4f32,
-                       DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, T, T, T, T));
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
     break;
   }
   case MVT::v2f64: {
-    uint64_t f64val = SplatBits;
-    assert(SplatSize == 8
+    uint64_t f64val = uint64_t(SplatBits);
+    assert(SplatBitSize == 64
            && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
     SDValue T = DAG.getConstant(f64val, MVT::i64);
-    return DAG.getNode(ISD::BIT_CONVERT, MVT::v2f64,
-                       DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T));
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
     break;
   }
   case MVT::v16i8: {
    // 8-bit constants have to be expanded to 16-bits
-   unsigned short Value16 = SplatBits | (SplatBits << 8);
-   SDValue Ops[8];
-   for (int i = 0; i < 8; ++i)
-     Ops[i] = DAG.getConstant(Value16, MVT::i16);
-   return DAG.getNode(ISD::BIT_CONVERT, VT,
-                      DAG.getNode(ISD::BUILD_VECTOR, MVT::v8i16, Ops, 8));
+   unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
+   SmallVector<SDValue, 8> Ops;
+
+   Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
+   return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
   }
   case MVT::v8i16: {
-    unsigned short Value16;
-    if (SplatSize == 2)
-      Value16 = (unsigned short) (SplatBits & 0xffff);
-    else
-      Value16 = (unsigned short) (SplatBits | (SplatBits << 8));
-    SDValue T = DAG.getConstant(Value16, VT.getVectorElementType());
-    SDValue Ops[8];
-    for (int i = 0; i < 8; ++i) Ops[i] = T;
-    return DAG.getNode(ISD::BUILD_VECTOR, VT, Ops, 8);
+    unsigned short Value16 = SplatBits;
+    SDValue T = DAG.getConstant(Value16, EltVT);
+    SmallVector<SDValue, 8> Ops;
+
+    Ops.assign(8, T);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
   }
   case MVT::v4i32: {
-    unsigned int Value = SplatBits;
-    SDValue T = DAG.getConstant(Value, VT.getVectorElementType());
-    return DAG.getNode(ISD::BUILD_VECTOR, VT, T, T, T, T);
+    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
+  }
+  case MVT::v2i32: {
+    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T);
   }
   case MVT::v2i64: {
-    uint64_t val = SplatBits;
-    uint32_t upper = uint32_t(val >> 32);
-    uint32_t lower = uint32_t(val);
-
-    if (upper == lower) {
-      // Magic constant that can be matched by IL, ILA, et. al.
-      SDValue Val = DAG.getTargetConstant(val, MVT::i64);
-      return DAG.getNode(ISD::BUILD_VECTOR, VT, Val, Val);
-    } else {
-      SDValue LO32;
-      SDValue HI32;
-      SmallVector<SDValue, 16> ShufBytes;
-      SDValue Result;
-      bool upper_special, lower_special;
-
-      // NOTE: This code creates common-case shuffle masks that can be easily
-      // detected as common expressions. It is not attempting to create highly
-      // specialized masks to replace any and all 0's, 0xff's and 0x80's.
-
-      // Detect if the upper or lower half is a special shuffle mask pattern:
-      upper_special = (upper == 0||upper == 0xffffffff||upper == 0x80000000);
-      lower_special = (lower == 0||lower == 0xffffffff||lower == 0x80000000);
-
-      // Create lower vector if not a special pattern
-      if (!lower_special) {
-        SDValue LO32C = DAG.getConstant(lower, MVT::i32);
-        LO32 = DAG.getNode(ISD::BIT_CONVERT, VT,
-                           DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                                       LO32C, LO32C, LO32C, LO32C));
-      }
+    return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
+  }
+  }
 
-      // Create upper vector if not a special pattern
-      if (!upper_special) {
-        SDValue HI32C = DAG.getConstant(upper, MVT::i32);
-        HI32 = DAG.getNode(ISD::BIT_CONVERT, VT,
-                           DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                                       HI32C, HI32C, HI32C, HI32C));
-      }
+  return SDValue();
+}
 
-      // If either upper or lower are special, then the two input operands are
-      // the same (basically, one of them is a "don't care")
-      if (lower_special)
-        LO32 = HI32;
-      if (upper_special)
-        HI32 = LO32;
-      if (lower_special && upper_special) {
-        // Unhappy situation... both upper and lower are special, so punt with
-        // a target constant:
-        SDValue Zero = DAG.getConstant(0, MVT::i32);
-        HI32 = LO32 = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Zero, Zero,
-                                  Zero, Zero);
-      }
+/*!
+ */
+SDValue
+SPU::LowerV2I64Splat(MVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
+                     DebugLoc dl) {
+  uint32_t upper = uint32_t(SplatVal >> 32);
+  uint32_t lower = uint32_t(SplatVal);
+
+  if (upper == lower) {
+    // Magic constant that can be matched by IL, ILA, et. al.
+    SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   Val, Val, Val, Val));
+  } else {
+    bool upper_special, lower_special;
 
-      for (int i = 0; i < 4; ++i) {
-        uint64_t val = 0;
-        for (int j = 0; j < 4; ++j) {
-          SDValue V;
-          bool process_upper, process_lower;
-          val <<= 8;
-          process_upper = (upper_special && (i & 1) == 0);
-          process_lower = (lower_special && (i & 1) == 1);
-
-          if (process_upper || process_lower) {
-            if ((process_upper && upper == 0)
-                || (process_lower && lower == 0))
-              val |= 0x80;
-            else if ((process_upper && upper == 0xffffffff)
-                     || (process_lower && lower == 0xffffffff))
-              val |= 0xc0;
-            else if ((process_upper && upper == 0x80000000)
-                     || (process_lower && lower == 0x80000000))
-              val |= (j == 0 ? 0xe0 : 0x80);
-          } else
-            val |= i * 4 + j + ((i & 1) * 16);
-        }
+    // NOTE: This code creates common-case shuffle masks that can be easily
+    // detected as common expressions. It is not attempting to create highly
+    // specialized masks to replace any and all 0's, 0xff's and 0x80's.
+
+    // Detect if the upper or lower half is a special shuffle mask pattern:
+    upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
+    lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
+
+    // Both upper and lower are special, lower to a constant pool load:
+    if (lower_special && upper_special) {
+      SDValue SplatValCN = DAG.getConstant(SplatVal, MVT::i64);
+      return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
+                         SplatValCN, SplatValCN);
+    }
+
+    SDValue LO32;
+    SDValue HI32;
+    SmallVector<SDValue, 16> ShufBytes;
+    SDValue Result;
+
+    // Create lower vector if not a special pattern
+    if (!lower_special) {
+      SDValue LO32C = DAG.getConstant(lower, MVT::i32);
+      LO32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                     LO32C, LO32C, LO32C, LO32C));
+    }
+
+    // Create upper vector if not a special pattern
+    if (!upper_special) {
+      SDValue HI32C = DAG.getConstant(upper, MVT::i32);
+      HI32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                     HI32C, HI32C, HI32C, HI32C));
+    }
 
-        ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
+    // If either upper or lower are special, then the two input operands are
+    // the same (basically, one of them is a "don't care")
+    if (lower_special)
+      LO32 = HI32;
+    if (upper_special)
+      HI32 = LO32;
+
+    for (int i = 0; i < 4; ++i) {
+      uint64_t val = 0;
+      for (int j = 0; j < 4; ++j) {
+        SDValue V;
+        bool process_upper, process_lower;
+        val <<= 8;
+        process_upper = (upper_special && (i & 1) == 0);
+        process_lower = (lower_special && (i & 1) == 1);
+
+        if (process_upper || process_lower) {
+          if ((process_upper && upper == 0)
+                  || (process_lower && lower == 0))
+            val |= 0x80;
+          else if ((process_upper && upper == 0xffffffff)
+                  || (process_lower && lower == 0xffffffff))
+            val |= 0xc0;
+          else if ((process_upper && upper == 0x80000000)
+                  || (process_lower && lower == 0x80000000))
+            val |= (j == 0 ? 0xe0 : 0x80);
+        } else
+          val |= i * 4 + j + ((i & 1) * 16);
       }
 
-      return DAG.getNode(SPUISD::SHUFB, VT, HI32, LO32,
-                         DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                                     &ShufBytes[0], ShufBytes.size()));
+      ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
     }
-  }
-  }
 
-  return SDValue();
+    return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   &ShufBytes[0], ShufBytes.size()));
+  }
 }
 
 /// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
@@ -1739,47 +1754,75 @@ static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
 /// \note
 /// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  SDValue PermMask = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
 
   if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
 
   // If we have a single element being moved from V1 to V2, this can be handled
   // using the C*[DX] compute mask instructions, but the vector elements have
   // to be monotonically increasing with one exception element.
-  MVT EltVT = V1.getValueType().getVectorElementType();
+  MVT VecVT = V1.getValueType();
+  MVT EltVT = VecVT.getVectorElementType();
   unsigned EltsFromV2 = 0;
   unsigned V2Elt = 0;
   unsigned V2EltIdx0 = 0;
   unsigned CurrElt = 0;
+  unsigned MaxElts = VecVT.getVectorNumElements();
+  unsigned PrevElt = 0;
+  unsigned V0Elt = 0;
   bool monotonic = true;
-  if (EltVT == MVT::i8)
+  bool rotate = true;
+
+  if (EltVT == MVT::i8) {
     V2EltIdx0 = 16;
-  else if (EltVT == MVT::i16)
+  } else if (EltVT == MVT::i16) {
     V2EltIdx0 = 8;
-  else if (EltVT == MVT::i32)
+  } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
     V2EltIdx0 = 4;
-  else
-    assert(0 && "Unhandled vector type in LowerVECTOR_SHUFFLE");
+  } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
+    V2EltIdx0 = 2;
+  } else
+    llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
 
-  for (unsigned i = 0, e = PermMask.getNumOperands();
-       EltsFromV2 <= 1 && monotonic && i != e;
-       ++i) {
-    unsigned SrcElt;
-    if (PermMask.getOperand(i).getOpcode() == ISD::UNDEF)
-      SrcElt = 0;
-    else
-      SrcElt = cast<ConstantSDNode>(PermMask.getOperand(i))->getZExtValue();
+  for (unsigned i = 0; i != MaxElts; ++i) {
+    if (SVN->getMaskElt(i) < 0)
+      continue;
+    
+    unsigned SrcElt = SVN->getMaskElt(i);
+
+    if (monotonic) {
+      if (SrcElt >= V2EltIdx0) {
+        if (1 >= (++EltsFromV2)) {
+          V2Elt = (V2EltIdx0 - SrcElt) << 2;
+        }
+      } else if (CurrElt != SrcElt) {
+        monotonic = false;
+      }
 
-    if (SrcElt >= V2EltIdx0) {
-      ++EltsFromV2;
-      V2Elt = (V2EltIdx0 - SrcElt) << 2;
-    } else if (CurrElt != SrcElt) {
-      monotonic = false;
+      ++CurrElt;
     }
 
-    ++CurrElt;
+    if (rotate) {
+      if (PrevElt > 0 && SrcElt < MaxElts) {
+        if ((PrevElt == SrcElt - 1)
+            || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
+          PrevElt = SrcElt;
+          if (SrcElt == 0)
+            V0Elt = i;
+        } else {
+          rotate = false;
+        }
+      } else if (PrevElt == 0) {
+        // First time through, need to keep track of previous element
+        PrevElt = SrcElt;
+      } else {
+        // This isn't a rotation, takes elements from vector 2
+        rotate = false;
+      }
+    }
   }
 
   if (EltsFromV2 == 1 && monotonic) {
@@ -1790,41 +1833,42 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
     // Initialize temporary register to 0
     SDValue InitTempReg =
-      DAG.getCopyToReg(DAG.getEntryNode(), VReg, DAG.getConstant(0, PtrVT));
+      DAG.getCopyToReg(DAG.getEntryNode(), dl, VReg, DAG.getConstant(0, PtrVT));
     // Copy register's contents as index in SHUFFLE_MASK:
     SDValue ShufMaskOp =
-      DAG.getNode(SPUISD::SHUFFLE_MASK, V1.getValueType(),
+      DAG.getNode(SPUISD::SHUFFLE_MASK, dl, MVT::v4i32,
                   DAG.getTargetConstant(V2Elt, MVT::i32),
-                  DAG.getCopyFromReg(InitTempReg, VReg, PtrVT));
+                  DAG.getCopyFromReg(InitTempReg, dl, VReg, PtrVT));
     // Use shuffle mask in SHUFB synthetic instruction:
-    return DAG.getNode(SPUISD::SHUFB, V1.getValueType(), V2, V1, ShufMaskOp);
+    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
+                       ShufMaskOp);
+  } else if (rotate) {
+    int rotamt = (MaxElts - V0Elt) * EltVT.getSizeInBits()/8;
+
+    return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
+                       V1, DAG.getConstant(rotamt, MVT::i16));
   } else {
    // Convert the SHUFFLE_VECTOR mask's input element units to the
    // actual bytes.
     unsigned BytesPerElement = EltVT.getSizeInBits()/8;
 
     SmallVector<SDValue, 16> ResultMask;
-    for (unsigned i = 0, e = PermMask.getNumOperands(); i != e; ++i) {
-      unsigned SrcElt;
-      if (PermMask.getOperand(i).getOpcode() == ISD::UNDEF)
-        SrcElt = 0;
-      else
-        SrcElt = cast<ConstantSDNode>(PermMask.getOperand(i))->getZExtValue();
-
-      for (unsigned j = 0; j < BytesPerElement; ++j) {
-        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
-                                             MVT::i8));
-      }
+    for (unsigned i = 0, e = MaxElts; i != e; ++i) {
+      unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
+
+      for (unsigned j = 0; j < BytesPerElement; ++j)
+        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
     }
 
-    SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8,
-                                      &ResultMask[0], ResultMask.size());
-    return DAG.getNode(SPUISD::SHUFB, V1.getValueType(), V1, V2, VPermMask);
+    SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
+                                    &ResultMask[0], ResultMask.size());
+    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
   }
 }
 
 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
+  DebugLoc dl = Op.getDebugLoc();
 
   if (Op0.getNode()->getOpcode() == ISD::Constant) {
     // For a constant, build the appropriate constant vector, which will
@@ -1837,8 +1881,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
 
     // Create a constant vector:
     switch (Op.getValueType().getSimpleVT()) {
-    default: assert(0 && "Unexpected constant value type in "
-                         "LowerSCALAR_TO_VECTOR");
+    default: llvm_unreachable("Unexpected constant value type in "
+                              "LowerSCALAR_TO_VECTOR");
     case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
     case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
     case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
@@ -1851,214 +1895,30 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
     for (size_t j = 0; j < n_copies; ++j)
       ConstVecValues.push_back(CValue);
 
-    return DAG.getNode(ISD::BUILD_VECTOR, Op.getValueType(),
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
                        &ConstVecValues[0], ConstVecValues.size());
   } else {
     // Otherwise, copy the value from one register to another:
     switch (Op0.getValueType().getSimpleVT()) {
-    default: assert(0 && "Unexpected value type in LowerSCALAR_TO_VECTOR");
+    default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
     case MVT::i8:
     case MVT::i16:
     case MVT::i32:
     case MVT::i64:
     case MVT::f32:
     case MVT::f64:
-      return DAG.getNode(SPUISD::PROMOTE_SCALAR, Op.getValueType(), Op0, Op0);
+      return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
     }
   }
 
   return SDValue();
 }
 
-static SDValue LowerVectorMUL(SDValue Op, SelectionDAG &DAG) {
-  switch (Op.getValueType().getSimpleVT()) {
-  default:
-    cerr << "CellSPU: Unknown vector multiplication, got "
-         << Op.getValueType().getMVTString()
-         << "\n";
-    abort();
-    /*NOTREACHED*/
-
-  case MVT::v4i32: {
-    SDValue rA = Op.getOperand(0);
-    SDValue rB = Op.getOperand(1);
-    SDValue HiProd1 = DAG.getNode(SPUISD::MPYH, MVT::v4i32, rA, rB);
-    SDValue HiProd2 = DAG.getNode(SPUISD::MPYH, MVT::v4i32, rB, rA);
-    SDValue LoProd = DAG.getNode(SPUISD::MPYU, MVT::v4i32, rA, rB);
-    SDValue Residual1 = DAG.getNode(ISD::ADD, MVT::v4i32, LoProd, HiProd1);
-
-    return DAG.getNode(ISD::ADD, MVT::v4i32, Residual1, HiProd2);
-    break;
-  }
-
-  // Multiply two v8i16 vectors (pipeline friendly version):
-  // a) multiply lower halves, mask off upper 16-bit of 32-bit product
-  // b) multiply upper halves, rotate left by 16 bits (inserts 16 lower zeroes)
-  // c) Use SELB to select upper and lower halves from the intermediate results
-  //
-  // NOTE: We really want to move the SELECT_MASK to earlier to actually get the
-  // dual-issue. This code does manage to do this, even if it's a little on
-  // the wacky side
-  case MVT::v8i16: {
-    MachineFunction &MF = DAG.getMachineFunction();
-    MachineRegisterInfo &RegInfo = MF.getRegInfo();
-    SDValue Chain = Op.getOperand(0);
-    SDValue rA = Op.getOperand(0);
-    SDValue rB = Op.getOperand(1);
-    unsigned FSMBIreg = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-    unsigned HiProdReg = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-
-    SDValue FSMBOp =
-      DAG.getCopyToReg(Chain, FSMBIreg,
-                       DAG.getNode(SPUISD::SELECT_MASK, MVT::v8i16,
-                                   DAG.getConstant(0xcccc, MVT::i16)));
-
-    SDValue HHProd =
-      DAG.getCopyToReg(FSMBOp, HiProdReg,
-                       DAG.getNode(SPUISD::MPYHH, MVT::v8i16, rA, rB));
-
-    SDValue HHProd_v4i32 =
-      DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
-                  DAG.getCopyFromReg(HHProd, HiProdReg, MVT::v4i32));
-
-    return DAG.getNode(SPUISD::SELB, MVT::v8i16,
-                       DAG.getNode(SPUISD::MPY, MVT::v8i16, rA, rB),
-                       DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(),
-                                   DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32,
-                                               HHProd_v4i32,
-                                               DAG.getConstant(16, MVT::i16))),
-                       DAG.getCopyFromReg(FSMBOp, FSMBIreg, MVT::v4i32));
-  }
-
-  // This M00sE is N@stI! (apologies to Monty Python)
-  //
-  // SPU doesn't know how to do any 8-bit multiplication, so the solution
-  // is to break it all apart, sign extend, and reassemble the various
-  // intermediate products.
-  case MVT::v16i8: {
-    SDValue rA = Op.getOperand(0);
-    SDValue rB = Op.getOperand(1);
-    SDValue c8 = DAG.getConstant(8, MVT::i32);
-    SDValue c16 = DAG.getConstant(16, MVT::i32);
-
-    SDValue LLProd =
-      DAG.getNode(SPUISD::MPY, MVT::v8i16,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rA),
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rB));
-
-    SDValue rALH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rA, c8);
-
-    SDValue rBLH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rB, c8);
-
-    SDValue LHProd =
-      DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16,
-                  DAG.getNode(SPUISD::MPY, MVT::v8i16, rALH, rBLH), c8);
-
-    SDValue FSMBmask = DAG.getNode(SPUISD::SELECT_MASK, MVT::v8i16,
-                                     DAG.getConstant(0x2222, MVT::i16));
-
-    SDValue LoProdParts =
-      DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
-                  DAG.getNode(SPUISD::SELB, MVT::v8i16,
-                              LLProd, LHProd, FSMBmask));
-
-    SDValue LoProdMask = DAG.getConstant(0xffff, MVT::i32);
-
-    SDValue LoProd =
-      DAG.getNode(ISD::AND, MVT::v4i32,
-                  LoProdParts,
-                  DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                              LoProdMask, LoProdMask,
-                              LoProdMask, LoProdMask));
-
-    SDValue rAH =
-      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rA), c16);
-
-    SDValue rBH =
-      DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rB), c16);
-
-    SDValue HLProd =
-      DAG.getNode(SPUISD::MPY, MVT::v8i16,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rAH),
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rBH));
-
-    SDValue HHProd_1 =
-      DAG.getNode(SPUISD::MPY, MVT::v8i16,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
-                              DAG.getNode(SPUISD::VEC_SRA,
-                                          MVT::v4i32, rAH, c8)),
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
-                              DAG.getNode(SPUISD::VEC_SRA,
-                                          MVT::v4i32, rBH, c8)));
-
-    SDValue HHProd =
-      DAG.getNode(SPUISD::SELB, MVT::v8i16,
-                  HLProd,
-                  DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16, HHProd_1, c8),
-                  FSMBmask);
-
-    SDValue HiProd =
-      DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32, HHProd, c16);
-
-    return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8,
-                       DAG.getNode(ISD::OR, MVT::v4i32,
-                                   LoProd, HiProd));
-  }
-  }
-
-  return SDValue();
-}
-
-static SDValue LowerFDIVf32(SDValue Op, SelectionDAG &DAG) {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineRegisterInfo &RegInfo = MF.getRegInfo();
-
-  SDValue A = Op.getOperand(0);
-  SDValue B = Op.getOperand(1);
-  MVT VT = Op.getValueType();
-
-  unsigned VRegBR, VRegC;
-
-  if (VT == MVT::f32) {
-    VRegBR = RegInfo.createVirtualRegister(&SPU::R32FPRegClass);
-    VRegC = RegInfo.createVirtualRegister(&SPU::R32FPRegClass);
-  } else {
-    VRegBR = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-    VRegC = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-  }
-  // TODO: make sure we're feeding FPInterp the right arguments
-  // Right now: fi B, frest(B)
-
-  // Computes BRcpl =
-  // (Floating Interpolate (FP Reciprocal Estimate B))
-  SDValue BRcpl =
-      DAG.getCopyToReg(DAG.getEntryNode(), VRegBR,
-                       DAG.getNode(SPUISD::FPInterp, VT, B,
-                                DAG.getNode(SPUISD::FPRecipEst, VT, B)));
-
-  // Computes A * BRcpl and stores in a temporary register
-  SDValue AxBRcpl =
-      DAG.getCopyToReg(BRcpl, VRegC,
-                 DAG.getNode(ISD::FMUL, VT, A,
-                        DAG.getCopyFromReg(BRcpl, VRegBR, VT)));
-  // What's the Chain variable do? It's magic!
-  // TODO: set Chain = Op(0).getEntryNode()
-
-  return DAG.getNode(ISD::FADD, VT,
-                DAG.getCopyFromReg(AxBRcpl, VRegC, VT),
-                DAG.getNode(ISD::FMUL, VT,
-                        DAG.getCopyFromReg(AxBRcpl, VRegBR, VT),
-                        DAG.getNode(ISD::FSUB, VT, A,
-                            DAG.getNode(ISD::FMUL, VT, B,
-                            DAG.getCopyFromReg(AxBRcpl, VRegC, VT)))));
-}
-
 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getValueType();
   SDValue N = Op.getOperand(0);
   SDValue Elt = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
   SDValue retval;
 
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
@@ -2067,17 +1927,17 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
 
     // sanity checks:
     if (VT == MVT::i8 && EltNo >= 16)
-      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
     else if (VT == MVT::i16 && EltNo >= 8)
-      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
     else if (VT == MVT::i32 && EltNo >= 4)
-      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
     else if (VT == MVT::i64 && EltNo >= 2)
-      assert(0 && "SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
 
     if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
       // i32 and i64: Element 0 is the preferred slot
-      return DAG.getNode(SPUISD::VEC2PREFSLOT, VT, N);
+      return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
     }
 
     // Need to generate shuffle mask and extract:
@@ -2128,7 +1988,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
 
     SDValue ShufMask[4];
     for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
-      unsigned bidx = i / 4;
+      unsigned bidx = i * 4;
       unsigned int bits = ((ShufBytes[bidx] << 24) |
                            (ShufBytes[bidx+1] << 16) |
                            (ShufBytes[bidx+2] << 8) |
@@ -2136,25 +1996,25 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
       ShufMask[i] = DAG.getConstant(bits, MVT::i32);
     }
 
-    SDValue ShufMaskVec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                                      &ShufMask[0],
-                                      sizeof(ShufMask) / sizeof(ShufMask[0]));
+    SDValue ShufMaskVec =
+      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                  &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
 
-    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                         DAG.getNode(SPUISD::SHUFB, N.getValueType(),
+    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                         DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
                                      N, N, ShufMaskVec));
   } else {
     // Variable index: Rotate the requested element into slot 0, then replicate
     // slot 0 across the vector
     MVT VecVT = N.getValueType();
     if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) {
-      cerr << "LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit vector type!\n";
-      abort();
+      llvm_report_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
+                        "vector type!");
     }
 
     // Make life easier by making sure the index is zero-extended to i32
     if (Elt.getValueType() != MVT::i32)
-      Elt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, Elt);
+      Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
 
     // Scale the index to a bit/byte shift quantity
     APInt scaleFactor =
@@ -2164,11 +2024,11 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
 
     if (scaleShift > 0) {
       // Scale the shift factor:
-      Elt = DAG.getNode(ISD::SHL, MVT::i32, Elt,
-              DAG.getConstant(scaleShift, MVT::i32));
+      Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
+                        DAG.getConstant(scaleShift, MVT::i32));
     }
 
-    vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, VecVT, N, Elt);
+    vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, dl, VecVT, N, Elt);
 
     // Replicate the bytes starting at byte 0 across the entire vector (for
     // consistency with the notion of a unified register set)
@@ -2176,40 +2036,41 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
 
     switch (VT.getSimpleVT()) {
     default:
-      cerr << "LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector type\n";
-      abort();
+      llvm_report_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
+                        "type");
       /*NOTREACHED*/
     case MVT::i8: {
       SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
-      replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor,
-                              factor, factor);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
       break;
     }
     case MVT::i16: {
       SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
-      replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor,
-                              factor, factor);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
       break;
     }
     case MVT::i32:
     case MVT::f32: {
       SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
-      replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, factor, factor,
-                              factor, factor);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
       break;
     }
     case MVT::i64:
     case MVT::f64: {
       SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
       SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
-      replicate = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, loFactor, hiFactor,
-                              loFactor, hiFactor);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              loFactor, hiFactor, loFactor, hiFactor);
       break;
     }
     }
 
-    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                         DAG.getNode(SPUISD::SHUFB, VecVT, vecShift, vecShift, replicate));
+    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                         DAG.getNode(SPUISD::SHUFB, dl, VecVT,
+                                     vecShift, vecShift, replicate));
   }
 
   return retval;
@@ -2219,326 +2080,126 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   SDValue VecOp = Op.getOperand(0);
   SDValue ValOp = Op.getOperand(1);
   SDValue IdxOp = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
   MVT VT = Op.getValueType();
 
   ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
   assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
 
   MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  // Use $2 because it's always 16-byte aligned and it's available:
-  SDValue PtrBase = DAG.getRegister(SPU::R2, PtrVT);
+  // Use $sp ($1) because it's always 16-byte aligned and it's available:
+  SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                DAG.getRegister(SPU::R1, PtrVT),
+                                DAG.getConstant(CN->getSExtValue(), PtrVT));
+  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, VT, Pointer);
 
   SDValue result =
-    DAG.getNode(SPUISD::SHUFB, VT,
-                DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, ValOp),
+    DAG.getNode(SPUISD::SHUFB, dl, VT,
+                DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
                 VecOp,
-                DAG.getNode(SPUISD::SHUFFLE_MASK, VT,
-                            DAG.getNode(ISD::ADD, PtrVT,
-                                        PtrBase,
-                                        DAG.getConstant(CN->getZExtValue(),
-                                                        PtrVT))));
+                DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, ShufMask));
 
   return result;
 }
 
-static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
+static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
+                           const TargetLowering &TLI)
 {
   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
+  DebugLoc dl = Op.getDebugLoc();
+  MVT ShiftVT = TLI.getShiftAmountTy();
 
   assert(Op.getValueType() == MVT::i8);
   switch (Opc) {
   default:
-    assert(0 && "Unhandled i8 math operator");
+    llvm_unreachable("Unhandled i8 math operator");
     /*NOTREACHED*/
     break;
+  case ISD::ADD: {
+    // 8-bit addition: Promote the arguments up to 16-bits and truncate
+    // the result:
+    SDValue N1 = Op.getOperand(1);
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+
+  }
+
   case ISD::SUB: {
     // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
     // the result:
     SDValue N1 = Op.getOperand(1);
-    N0 = (N0.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
-                            MVT::i16));
-    N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
-                            MVT::i16));
-    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
-                       DAG.getNode(Opc, MVT::i16, N0, N1));
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   }
   case ISD::ROTR:
   case ISD::ROTL: {
     SDValue N1 = Op.getOperand(1);
-    unsigned N1Opc;
-    N0 = (N0.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
-                            MVT::i16));
-    N1Opc = N1.getValueType().bitsLT(MVT::i32)
-            ? ISD::ZERO_EXTEND
-            : ISD::TRUNCATE;
-    N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(N1Opc, MVT::i32, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
-                            MVT::i32));
+    MVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
+                       ? ISD::ZERO_EXTEND
+                       : ISD::TRUNCATE;
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
+    // Replicate lower 8-bits into upper 8:
     SDValue ExpandArg =
-      DAG.getNode(ISD::OR, MVT::i16, N0,
-                  DAG.getNode(ISD::SHL, MVT::i16,
+      DAG.getNode(ISD::OR, dl, MVT::i16, N0,
+                  DAG.getNode(ISD::SHL, dl, MVT::i16,
                               N0, DAG.getConstant(8, MVT::i32)));
-    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
-                       DAG.getNode(Opc, MVT::i16, ExpandArg, N1));
+
+    // Truncate back down to i8
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
   }
   case ISD::SRL:
   case ISD::SHL: {
     SDValue N1 = Op.getOperand(1);
-    unsigned N1Opc;
-    N0 = (N0.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
-                            MVT::i16));
-    N1Opc = N1.getValueType().bitsLT(MVT::i16)
-            ? ISD::ZERO_EXTEND
-            : ISD::TRUNCATE;
-    N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(N1Opc, MVT::i16, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
-                            MVT::i16));
-    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
-                       DAG.getNode(Opc, MVT::i16, N0, N1));
-  }
-  case ISD::SRA: {
-    SDValue N1 = Op.getOperand(1);
-    unsigned N1Opc;
-    N0 = (N0.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
-                            MVT::i16));
-    N1Opc = N1.getValueType().bitsLT(MVT::i16)
-            ? ISD::SIGN_EXTEND
-            : ISD::TRUNCATE;
-    N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(N1Opc, MVT::i16, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
-                            MVT::i16));
-    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
-                       DAG.getNode(Opc, MVT::i16, N0, N1));
-  }
-  case ISD::MUL: {
-    SDValue N1 = Op.getOperand(1);
-    unsigned N1Opc;
-    N0 = (N0.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
-                            MVT::i16));
-    N1Opc = N1.getValueType().bitsLT(MVT::i16) ? ISD::SIGN_EXTEND : ISD::TRUNCATE;
-    N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(N1Opc, MVT::i16, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
-                            MVT::i16));
-    return DAG.getNode(ISD::TRUNCATE, MVT::i8,
-                       DAG.getNode(Opc, MVT::i16, N0, N1));
-    break;
-  }
-  }
+    MVT N1VT = N1.getValueType();
 
-  return SDValue();
-}
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = ISD::ZERO_EXTEND;
 
-static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
-{
-  MVT VT = Op.getValueType();
-  MVT VecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
-
-  SDValue Op0 = Op.getOperand(0);
+      if (N1.getValueType().bitsGT(ShiftVT))
+        N1Opc = ISD::TRUNCATE;
 
-  switch (Opc) {
-  case ISD::ZERO_EXTEND:
-  case ISD::SIGN_EXTEND:
-  case ISD::ANY_EXTEND: {
-    MVT Op0VT = Op0.getValueType();
-    MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits()));
-
-    assert(Op0VT == MVT::i32
-           && "CellSPU: Zero/sign extending something other than i32");
-
-    DEBUG(cerr << "CellSPU.LowerI64Math: lowering zero/sign/any extend\n");
-
-    SDValue PromoteScalar =
-            DAG.getNode(SPUISD::PROMOTE_SCALAR, Op0VecVT, Op0);
-
-    if (Opc != ISD::SIGN_EXTEND) {
-      // Use a shuffle to zero extend the i32 to i64 directly:
-      SDValue shufMask =
-              DAG.getNode(ISD::BUILD_VECTOR, Op0VecVT,
-                          DAG.getConstant(0x80808080, MVT::i32),
-                          DAG.getConstant(0x00010203, MVT::i32),
-                          DAG.getConstant(0x80808080, MVT::i32),
-                          DAG.getConstant(0x08090a0b, MVT::i32));
-      SDValue zextShuffle =
-              DAG.getNode(SPUISD::SHUFB, Op0VecVT,
-                          PromoteScalar, PromoteScalar, shufMask);
-
-      return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                         DAG.getNode(ISD::BIT_CONVERT, VecVT, zextShuffle));
-    } else {
-      // SPU has no "rotate quadword and replicate bit 0" (i.e. rotate/shift
-      // right and propagate the sign bit) instruction.
-      SDValue RotQuad =
-              DAG.getNode(SPUISD::ROTQUAD_RZ_BYTES, Op0VecVT,
-                          PromoteScalar, DAG.getConstant(4, MVT::i32));
-      SDValue SignQuad =
-              DAG.getNode(SPUISD::VEC_SRA, Op0VecVT,
-                          PromoteScalar, DAG.getConstant(32, MVT::i32));
-      SDValue SelMask =
-              DAG.getNode(SPUISD::SELECT_MASK, Op0VecVT,
-                          DAG.getConstant(0xf0f0, MVT::i16));
-      SDValue CombineQuad =
-              DAG.getNode(SPUISD::SELB, Op0VecVT,
-                          SignQuad, RotQuad, SelMask);
-
-      return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                         DAG.getNode(ISD::BIT_CONVERT, VecVT, CombineQuad));
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
     }
-  }
 
-  case ISD::ADD: {
-    // Turn operands into vectors to satisfy type checking (shufb works on
-    // vectors)
-    SDValue Op0 =
-      DAG.getNode(SPUISD::PROMOTE_SCALAR, MVT::v2i64, Op.getOperand(0));
-    SDValue Op1 =
-      DAG.getNode(SPUISD::PROMOTE_SCALAR, MVT::v2i64, Op.getOperand(1));
-    SmallVector<SDValue, 16> ShufBytes;
-
-    // Create the shuffle mask for "rotating" the borrow up one register slot
-    // once the borrow is generated.
-    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
-
-    SDValue CarryGen =
-      DAG.getNode(SPUISD::CARRY_GENERATE, MVT::v2i64, Op0, Op1);
-    SDValue ShiftedCarry =
-      DAG.getNode(SPUISD::SHUFB, MVT::v2i64,
-                  CarryGen, CarryGen,
-                  DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                              &ShufBytes[0], ShufBytes.size()));
-
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i64,
-                       DAG.getNode(SPUISD::ADD_EXTENDED, MVT::v2i64,
-                                   Op0, Op1, ShiftedCarry));
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   }
+  case ISD::SRA: {
+    SDValue N1 = Op.getOperand(1);
+    MVT N1VT = N1.getValueType();
 
-  case ISD::SUB: {
-    // Turn operands into vectors to satisfy type checking (shufb works on
-    // vectors)
-    SDValue Op0 =
-      DAG.getNode(SPUISD::PROMOTE_SCALAR, MVT::v2i64, Op.getOperand(0));
-    SDValue Op1 =
-      DAG.getNode(SPUISD::PROMOTE_SCALAR, MVT::v2i64, Op.getOperand(1));
-    SmallVector<SDValue, 16> ShufBytes;
-
-    // Create the shuffle mask for "rotating" the borrow up one register slot
-    // once the borrow is generated.
-    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
-    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
-
-    SDValue BorrowGen =
-      DAG.getNode(SPUISD::BORROW_GENERATE, MVT::v2i64, Op0, Op1);
-    SDValue ShiftedBorrow =
-      DAG.getNode(SPUISD::SHUFB, MVT::v2i64,
-                  BorrowGen, BorrowGen,
-                  DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                              &ShufBytes[0], ShufBytes.size()));
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = ISD::SIGN_EXTEND;
 
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i64,
-                       DAG.getNode(SPUISD::SUB_EXTENDED, MVT::v2i64,
-                                   Op0, Op1, ShiftedBorrow));
-  }
+      if (N1VT.bitsGT(ShiftVT))
+        N1Opc = ISD::TRUNCATE;
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
 
-  case ISD::SHL: {
-    SDValue ShiftAmt = Op.getOperand(1);
-    MVT ShiftAmtVT = ShiftAmt.getValueType();
-    SDValue Op0Vec = DAG.getNode(SPUISD::PROMOTE_SCALAR, VecVT, Op0);
-    SDValue MaskLower =
-      DAG.getNode(SPUISD::SELB, VecVT,
-                  Op0Vec,
-                  DAG.getConstant(0, VecVT),
-                  DAG.getNode(SPUISD::SELECT_MASK, VecVT,
-                              DAG.getConstant(0xff00ULL, MVT::i16)));
-    SDValue ShiftAmtBytes =
-      DAG.getNode(ISD::SRL, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(3, ShiftAmtVT));
-    SDValue ShiftAmtBits =
-      DAG.getNode(ISD::AND, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(7, ShiftAmtVT));
-
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                       DAG.getNode(SPUISD::SHLQUAD_L_BITS, VecVT,
-                                   DAG.getNode(SPUISD::SHLQUAD_L_BYTES, VecVT,
-                                               MaskLower, ShiftAmtBytes),
-                                   ShiftAmtBits));
-  }
-
-  case ISD::SRL: {
-    MVT VT = Op.getValueType();
-    SDValue ShiftAmt = Op.getOperand(1);
-    MVT ShiftAmtVT = ShiftAmt.getValueType();
-    SDValue ShiftAmtBytes =
-      DAG.getNode(ISD::SRL, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(3, ShiftAmtVT));
-    SDValue ShiftAmtBits =
-      DAG.getNode(ISD::AND, ShiftAmtVT,
-                  ShiftAmt,
-                  DAG.getConstant(7, ShiftAmtVT));
-
-    return DAG.getNode(SPUISD::ROTQUAD_RZ_BITS, VT,
-                       DAG.getNode(SPUISD::ROTQUAD_RZ_BYTES, VT,
-                                   Op0, ShiftAmtBytes),
-                       ShiftAmtBits);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
   }
+  case ISD::MUL: {
+    SDValue N1 = Op.getOperand(1);
 
-  case ISD::SRA: {
-    // Promote Op0 to vector
-    SDValue Op0 =
-      DAG.getNode(SPUISD::PROMOTE_SCALAR, MVT::v2i64, Op.getOperand(0));
-    SDValue ShiftAmt = Op.getOperand(1);
-    MVT ShiftVT = ShiftAmt.getValueType();
-
-    // Negate variable shift amounts
-    if (!isa<ConstantSDNode>(ShiftAmt)) {
-      ShiftAmt = DAG.getNode(ISD::SUB, ShiftVT,
-                             DAG.getConstant(0, ShiftVT), ShiftAmt);
-    }
-
-    SDValue UpperHalfSign =
-      DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i32,
-                  DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
-                              DAG.getNode(SPUISD::VEC_SRA, MVT::v2i64,
-                                          Op0, DAG.getConstant(31, MVT::i32))));
-    SDValue UpperHalfSignMask =
-      DAG.getNode(SPUISD::SELECT_MASK, MVT::v2i64, UpperHalfSign);
-    SDValue UpperLowerMask =
-      DAG.getNode(SPUISD::SELECT_MASK, MVT::v2i64,
-                  DAG.getConstant(0xff00, MVT::i16));
-    SDValue UpperLowerSelect =
-      DAG.getNode(SPUISD::SELB, MVT::v2i64,
-                  UpperHalfSignMask, Op0, UpperLowerMask);
-    SDValue RotateLeftBytes =
-      DAG.getNode(SPUISD::ROTBYTES_LEFT_BITS, MVT::v2i64,
-                  UpperLowerSelect, ShiftAmt);
-    SDValue RotateLeftBits =
-      DAG.getNode(SPUISD::ROTBYTES_LEFT, MVT::v2i64,
-                  RotateLeftBytes, ShiftAmt);
-
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, MVT::i64,
-                       RotateLeftBits);
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+    break;
   }
   }
 
@@ -2551,6 +2212,7 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
   SDValue ConstVec;
   SDValue Arg;
   MVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
 
   ConstVec = Op.getOperand(0);
   Arg = Op.getOperand(1);
@@ -2567,58 +2229,32 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
   }
 
   if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
-    uint64_t VectorBits[2];
-    uint64_t UndefBits[2];
-    uint64_t SplatBits, SplatUndef;
-    int SplatSize;
-
-    if (!GetConstantBuildVectorBits(ConstVec.getNode(), VectorBits, UndefBits)
-        && isConstantSplat(VectorBits, UndefBits,
-                           VT.getVectorElementType().getSizeInBits(),
-                           SplatBits, SplatUndef, SplatSize)) {
-      SDValue tcVec[16];
+    BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
+    assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
+
+    APInt APSplatBits, APSplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
+
+    if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                              HasAnyUndefs, minSplatBits)
+        && minSplatBits <= SplatBitSize) {
+      uint64_t SplatBits = APSplatBits.getZExtValue();
       SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
-      const size_t tcVecSize = sizeof(tcVec) / sizeof(tcVec[0]);
 
-      // Turn the BUILD_VECTOR into a set of target constants:
-      for (size_t i = 0; i < tcVecSize; ++i)
-        tcVec[i] = tc;
-
-      return DAG.getNode(Op.getNode()->getOpcode(), VT, Arg,
-                         DAG.getNode(ISD::BUILD_VECTOR, VT, tcVec, tcVecSize));
+      SmallVector<SDValue, 16> tcVec;
+      tcVec.assign(16, tc);
+      return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
     }
   }
+
   // These operations (AND, OR, XOR) are legal, they just couldn't be custom
   // lowered.  Return the operation, rather than a null SDValue.
   return Op;
 }
 
-//! Lower i32 multiplication
-static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG, MVT VT,
-                          unsigned Opc) {
-  switch (VT.getSimpleVT()) {
-  default:
-    cerr << "CellSPU: Unknown LowerMUL value type, got "
-         << Op.getValueType().getMVTString()
-         << "\n";
-    abort();
-    /*NOTREACHED*/
-
-  case MVT::i32: {
-    SDValue rA = Op.getOperand(0);
-    SDValue rB = Op.getOperand(1);
-
-    return DAG.getNode(ISD::ADD, MVT::i32,
-                       DAG.getNode(ISD::ADD, MVT::i32,
-                                   DAG.getNode(SPUISD::MPYH, MVT::i32, rA, rB),
-                                   DAG.getNode(SPUISD::MPYH, MVT::i32, rB, rA)),
-                       DAG.getNode(SPUISD::MPYU, MVT::i32, rA, rB));
-  }
-  }
-
-  return SDValue();
-}
-
 //! Custom lowering for CTPOP (count population)
 /*!
   Custom lowering code that counts the number ones in the input
@@ -2628,6 +2264,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG, MVT VT,
 static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getValueType();
   MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
+  DebugLoc dl = Op.getDebugLoc();
 
   switch (VT.getSimpleVT()) {
   default:
@@ -2636,10 +2273,10 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
     SDValue N = Op.getOperand(0);
     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
 
-    SDValue Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
-    SDValue CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
 
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i8, CNTB, Elt0);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
   }
 
   case MVT::i16: {
@@ -2653,22 +2290,22 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
     SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
     SDValue Shift1 = DAG.getConstant(8, MVT::i32);
 
-    SDValue Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
-    SDValue CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
 
     // CNTB_result becomes the chain to which all of the virtual registers
     // CNTB_reg, SUM1_reg become associated:
     SDValue CNTB_result =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, CNTB, Elt0);
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
 
     SDValue CNTB_rescopy =
-      DAG.getCopyToReg(CNTB_result, CNTB_reg, CNTB_result);
+      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
 
-    SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i16);
+    SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
 
-    return DAG.getNode(ISD::AND, MVT::i16,
-                       DAG.getNode(ISD::ADD, MVT::i16,
-                                   DAG.getNode(ISD::SRL, MVT::i16,
+    return DAG.getNode(ISD::AND, dl, MVT::i16,
+                       DAG.getNode(ISD::ADD, dl, MVT::i16,
+                                   DAG.getNode(ISD::SRL, dl, MVT::i16,
                                                Tmp1, Shift1),
                                    Tmp1),
                        Mask0);
@@ -2687,37 +2324,38 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
     SDValue Shift1 = DAG.getConstant(16, MVT::i32);
     SDValue Shift2 = DAG.getConstant(8, MVT::i32);
 
-    SDValue Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
-    SDValue CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
 
     // CNTB_result becomes the chain to which all of the virtual registers
     // CNTB_reg, SUM1_reg become associated:
     SDValue CNTB_result =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, CNTB, Elt0);
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
 
     SDValue CNTB_rescopy =
-      DAG.getCopyToReg(CNTB_result, CNTB_reg, CNTB_result);
+      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
 
     SDValue Comp1 =
-      DAG.getNode(ISD::SRL, MVT::i32,
-                  DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i32), Shift1);
+      DAG.getNode(ISD::SRL, dl, MVT::i32,
+                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
+                  Shift1);
 
     SDValue Sum1 =
-      DAG.getNode(ISD::ADD, MVT::i32,
-                  Comp1, DAG.getCopyFromReg(CNTB_rescopy, CNTB_reg, MVT::i32));
+      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
+                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
 
     SDValue Sum1_rescopy =
-      DAG.getCopyToReg(CNTB_result, SUM1_reg, Sum1);
+      DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
 
     SDValue Comp2 =
-      DAG.getNode(ISD::SRL, MVT::i32,
-                  DAG.getCopyFromReg(Sum1_rescopy, SUM1_reg, MVT::i32),
+      DAG.getNode(ISD::SRL, dl, MVT::i32,
+                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
                   Shift2);
     SDValue Sum2 =
-      DAG.getNode(ISD::ADD, MVT::i32, Comp2,
-                  DAG.getCopyFromReg(Sum1_rescopy, SUM1_reg, MVT::i32));
+      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
+                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
 
-    return DAG.getNode(ISD::AND, MVT::i32, Sum2, Mask0);
+    return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
   }
 
   case MVT::i64:
@@ -2727,6 +2365,182 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
+//! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
+/*!
+ f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
+ All conversions to i64 are expanded to a libcall.
+ */
+static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                              SPUTargetLowering &TLI) {
+  MVT OpVT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType();
+
+  if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
+      || OpVT == MVT::i64) {
+    // Convert f32 / f64 to i32 / i64 via libcall.
+    RTLIB::Libcall LC =
+            (Op.getOpcode() == ISD::FP_TO_SINT)
+             ? RTLIB::getFPTOSINT(Op0VT, OpVT)
+             : RTLIB::getFPTOUINT(Op0VT, OpVT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
+    SDValue Dummy;
+    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
+  }
+
+  return Op;
+}
+
+//! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
+/*!
+ i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
+ All conversions from i64 are expanded to a libcall.
+ */
+static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                              SPUTargetLowering &TLI) {
+  MVT OpVT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType();
+
+  if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
+      || Op0VT == MVT::i64) {
+    // Convert i32, i64 to f64 via libcall:
+    RTLIB::Libcall LC =
+            (Op.getOpcode() == ISD::SINT_TO_FP)
+             ? RTLIB::getSINTTOFP(Op0VT, OpVT)
+             : RTLIB::getUINTTOFP(Op0VT, OpVT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
+    SDValue Dummy;
+    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
+  }
+
+  return Op;
+}
+
+//! Lower ISD::SETCC
+/*!
+ This handles MVT::f64 (double floating point) condition lowering
+ */
+static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI) {
+  CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
+  DebugLoc dl = Op.getDebugLoc();
+  assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
+
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  MVT lhsVT = lhs.getValueType();
+  assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
+
+  MVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
+  APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
+  MVT IntVT(MVT::i64);
+
+  // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
+  // selected to a NOP:
+  SDValue i64lhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, lhs);
+  SDValue lhsHi32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+                      DAG.getNode(ISD::SRL, dl, IntVT,
+                                  i64lhs, DAG.getConstant(32, MVT::i32)));
+  SDValue lhsHi32abs =
+          DAG.getNode(ISD::AND, dl, MVT::i32,
+                      lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
+  SDValue lhsLo32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
+
+  // SETO and SETUO only use the lhs operand:
+  if (CC->get() == ISD::SETO) {
+    // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
+    // SETUO
+    APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
+    return DAG.getNode(ISD::XOR, dl, ccResultVT,
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhs, DAG.getConstantFP(0.0, lhsVT),
+                                    ISD::SETUO),
+                       DAG.getConstant(ccResultAllOnes, ccResultVT));
+  } else if (CC->get() == ISD::SETUO) {
+    // Evaluates to true if Op0 is [SQ]NaN
+    return DAG.getNode(ISD::AND, dl, ccResultVT,
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhsHi32abs,
+                                    DAG.getConstant(0x7ff00000, MVT::i32),
+                                    ISD::SETGE),
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhsLo32,
+                                    DAG.getConstant(0, MVT::i32),
+                                    ISD::SETGT));
+  }
+
+  SDValue i64rhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, rhs);
+  SDValue rhsHi32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+                      DAG.getNode(ISD::SRL, dl, IntVT,
+                                  i64rhs, DAG.getConstant(32, MVT::i32)));
+
+  // If a value is negative, subtract from the sign magnitude constant:
+  SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
+
+  // Convert the sign-magnitude representation into 2's complement:
+  SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
+                                      lhsHi32, DAG.getConstant(31, MVT::i32));
+  SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
+  SDValue lhsSelect =
+          DAG.getNode(ISD::SELECT, dl, IntVT,
+                      lhsSelectMask, lhsSignMag2TC, i64lhs);
+
+  SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
+                                      rhsHi32, DAG.getConstant(31, MVT::i32));
+  SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
+  SDValue rhsSelect =
+          DAG.getNode(ISD::SELECT, dl, IntVT,
+                      rhsSelectMask, rhsSignMag2TC, i64rhs);
+
+  unsigned compareOp;
+
+  switch (CC->get()) {
+  case ISD::SETOEQ:
+  case ISD::SETUEQ:
+    compareOp = ISD::SETEQ; break;
+  case ISD::SETOGT:
+  case ISD::SETUGT:
+    compareOp = ISD::SETGT; break;
+  case ISD::SETOGE:
+  case ISD::SETUGE:
+    compareOp = ISD::SETGE; break;
+  case ISD::SETOLT:
+  case ISD::SETULT:
+    compareOp = ISD::SETLT; break;
+  case ISD::SETOLE:
+  case ISD::SETULE:
+    compareOp = ISD::SETLE; break;
+  case ISD::SETUNE:
+  case ISD::SETONE:
+    compareOp = ISD::SETNE; break;
+  default:
+    llvm_report_error("CellSPU ISel Select: unimplemented f64 condition");
+  }
+
+  SDValue result =
+          DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
+                       (ISD::CondCode) compareOp);
+
+  if ((CC->get() & 0x8) == 0) {
+    // Ordered comparison:
+    SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
+                                  lhs, DAG.getConstantFP(0.0, MVT::f64),
+                                  ISD::SETO);
+    SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
+                                  rhs, DAG.getConstantFP(0.0, MVT::f64),
+                                  ISD::SETO);
+    SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
+
+    result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
+  }
+
+  return result;
+}
+
 //! Lower ISD::SELECT_CC
 /*!
   ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
@@ -2739,20 +2553,64 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
   assumption, given the simplisitc uses so far.
  */
 
-static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+                              const TargetLowering &TLI) {
   MVT VT = Op.getValueType();
   SDValue lhs = Op.getOperand(0);
   SDValue rhs = Op.getOperand(1);
   SDValue trueval = Op.getOperand(2);
   SDValue falseval = Op.getOperand(3);
   SDValue condition = Op.getOperand(4);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // NOTE: SELB's arguments: $rA, $rB, $mask
+  //
+  // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
+  // where bits in $mask are 1. CCond will be inverted, having 1s where the
+  // condition was true and 0s where the condition was false. Hence, the
+  // arguments to SELB get reversed.
 
   // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
   // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
   // with another "cannot select select_cc" assert:
 
-  SDValue compare = DAG.getNode(ISD::SETCC, VT, lhs, rhs, condition);
-  return DAG.getNode(SPUISD::SELB, VT, trueval, falseval, compare);
+  SDValue compare = DAG.getNode(ISD::SETCC, dl,
+                                TLI.getSetCCResultType(Op.getValueType()),
+                                lhs, rhs, condition);
+  return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
+}
+
+//! Custom lower ISD::TRUNCATE
+static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
+{
+  // Type to truncate to
+  MVT VT = Op.getValueType();
+  MVT::SimpleValueType simpleVT = VT.getSimpleVT();
+  MVT VecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits()));
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Type to truncate from
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType();
+
+  if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) {
+    // Create shuffle mask, least significant doubleword of quadword
+    unsigned maskHigh = 0x08090a0b;
+    unsigned maskLow = 0x0c0d0e0f;
+    // Use a shuffle to perform the truncation
+    SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   DAG.getConstant(maskHigh, MVT::i32),
+                                   DAG.getConstant(maskLow, MVT::i32),
+                                   DAG.getConstant(maskHigh, MVT::i32),
+                                   DAG.getConstant(maskLow, MVT::i32));
+
+    SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
+                                       Op0, Op0, shufMask);
+
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
+  }
+
+  return SDValue();             // Leave the truncate unmolested
 }
 
 //! Custom (target-specific) lowering entry point
@@ -2768,13 +2626,16 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
 
   switch (Opc) {
   default: {
+#ifndef NDEBUG
     cerr << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
     cerr << "Op.getOpcode() = " << Opc << "\n";
     cerr << "*Op.getNode():\n";
     Op.getNode()->dump();
-    abort();
+#endif
+    llvm_unreachable(0);
   }
   case ISD::LOAD:
+  case ISD::EXTLOAD:
   case ISD::SEXTLOAD:
   case ISD::ZEXTLOAD:
     return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
@@ -2786,12 +2647,8 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
     return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
   case ISD::JumpTable:
     return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
-  case ISD::Constant:
-    return LowerConstant(Op, DAG);
   case ISD::ConstantFP:
     return LowerConstantFP(Op, DAG);
-  case ISD::BRCOND:
-    return LowerBRCOND(Op, DAG);
   case ISD::FORMAL_ARGUMENTS:
     return LowerFORMAL_ARGUMENTS(Op, DAG, VarArgsFrameIndex);
   case ISD::CALL:
@@ -2799,11 +2656,7 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
   case ISD::RET:
     return LowerRET(Op, DAG, getTargetMachine());
 
-
   // i8, i64 math ops:
-  case ISD::ZERO_EXTEND:
-  case ISD::SIGN_EXTEND:
-  case ISD::ANY_EXTEND:
   case ISD::ADD:
   case ISD::SUB:
   case ISD::ROTR:
@@ -2812,12 +2665,18 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
   case ISD::SHL:
   case ISD::SRA: {
     if (VT == MVT::i8)
-      return LowerI8Math(Op, DAG, Opc);
-    else if (VT == MVT::i64)
-      return LowerI64Math(Op, DAG, Opc);
+      return LowerI8Math(Op, DAG, Opc, *this);
     break;
   }
 
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG, *this);
+
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG, *this);
+
   // Vector-related lowering.
   case ISD::BUILD_VECTOR:
     return LowerBUILD_VECTOR(Op, DAG);
@@ -2838,29 +2697,20 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
 
   // Vector and i8 multiply:
   case ISD::MUL:
-    if (VT.isVector())
-      return LowerVectorMUL(Op, DAG);
-    else if (VT == MVT::i8)
-      return LowerI8Math(Op, DAG, Opc);
-    else
-      return LowerMUL(Op, DAG, VT, Opc);
-
-  case ISD::FDIV:
-    if (VT == MVT::f32 || VT == MVT::v4f32)
-      return LowerFDIVf32(Op, DAG);
-#if 0
-    // This is probably a libcall
-    else if (Op.getValueType() == MVT::f64)
-      return LowerFDIVf64(Op, DAG);
-#endif
-    else
-      assert(0 && "Calling FDIV on unsupported MVT");
+    if (VT == MVT::i8)
+      return LowerI8Math(Op, DAG, Opc, *this);
 
   case ISD::CTPOP:
     return LowerCTPOP(Op, DAG);
 
   case ISD::SELECT_CC:
-    return LowerSELECT_CC(Op, DAG);
+    return LowerSELECT_CC(Op, DAG, *this);
+
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG, *this);
+
+  case ISD::TRUNCATE:
+    return LowerTRUNCATE(Op, DAG);
   }
 
   return SDValue();
@@ -2901,53 +2751,64 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
 #endif
   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
   SelectionDAG &DAG = DCI.DAG;
-  SDValue Op0 = N->getOperand(0);      // everything has at least one operand
-  SDValue Result;                     // Initially, NULL result
+  SDValue Op0 = N->getOperand(0);       // everything has at least one operand
+  MVT NodeVT = N->getValueType(0);      // The node's value type
+  MVT Op0VT = Op0.getValueType();       // The first operand's result
+  SDValue Result;                       // Initially, empty result
+  DebugLoc dl = N->getDebugLoc();
 
   switch (N->getOpcode()) {
   default: break;
   case ISD::ADD: {
     SDValue Op1 = N->getOperand(1);
 
-    if (isa<ConstantSDNode>(Op1) && Op0.getOpcode() == SPUISD::IndirectAddr) {
-      SDValue Op01 = Op0.getOperand(1);
-      if (Op01.getOpcode() == ISD::Constant
-          || Op01.getOpcode() == ISD::TargetConstant) {
-        // (add <const>, (SPUindirect <arg>, <const>)) ->
-        // (SPUindirect <arg>, <const + const>)
-        ConstantSDNode *CN0 = cast<ConstantSDNode>(Op1);
-        ConstantSDNode *CN1 = cast<ConstantSDNode>(Op01);
-        SDValue combinedConst =
-          DAG.getConstant(CN0->getZExtValue() + CN1->getZExtValue(),
-                          Op0.getValueType());
-
-        DEBUG(cerr << "Replace: (add " << CN0->getZExtValue() << ", "
-                   << "(SPUindirect <arg>, " << CN1->getZExtValue() << "))\n");
-        DEBUG(cerr << "With:    (SPUindirect <arg>, "
-                   << CN0->getZExtValue() + CN1->getZExtValue() << ")\n");
-        return DAG.getNode(SPUISD::IndirectAddr, Op0.getValueType(),
-                           Op0.getOperand(0), combinedConst);
+    if (Op0.getOpcode() == SPUISD::IndirectAddr
+        || Op1.getOpcode() == SPUISD::IndirectAddr) {
+      // Normalize the operands to reduce repeated code
+      SDValue IndirectArg = Op0, AddArg = Op1;
+
+      if (Op1.getOpcode() == SPUISD::IndirectAddr) {
+        IndirectArg = Op1;
+        AddArg = Op0;
       }
-    } else if (isa<ConstantSDNode>(Op0)
-               && Op1.getOpcode() == SPUISD::IndirectAddr) {
-      SDValue Op11 = Op1.getOperand(1);
-      if (Op11.getOpcode() == ISD::Constant
-          || Op11.getOpcode() == ISD::TargetConstant) {
-        // (add (SPUindirect <arg>, <const>), <const>) ->
-        // (SPUindirect <arg>, <const + const>)
-        ConstantSDNode *CN0 = cast<ConstantSDNode>(Op0);
-        ConstantSDNode *CN1 = cast<ConstantSDNode>(Op11);
-        SDValue combinedConst =
-          DAG.getConstant(CN0->getZExtValue() + CN1->getZExtValue(),
-                          Op0.getValueType());
-
-        DEBUG(cerr << "Replace: (add " << CN0->getZExtValue() << ", "
-                   << "(SPUindirect <arg>, " << CN1->getZExtValue() << "))\n");
-        DEBUG(cerr << "With:    (SPUindirect <arg>, "
-                   << CN0->getZExtValue() + CN1->getZExtValue() << ")\n");
-
-        return DAG.getNode(SPUISD::IndirectAddr, Op1.getValueType(),
-                           Op1.getOperand(0), combinedConst);
+
+      if (isa<ConstantSDNode>(AddArg)) {
+        ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
+        SDValue IndOp1 = IndirectArg.getOperand(1);
+
+        if (CN0->isNullValue()) {
+          // (add (SPUindirect <arg>, <arg>), 0) ->
+          // (SPUindirect <arg>, <arg>)
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            cerr << "\n"
+                 << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
+                 << "With:    (SPUindirect <arg>, <arg>)\n";
+          }
+#endif
+
+          return IndirectArg;
+        } else if (isa<ConstantSDNode>(IndOp1)) {
+          // (add (SPUindirect <arg>, <const>), <const>) ->
+          // (SPUindirect <arg>, <const + const>)
+          ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
+          int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
+          SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            cerr << "\n"
+                 << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
+                 << "), " << CN0->getSExtValue() << ")\n"
+                 << "With:    (SPUindirect <arg>, "
+                 << combinedConst << ")\n";
+          }
+#endif
+
+          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
+                             IndirectArg, combinedValue);
+        }
       }
     }
     break;
@@ -2955,16 +2816,19 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND: {
-    if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT &&
-        N->getValueType(0) == Op0.getValueType()) {
+    if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
       // (any_extend (SPUextract_elt0 <arg>)) ->
       // (SPUextract_elt0 <arg>)
       // Types must match, however...
-      DEBUG(cerr << "Replace: ");
-      DEBUG(N->dump(&DAG));
-      DEBUG(cerr << "\nWith:    ");
-      DEBUG(Op0.getNode()->dump(&DAG));
-      DEBUG(cerr << "\n");
+#if !defined(NDEBUG)
+      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+        cerr << "\nReplace: ";
+        N->dump(&DAG);
+        cerr << "\nWith:    ";
+        Op0.getNode()->dump(&DAG);
+        cerr << "\n";
+      }
+#endif
 
       return Op0;
     }
@@ -2972,8 +2836,8 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   }
   case SPUISD::IndirectAddr: {
     if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
-      ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(1));
-      if (CN->getZExtValue() == 0) {
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
+      if (CN != 0 && CN->getZExtValue() == 0) {
         // (SPUindirect (SPUaform <addr>, 0), 0) ->
         // (SPUaform <addr>, 0)
 
@@ -2985,6 +2849,25 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
 
         return Op0;
       }
+    } else if (Op0.getOpcode() == ISD::ADD) {
+      SDValue Op1 = N->getOperand(1);
+      if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
+        // (SPUindirect (add <arg>, <arg>), 0) ->
+        // (SPUindirect <arg>, <arg>)
+        if (CN1->isNullValue()) {
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            cerr << "\n"
+                 << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
+                 << "With:    (SPUindirect <arg>, <arg>)\n";
+          }
+#endif
+
+          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
+                             Op0.getOperand(0), Op0.getOperand(1));
+        }
+      }
     }
     break;
   }
@@ -2993,41 +2876,38 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   case SPUISD::VEC_SHL:
   case SPUISD::VEC_SRL:
   case SPUISD::VEC_SRA:
-  case SPUISD::ROTQUAD_RZ_BYTES:
-  case SPUISD::ROTQUAD_RZ_BITS: {
+  case SPUISD::ROTBYTES_LEFT: {
     SDValue Op1 = N->getOperand(1);
 
-    if (isa<ConstantSDNode>(Op1)) {
-      // Kill degenerate vector shifts:
-      ConstantSDNode *CN = cast<ConstantSDNode>(Op1);
-
-      if (CN->getZExtValue() == 0) {
+    // Kill degenerate vector shifts:
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
+      if (CN->isNullValue()) {
         Result = Op0;
       }
     }
     break;
   }
-  case SPUISD::PROMOTE_SCALAR: {
+  case SPUISD::PREFSLOT2VEC: {
     switch (Op0.getOpcode()) {
     default:
       break;
     case ISD::ANY_EXTEND:
     case ISD::ZERO_EXTEND:
     case ISD::SIGN_EXTEND: {
-      // (SPUpromote_scalar (any|sign|zero_extend (SPUextract_elt0 <arg>))) ->
+      // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
       // <arg>
-      // but only if the SPUpromote_scalar and <arg> types match.
+      // but only if the SPUprefslot2vec and <arg> types match.
       SDValue Op00 = Op0.getOperand(0);
       if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
         SDValue Op000 = Op00.getOperand(0);
-        if (Op000.getValueType() == N->getValueType(0)) {
+        if (Op000.getValueType() == NodeVT) {
           Result = Op000;
         }
       }
       break;
     }
     case SPUISD::VEC2PREFSLOT: {
-      // (SPUpromote_scalar (SPUextract_elt0 <arg>)) ->
+      // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
       // <arg>
       Result = Op0.getOperand(0);
       break;
@@ -3036,8 +2916,9 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
     break;
   }
   }
+
   // Otherwise, return unchanged.
-#ifdef NDEBUG
+#ifndef NDEBUG
   if (Result.getNode()) {
     DEBUG(cerr << "\nReplace.SPU: ");
     DEBUG(N->dump(&DAG));
@@ -3107,51 +2988,19 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
                                                   const SelectionDAG &DAG,
                                                   unsigned Depth ) const {
 #if 0
-  const uint64_t uint64_sizebits = sizeof(uint64_t) * 8;
-#endif
+  const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
 
   switch (Op.getOpcode()) {
   default:
     // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
     break;
-
-#if 0
   case CALL:
   case SHUFB:
   case SHUFFLE_MASK:
   case CNTB:
-#endif
-
-  case SPUISD::PROMOTE_SCALAR: {
-    SDValue Op0 = Op.getOperand(0);
-    MVT Op0VT = Op0.getValueType();
-    unsigned Op0VTBits = Op0VT.getSizeInBits();
-    uint64_t InMask = Op0VT.getIntegerVTBitMask();
-    KnownZero |= APInt(Op0VTBits, ~InMask, false);
-    KnownOne |= APInt(Op0VTBits, InMask, false);
-    break;
-  }
-
+  case SPUISD::PREFSLOT2VEC:
   case SPUISD::LDRESULT:
   case SPUISD::VEC2PREFSLOT:
-  case SPUISD::VEC2PREFSLOT_CHAINED: {
-    MVT OpVT = Op.getValueType();
-    unsigned OpVTBits = OpVT.getSizeInBits();
-    uint64_t InMask = OpVT.getIntegerVTBitMask();
-    KnownZero |= APInt(OpVTBits, ~InMask, false);
-    KnownOne |= APInt(OpVTBits, InMask, false);
-    break;
-  }
-
-#if 0
-  case EXTRACT_I1_ZEXT:
-  case EXTRACT_I1_SEXT:
-  case EXTRACT_I8_ZEXT:
-  case EXTRACT_I8_SEXT:
-  case MPY:
-  case MPYU:
-  case MPYH:
-  case MPYHH:
   case SPUISD::SHLQUAD_L_BITS:
   case SPUISD::SHLQUAD_L_BYTES:
   case SPUISD::VEC_SHL:
@@ -3159,16 +3008,28 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   case SPUISD::VEC_SRA:
   case SPUISD::VEC_ROTL:
   case SPUISD::VEC_ROTR:
-  case SPUISD::ROTQUAD_RZ_BYTES:
-  case SPUISD::ROTQUAD_RZ_BITS:
   case SPUISD::ROTBYTES_LEFT:
-  case SPUISD::ROTBYTES_LEFT_CHAINED:
   case SPUISD::SELECT_MASK:
   case SPUISD::SELB:
-  case SPUISD::FPInterp:
-  case SPUISD::FPRecipEst:
-  case SPUISD::SEXT32TO64:
+  }
 #endif
+}
+
+unsigned
+SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    return 1;
+
+  case ISD::SETCC: {
+    MVT VT = Op.getValueType();
+
+    if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
+      VT = MVT::i32;
+    }
+    return VT.getSizeInBits();
+  }
   }
 }