X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FCellSPU%2FSPUISelLowering.cpp;h=9354f7327be643bef6e89f6c8d28a74692a2390f;hb=b78ca423844f19f4a838abb49b4b4fa7ae499707;hp=7bac3f5552abbaf28d043e846fe038330b9d328e;hpb=0b4ab0cfe028bfab50a711316bdda89918c776b9;p=oota-llvm.git diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index 7bac3f5552a..9354f7327be 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -1,4 +1,3 @@ -// //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===// // The LLVM Compiler Infrastructure // @@ -11,15 +10,15 @@ // //===----------------------------------------------------------------------===// -#include "SPURegisterNames.h" #include "SPUISelLowering.h" #include "SPUTargetMachine.h" -#include "SPUFrameInfo.h" +#include "SPUFrameLowering.h" #include "SPUMachineFunction.h" #include "llvm/Constants.h" #include "llvm/Function.h" #include "llvm/Intrinsics.h" #include "llvm/CallingConv.h" +#include "llvm/Type.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -28,7 +27,6 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/ADT/VectorExtras.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -41,41 +39,12 @@ using namespace llvm; namespace { std::map node_names; - //! EVT mapping to useful data for Cell SPU - struct valtype_map_s { - EVT valtype; - int prefslot_byte; - }; - - const valtype_map_s valtype_map[] = { - { MVT::i1, 3 }, - { MVT::i8, 3 }, - { MVT::i16, 2 }, - { MVT::i32, 0 }, - { MVT::f32, 0 }, - { MVT::i64, 0 }, - { MVT::f64, 0 }, - { MVT::i128, 0 } - }; - - const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]); - - const valtype_map_s *getValueTypeMapEntry(EVT VT) { - const valtype_map_s *retval = 0; - - for (size_t i = 0; i < n_valtype_map; ++i) { - if (valtype_map[i].valtype == VT) { - retval = valtype_map + i; - break; - } - } - -#ifndef NDEBUG - if (retval == 0) { - report_fatal_error("getValueTypeMapEntry returns NULL for " + - Twine(VT.getEVTString())); - } -#endif + // Byte offset of the preferred slot (counted from the MSB) + int prefslotOffset(EVT VT) { + int retval=0; + if (VT==MVT::i1) retval=3; + if (VT==MVT::i8) retval=3; + if (VT==MVT::i16) retval=2; return retval; } @@ -99,7 +68,7 @@ namespace { TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { EVT ArgVT = Op.getOperand(i).getValueType(); - const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy; Entry.isSExt = isSigned; @@ -110,12 +79,13 @@ namespace { TLI.getPointerTy()); // Splice the libcall in wherever FindInputOutputChains tells us to. - const Type *RetTy = + Type *RetTy = Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext()); std::pair CallInfo = TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false, - 0, TLI.getLibcallCallingConv(LC), false, - /*isReturnValueUsed=*/true, + 0, TLI.getLibcallCallingConv(LC), + /*isTailCall=*/false, + /*doesNotRet=*/false, /*isReturnValueUsed=*/true, Callee, Args, DAG, Op.getDebugLoc()); return CallInfo.first; @@ -125,8 +95,6 @@ namespace { SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) : TargetLowering(TM, new TargetLoweringObjectFileELF()), SPUTM(TM) { - // Fold away setcc operations if possible. - setPow2DivIsCheap(); // Use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(true); @@ -206,6 +174,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) // SPU has no intrinsics for these particular operations: setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand); + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand); // SPU has no division/remainder instructions setOperationAction(ISD::SREM, MVT::i8, Expand); @@ -252,6 +221,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::FSQRT, MVT::f64, Expand); setOperationAction(ISD::FSQRT, MVT::f32, Expand); + setOperationAction(ISD::FMA, MVT::f64, Expand); + setOperationAction(ISD::FMA, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); @@ -324,12 +296,22 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::CTTZ , MVT::i32, Expand); setOperationAction(ISD::CTTZ , MVT::i64, Expand); setOperationAction(ISD::CTTZ , MVT::i128, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i8, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i128, Expand); setOperationAction(ISD::CTLZ , MVT::i8, Promote); setOperationAction(ISD::CTLZ , MVT::i16, Promote); setOperationAction(ISD::CTLZ , MVT::i32, Legal); setOperationAction(ISD::CTLZ , MVT::i64, Expand); setOperationAction(ISD::CTLZ , MVT::i128, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i128, Expand); // SPU has a version of select that implements (a&~c)|(b&c), just like // select ought to work: @@ -376,10 +358,10 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::BIT_CONVERT, MVT::i32, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::f32, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::i64, Legal); - setOperationAction(ISD::BIT_CONVERT, MVT::f64, Legal); + setOperationAction(ISD::BITCAST, MVT::i32, Legal); + setOperationAction(ISD::BITCAST, MVT::f32, Legal); + setOperationAction(ISD::BITCAST, MVT::i64, Legal); + setOperationAction(ISD::BITCAST, MVT::f64, Legal); // We cannot sextinreg(i1). Expand to shifts. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -430,6 +412,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { MVT::SimpleValueType VT = (MVT::SimpleValueType)i; + // Set operation actions to legal types only. + if (!isTypeLegal(VT)) continue; + // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD, VT, Legal); setOperationAction(ISD::SUB, VT, Legal); @@ -439,9 +424,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::AND, VT, Legal); setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); - setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::SELECT, VT, Legal); - setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::STORE, VT, Custom); // These operations need to be expanded: setOperationAction(ISD::SDIV, VT, Expand); @@ -449,6 +434,13 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); + // Expand all trunc stores + for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) { + MVT::SimpleValueType TargetVT = (MVT::SimpleValueType)j; + setTruncStoreAction(VT, TargetVT, Expand); + } + // Custom lower build_vector, constant pool spills, insert and // extract vector elements: setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -459,6 +451,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } + setOperationAction(ISD::SHL, MVT::v2i64, Expand); + setOperationAction(ISD::AND, MVT::v16i8, Custom); setOperationAction(ISD::OR, MVT::v16i8, Custom); setOperationAction(ISD::XOR, MVT::v16i8, Custom); @@ -466,8 +460,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v4f32, Legal); - setShiftAmountType(MVT::i32); setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // FIXME: Is this correct? setStackPointerRegisterToSaveRestore(SPU::R1); @@ -477,6 +471,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); + setMinFunctionAlignment(3); + computeRegisterProperties(); // Set pre-RA register scheduler default to BURR, which produces slightly @@ -502,8 +498,8 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB"; node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC"; node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES"; + node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS"; + node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES"; node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL"; node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR"; node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT"; @@ -521,20 +517,25 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const return ((i != node_names.end()) ? i->second : 0); } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const { - return 3; -} - //===----------------------------------------------------------------------===// // Return the Cell SPU's SETCC result type //===----------------------------------------------------------------------===// -MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const { - // i16 and i32 are valid SETCC result types - return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ? - VT.getSimpleVT().SimpleTy : - MVT::i32); +EVT SPUTargetLowering::getSetCCResultType(EVT VT) const { + // i8, i16 and i32 are valid SETCC result types + MVT::SimpleValueType retval; + + switch(VT.getSimpleVT().SimpleTy){ + case MVT::i1: + case MVT::i8: + retval = MVT::i8; break; + case MVT::i16: + retval = MVT::i16; break; + case MVT::i32: + default: + retval = MVT::i32; + } + return retval; } //===----------------------------------------------------------------------===// @@ -572,113 +573,175 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT OutVT = Op.getValueType(); ISD::LoadExtType ExtType = LN->getExtensionType(); unsigned alignment = LN->getAlignment(); - const valtype_map_s *vtm = getValueTypeMapEntry(InVT); + int pso = prefslotOffset(InVT); DebugLoc dl = Op.getDebugLoc(); - - switch (LN->getAddressingMode()) { - case ISD::UNINDEXED: { - SDValue result; - SDValue basePtr = LN->getBasePtr(); - SDValue rotate; - - if (alignment == 16) { - ConstantSDNode *CN; - - // Special cases for a known aligned load to simplify the base pointer - // and the rotation amount: - if (basePtr.getOpcode() == ISD::ADD - && (CN = dyn_cast (basePtr.getOperand(1))) != 0) { - // Known offset into basePtr - int64_t offset = CN->getSExtValue(); - int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte); - - if (rotamt < 0) - rotamt += 16; - - rotate = DAG.getConstant(rotamt, MVT::i16); - - // Simplify the base pointer for this case: - basePtr = basePtr.getOperand(0); - if ((offset & ~0xf) > 0) { - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & ~0xf), PtrVT)); - } - } else if ((basePtr.getOpcode() == SPUISD::AFormAddr) - || (basePtr.getOpcode() == SPUISD::IndirectAddr - && basePtr.getOperand(0).getOpcode() == SPUISD::Hi - && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) { - // Plain aligned a-form address: rotate into preferred slot - // Same for (SPUindirect (SPUhi ...), (SPUlo ...)) - int64_t rotamt = -vtm->prefslot_byte; - if (rotamt < 0) - rotamt += 16; - rotate = DAG.getConstant(rotamt, MVT::i16); - } else { - // Offset the rotate amount by the basePtr and the preferred slot - // byte offset - int64_t rotamt = -vtm->prefslot_byte; - if (rotamt < 0) - rotamt += 16; - rotate = DAG.getNode(ISD::ADD, dl, PtrVT, - basePtr, - DAG.getConstant(rotamt, PtrVT)); - } - } else { - // Unaligned load: must be more pessimistic about addressing modes: - if (basePtr.getOpcode() == ISD::ADD) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); - SDValue Flag; - - SDValue Op0 = basePtr.getOperand(0); - SDValue Op1 = basePtr.getOperand(1); - - if (isa(Op1)) { - // Convert the (add , ) to an indirect address contained - // in a register. Note that this is done because we need to avoid - // creating a 0(reg) d-form address due to the SPU's block loads. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); - basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); - } else { - // Convert the (add , ) to an indirect address, which - // will likely be lowered as a reg(reg) x-form address. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - } - } else { + EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT, + (128 / InVT.getSizeInBits())); + + // two sanity checks + assert( LN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (InVT.getSizeInBits() == 128 && (alignment%16) == 0) + return SDValue(); + + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = LN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16); + + SDValue result; + SDValue basePtr = LN->getBasePtr(); + SDValue rotate; + + if ((alignment%16) == 0) { + ConstantSDNode *CN; + + // Special cases for a known aligned load to simplify the base pointer + // and the rotation amount: + if (basePtr.getOpcode() == ISD::ADD + && (CN = dyn_cast (basePtr.getOperand(1))) != 0) { + // Known offset into basePtr + int64_t offset = CN->getSExtValue(); + int64_t rotamt = int64_t((offset & 0xf) - pso); + + if (rotamt < 0) + rotamt += 16; + + rotate = DAG.getConstant(rotamt, MVT::i16); + + // Simplify the base pointer for this case: + basePtr = basePtr.getOperand(0); + if ((offset & ~0xf) > 0) { basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, - DAG.getConstant(0, PtrVT)); + DAG.getConstant((offset & ~0xf), PtrVT)); } - + } else if ((basePtr.getOpcode() == SPUISD::AFormAddr) + || (basePtr.getOpcode() == SPUISD::IndirectAddr + && basePtr.getOperand(0).getOpcode() == SPUISD::Hi + && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) { + // Plain aligned a-form address: rotate into preferred slot + // Same for (SPUindirect (SPUhi ...), (SPUlo ...)) + int64_t rotamt = -pso; + if (rotamt < 0) + rotamt += 16; + rotate = DAG.getConstant(rotamt, MVT::i16); + } else { // Offset the rotate amount by the basePtr and the preferred slot // byte offset + int64_t rotamt = -pso; + if (rotamt < 0) + rotamt += 16; rotate = DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, - DAG.getConstant(-vtm->prefslot_byte, PtrVT)); + DAG.getConstant(rotamt, PtrVT)); } + } else { + // Unaligned load: must be more pessimistic about addressing modes: + if (basePtr.getOpcode() == ISD::ADD) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); + SDValue Flag; + + SDValue Op0 = basePtr.getOperand(0); + SDValue Op1 = basePtr.getOperand(1); + + if (isa(Op1)) { + // Convert the (add , ) to an indirect address contained + // in a register. Note that this is done because we need to avoid + // creating a 0(reg) d-form address due to the SPU's block loads. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); + basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); + } else { + // Convert the (add , ) to an indirect address, which + // will likely be lowered as a reg(reg) x-form address. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + } + } else { + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } - // Re-emit as a v16i8 vector load - result = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr, - LN->getSrcValue(), LN->getSrcValueOffset(), - LN->isVolatile(), LN->isNonTemporal(), 16); + // Offset the rotate amount by the basePtr and the preferred slot + // byte offset + rotate = DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(-pso, PtrVT)); + } + + // Do the load as a i128 to allow possible shifting + SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr, + lowMemPtr, + LN->isVolatile(), LN->isNonTemporal(), false, 16); + // When the size is not greater than alignment we get all data with just + // one load + if (alignment >= InVT.getSizeInBits()/8) { // Update the chain - the_chain = result.getValue(1); + the_chain = low.getValue(1); // Rotate into the preferred slot: - result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::v16i8, - result.getValue(0), rotate); + result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128, + low.getValue(0), rotate); // Convert the loaded v16i8 vector to the appropriate vector type // specified by the operand: - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), + EVT vecVT = EVT::getVectorVT(*DAG.getContext(), InVT, (128 / InVT.getSizeInBits())); result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, - DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result)); + DAG.getNode(ISD::BITCAST, dl, vecVT, result)); + } + // When alignment is less than the size, we might need (known only at + // run-time) two loads + // TODO: if the memory address is composed only from constants, we have + // extra kowledge, and might avoid the second load + else { + // storage position offset from lower 16 byte aligned memory chunk + SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, + basePtr, DAG.getConstant( 0xf, MVT::i32 ) ); + // get a registerfull of ones. (this implementation is a workaround: LLVM + // cannot handle 128 bit signed int constants) + SDValue ones = DAG.getConstant(-1, MVT::v4i32 ); + ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones); + + SDValue high = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(16, PtrVT)), + highMemPtr, + LN->isVolatile(), LN->isNonTemporal(), false, + 16); + + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + high.getValue(1)); + + // Shift the (possible) high part right to compensate the misalignemnt. + // if there is no highpart (i.e. value is i64 and offset is 4), this + // will zero out the high value. + high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high, + DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset + )); + + // Shift the low similarly + // TODO: add SPUISD::SHL_BYTES + low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset ); + + // Merge the two parts + result = DAG.getNode(ISD::BITCAST, dl, vecVT, + DAG.getNode(ISD::OR, dl, MVT::i128, low, high)); + + if (!InVT.isVector()) { + result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result ); + } + } // Handle extending loads by extending the scalar result: if (ExtType == ISD::SEXTLOAD) { result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result); @@ -702,21 +765,6 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { result = DAG.getNode(SPUISD::LDRESULT, dl, retvts, retops, sizeof(retops) / sizeof(retops[0])); return result; - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other " - "than UNINDEXED\n" + - Twine((unsigned)LN->getAddressingMode())); - /*NOTREACHED*/ - } - } - - return SDValue(); } /// Custom lower stores for CellSPU @@ -734,93 +782,104 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); DebugLoc dl = Op.getDebugLoc(); unsigned alignment = SN->getAlignment(); + SDValue result; + EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT, + (128 / StVT.getSizeInBits())); + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = SN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16); + + + // two sanity checks + assert( SN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (StVT.getSizeInBits() == 128 && (alignment%16) == 0) + return SDValue(); + + SDValue alignLoadVec; + SDValue basePtr = SN->getBasePtr(); + SDValue the_chain = SN->getChain(); + SDValue insertEltOffs; + + if ((alignment%16) == 0) { + ConstantSDNode *CN; + // Special cases for a known aligned load to simplify the base pointer + // and insertion byte: + if (basePtr.getOpcode() == ISD::ADD + && (CN = dyn_cast(basePtr.getOperand(1))) != 0) { + // Known offset into basePtr + int64_t offset = CN->getSExtValue(); + + // Simplify the base pointer for this case: + basePtr = basePtr.getOperand(0); + insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant((offset & 0xf), PtrVT)); - switch (SN->getAddressingMode()) { - case ISD::UNINDEXED: { - // The vector type we really want to load from the 16-byte chunk. - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), - VT, (128 / VT.getSizeInBits())); - - SDValue alignLoadVec; - SDValue basePtr = SN->getBasePtr(); - SDValue the_chain = SN->getChain(); - SDValue insertEltOffs; - - if (alignment == 16) { - ConstantSDNode *CN; - // Special cases for a known aligned load to simplify the base pointer - // and insertion byte: - if (basePtr.getOpcode() == ISD::ADD - && (CN = dyn_cast(basePtr.getOperand(1))) != 0) { - // Known offset into basePtr - int64_t offset = CN->getSExtValue(); - - // Simplify the base pointer for this case: - basePtr = basePtr.getOperand(0); - insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & 0xf), PtrVT)); - - if ((offset & ~0xf) > 0) { - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & ~0xf), PtrVT)); - } - } else { - // Otherwise, assume it's at byte 0 of basePtr - insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant(0, PtrVT)); - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant(0, PtrVT)); - } - } else { - // Unaligned load: must be more pessimistic about addressing modes: - if (basePtr.getOpcode() == ISD::ADD) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); - SDValue Flag; - - SDValue Op0 = basePtr.getOperand(0); - SDValue Op1 = basePtr.getOperand(1); - - if (isa(Op1)) { - // Convert the (add , ) to an indirect address contained - // in a register. Note that this is done because we need to avoid - // creating a 0(reg) d-form address due to the SPU's block loads. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); - basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); - } else { - // Convert the (add , ) to an indirect address, which - // will likely be lowered as a reg(reg) x-form address. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - } - } else { + if ((offset & ~0xf) > 0) { basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, - DAG.getConstant(0, PtrVT)); + DAG.getConstant((offset & ~0xf), PtrVT)); } - - // Insertion point is solely determined by basePtr's contents - insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT, + } else { + // Otherwise, assume it's at byte 0 of basePtr + insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, DAG.getConstant(0, PtrVT)); } + } else { + // Unaligned load: must be more pessimistic about addressing modes: + if (basePtr.getOpcode() == ISD::ADD) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); + SDValue Flag; + + SDValue Op0 = basePtr.getOperand(0); + SDValue Op1 = basePtr.getOperand(1); + + if (isa(Op1)) { + // Convert the (add , ) to an indirect address contained + // in a register. Note that this is done because we need to avoid + // creating a 0(reg) d-form address due to the SPU's block loads. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); + basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); + } else { + // Convert the (add , ) to an indirect address, which + // will likely be lowered as a reg(reg) x-form address. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + } + } else { + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } + + // Insertion point is solely determined by basePtr's contents + insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } - // Load the memory to which to store. - alignLoadVec = DAG.getLoad(vecVT, dl, the_chain, basePtr, - SN->getSrcValue(), SN->getSrcValueOffset(), - SN->isVolatile(), SN->isNonTemporal(), 16); + // Load the lower part of the memory to which to store. + SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr, + lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), + false, 16); + // if we don't need to store over the 16 byte boundary, one store suffices + if (alignment >= StVT.getSizeInBits()/8) { // Update the chain - the_chain = alignLoadVec.getValue(1); + the_chain = low.getValue(1); - LoadSDNode *LN = cast(alignLoadVec); + LoadSDNode *LN = cast(low); SDValue theValue = SN->getValue(); - SDValue result; if (StVT != VT && (theValue.getOpcode() == ISD::AssertZext @@ -844,48 +903,115 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT, insertEltOffs); - SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, + SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, theValue); result = DAG.getNode(SPUISD::SHUFB, dl, vecVT, - vectorizeOp, alignLoadVec, - DAG.getNode(ISD::BIT_CONVERT, dl, + vectorizeOp, low, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, insertEltOp)); result = DAG.getStore(the_chain, dl, result, basePtr, - LN->getSrcValue(), LN->getSrcValueOffset(), + lowMemPtr, LN->isVolatile(), LN->isNonTemporal(), - LN->getAlignment()); - -#if 0 && !defined(NDEBUG) - if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { - const SDValue ¤tRoot = DAG.getRoot(); - - DAG.setRoot(result); - errs() << "------- CellSPU:LowerStore result:\n"; - DAG.dump(); - errs() << "-------\n"; - DAG.setRoot(currentRoot); - } -#endif - - return result; - /*UNREACHED*/ - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other " - "than UNINDEXED\n" + - Twine((unsigned)SN->getAddressingMode())); - /*NOTREACHED*/ - } + 16); + + } + // do the store when it might cross the 16 byte memory access boundary. + else { + // TODO issue a warning if SN->isVolatile()== true? This is likely not + // what the user wanted. + + // address offset from nearest lower 16byte alinged address + SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, + SN->getBasePtr(), + DAG.getConstant(0xf, MVT::i32)); + // 16 - offset + SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset); + // 16 - sizeof(Value) + SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + DAG.getConstant( VT.getSizeInBits()/8, + MVT::i32)); + // get a registerfull of ones + SDValue ones = DAG.getConstant(-1, MVT::v4i32); + ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones); + + // Create the 128 bit masks that have ones where the data to store is + // located. + SDValue lowmask, himask; + // if the value to store don't fill up the an entire 128 bits, zero + // out the last bits of the mask so that only the value we want to store + // is masked. + // this is e.g. in the case of store i32, align 2 + if (!VT.isVector()){ + Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus); + lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + surplus); + Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value); + Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask); + + } + else { + lowmask = ones; + Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value); + } + // this will zero, if there are no data that goes to the high quad + himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + offset_compl); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask, + offset); + + // Load in the old data and zero out the parts that will be overwritten with + // the new data to store. + SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), + false, 16); + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + hi.getValue(1)); + + low = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BITCAST, dl, MVT::i128, low), + DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones)); + hi = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi), + DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones)); + + // Shift the Value to store into place. rlow contains the parts that go to + // the lower memory chunk, rhi has the parts that go to the upper one. + SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset); + rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask); + SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value, + offset_compl); + + // Merge the old data and the new data and store the results + // Need to convert vectors here to integer as 'OR'ing floats assert + rlow = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BITCAST, dl, MVT::i128, low), + DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow)); + rhi = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi), + DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi)); + + low = DAG.getStore(the_chain, dl, rlow, basePtr, + lowMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + hi = DAG.getStore(the_chain, dl, rhi, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0), + hi.getValue(0)); } - return SDValue(); + return result; } //! Generate the address of a constant pool entry. @@ -913,7 +1039,6 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { llvm_unreachable("LowerConstantPool: Relocation model other than static" " not supported."); - return SDValue(); } //! Alternate entry point for generating the address of a constant pool entry @@ -944,7 +1069,6 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { llvm_unreachable("LowerJumpTable: Relocation model other than static" " not supported."); - return SDValue(); } static SDValue @@ -972,8 +1096,6 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { "not supported."); /*NOTREACHED*/ } - - return SDValue(); } //! Custom lower double precision floating point constants @@ -993,7 +1115,7 @@ LowerConstantFP(SDValue Op, SelectionDAG &DAG) { SDValue T = DAG.getConstant(dbits, MVT::i64); SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T); return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Tvec)); + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec)); } return SDValue(); @@ -1013,15 +1135,15 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, MachineRegisterInfo &RegInfo = MF.getRegInfo(); SPUFunctionInfo *FuncInfo = MF.getInfo(); - unsigned ArgOffset = SPUFrameInfo::minStackSize(); + unsigned ArgOffset = SPUFrameLowering::minStackSize(); unsigned ArgRegIdx = 0; - unsigned StackSlotSize = SPUFrameInfo::stackSlotSize(); + unsigned StackSlotSize = SPUFrameLowering::stackSlotSize(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); // FIXME: allow for other calling conventions CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU); @@ -1080,7 +1202,8 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, // or we're forced to do vararg int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0, false, false, 0); + ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), + false, false, false, 0); ArgOffset += StackSlotSize; } @@ -1091,9 +1214,9 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, // vararg handling: if (isVarArg) { - // FIXME: we should be able to query the argument registers from - // tablegen generated code. - static const unsigned ArgRegs[] = { + // FIXME: we should be able to query the argument registers from + // tablegen generated code. + static const uint16_t ArgRegs[] = { SPU::R3, SPU::R4, SPU::R5, SPU::R6, SPU::R7, SPU::R8, SPU::R9, SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16, SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23, @@ -1107,7 +1230,7 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79 }; // size of ArgRegs array - unsigned NumArgRegs = 77; + const unsigned NumArgRegs = 77; // We will spill (79-3)+1 registers to the stack SmallVector MemOps; @@ -1117,9 +1240,9 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setVarArgsFrameIndex( MFI->CreateFixedObject(StackSlotSize, ArgOffset, true)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); - unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass); + unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass); SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8); - SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, NULL, 0, + SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(), false, false, 0); Chain = Store.getOperand(0); MemOps.push_back(Store); @@ -1152,7 +1275,7 @@ static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) { SDValue SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, - bool &isTailCall, + bool doesNotRet, bool &isTailCall, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, @@ -1163,14 +1286,14 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, const SPUSubtarget *ST = SPUTM.getSubtargetImpl(); unsigned NumOps = Outs.size(); - unsigned StackSlotSize = SPUFrameInfo::stackSlotSize(); + unsigned StackSlotSize = SPUFrameLowering::stackSlotSize(); SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, - *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); // FIXME: allow for other calling conventions CCInfo.AnalyzeCallOperands(Outs, CCC_SPU); - + const unsigned NumArgRegs = ArgLocs.size(); @@ -1184,7 +1307,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Figure out which arguments are going to go in registers, and which in // memory. - unsigned ArgOffset = SPUFrameInfo::minStackSize(); // Just below [LR] + unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR] unsigned ArgRegIdx = 0; // Keep track of registers passing arguments @@ -1219,7 +1342,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (ArgRegIdx != NumArgRegs) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { - MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0, + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0)); ArgOffset += StackSlotSize; } @@ -1230,7 +1354,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Accumulate how many bytes are to be pushed on the stack, including the // linkage area, and parameter passing area. According to the SPU ABI, // we minimally need space for [LR] and [SP]. - unsigned NumStackBytes = ArgOffset - SPUFrameInfo::minStackSize(); + unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize(); // Insert a call sequence start Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes, @@ -1311,7 +1435,7 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (InFlag.getNode()) Ops.push_back(InFlag); // Returns a chain and a flag for retval copy to use. - Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag), + Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue), &Ops[0], Ops.size()); InFlag = Chain.getValue(1); @@ -1326,15 +1450,15 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Now handle the return value(s) SmallVector RVLocs; - CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU); // If the call has results, copy the values out of the ret val registers. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; - + SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), InFlag); Chain = Val.getValue(1); @@ -1353,8 +1477,8 @@ SPUTargetLowering::LowerReturn(SDValue Chain, DebugLoc dl, SelectionDAG &DAG) const { SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_SPU); // If this is the first return lowered for this function, add the regs to the @@ -1567,9 +1691,8 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { && "LowerBUILD_VECTOR: Unexpected floating point vector element."); // NOTE: pretend the constant is an integer. LLVM won't load FP constants SDValue T = DAG.getConstant(Value32, MVT::i32); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, + return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T)); - break; } case MVT::v2f64: { uint64_t f64val = uint64_t(SplatBits); @@ -1577,9 +1700,8 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes."); // NOTE: pretend the constant is an integer. LLVM won't load FP constants SDValue T = DAG.getConstant(f64val, MVT::i64); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, + return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T)); - break; } case MVT::v16i8: { // 8-bit constants have to be expanded to 16-bits @@ -1587,7 +1709,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { SmallVector Ops; Ops.assign(8, DAG.getConstant(Value16, MVT::i16)); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size())); } case MVT::v8i16: { @@ -1606,8 +1728,6 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl); } } - - return SDValue(); } /*! @@ -1621,7 +1741,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, if (upper == lower) { // Magic constant that can be matched by IL, ILA, et. al. SDValue Val = DAG.getTargetConstant(upper, MVT::i32); - return DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + return DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Val, Val, Val, Val)); } else { @@ -1637,9 +1757,11 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, // Both upper and lower are special, lower to a constant pool load: if (lower_special && upper_special) { - SDValue SplatValCN = DAG.getConstant(SplatVal, MVT::i64); - return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, - SplatValCN, SplatValCN); + SDValue UpperVal = DAG.getConstant(upper, MVT::i32); + SDValue LowerVal = DAG.getConstant(lower, MVT::i32); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, + UpperVal, LowerVal, UpperVal, LowerVal); + return DAG.getNode(ISD::BITCAST, dl, OpVT, BV); } SDValue LO32; @@ -1650,7 +1772,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, // Create lower vector if not a special pattern if (!lower_special) { SDValue LO32C = DAG.getConstant(lower, MVT::i32); - LO32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, LO32C, LO32C, LO32C, LO32C)); } @@ -1658,7 +1780,7 @@ SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal, // Create upper vector if not a special pattern if (!upper_special) { SDValue HI32C = DAG.getConstant(upper, MVT::i32); - HI32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT, + HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, HI32C, HI32C, HI32C, HI32C)); } @@ -1737,12 +1859,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { unsigned PrevElt = 0; bool monotonic = true; bool rotate = true; - int rotamt; + int rotamt=0; EVT maskVT; // which of the c?d instructions to use if (EltVT == MVT::i8) { V2EltIdx0 = 16; - maskVT = MVT::v16i8; + maskVT = MVT::v16i8; } else if (EltVT == MVT::i16) { V2EltIdx0 = 8; maskVT = MVT::v8i16; @@ -1758,7 +1880,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { for (unsigned i = 0; i != MaxElts; ++i) { if (SVN->getMaskElt(i) < 0) continue; - + unsigned SrcElt = SVN->getMaskElt(i); if (monotonic) { @@ -1781,13 +1903,13 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (PrevElt > 0 && SrcElt < MaxElts) { if ((PrevElt == SrcElt - 1) || (PrevElt == MaxElts - 1 && SrcElt == 0)) { - rotamt = SrcElt-i; PrevElt = SrcElt; } else { rotate = false; } } else if (i == 0 || (PrevElt==0 && SrcElt==1)) { // First time or after a "wrap around" + rotamt = SrcElt-i; PrevElt = SrcElt; } else { // This isn't a rotation, takes elements from vector 2 @@ -1805,7 +1927,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, DAG.getRegister(SPU::R1, PtrVT), DAG.getConstant(V2EltOffset, MVT::i32)); - SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, + SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer); // Use shuffle mask in SHUFB synthetic instruction: @@ -1879,8 +2001,6 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0); } } - - return SDValue(); } static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { @@ -1914,8 +2034,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { int elt_byte = EltNo * VT.getSizeInBits() / 8; switch (VT.getSimpleVT().SimpleTy) { - default: - assert(false && "Invalid value type!"); + default: llvm_unreachable("Invalid value type!"); case MVT::i8: { prefslot_begin = prefslot_end = 3; break; @@ -1999,7 +2118,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(scaleShift, MVT::i32)); } - vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, dl, VecVT, N, Elt); + vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt); // Replicate the bytes starting at byte 0 across the entire vector (for // consistency with the notion of a unified register set) @@ -2069,7 +2188,7 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { DAG.getRegister(SPU::R1, PtrVT), DAG.getConstant(Offset, PtrVT)); // widen the mask when dealing with half vectors - EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(), + EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(), 128/ VT.getVectorElementType().getSizeInBits()); SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer); @@ -2077,7 +2196,7 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { DAG.getNode(SPUISD::SHUFB, dl, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp), VecOp, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, ShufMask)); + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask)); return result; } @@ -2087,14 +2206,12 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc, { SDValue N0 = Op.getOperand(0); // Everything has at least one operand DebugLoc dl = Op.getDebugLoc(); - EVT ShiftVT = TLI.getShiftAmountTy(); + EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType()); assert(Op.getValueType() == MVT::i8); switch (Opc) { default: llvm_unreachable("Unhandled i8 math operator"); - /*NOTREACHED*/ - break; case ISD::ADD: { // 8-bit addition: Promote the arguments up to 16-bits and truncate // the result: @@ -2179,11 +2296,8 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc, N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1); return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getNode(Opc, dl, MVT::i16, N0, N1)); - break; } } - - return SDValue(); } //! Lower byte immediate operations for v16i8 vectors: @@ -2197,12 +2311,12 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) { ConstVec = Op.getOperand(0); Arg = Op.getOperand(1); if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) { - if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) { + if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) { ConstVec = ConstVec.getOperand(0); } else { ConstVec = Op.getOperand(1); Arg = Op.getOperand(0); - if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) { + if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) { ConstVec = ConstVec.getOperand(0); } } @@ -2243,13 +2357,12 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) { */ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), + EVT vecVT = EVT::getVectorVT(*DAG.getContext(), VT, (128 / VT.getSizeInBits())); DebugLoc dl = Op.getDebugLoc(); switch (VT.getSimpleVT().SimpleTy) { - default: - assert(false && "Invalid value type!"); + default: llvm_unreachable("Invalid value type!"); case MVT::i8: { SDValue N = Op.getOperand(0); SDValue Elt0 = DAG.getConstant(0, MVT::i32); @@ -2419,7 +2532,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG, // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently // selected to a NOP: - SDValue i64lhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, lhs); + SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs); SDValue lhsHi32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRL, dl, IntVT, @@ -2453,7 +2566,7 @@ static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG, ISD::SETGT)); } - SDValue i64rhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, rhs); + SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs); SDValue rhsHi32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, DAG.getNode(ISD::SRL, dl, IntVT, @@ -2567,7 +2680,7 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) // Type to truncate to EVT VT = Op.getValueType(); MVT simpleVT = VT.getSimpleVT(); - EVT VecVT = EVT::getVectorVT(*DAG.getContext(), + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, (128 / VT.getSizeInBits())); DebugLoc dl = Op.getDebugLoc(); @@ -2575,7 +2688,7 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) SDValue Op0 = Op.getOperand(0); EVT Op0VT = Op0.getValueType(); - if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) { + if (Op0VT == MVT::i128 && simpleVT == MVT::i64) { // Create shuffle mask, least significant doubleword of quadword unsigned maskHigh = 0x08090a0b; unsigned maskLow = 0x0c0d0e0f; @@ -2616,10 +2729,17 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) SDValue Op0 = Op.getOperand(0); MVT Op0VT = Op0.getValueType().getSimpleVT(); + // extend i8 & i16 via i32 + if (Op0VT == MVT::i8 || Op0VT == MVT::i16) { + Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0); + Op0VT = MVT::i32; + } + // The type to extend to needs to be a i128 and // the type to extend from needs to be i64 or i32. assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) && "LowerSIGN_EXTEND: input and/or output operand have wrong size"); + (void)OpVT; // Create shuffle mask unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7 @@ -2640,12 +2760,17 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0), DAG.getConstant(31, MVT::i32)); + // reinterpret as a i128 (SHUFB requires it). This gets lowered away. + SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, Op0VT, Op0, + DAG.getTargetConstant( + SPU::GPRCRegClass.getID(), + MVT::i32)), 0); // Shuffle bytes - Copy the sign bits into the upper 64 bits // and the input value into the lower 64 bits. SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt, - DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i128, Op0), sraVal, shufMask); - - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, extShuffle); + extended, sraVal, shufMask); + return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle); } //! Custom (target-specific) lowering entry point @@ -2903,8 +3028,8 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const } break; } - case SPUISD::SHLQUAD_L_BITS: - case SPUISD::SHLQUAD_L_BYTES: + case SPUISD::SHL_BITS: + case SPUISD::SHL_BYTES: case SPUISD::ROTBYTES_LEFT: { SDValue Op1 = N->getOperand(1); @@ -2982,6 +3107,38 @@ SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const return TargetLowering::getConstraintType(ConstraintLetter); } +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +SPUTargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (CallOperandVal == NULL) + return CW_Default; + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break; + //FIXME: Seems like the supported constraint letters were just copied + // from PPC, as the following doesn't correspond to the GCC docs. + // I'm leaving it so until someone adds the corresponding lowering support. + case 'b': + case 'r': + case 'f': + case 'd': + case 'v': + case 'y': + weight = CW_Register; + break; + } + return weight; +} + std::pair SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const @@ -3062,17 +3219,17 @@ SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, // LowerAsmOperandForConstraint void SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op, - char ConstraintLetter, + std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const { // Default, for the time being, to the base class handler - TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, Ops, DAG); + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } /// isLegalAddressImmediate - Return true if the integer value can be used /// as the offset of the target addressing mode. bool SPUTargetLowering::isLegalAddressImmediate(int64_t V, - const Type *Ty) const { + Type *Ty) const { // SPU's addresses are 256K: return (V > -(1 << 18) && V < (1 << 18) - 1); } @@ -3086,3 +3243,28 @@ SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The SPU target isn't yet aware of offsets. return false; } + +// can we compare to Imm without writing it into a register? +bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const { + //ceqi, cgti, etc. all take s10 operand + return isInt<10>(Imm); +} + +bool +SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type * ) const{ + + // A-form: 18bit absolute address. + if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0) + return true; + + // D-form: reg + 14bit offset + if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs)) + return true; + + // X-form: reg+reg + if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0) + return true; + + return false; +}