From: Kalle Raiskila Date: Fri, 12 Nov 2010 10:14:03 +0000 (+0000) Subject: Fix memory access lowering on SPU, adding X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=7ea1ab5f41299563eb648aed159cfaff09e774d8;p=oota-llvm.git Fix memory access lowering on SPU, adding support for the case where alignment node_names; - //! EVT mapping to useful data for Cell SPU - struct valtype_map_s { - EVT valtype; - int prefslot_byte; - }; - - const valtype_map_s valtype_map[] = { - { MVT::i1, 3 }, - { MVT::i8, 3 }, - { MVT::i16, 2 }, - { MVT::i32, 0 }, - { MVT::f32, 0 }, - { MVT::i64, 0 }, - { MVT::f64, 0 }, - { MVT::i128, 0 } - }; - - const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]); - - const valtype_map_s *getValueTypeMapEntry(EVT VT) { - const valtype_map_s *retval = 0; - - for (size_t i = 0; i < n_valtype_map; ++i) { - if (valtype_map[i].valtype == VT) { - retval = valtype_map + i; - break; - } - } - -#ifndef NDEBUG - if (retval == 0) { - report_fatal_error("getValueTypeMapEntry returns NULL for " + - Twine(VT.getEVTString())); - } -#endif + // Byte offset of the preferred slot (counted from the MSB) + int prefslotOffset(EVT VT) { + int retval=0; + if (VT==MVT::i1) retval=3; + if (VT==MVT::i8) retval=3; + if (VT==MVT::i16) retval=2; return retval; } @@ -440,9 +411,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::AND, VT, Legal); setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); - setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::SELECT, VT, Legal); - setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::STORE, VT, Custom); // These operations need to be expanded: setOperationAction(ISD::SDIV, VT, Expand); @@ -503,8 +474,8 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB"; node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC"; node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES"; + node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS"; + node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES"; node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL"; node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR"; node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT"; @@ -573,11 +544,26 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT OutVT = Op.getValueType(); ISD::LoadExtType ExtType = LN->getExtensionType(); unsigned alignment = LN->getAlignment(); - const valtype_map_s *vtm = getValueTypeMapEntry(InVT); + int pso = prefslotOffset(InVT); DebugLoc dl = Op.getDebugLoc(); + EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT, + (128 / InVT.getSizeInBits())); + + // two sanity checks + assert( LN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (InVT.getSizeInBits() == 128 && alignment == 16) + return SDValue(); + + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = LN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr( LN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr( LN->getPointerInfo().V, mpi_offset+16); + + - switch (LN->getAddressingMode()) { - case ISD::UNINDEXED: { SDValue result; SDValue basePtr = LN->getBasePtr(); SDValue rotate; @@ -591,7 +577,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { && (CN = dyn_cast (basePtr.getOperand(1))) != 0) { // Known offset into basePtr int64_t offset = CN->getSExtValue(); - int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte); + int64_t rotamt = int64_t((offset & 0xf) - pso); if (rotamt < 0) rotamt += 16; @@ -611,14 +597,14 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) { // Plain aligned a-form address: rotate into preferred slot // Same for (SPUindirect (SPUhi ...), (SPUlo ...)) - int64_t rotamt = -vtm->prefslot_byte; + int64_t rotamt = -pso; if (rotamt < 0) rotamt += 16; rotate = DAG.getConstant(rotamt, MVT::i16); } else { // Offset the rotate amount by the basePtr and the preferred slot // byte offset - int64_t rotamt = -vtm->prefslot_byte; + int64_t rotamt = -pso; if (rotamt < 0) rotamt += 16; rotate = DAG.getNode(ISD::ADD, dl, PtrVT, @@ -658,20 +644,23 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { // byte offset rotate = DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, - DAG.getConstant(-vtm->prefslot_byte, PtrVT)); + DAG.getConstant(-pso, PtrVT)); } - // Re-emit as a v16i8 vector load - result = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr, - LN->getPointerInfo(), + // Do the load as a i128 to allow possible shifting + SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr, + lowMemPtr, LN->isVolatile(), LN->isNonTemporal(), 16); - + + // When the size is not greater than alignment we get all data with just + // one load + if (alignment >= InVT.getSizeInBits()/8) { // Update the chain - the_chain = result.getValue(1); + the_chain = low.getValue(1); // Rotate into the preferred slot: - result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::v16i8, - result.getValue(0), rotate); + result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128, + low.getValue(0), rotate); // Convert the loaded v16i8 vector to the appropriate vector type // specified by the operand: @@ -679,7 +668,56 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { InVT, (128 / InVT.getSizeInBits())); result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result)); + } + // When alignment is less than the size, we might need (known only at + // run-time) two loads + // TODO: if the memory address is composed only from constants, we have + // extra kowledge, and might avoid the second load + else { + // storage position offset from lower 16 byte aligned memory chunk + SDValue offset = DAG.getNode( ISD::AND, dl, MVT::i32, + basePtr, DAG.getConstant( 0xf, MVT::i32 ) ); + // 16 - offset + SDValue offset_compl = DAG.getNode( ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset ); + // get a registerfull of ones. (this implementation is a workaround: LLVM + // cannot handle 128 bit signed int constants) + SDValue ones = DAG.getConstant( -1, MVT::v4i32 ); + ones = DAG.getNode( ISD::BIT_CONVERT, dl, MVT::i128, ones); + + SDValue high = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(16, PtrVT)), + highMemPtr, + LN->isVolatile(), LN->isNonTemporal(), 16); + + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + high.getValue(1)); + + // Shift the (possible) high part right to compensate the misalignemnt. + // if there is no highpart (i.e. value is i64 and offset is 4), this + // will zero out the high value. + high = DAG.getNode( SPUISD::SRL_BYTES, dl, MVT::i128, high, + DAG.getNode( ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset + )); + + // Shift the low similarily + // TODO: add SPUISD::SHL_BYTES + low = DAG.getNode( SPUISD::SHL_BYTES, dl, MVT::i128, low, offset ); + + // Merge the two parts + result = DAG.getNode( ISD::BIT_CONVERT, dl, vecVT, + DAG.getNode(ISD::OR, dl, MVT::i128, low, high)); + + if (!InVT.isVector()) { + result = DAG.getNode( SPUISD::VEC2PREFSLOT, dl, InVT, result ); + } + } // Handle extending loads by extending the scalar result: if (ExtType == ISD::SEXTLOAD) { result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result); @@ -703,21 +741,6 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { result = DAG.getNode(SPUISD::LDRESULT, dl, retvts, retops, sizeof(retops) / sizeof(retops[0])); return result; - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other " - "than UNINDEXED\n" + - Twine((unsigned)LN->getAddressingMode())); - /*NOTREACHED*/ - } - } - - return SDValue(); } /// Custom lower stores for CellSPU @@ -735,12 +758,24 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); DebugLoc dl = Op.getDebugLoc(); unsigned alignment = SN->getAlignment(); + SDValue result; + EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT, + (128 / StVT.getSizeInBits())); + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = SN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr( SN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr( SN->getPointerInfo().V, mpi_offset+16); + + + // two sanity checks + assert( SN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (StVT.getSizeInBits() == 128 && alignment == 16) + return SDValue(); + - switch (SN->getAddressingMode()) { - case ISD::UNINDEXED: { - // The vector type we really want to load from the 16-byte chunk. - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), - VT, (128 / VT.getSizeInBits())); SDValue alignLoadVec; SDValue basePtr = SN->getBasePtr(); @@ -811,17 +846,17 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { DAG.getConstant(0, PtrVT)); } - // Load the memory to which to store. - alignLoadVec = DAG.getLoad(vecVT, dl, the_chain, basePtr, - SN->getPointerInfo(), - SN->isVolatile(), SN->isNonTemporal(), 16); + // Load the lower part of the memory to which to store. + SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr, + lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16); + // if we don't need to store over the 16 byte boundary, one store suffices + if (alignment >= StVT.getSizeInBits()/8) { // Update the chain - the_chain = alignLoadVec.getValue(1); + the_chain = low.getValue(1); - LoadSDNode *LN = cast(alignLoadVec); + LoadSDNode *LN = cast(low); SDValue theValue = SN->getValue(); - SDValue result; if (StVT != VT && (theValue.getOpcode() == ISD::AssertZext @@ -849,14 +884,14 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { theValue); result = DAG.getNode(SPUISD::SHUFB, dl, vecVT, - vectorizeOp, alignLoadVec, + vectorizeOp, low, DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, insertEltOp)); result = DAG.getStore(the_chain, dl, result, basePtr, - LN->getPointerInfo(), + lowMemPtr, LN->isVolatile(), LN->isNonTemporal(), - LN->getAlignment()); + 16); #if 0 && !defined(NDEBUG) if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { @@ -869,24 +904,106 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { DAG.setRoot(currentRoot); } #endif - - return result; - /*UNREACHED*/ - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other " - "than UNINDEXED\n" + - Twine((unsigned)SN->getAddressingMode())); - /*NOTREACHED*/ - } } + // do the store when it might cross the 16 byte memory access boundary. + else { + // TODO issue a warning if SN->isVolatile()== true? This is likely not + // what the user wanted. + + // address offset from nearest lower 16byte alinged address + SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, + SN->getBasePtr(), + DAG.getConstant(0xf, MVT::i32)); + // 16 - offset + SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset); + SDValue hi_shift = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( VT.getSizeInBits()/8, + MVT::i32), + offset_compl); + // 16 - sizeof(Value) + SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + DAG.getConstant( VT.getSizeInBits()/8, + MVT::i32)); + // get a registerfull of ones + SDValue ones = DAG.getConstant(-1, MVT::v4i32); + ones = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, ones); + + // Create the 128 bit masks that have ones where the data to store is + // located. + SDValue lowmask, himask; + // if the value to store don't fill up the an entire 128 bits, zero + // out the last bits of the mask so that only the value we want to store + // is masked. + // this is e.g. in the case of store i32, align 2 + if (!VT.isVector()){ + Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus); + lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + surplus); + Value = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, Value); + Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask); + + } + else { + lowmask = ones; + Value = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, Value); + } + // this will zero, if there are no data that goes to the high quad + himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + offset_compl); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask, + offset); + + // Load in the old data and zero out the parts that will be overwritten with + // the new data to store. + SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + hi.getValue(1)); + + low = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BIT_CONVERT, dl, MVT::i128, low), + DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones)); + hi = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BIT_CONVERT, dl, MVT::i128, hi), + DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones)); + + // Shift the Value to store into place. rlow contains the parts that go to + // the lower memory chunk, rhi has the parts that go to the upper one. + SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset); + rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask); + SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value, + offset_compl); + + // Merge the old data and the new data and store the results + // Need to convert vectors here to integer as 'OR'ing floats assert + rlow = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, low), + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, rlow)); + rhi = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, hi), + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, rhi)); + + low = DAG.getStore(the_chain, dl, rlow, basePtr, + lowMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + hi = DAG.getStore(the_chain, dl, rhi, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0), + hi.getValue(0)); + } + + return result; - return SDValue(); } //! Generate the address of a constant pool entry. @@ -2002,7 +2119,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(scaleShift, MVT::i32)); } - vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, dl, VecVT, N, Elt); + vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt); // Replicate the bytes starting at byte 0 across the entire vector (for // consistency with the notion of a unified register set) @@ -2911,8 +3028,8 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const } break; } - case SPUISD::SHLQUAD_L_BITS: - case SPUISD::SHLQUAD_L_BYTES: + case SPUISD::SHL_BITS: + case SPUISD::SHL_BYTES: case SPUISD::ROTBYTES_LEFT: { SDValue Op1 = N->getOperand(1); diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h index 41d08267586..82f10270db3 100644 --- a/lib/Target/CellSPU/SPUISelLowering.h +++ b/lib/Target/CellSPU/SPUISelLowering.h @@ -41,8 +41,9 @@ namespace llvm { CNTB, ///< Count leading ones in bytes PREFSLOT2VEC, ///< Promote scalar->vector VEC2PREFSLOT, ///< Extract element 0 - SHLQUAD_L_BITS, ///< Rotate quad left, by bits - SHLQUAD_L_BYTES, ///< Rotate quad left, by bytes + SHL_BITS, ///< Shift quad left, by bits + SHL_BYTES, ///< Shift quad left, by bytes + SRL_BYTES, ///< Shift quad right, by bytes. Insert zeros. VEC_ROTL, ///< Vector rotate left VEC_ROTR, ///< Vector rotate right ROTBYTES_LEFT, ///< Rotate bytes (loads -> ROTQBYI) diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td index ff327eb09e2..50f688a5797 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.td +++ b/lib/Target/CellSPU/SPUInstrInfo.td @@ -2369,10 +2369,13 @@ class ROTQBYInst pattern>: RRForm<0b00111011100, OOL, IOL, "rotqby\t$rT, $rA, $rB", RotateShift, pattern>; -class ROTQBYVecInst: - ROTQBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), - [(set (vectype VECREG:$rT), - (SPUrotbytes_left (vectype VECREG:$rA), R32C:$rB))]>; +class ROTQBYGenInst: + ROTQBYInst<(outs rc:$rT), (ins rc:$rA, R32C:$rB), + [(set (type rc:$rT), + (SPUrotbytes_left (type rc:$rA), R32C:$rB))]>; + +class ROTQBYVecInst: + ROTQBYGenInst; multiclass RotateQuadLeftByBytes { @@ -2382,6 +2385,7 @@ multiclass RotateQuadLeftByBytes def v4f32: ROTQBYVecInst; def v2i64: ROTQBYVecInst; def v2f64: ROTQBYVecInst; + def i128: ROTQBYGenInst; } defm ROTQBY: RotateQuadLeftByBytes; @@ -2394,10 +2398,13 @@ class ROTQBYIInst pattern>: RI7Form<0b00111111100, OOL, IOL, "rotqbyi\t$rT, $rA, $val", RotateShift, pattern>; +class ROTQBYIGenInst: + ROTQBYIInst<(outs rclass:$rT), (ins rclass:$rA, u7imm:$val), + [(set (type rclass:$rT), + (SPUrotbytes_left (type rclass:$rA), (i16 uimm7:$val)))]>; + class ROTQBYIVecInst: - ROTQBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val), - [(set (vectype VECREG:$rT), - (SPUrotbytes_left (vectype VECREG:$rA), (i16 uimm7:$val)))]>; + ROTQBYIGenInst; multiclass RotateQuadByBytesImm { @@ -2407,6 +2414,7 @@ multiclass RotateQuadByBytesImm def v4f32: ROTQBYIVecInst; def v2i64: ROTQBYIVecInst; def vfi64: ROTQBYIVecInst; + def i128: ROTQBYIGenInst; } defm ROTQBYI: RotateQuadByBytesImm; @@ -2661,6 +2669,10 @@ multiclass RotateQuadBytes defm ROTQMBY : RotateQuadBytes; +def : Pat<(SPUsrl_bytes GPRC:$rA, R32C:$rB), + (ROTQMBYr128 GPRC:$rA, + (SFIr32 R32C:$rB, 0))>; + class ROTQMBYIInst pattern>: RI7Form<0b10111111100, OOL, IOL, "rotqmbyi\t$rT, $rA, $val", RotateShift, pattern>; @@ -2749,6 +2761,11 @@ multiclass RotateMaskQuadByBits defm ROTQMBI: RotateMaskQuadByBits; +def : Pat<(srl GPRC:$rA, R32C:$rB), + (ROTQMBIr128 GPRC:$rA, + (SFIr32 R32C:$rB, 0))>; + + //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ // Rotate quad and mask by bits, immediate //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td index 647da3051d3..dc3dc8ce891 100644 --- a/lib/Target/CellSPU/SPUNodes.td +++ b/lib/Target/CellSPU/SPUNodes.td @@ -83,10 +83,6 @@ def SPUcntb : SDNode<"SPUISD::CNTB", SDTIntUnaryOp>; // SPUISelLowering.h): def SPUshuffle: SDNode<"SPUISD::SHUFB", SDT_SPUshuffle, []>; -// Shift left quadword by bits and bytes -def SPUshlquad_l_bits: SDNode<"SPUISD::SHLQUAD_L_BITS", SPUvecshift_type, []>; -def SPUshlquad_l_bytes: SDNode<"SPUISD::SHLQUAD_L_BYTES", SPUvecshift_type, []>; - // Vector shifts (ISD::SHL,SRL,SRA are for _integers_ only): def SPUvec_shl: SDNode<"ISD::SHL", SPUvecshift_type, []>; def SPUvec_srl: SDNode<"ISD::SRL", SPUvecshift_type, []>; @@ -105,6 +101,12 @@ def SPUrotbytes_left: SDNode<"SPUISD::ROTBYTES_LEFT", def SPUrotbytes_left_bits : SDNode<"SPUISD::ROTBYTES_LEFT_BITS", SPUvecshift_type>; +// Shift entire quad left by bytes/bits. Zeros are shifted in on the right +// SHL_BITS the same as SHL for i128, but ISD::SHL is not implemented for i128 +def SPUshlquad_l_bytes: SDNode<"SPUISD::SHL_BYTES", SPUvecshift_type, []>; +def SPUshlquad_l_bits: SDNode<"SPUISD::SHL_BITS", SPUvecshift_type, []>; +def SPUsrl_bytes: SDNode<"SPUISD::SRL_BYTES", SPUvecshift_type, []>; + // SPU form select mask for bytes, immediate def SPUselmask: SDNode<"SPUISD::SELECT_MASK", SPUselmask_type, []>; diff --git a/test/CodeGen/CellSPU/arg_ret.ll b/test/CodeGen/CellSPU/arg_ret.ll index 6f07458d006..7410b724d6f 100644 --- a/test/CodeGen/CellSPU/arg_ret.ll +++ b/test/CodeGen/CellSPU/arg_ret.ll @@ -26,7 +26,7 @@ define ccc i32 @test_regs_and_stack( %paramstruct %prm, i32 %stackprm ) define ccc %paramstruct @test_return( i32 %param, %paramstruct %prm ) { -;CHECK: lqd $75, 80($sp) +;CHECK: lqd {{\$[0-9]+}}, 80($sp) ;CHECK-NOT: ori {{\$[0-9]+, \$[0-9]+, 0}} ;CHECK: lr $3, $4 ret %paramstruct %prm diff --git a/test/CodeGen/CellSPU/loads.ll b/test/CodeGen/CellSPU/loads.ll index d40217dacfe..03d7ad1153a 100644 --- a/test/CodeGen/CellSPU/loads.ll +++ b/test/CodeGen/CellSPU/loads.ll @@ -38,3 +38,15 @@ define <4 x float> @load_undef(){ %val = load <4 x float>* undef ret <4 x float> %val } + +;check that 'misaligned' loads that may span two memory chunks +;have two loads. Don't check for the bitmanipulation, as that +;might change with improved algorithms or scheduling +define i32 @load_misaligned( i32* %ptr ){ +;CHECK: load_misaligned +;CHECK: lqd +;CHECK: lqd +;CHECK: bi $lr + %rv = load i32* %ptr, align 2 + ret i32 %rv +} diff --git a/test/CodeGen/CellSPU/stores.ll b/test/CodeGen/CellSPU/stores.ll index 05f44f4be04..efc915ca269 100644 --- a/test/CodeGen/CellSPU/stores.ll +++ b/test/CodeGen/CellSPU/stores.ll @@ -14,6 +14,7 @@ ; RUN: grep iohl %t1.s | count 8 ; RUN: grep shufb %t1.s | count 15 ; RUN: grep frds %t1.s | count 1 +; RUN: llc < %s -march=cellspu | FileCheck %s ; ModuleID = 'stores.bc' target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128" @@ -149,3 +150,15 @@ entry: store float %conv, float* %dest ret float %conv } + +;Check stores that might span two 16 byte memory blocks +define void @store_misaligned( i32 %val, i32* %ptr) { +;CHECK: store_misaligned +;CHECK: lqd +;CHECK: lqd +;CHECK: stqd +;CHECK: stqd +;CHECK: bi $lr + store i32 %val, i32*%ptr, align 2 + ret void +}