-static SDValue LowerVectorMUL(SDValue Op, SelectionDAG &DAG) {
- switch (Op.getValueType().getSimpleVT()) {
- default:
- cerr << "CellSPU: Unknown vector multiplication, got "
- << Op.getValueType().getMVTString()
- << "\n";
- abort();
- /*NOTREACHED*/
-
- case MVT::v4i32: {
- SDValue rA = Op.getOperand(0);
- SDValue rB = Op.getOperand(1);
- SDValue HiProd1 = DAG.getNode(SPUISD::MPYH, MVT::v4i32, rA, rB);
- SDValue HiProd2 = DAG.getNode(SPUISD::MPYH, MVT::v4i32, rB, rA);
- SDValue LoProd = DAG.getNode(SPUISD::MPYU, MVT::v4i32, rA, rB);
- SDValue Residual1 = DAG.getNode(ISD::ADD, MVT::v4i32, LoProd, HiProd1);
-
- return DAG.getNode(ISD::ADD, MVT::v4i32, Residual1, HiProd2);
- break;
- }
-
- // Multiply two v8i16 vectors (pipeline friendly version):
- // a) multiply lower halves, mask off upper 16-bit of 32-bit product
- // b) multiply upper halves, rotate left by 16 bits (inserts 16 lower zeroes)
- // c) Use SELB to select upper and lower halves from the intermediate results
- //
- // NOTE: We really want to move the SELECT_MASK to earlier to actually get the
- // dual-issue. This code does manage to do this, even if it's a little on
- // the wacky side
- case MVT::v8i16: {
- MachineFunction &MF = DAG.getMachineFunction();
- MachineRegisterInfo &RegInfo = MF.getRegInfo();
- SDValue Chain = Op.getOperand(0);
- SDValue rA = Op.getOperand(0);
- SDValue rB = Op.getOperand(1);
- unsigned FSMBIreg = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
- unsigned HiProdReg = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
-
- SDValue FSMBOp =
- DAG.getCopyToReg(Chain, FSMBIreg,
- DAG.getNode(SPUISD::SELECT_MASK, MVT::v8i16,
- DAG.getConstant(0xcccc, MVT::i16)));
-
- SDValue HHProd =
- DAG.getCopyToReg(FSMBOp, HiProdReg,
- DAG.getNode(SPUISD::MPYHH, MVT::v8i16, rA, rB));
-
- SDValue HHProd_v4i32 =
- DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
- DAG.getCopyFromReg(HHProd, HiProdReg, MVT::v4i32));
-
- return DAG.getNode(SPUISD::SELB, MVT::v8i16,
- DAG.getNode(SPUISD::MPY, MVT::v8i16, rA, rB),
- DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(),
- DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32,
- HHProd_v4i32,
- DAG.getConstant(16, MVT::i16))),
- DAG.getCopyFromReg(FSMBOp, FSMBIreg, MVT::v4i32));
- }
-
- // This M00sE is N@stI! (apologies to Monty Python)
- //
- // SPU doesn't know how to do any 8-bit multiplication, so the solution
- // is to break it all apart, sign extend, and reassemble the various
- // intermediate products.
- case MVT::v16i8: {
- SDValue rA = Op.getOperand(0);
- SDValue rB = Op.getOperand(1);
- SDValue c8 = DAG.getConstant(8, MVT::i32);
- SDValue c16 = DAG.getConstant(16, MVT::i32);
-
- SDValue LLProd =
- DAG.getNode(SPUISD::MPY, MVT::v8i16,
- DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rA),
- DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rB));
-
- SDValue rALH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rA, c8);
-
- SDValue rBLH = DAG.getNode(SPUISD::VEC_SRA, MVT::v8i16, rB, c8);
-
- SDValue LHProd =
- DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16,
- DAG.getNode(SPUISD::MPY, MVT::v8i16, rALH, rBLH), c8);
-
- SDValue FSMBmask = DAG.getNode(SPUISD::SELECT_MASK, MVT::v8i16,
- DAG.getConstant(0x2222, MVT::i16));
-
- SDValue LoProdParts =
- DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32,
- DAG.getNode(SPUISD::SELB, MVT::v8i16,
- LLProd, LHProd, FSMBmask));
-
- SDValue LoProdMask = DAG.getConstant(0xffff, MVT::i32);
-
- SDValue LoProd =
- DAG.getNode(ISD::AND, MVT::v4i32,
- LoProdParts,
- DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
- LoProdMask, LoProdMask,
- LoProdMask, LoProdMask));
-
- SDValue rAH =
- DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
- DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rA), c16);
-
- SDValue rBH =
- DAG.getNode(SPUISD::VEC_SRA, MVT::v4i32,
- DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, rB), c16);
-
- SDValue HLProd =
- DAG.getNode(SPUISD::MPY, MVT::v8i16,
- DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rAH),
- DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, rBH));
-
- SDValue HHProd_1 =
- DAG.getNode(SPUISD::MPY, MVT::v8i16,
- DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
- DAG.getNode(SPUISD::VEC_SRA,
- MVT::v4i32, rAH, c8)),
- DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16,
- DAG.getNode(SPUISD::VEC_SRA,
- MVT::v4i32, rBH, c8)));
-
- SDValue HHProd =
- DAG.getNode(SPUISD::SELB, MVT::v8i16,
- HLProd,
- DAG.getNode(SPUISD::VEC_SHL, MVT::v8i16, HHProd_1, c8),
- FSMBmask);
-
- SDValue HiProd =
- DAG.getNode(SPUISD::VEC_SHL, MVT::v4i32, HHProd, c16);
-
- return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8,
- DAG.getNode(ISD::OR, MVT::v4i32,
- LoProd, HiProd));
- }
- }
-
- return SDValue();
-}
-
-static SDValue LowerFDIVf32(SDValue Op, SelectionDAG &DAG) {
- MachineFunction &MF = DAG.getMachineFunction();
- MachineRegisterInfo &RegInfo = MF.getRegInfo();
-
- SDValue A = Op.getOperand(0);
- SDValue B = Op.getOperand(1);
- MVT VT = Op.getValueType();
-
- unsigned VRegBR, VRegC;
-
- if (VT == MVT::f32) {
- VRegBR = RegInfo.createVirtualRegister(&SPU::R32FPRegClass);
- VRegC = RegInfo.createVirtualRegister(&SPU::R32FPRegClass);
- } else {
- VRegBR = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
- VRegC = RegInfo.createVirtualRegister(&SPU::VECREGRegClass);
- }
- // TODO: make sure we're feeding FPInterp the right arguments
- // Right now: fi B, frest(B)
-
- // Computes BRcpl =
- // (Floating Interpolate (FP Reciprocal Estimate B))
- SDValue BRcpl =
- DAG.getCopyToReg(DAG.getEntryNode(), VRegBR,
- DAG.getNode(SPUISD::FPInterp, VT, B,
- DAG.getNode(SPUISD::FPRecipEst, VT, B)));
-
- // Computes A * BRcpl and stores in a temporary register
- SDValue AxBRcpl =
- DAG.getCopyToReg(BRcpl, VRegC,
- DAG.getNode(ISD::FMUL, VT, A,
- DAG.getCopyFromReg(BRcpl, VRegBR, VT)));
- // What's the Chain variable do? It's magic!
- // TODO: set Chain = Op(0).getEntryNode()
-
- return DAG.getNode(ISD::FADD, VT,
- DAG.getCopyFromReg(AxBRcpl, VRegC, VT),
- DAG.getNode(ISD::FMUL, VT,
- DAG.getCopyFromReg(AxBRcpl, VRegBR, VT),
- DAG.getNode(ISD::FSUB, VT, A,
- DAG.getNode(ISD::FMUL, VT, B,
- DAG.getCopyFromReg(AxBRcpl, VRegC, VT)))));
-}
-