From 11e15b38e965731e5bfff6c73d8d269196e5048c Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Mon, 3 Apr 2006 20:53:28 +0000 Subject: [PATCH] - More efficient extract_vector_elt with shuffle and movss, movsd, movd, etc. - Some bug fixes and naming inconsistency fixes. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27377 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 62 +++++++++++- lib/Target/X86/X86InstrInfo.cpp | 6 +- lib/Target/X86/X86InstrInfo.td | 1 + lib/Target/X86/X86InstrSSE.td | 152 +++++++++++++++++------------ 4 files changed, 151 insertions(+), 70 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index bfbae93396e..693189b6765 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -280,6 +280,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); } if (Subtarget->hasSSE2()) { @@ -316,7 +317,9 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); } @@ -1484,11 +1487,20 @@ bool X86::isSHUFPMask(SDNode *N) { // Dest { 2, 1 } <= shuffle( Dest { 1, 0 }, Src { 3, 2 } // Expect bit 0 == 1, bit1 == 2 SDOperand Bit0 = N->getOperand(0); + if (Bit0.getOpcode() != ISD::UNDEF) { + assert(isa(Bit0) && "Invalid VECTOR_SHUFFLE mask!"); + if (cast(Bit0)->getValue() != 1) + return false; + } + SDOperand Bit1 = N->getOperand(1); - assert(isa(Bit0) && isa(Bit1) && - "Invalid VECTOR_SHUFFLE mask!"); - return (cast(Bit0)->getValue() == 1 && - cast(Bit1)->getValue() == 2); + if (Bit1.getOpcode() != ISD::UNDEF) { + assert(isa(Bit1) && "Invalid VECTOR_SHUFFLE mask!"); + if (cast(Bit1)->getValue() != 2) + return false; + } + + return true; } if (NumElems != 4) return false; @@ -2660,15 +2672,55 @@ SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) { return SDOperand(); } case ISD::EXTRACT_VECTOR_ELT: { - // Transform it so it match pextrw which produces a 32-bit result. + if (!isa(Op.getOperand(1))) + return SDOperand(); + MVT::ValueType VT = Op.getValueType(); if (MVT::getSizeInBits(VT) == 16) { + // Transform it so it match pextrw which produces a 32-bit result. MVT::ValueType EVT = (MVT::ValueType)(VT+1); SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, EVT, Op.getOperand(0), Op.getOperand(1)); SDOperand Assert = DAG.getNode(ISD::AssertZext, EVT, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, VT, Assert); + } else if (MVT::getSizeInBits(VT) == 32) { + SDOperand Vec = Op.getOperand(0); + unsigned Idx = cast(Op.getOperand(1))->getValue(); + if (Idx == 0) + return Op; + + // TODO: if Idex == 2, we can use unpckhps + // SHUFPS the element to the lowest double word, then movss. + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4); + SDOperand IdxNode = DAG.getConstant((Idx < 2) ? Idx : Idx+4, + MVT::getVectorBaseType(MaskVT)); + std::vector IdxVec; + IdxVec.push_back(DAG.getConstant(Idx, MVT::getVectorBaseType(MaskVT))); + IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorBaseType(MaskVT))); + IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorBaseType(MaskVT))); + IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorBaseType(MaskVT))); + SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, IdxVec); + Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(), + Vec, Vec, Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec, + DAG.getConstant(0, MVT::i32)); + } else if (MVT::getSizeInBits(VT) == 64) { + SDOperand Vec = Op.getOperand(0); + unsigned Idx = cast(Op.getOperand(1))->getValue(); + if (Idx == 0) + return Op; + + // UNPCKHPD the element to the lowest double word, then movsd. + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4); + std::vector IdxVec; + IdxVec.push_back(DAG.getConstant(1, MVT::getVectorBaseType(MaskVT))); + IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorBaseType(MaskVT))); + SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, IdxVec); + Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(), + Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec, + DAG.getConstant(0, MVT::i32)); } return SDOperand(); diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 20e1cd0ecd7..5c4ab1bdf36 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -31,8 +31,10 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI, oc == X86::FpMOV || oc == X86::MOVSSrr || oc == X86::MOVSDrr || oc == X86::FsMOVAPSrr || oc == X86::FsMOVAPDrr || oc == X86::MOVAPSrr || oc == X86::MOVAPDrr || - oc == X86::MOVSS128rr || oc == X86::MOVSD128rr || - oc == X86::MOVD128rr || oc == X86::MOVQ128rr) { + oc == X86::MOVSS2PSrr || oc == X86::MOVSD2PDrr || + oc == X86::MOVPS2SSrr || oc == X86::MOVPD2SDrr || + oc == X86::MOVDI2PDIrr || oc == X86::MOVQI2PQIrr || + oc == X86::MOVPDI2DIrr) { assert(MI.getNumOperands() == 2 && MI.getOperand(0).isRegister() && MI.getOperand(1).isRegister() && diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 7dcdbd82ea7..172f33e9019 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -284,6 +284,7 @@ def i16immZExt8 : PatLeaf<(i16 imm), [{ def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; def loadi16 : PatFrag<(ops node:$ptr), (i16 (load node:$ptr))>; def loadi32 : PatFrag<(ops node:$ptr), (i32 (load node:$ptr))>; +def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index beec5e2d81c..5fc0004d511 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -226,24 +226,6 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, FR64:$src), "movsd {$src, $dst|$dst, $src}", [(store FR64:$src, addr:$dst)]>; -// FR32 / FR64 to 128-bit vector conversion. -def MOVSS128rr : SSI<0x10, MRMSrcReg, (ops VR128:$dst, FR32:$src), - "movss {$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v4f32 (scalar_to_vector FR32:$src)))]>; -def MOVSS128rm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src), - "movss {$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>; -def MOVSD128rr : SDI<0x10, MRMSrcReg, (ops VR128:$dst, FR64:$src), - "movsd {$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v2f64 (scalar_to_vector FR64:$src)))]>; -def MOVSD128rm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src), - "movsd {$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>; - // Arithmetic instructions let isTwoAddress = 1 in { let isCommutable = 1 in { @@ -1122,18 +1104,6 @@ def HSUBPDrm : S3D_Intrm<0x7C, "hsubpd {$src2, $dst|$dst, $src2}", //===----------------------------------------------------------------------===// // Move Instructions -def MOVD128rr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, R32:$src), - "movd {$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v4i32 (scalar_to_vector R32:$src)))]>; -def MOVD128rm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src), - "movd {$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>; - -def MOVD128mr : PDI<0x7E, MRMDestMem, (ops i32mem:$dst, VR128:$src), - "movd {$src, $dst|$dst, $src}", []>; - def MOVDQArr : PDI<0x6F, MRMSrcReg, (ops VR128:$dst, VR128:$src), "movdqa {$src, $dst|$dst, $src}", []>; def MOVDQArm : PDI<0x6F, MRMSrcMem, (ops VR128:$dst, i128mem:$src), @@ -1143,18 +1113,6 @@ def MOVDQAmr : PDI<0x7F, MRMDestMem, (ops i128mem:$dst, VR128:$src), "movdqa {$src, $dst|$dst, $src}", [(store (v4i32 VR128:$src), addr:$dst)]>; -// SSE2 instructions with XS prefix -def MOVQ128rr : I<0x7E, MRMSrcReg, (ops VR128:$dst, VR64:$src), - "movq {$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v2i64 (scalar_to_vector VR64:$src)))]>, XS, - Requires<[HasSSE2]>; -def MOVQ128rm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src), - "movq {$src, $dst|$dst, $src}", []>, XS, - Requires<[HasSSE2]>; -def MOVQ128mr : PDI<0xD6, MRMSrcMem, (ops i64mem:$dst, VR128:$src), - "movq {$src, $dst|$dst, $src}", []>; - // 128-bit Integer Arithmetic let isTwoAddress = 1 in { let isCommutable = 1 in { @@ -1549,32 +1507,102 @@ def V_SETALLONES : PDI<0x76, MRMInitReg, (ops VR128:$dst), "pcmpeqd $dst, $dst", [(set VR128:$dst, (v2f64 immAllOnesV))]>; -// Scalar to 128-bit vector with zero extension. +// FR32 / FR64 to 128-bit vector conversion. +def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (ops VR128:$dst, FR32:$src), + "movss {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4f32 (scalar_to_vector FR32:$src)))]>; +def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src), + "movss {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>; +def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (ops VR128:$dst, FR64:$src), + "movsd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (scalar_to_vector FR64:$src)))]>; +def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src), + "movsd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>; + +def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, R32:$src), + "movd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector R32:$src)))]>; +def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src), + "movd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>; +// SSE2 instructions with XS prefix +def MOVQI2PQIrr : I<0x7E, MRMSrcReg, (ops VR128:$dst, VR64:$src), + "movq {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector VR64:$src)))]>, XS, + Requires<[HasSSE2]>; +def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src), + "movq {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, + Requires<[HasSSE2]>; +// FIXME: may not be able to eliminate this movss with coalescing the src and +// dest register classes are different. We really want to write this pattern +// like this: +// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (i32 0))), +// (f32 FR32:$src)>; +def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, VR128:$src), + "movss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (vector_extract (v4f32 VR128:$src), + (i32 0)))]>; +def MOVPS2SSmr : SSI<0x10, MRMDestMem, (ops f32mem:$dst, VR128:$src), + "movss {$src, $dst|$dst, $src}", + [(store (f32 (vector_extract (v4f32 VR128:$src), + (i32 0))), addr:$dst)]>; +def MOVPD2SDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, VR128:$src), + "movsd {$src, $dst|$dst, $src}", + [(set FR64:$dst, (vector_extract (v2f64 VR128:$src), + (i32 0)))]>; +def MOVPD2SDmr : SDI<0x10, MRMDestMem, (ops f64mem:$dst, VR128:$src), + "movsd {$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (v2f64 VR128:$src), + (i32 0))), addr:$dst)]>; +def MOVPDI2DIrr : PDI<0x7E, MRMSrcReg, (ops R32:$dst, VR128:$src), + "movd {$src, $dst|$dst, $src}", + [(set R32:$dst, (vector_extract (v4i32 VR128:$src), + (i32 0)))]>; +def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (ops i32mem:$dst, VR128:$src), + "movd {$src, $dst|$dst, $src}", + [(store (i32 (vector_extract (v4i32 VR128:$src), + (i32 0))), addr:$dst)]>; + +// Move to lower bits of a VR128, leaving upper bits alone. // Three operand (but two address) aliases. let isTwoAddress = 1 in { -def MOVZSS128rr : SSI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src1, FR32:$src2), +def MOVLSS2PSrr : SSI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src1, FR32:$src2), "movss {$src2, $dst|$dst, $src2}", []>; -def MOVZSD128rr : SDI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src1, FR64:$src2), +def MOVLSD2PDrr : SDI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src1, FR64:$src2), "movsd {$src2, $dst|$dst, $src2}", []>; -def MOVZD128rr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, VR128:$src1, R32:$src2), +def MOVLDI2PDIrr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, VR128:$src1, R32:$src2), "movd {$src2, $dst|$dst, $src2}", []>; -def MOVZQ128rr : I<0x7E, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR64:$src2), - "movq {$src2, $dst|$dst, $src2}", []>; } +// Move to lower bits of a VR128 and zeroing upper bits. // Loading from memory automatically zeroing upper bits. -def MOVZSS128rm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src), +def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src), "movss {$src, $dst|$dst, $src}", [(set VR128:$dst, (v4f32 (X86zexts2vec (loadf32 addr:$src))))]>; -def MOVZSD128rm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src), +def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src), "movsd {$src, $dst|$dst, $src}", [(set VR128:$dst, (v2f64 (X86zexts2vec (loadf64 addr:$src))))]>; -def MOVZD128rm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src), - "movd {$src, $dst|$dst, $src}", - [(set VR128:$dst, - (v4i32 (X86zexts2vec (loadi32 addr:$src))))]>; +def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src), + "movd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86zexts2vec (loadi32 addr:$src))))]>; +def MOVZQI2PQIrm : PDI<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src), + "movd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (X86zexts2vec (loadi64 addr:$src))))]>; //===----------------------------------------------------------------------===// // Non-Instruction Patterns @@ -1621,9 +1649,9 @@ def : Pat<(store (v2i64 VR128:$src), addr:$dst), // Scalar to v8i16 / v16i8. The source may be a R32, but only the lower 8 or // 16-bits matter. -def : Pat<(v8i16 (X86s2vec R32:$src)), (MOVD128rr R32:$src)>, +def : Pat<(v8i16 (X86s2vec R32:$src)), (MOVDI2PDIrr R32:$src)>, Requires<[HasSSE2]>; -def : Pat<(v16i8 (X86s2vec R32:$src)), (MOVD128rr R32:$src)>, +def : Pat<(v16i8 (X86s2vec R32:$src)), (MOVDI2PDIrr R32:$src)>, Requires<[HasSSE2]>; // bit_convert @@ -1659,17 +1687,15 @@ def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>, // Zeroing a VR128 then do a MOVS* to the lower bits. def : Pat<(v2f64 (X86zexts2vec FR64:$src)), - (MOVZSD128rr (V_SET0_PD), FR64:$src)>, Requires<[HasSSE2]>; + (MOVLSD2PDrr (V_SET0_PD), FR64:$src)>, Requires<[HasSSE2]>; def : Pat<(v4f32 (X86zexts2vec FR32:$src)), - (MOVZSS128rr (V_SET0_PS), FR32:$src)>, Requires<[HasSSE2]>; -def : Pat<(v2i64 (X86zexts2vec VR64:$src)), - (MOVZQ128rr (V_SET0_PI), VR64:$src)>, Requires<[HasSSE2]>; + (MOVLSS2PSrr (V_SET0_PS), FR32:$src)>, Requires<[HasSSE2]>; def : Pat<(v4i32 (X86zexts2vec R32:$src)), - (MOVZD128rr (V_SET0_PI), R32:$src)>, Requires<[HasSSE2]>; + (MOVLDI2PDIrr (V_SET0_PI), R32:$src)>, Requires<[HasSSE2]>; def : Pat<(v8i16 (X86zexts2vec R16:$src)), - (MOVZD128rr (V_SET0_PI), (MOVZX32rr16 R16:$src))>, Requires<[HasSSE2]>; + (MOVLDI2PDIrr (V_SET0_PI), (MOVZX32rr16 R16:$src))>, Requires<[HasSSE2]>; def : Pat<(v16i8 (X86zexts2vec R8:$src)), - (MOVZD128rr (V_SET0_PI), (MOVZX32rr8 R8:$src))>, Requires<[HasSSE2]>; + (MOVLDI2PDIrr (V_SET0_PI), (MOVZX32rr8 R8:$src))>, Requires<[HasSSE2]>; // Splat v2f64 / v2i64 def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), SSE_splat_mask:$sm), -- 2.34.1