From: Elena Demikhovsky Date: Thu, 2 Feb 2012 09:10:43 +0000 (+0000) Subject: Optimization for SIGN_EXTEND operation on AVX. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=dcabc7bca9b81c384d307cbb7d28b29451e263f2;p=oota-llvm.git Optimization for SIGN_EXTEND operation on AVX. Special handling was added for v4i32 -> v4i64 and v8i16 -> v8i32 extensions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149600 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 76b21aca00e..7b8221a1392 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1221,6 +1221,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); if (Subtarget->is64Bit()) @@ -14641,6 +14642,55 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!Subtarget->hasAVX()) return SDValue(); + + // Optimize vectors in AVX mode + // Sign extend v8i16 to v8i32 and + // v4i32 to v4i64 + // + // Divide input vector into two parts + // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} + // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 + // concat the vectors to original VT + + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + EVT OpVT = Op.getValueType(); + DebugLoc dl = N->getDebugLoc(); + + if (((VT == MVT::v4i64) && (OpVT == MVT::v4i32)) || + ((VT == MVT::v8i32) && (OpVT == MVT::v8i16))) { + + unsigned NumElems = OpVT.getVectorNumElements(); + SmallVector ShufMask1(NumElems, -1); + for (unsigned i=0; i< NumElems/2; i++) ShufMask1[i] = i; + + SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), + ShufMask1.data()); + + SmallVector ShufMask2(NumElems, -1); + for (unsigned i=0; i< NumElems/2; i++) ShufMask2[i] = i+NumElems/2; + + SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), + ShufMask2.data()); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); + OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); + } + return SDValue(); +} + static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> @@ -14886,6 +14936,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget); + case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); case X86ISD::SHUFP: // Handle all target specific shuffles diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 1a3892193b5..9689bcd5863 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -219,6 +219,9 @@ namespace llvm { // VZEXT_MOVL - Vector move low and zero extend. VZEXT_MOVL, + // VZEXT_MOVL - Vector move low and sign extend. + VSEXT_MOVL, + // VSHL, VSRL - 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 98ed34ad738..f239509a9c5 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -71,6 +71,9 @@ def X86insrtps : SDNode<"X86ISD::INSERTPS", SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>; def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; +def X86vsmovl : SDNode<"X86ISD::VSEXT_MOVL", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>; + def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 8f6df304c7b..34478a026d2 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -5478,6 +5478,16 @@ let Predicates = [HasSSE41] in { (PMOVZXDQrm addr:$src)>; } +let Predicates = [HasAVX] in { +def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; +def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; +} + +let Predicates = [HasSSE41] in { +def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; +def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; +} + multiclass SS41I_binop_rm_int4 opc, string OpcodeStr, Intrinsic IntId> { def rr : SS48I @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { +;CHECK: sext_8i16_to_8i32 +;CHECK: vpmovsxwd + + %B = sext <8 x i16> %A to <8 x i32> + ret <8 x i32>%B +} + +define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { +;CHECK: sext_4i32_to_4i64 +;CHECK: vpmovsxdq + + %B = sext <4 x i32> %A to <4 x i64> + ret <4 x i64>%B +}