From: Elena Demikhovsky Date: Wed, 1 Feb 2012 07:56:44 +0000 (+0000) Subject: Optimization for "truncate" operation on AVX. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=3ae98150e3e3c4770644a33cf38fe42a582509bb;p=oota-llvm.git Optimization for "truncate" operation on AVX. Truncating v4i64 -> v4i32 and v8i32 -> v8i16 may be done with set of shuffles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149485 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2ce883a1700..f84d2864056 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1218,6 +1218,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); @@ -12911,6 +12912,104 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); } + +/// PerformTruncateCombine - Converts truncate operation to +/// a sequence of vector shuffle operations. +/// It is possible when we truncate 256-bit vector to 128-bit vector + +SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!Subtarget->hasAVX()) return SDValue(); + + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + EVT OpVT = Op.getValueType(); + DebugLoc dl = N->getDebugLoc(); + + if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, + DAG.getIntPtrConstant(0)); + + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, + DAG.getIntPtrConstant(2)); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); + + // PSHUFD + SmallVector ShufMask1; + ShufMask1.push_back(0); + ShufMask1.push_back(2); + ShufMask1.push_back(0); + ShufMask1.push_back(0); + + OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), + ShufMask1.data()); + OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), + ShufMask1.data()); + + // MOVLHPS + SmallVector ShufMask2; + ShufMask2.push_back(0); + ShufMask2.push_back(1); + ShufMask2.push_back(4); + ShufMask2.push_back(5); + + return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2.data()); + } + if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, + DAG.getIntPtrConstant(0)); + + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, + DAG.getIntPtrConstant(4)); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi); + + // PSHUFB + SmallVector ShufMask1; + ShufMask1.push_back(0x0); + ShufMask1.push_back(0x1); + ShufMask1.push_back(0x4); + ShufMask1.push_back(0x5); + ShufMask1.push_back(0x8); + ShufMask1.push_back(0x9); + ShufMask1.push_back(0xc); + ShufMask1.push_back(0xd); + for (unsigned i=0; i<8; ++i) + ShufMask1.push_back(-1); + + OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, + DAG.getUNDEF(MVT::v16i8), + ShufMask1.data()); + OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, + DAG.getUNDEF(MVT::v16i8), + ShufMask1.data()); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); + + // MOVLHPS + SmallVector ShufMask2; + ShufMask2.push_back(0); + ShufMask2.push_back(1); + ShufMask2.push_back(4); + ShufMask2.push_back(5); + + SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2.data()); + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res); + + } + + return SDValue(); +} + /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// to a simple store and scalar loads to extract the elements. @@ -14771,6 +14870,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget); + case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 253946bf08e..1a3892193b5 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -839,6 +839,7 @@ namespace llvm { SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue PerformTruncateCombine(SDNode* N, SelectionDAG &DAG, DAGCombinerInfo &DCI) const; // Utility functions to help LowerVECTOR_SHUFFLE SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const; diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll new file mode 100755 index 00000000000..dd0e268c3cf --- /dev/null +++ b/test/CodeGen/X86/avx-trunc.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +define <4 x i32> @trunc_64_32(<4 x i64> %A) nounwind uwtable readnone ssp{ +; CHECK: trunc_64_32 +; CHECK: pshufd + %B = trunc <4 x i64> %A to <4 x i32> + ret <4 x i32>%B +} +define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{ +; CHECK: trunc_32_16 +; CHECK: pshufb + %B = trunc <8 x i32> %A to <8 x i16> + ret <8 x i16>%B +} +