From: Evan Cheng Date: Mon, 24 Mar 2008 21:52:23 +0000 (+0000) Subject: - SSE4.1 extractfps extracts a f32 into a gr32 register. Very useful! Not. Fix the... X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=62a3f1538cf50f0373c2a5eeb440d6288604f969;p=oota-llvm.git - SSE4.1 extractfps extracts a f32 into a gr32 register. Very useful! Not. Fix the instruction specification and teaches lowering code to use it only when the only use is a store instruction. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@48746 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index dd2d7849d77..1d72e1f6c64 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -699,7 +699,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); @@ -3718,6 +3718,19 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDOperand Op, SDOperand Assert = DAG.getNode(ISD::AssertZext, MVT::i32, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, VT, Assert); + } else if (VT == MVT::f32) { + // EXTRACTPS outputs to a GPR32 register which will require a movd to copy + // the result back to FR32 register. It's only worth matching if the + // result has a single use which is a store. + if (!Op.hasOneUse()) + return SDOperand(); + SDNode *User = *Op.Val->use_begin(); + if (User->getOpcode() != ISD::STORE) + return SDOperand(); + SDOperand Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, + DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, Op.getOperand(0)), + Op.getOperand(1)); + return DAG.getNode(ISD::BIT_CONVERT, MVT::f32, Extract); } return SDOperand(); } @@ -3728,8 +3741,11 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) { if (!isa(Op.getOperand(1))) return SDOperand(); - if (Subtarget->hasSSE41()) - return LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); + if (Subtarget->hasSSE41()) { + SDOperand Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); + if (Res.Val) + return Res; + } MVT::ValueType VT = Op.getValueType(); // TODO: handle v16i8. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 787414b10f0..9a3b2f67b1c 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3380,19 +3380,22 @@ multiclass SS41I_extract32 opc, string OpcodeStr> { defm PEXTRD : SS41I_extract32<0x16, "pextrd">; -/// SS41I_extractf32 - SSE 4.1 extract 32 bits to fp reg or memory destination +/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory +/// destination multiclass SS41I_extractf32 opc, string OpcodeStr> { - def rr : SS4AIi8, OpSize; + [/*(set GR32:$dst, + (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))*/]>, + OpSize; def mr : SS4AIi8, OpSize; } diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll new file mode 100644 index 00000000000..1ef5e8803ef --- /dev/null +++ b/test/CodeGen/X86/vec_extract-sse4.ll @@ -0,0 +1,30 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse41 -o %t -f +; RUN: grep extractps %t | count 1 +; RUN: grep pextrd %t | count 2 +; RUN: grep pshufd %t | count 1 + +define void @t1(float* %R, <4 x float>* %P1) { + %X = load <4 x float>* %P1 + %tmp = extractelement <4 x float> %X, i32 3 + store float %tmp, float* %R + ret void +} + +define float @t2(<4 x float>* %P1) { + %X = load <4 x float>* %P1 + %tmp = extractelement <4 x float> %X, i32 2 + ret float %tmp +} + +define void @t3(i32* %R, <4 x i32>* %P1) { + %X = load <4 x i32>* %P1 + %tmp = extractelement <4 x i32> %X, i32 3 + store i32 %tmp, i32* %R + ret void +} + +define i32 @t4(<4 x i32>* %P1) { + %X = load <4 x i32>* %P1 + %tmp = extractelement <4 x i32> %X, i32 3 + ret i32 %tmp +}