From 125a76c50274eeadb8e0e503f77f8c583fd5c5c3 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Mon, 4 May 2015 12:40:50 +0000 Subject: [PATCH] AVX-512: added calling convention for i1 vectors in 32-bit mode. Fixed some bugs in extend/truncate for AVX-512 target. Removed VBROADCASTM (masked broadcast) node, since it is not used any more. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236420 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86CallingConv.td | 12 +++- lib/Target/X86/X86ISelLowering.cpp | 76 ++++++++++++------------- lib/Target/X86/X86ISelLowering.h | 2 - lib/Target/X86/X86InstrAVX512.td | 33 ++++++----- lib/Target/X86/X86InstrFragmentsSIMD.td | 1 - test/CodeGen/X86/avx512-calling-conv.ll | 43 ++++++++++++++ 6 files changed, 105 insertions(+), 62 deletions(-) diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 8ce911079f4..203dc3efa3e 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -268,7 +268,7 @@ def CC_X86_64_C : CallingConv<[ CCIfSubtarget<"hasSSE2()", CCPromoteToType>>>, - // Boolean vectors of AVX-512 are returned in SIMD registers. + // Boolean vectors of AVX-512 are passed in SIMD registers. // The call from AVX to AVX-512 function should work, // since the boolean types in AVX/AVX2 are promoted by default. CCIfType<[v2i1], CCPromoteToType>, @@ -472,6 +472,16 @@ def CC_X86_32_Common : CallingConv<[ // Long doubles get slots whose size depends on the subtarget. CCIfType<[f80], CCAssignToStack<0, 4>>, + // Boolean vectors of AVX-512 are passed in SIMD registers. + // The call from AVX to AVX-512 function should work, + // since the boolean types in AVX/AVX2 are promoted by default. + CCIfType<[v2i1], CCPromoteToType>, + CCIfType<[v4i1], CCPromoteToType>, + CCIfType<[v8i1], CCPromoteToType>, + CCIfType<[v16i1], CCPromoteToType>, + CCIfType<[v32i1], CCPromoteToType>, + CCIfType<[v64i1], CCPromoteToType>, + // The first 4 SSE vector arguments are passed in XMM registers. CCIfNotVarArg>>, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index acacff946e4..49f0a8a73bf 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1312,6 +1312,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); @@ -2078,7 +2080,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Copy all of the result registers out of their specified physreg. for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; - EVT CopyVT = VA.getValVT(); + EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && @@ -2088,15 +2090,18 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. + bool RoundAfterCopy = false; if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && - isScalarFPTypeInSSEReg(VA.getValVT())) + isScalarFPTypeInSSEReg(VA.getValVT())) { CopyVT = MVT::f80; + RoundAfterCopy = (CopyVT != VA.getLocVT()); + } Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag).getValue(1); SDValue Val = Chain.getValue(0); - if (CopyVT != VA.getValVT()) + if (RoundAfterCopy) Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1, dl)); @@ -2825,7 +2830,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); break; case CCValAssign::AExt: - if (RegVT.is128BitVector()) { + if (Arg.getValueType().getScalarType() == MVT::i1) + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); + else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); @@ -11969,6 +11976,9 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); + if (VT.is512BitVector() || InVT.getScalarType() == MVT::i1) + return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In); + // Optimize vectors in AVX mode: // // v8i16 -> v8i32 @@ -12018,22 +12028,17 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) return DAG.getNode(X86ISD::VZEXT, DL, VT, In); - EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // Now we have only mask extension assert(InVT.getVectorElementType() == MVT::i1); - SDValue Cst = DAG.getTargetConstant(1, DL, ExtVT.getScalarType()); - const Constant *C = cast(Cst)->getConstantIntValue(); - SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); - unsigned Alignment = cast(CP)->getAlignment(); - SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); - - SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld); + MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; + SDValue One = + DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT); + SDValue Zero = + DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); + + SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); if (VT.is512BitVector()) - return Brcst; - return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst); + return V; + return DAG.getNode(X86ISD::VTRUNC, DL, VT, V); } static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, @@ -12115,14 +12120,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { InVT = ExtVT; } - SDValue Cst = DAG.getTargetConstant(1, DL, InVT.getVectorElementType()); - const Constant *C = cast(Cst)->getConstantIntValue(); - SDValue CP = DAG.getConstantPool(C, getPointerTy()); - unsigned Alignment = cast(CP)->getAlignment(); - SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); - SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld); + SDValue OneV = + DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT); SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); } @@ -13802,22 +13801,18 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget return DAG.getNode(X86ISD::VSEXT, dl, VT, In); } - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); - - MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32; - Constant *C = ConstantInt::get(*DAG.getContext(), - APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits())); - - SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); - unsigned Alignment = cast(CP)->getAlignment(); - SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); - SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld); + MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; + SDValue NegOne = + DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, + ExtVT); + SDValue Zero = + DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT); + + SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); if (VT.is512BitVector()) - return Brcst; - return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst); + return V; + return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); } static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, @@ -17860,7 +17855,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; - case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 027a3ca8a88..0d24c6a7a56 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -371,8 +371,6 @@ namespace llvm { VPERMI, VPERM2X128, VBROADCAST, - // masked broadcast - VBROADCASTM, // Insert/Extract vector element VINSERT, VEXTRACT, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 1fbea4b82be..41f4584c2a4 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -802,12 +802,8 @@ def : Pat <(v8i64 (X86vzext VK8WM:$mask)), def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), (VPBROADCASTDrZr GR32:$src)>; -def : Pat<(v16i32 (X86VBroadcastm VK16WM:$mask, (i32 GR32:$src))), - (VPBROADCASTDrZrkz VK16WM:$mask, GR32:$src)>; def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), (VPBROADCASTQrZr GR64:$src)>; -def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))), - (VPBROADCASTQrZrkz VK8WM:$mask, GR64:$src)>; def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))), (VPBROADCASTDrZr GR32:$src)>; @@ -829,24 +825,33 @@ multiclass avx512_int_broadcast_rm opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX; - def krr : AVX5128I, EVEX, EVEX_K; + def rrkz : AVX5128I, - EVEX, EVEX_KZ; + []>, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I, EVEX; - def krm : AVX5128I, EVEX, EVEX_K; + def rmkz : AVX5128I, EVEX, EVEX_KZ; + [(set DstRC:$dst, (OpVT (vselect KRC:$mask, + (X86VBroadcast (ld_frag addr:$src)), + (OpVT (bitconvert (v16i32 immAllZerosV))))))]>, EVEX, EVEX_KZ; } } @@ -907,12 +912,6 @@ def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; -let Predicates = [HasAVX512] in { -def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), - (EXTRACT_SUBREG - (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), - addr:$src)), sub_ymm)>; -} //===----------------------------------------------------------------------===// // AVX-512 BROADCAST MASK TO VECTOR REGISTER //--- diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 0a9f9d88251..0bf9d1d80f7 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -273,7 +273,6 @@ def X86VPermiv3 : SDNode<"X86ISD::VPERMIV3", SDTShuff3Op>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; -def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index cd09218810e..d18fd7e175e 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL ; KNL-LABEL: test1 ; KNL: vxorps @@ -42,3 +43,45 @@ define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) { ret <4 x i1> %c } +; SKX-LABEL: test5 +; SKX: vpcmpgtd +; SKX: vpmovm2w +; SKX: call +; SKX: vpmovzxwd +declare <8 x i1> @func8xi1(<8 x i1> %a) +define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) { + %cmpRes = icmp sgt <8 x i32>%a, %b + %resi = call <8 x i1> @func8xi1(<8 x i1> %cmpRes) + %res = sext <8 x i1>%resi to <8 x i32> + ret <8 x i32> %res +} + +declare <16 x i1> @func16xi1(<16 x i1> %a) + +; KNL-LABEL: test6 +; KNL: vpbroadcastd +; KNL: vpmovdb +; KNL: call +; KNL: vpmovzxbd +; KNL: vpslld $31, %zmm +; KNL: vpsrad $31, %zmm +define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) { + %cmpRes = icmp sgt <16 x i32>%a, %b + %resi = call <16 x i1> @func16xi1(<16 x i1> %cmpRes) + %res = sext <16 x i1>%resi to <16 x i32> + ret <16 x i32> %res +} + +declare <4 x i1> @func4xi1(<4 x i1> %a) +; SKX-LABEL: test7 +; SKX: vpmovm2d +; SKX: call +; SKX: vpslld $31, %xmm +; SKX: vpsrad $31, %xmm + +define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) { + %cmpRes = icmp sgt <4 x i32>%a, %b + %resi = call <4 x i1> @func4xi1(<4 x i1> %cmpRes) + %res = sext <4 x i1>%resi to <4 x i32> + ret <4 x i32> %res +} -- 2.34.1