From: Asaf Badouh Date: Wed, 18 Nov 2015 09:42:45 +0000 (+0000) Subject: [X86][AVX512CD] add mask broadcast intrinsics X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=51f7c584fa6862cf0f2b9d0d50de529c623b0841;p=oota-llvm.git [X86][AVX512CD] add mask broadcast intrinsics Differential Revision: http://reviews.llvm.org/D14573 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@253450 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 2b5577cdad9..df3f9931476 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -4955,6 +4955,24 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_pbroadcastq_i64_512 : Intrinsic<[llvm_v8i64_ty], [llvm_i64_ty], [IntrNoMem]>; + def int_x86_avx512_broadcastmw_512 : + GCCBuiltin<"__builtin_ia32_broadcastmw512">, + Intrinsic<[llvm_v16i32_ty], [llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_broadcastmw_256 : + GCCBuiltin<"__builtin_ia32_broadcastmw256">, + Intrinsic<[llvm_v8i32_ty], [llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_broadcastmw_128 : + GCCBuiltin<"__builtin_ia32_broadcastmw128">, + Intrinsic<[llvm_v4i32_ty], [llvm_i16_ty], [IntrNoMem]>; + def int_x86_avx512_broadcastmb_512 : + GCCBuiltin<"__builtin_ia32_broadcastmb512">, + Intrinsic<[llvm_v8i64_ty], [llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_broadcastmb_256 : + GCCBuiltin<"__builtin_ia32_broadcastmb256">, + Intrinsic<[llvm_v4i64_ty], [llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_broadcastmb_128 : + GCCBuiltin<"__builtin_ia32_broadcastmb128">, + Intrinsic<[llvm_v2i64_ty], [llvm_i8_ty], [IntrNoMem]>; } // Vector sign and zero extend diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 63929b189ee..ce8fcf16466 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -833,8 +833,12 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_XMM256: case TYPE_XMM512: case TYPE_VK1: + case TYPE_VK2: + case TYPE_VK4: case TYPE_VK8: case TYPE_VK16: + case TYPE_VK32: + case TYPE_VK64: case TYPE_DEBUGREG: case TYPE_CONTROLREG: case TYPE_BNDR: diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index a6073159f2d..6140c5ac1e7 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -1488,8 +1488,12 @@ static int readModRM(struct InternalInstruction* insn) { case TYPE_XMM: \ return prefix##_XMM0 + index; \ case TYPE_VK1: \ + case TYPE_VK2: \ + case TYPE_VK4: \ case TYPE_VK8: \ case TYPE_VK16: \ + case TYPE_VK32: \ + case TYPE_VK64: \ if (index > 7) \ *valid = 0; \ return prefix##_K0 + index; \ diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2be35511f18..31401f2fb8f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16463,6 +16463,12 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget DataToCompress), Mask, PassThru, Subtarget, DAG); } + case BROADCASTM: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); + Mask = DAG.getBitcast(MaskVT, Mask); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); + } case BLEND: { SDValue Mask = Op.getOperand(3); MVT VT = Op.getSimpleValueType(); @@ -20108,6 +20114,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index c800f16489b..c1b6328c712 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -403,6 +403,8 @@ namespace llvm { VFPCLASS, // Broadcast scalar to vector VBROADCAST, + // Broadcast mask to vector + VBROADCASTM, // Broadcast subvector to vector SUBV_BROADCAST, // Insert/Extract vector element diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 0069dd00d6f..96da043c2ff 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1113,30 +1113,27 @@ def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), //===----------------------------------------------------------------------===// // AVX-512 BROADCAST MASK TO VECTOR REGISTER //--- - -multiclass avx512_mask_broadcast opc, string OpcodeStr, - RegisterClass KRC> { -let Predicates = [HasCDI] in -def Zrr : AVX512XS8I, EVEX, EVEX_V512; - -let Predicates = [HasCDI, HasVLX] in { -def Z128rr : AVX512XS8I, EVEX, EVEX_V128; -def Z256rr : AVX512XS8I opc, string OpcodeStr, + X86VectorVTInfo _, RegisterClass KRC> { + def rr : AVX512XS8I, EVEX, EVEX_V256; + [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX; } + +multiclass avx512_mask_broadcast opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> { + let Predicates = [HasCDI] in + defm Z : avx512_mask_broadcastm, EVEX_V512; + let Predicates = [HasCDI, HasVLX] in { + defm Z256 : avx512_mask_broadcastm, EVEX_V256; + defm Z128 : avx512_mask_broadcastm, EVEX_V128; + } } -let Predicates = [HasCDI] in { defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", - VK16>; + avx512vl_i32_info, VK16>; defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", - VK8>, VEX_W; -} + avx512vl_i64_info, VK8>, VEX_W; //===----------------------------------------------------------------------===// // -- VPERM2I - 3 source operands form -- diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 90710bfdfc0..68891fa18db 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -295,7 +295,8 @@ def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisInt<3>]>; def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; -def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; +def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisInt<0>, SDTCisInt<1>]>; def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>; @@ -387,6 +388,7 @@ def X86SubV32x2Broadcast : SDNode<"X86ISD::SUBV_BROADCAST", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>, []>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 337c46c56c8..b4df8dae219 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -30,7 +30,7 @@ enum IntrinsicType { COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC, - TERLOG_OP_MASK, TERLOG_OP_MASKZ + TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM }; struct IntrinsicData { @@ -315,6 +315,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll new file mode 100644 index 00000000000..29f17bbc019 --- /dev/null +++ b/test/CodeGen/X86/avx512cd-intrinsics.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s + +define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) { + ; CHECK: test_x86_vbroadcastmw_512 + ; CHECK: vpbroadcastmw2d %k0, %zmm0 + %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) ; + ret <16 x i32> %res +} +declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16) + +define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) { + ; CHECK: test_x86_broadcastmb_512 + ; CHECK: vpbroadcastmb2q %k0, %zmm0 + %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) ; + ret <8 x i64> %res +} +declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8) + diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll index b82782c6253..14e91e1a876 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -146,3 +146,34 @@ define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i ret <4 x i64> %res2 } +define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) { + ; CHECK: test_x86_vbroadcastmw_256 + ; CHECK: vpbroadcastmw2d %k0, %ymm0 + %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ; + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16) + +define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) { + ; CHECK: test_x86_vbroadcastmw_128 + ; CHECK: vpbroadcastmw2d %k0, %xmm0 + %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ; + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16) + +define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) { + ; CHECK: test_x86_broadcastmb_256 + ; CHECK: vpbroadcastmb2q %k0, %ymm0 + %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ; + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8) + +define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) { + ; CHECK: test_x86_broadcastmb_128 + ; CHECK: vpbroadcastmb2q %k0, %xmm0 + %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ; + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)