From 92bfb547700550fcdb668862533e4952a8d74969 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Mon, 26 Aug 2013 12:45:35 +0000 Subject: [PATCH] AVX-512: Added shuffle instructions - VPSHUFD, VPERMILPS, VMOVDDUP, VMOVLHPS, VMOVHLPS, VSHUFPS, VALIGN single and double forms. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189215 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 68 +++++++------- lib/Target/X86/X86InstrAVX512.td | 139 ++++++++++++++++++++++++++++- lib/Target/X86/X86InstrSSE.td | 4 +- test/CodeGen/X86/avx512-shuffle.ll | 62 +++++++++++++ 4 files changed, 239 insertions(+), 34 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a00f848d2af..6a7ca7d5b48 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3600,7 +3600,7 @@ static bool isPALIGNRMask(ArrayRef Mask, MVT VT, return false; unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; // Do not handle 64-bit element shuffles with palignr. @@ -3683,10 +3683,7 @@ static void CommuteVectorShuffleMask(SmallVectorImpl &Mask, /// specifies a shuffle of elements that is suitable for input to 128/256-bit /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be /// reverse of what x86 shuffles want. -static bool isSHUFPMask(ArrayRef Mask, MVT VT, bool HasFp256, - bool Commuted = false) { - if (!HasFp256 && VT.is256BitVector()) - return false; +static bool isSHUFPMask(ArrayRef Mask, MVT VT, bool Commuted = false) { unsigned NumElems = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; @@ -3695,6 +3692,10 @@ static bool isSHUFPMask(ArrayRef Mask, MVT VT, bool HasFp256, if (NumLaneElems != 2 && NumLaneElems != 4) return false; + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + bool symetricMaskRequired = + (VT.getSizeInBits() >= 256) && (EltSize == 32); + // VSHUFPSY divides the resulting vector into 4 chunks. // The sources are also splitted into 4 chunks, and each destination // chunk must come from a different source chunk. @@ -3714,6 +3715,7 @@ static bool isSHUFPMask(ArrayRef Mask, MVT VT, bool HasFp256, // // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 // + SmallVector MaskVal(NumLaneElems, -1); unsigned HalfLaneElems = NumLaneElems/2; for (unsigned l = 0; l != NumElems; l += NumLaneElems) { for (unsigned i = 0; i != NumLaneElems; ++i) { @@ -3724,9 +3726,13 @@ static bool isSHUFPMask(ArrayRef Mask, MVT VT, bool HasFp256, // For VSHUFPSY, the mask of the second half must be the same as the // first but with the appropriate offsets. This works in the same way as // VPERMILPS works with masks. - if (NumElems != 8 || l == 0 || Mask[i] < 0) + if (!symetricMaskRequired || Idx < 0) + continue; + if (MaskVal[i] < 0) { + MaskVal[i] = Idx - l; continue; - if (!isUndefOrEqual(Idx, Mask[i]+l)) + } + if ((signed)(Idx - l) != MaskVal[i]) return false; } } @@ -4158,31 +4164,32 @@ static bool isPermImmMask(ArrayRef Mask, MVT VT, unsigned& Imm8) { /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. -static bool isVPERMILPMask(ArrayRef Mask, MVT VT, bool HasFp256) { - if (!HasFp256) +static bool isVPERMILPMask(ArrayRef Mask, MVT VT) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (VT.getSizeInBits() < 256 || EltSize < 32) return false; - + bool symetricMaskRequired = (EltSize == 32); unsigned NumElts = VT.getVectorNumElements(); - // Only match 256-bit with 32/64-bit types - if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8)) - return false; unsigned NumLanes = VT.getSizeInBits()/128; unsigned LaneSize = NumElts/NumLanes; + // 2 or 4 elements in one lane + + SmallVector ExpectedMaskVal(LaneSize, -1); for (unsigned l = 0; l != NumElts; l += LaneSize) { for (unsigned i = 0; i != LaneSize; ++i) { if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) return false; - if (NumElts != 8 || l == 0) - continue; - // VPERMILPS handling - if (Mask[i] < 0) - continue; - if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) - return false; + if (symetricMaskRequired) { + if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { + ExpectedMaskVal[i] = Mask[i+l] - l; + continue; + } + if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l)) + return false; + } } } - return true; } @@ -4431,10 +4438,11 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { MVT VT = SVOp->getSimpleValueType(0); - unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; + unsigned EltSize = VT.is512BitVector() ? 1 : + VT.getVectorElementType().getSizeInBits() >> 3; unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; int Val = 0; @@ -7407,7 +7415,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // Normalize the node to match x86 shuffle ops if needed - if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true))) + if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are @@ -7430,7 +7438,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShufflePSHUFLWImmediate(SVOp), DAG); - if (isSHUFPMask(M, VT, HasFp256)) + if (isSHUFPMask(M, VT)) return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, getShuffleSHUFImmediate(SVOp), DAG); @@ -7449,8 +7457,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); // Handle VPERMILPS/D* permutations - if (isVPERMILPMask(M, VT, HasFp256)) { - if (HasInt256 && VT == MVT::v8i32) + if (isVPERMILPMask(M, VT)) { + if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, @@ -13621,7 +13629,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || - isSHUFPMask(M, SVT, Subtarget->hasFp256()) || + isSHUFPMask(M, SVT) || isPSHUFDMask(M, SVT) || isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || @@ -13646,8 +13654,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl &Mask, if (NumElts == 4 && SVT.is128BitVector()) { return (isMOVLMask(Mask, SVT) || isCommutedMOVLMask(Mask, SVT, true) || - isSHUFPMask(Mask, SVT, Subtarget->hasFp256()) || - isSHUFPMask(Mask, SVT, Subtarget->hasFp256(), /* Commuted */ true)); + isSHUFPMask(Mask, SVT) || + isSHUFPMask(Mask, SVT, /* Commuted */ true)); } return false; } diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 17be5df3948..cf4a0f56eb8 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1621,6 +1621,45 @@ defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32, defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, VR512, memopv8i64, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +//===----------------------------------------------------------------------===// +// AVX-512 - PSHUFD +// + +multiclass avx512_pshuf_imm opc, string OpcodeStr, RegisterClass RC, + SDNode OpNode, PatFrag mem_frag, + X86MemOperand x86memop, ValueType OpVT> { + def ri : AVX512Ii8, + EVEX; + def mi : AVX512Ii8, EVEX; +} + +defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32, + i512mem, v16i32>, OpSize, EVEX_V512, EVEX_CD8<32, CD8VF>; + +let ExeDomain = SSEPackedSingle in +defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp, + memopv16f32, i512mem, v16f32>, OpSize, TA, EVEX_V512, + EVEX_CD8<32, CD8VF>; +let ExeDomain = SSEPackedDouble in +defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp, + memopv8f64, i512mem, v8f64>, OpSize, TA, EVEX_V512, + VEX_W, EVEX_CD8<32, CD8VF>; + +def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))), + (VPERMILPSZri VR512:$src1, imm:$imm)>; +def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))), + (VPERMILPDZri VR512:$src1, imm:$imm)>; //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions @@ -1774,8 +1813,8 @@ multiclass avx512_vptest opc, string OpcodeStr, RegisterClass KRC, defm VPTESTMDZ : avx512_vptest<0x27, "vptestmd", VK16, VR512, f512mem, memopv16i32, X86testm, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, memopv8i64, - X86testm, v8i64>, EVEX_V512, VEX_W, +defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, + memopv8i64, X86testm, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// @@ -1914,3 +1953,99 @@ defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, i512mem, memopv8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 - MOVDDUP +//===----------------------------------------------------------------------===// + +multiclass avx512_movddup { +def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX; +def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, + (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; +} + +defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))), + (VMOVDDUPZrm addr:$src)>; + +def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + "vmovlhps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))], + IIC_SSE_MOV_LH>, EVEX_4V; +def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + "vmovhlps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))], + IIC_SSE_MOV_LH>, EVEX_4V; + +// MOVLHPS patterns +def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)), + (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>; +def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)), + (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>; + +// MOVHLPS patterns +def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)), + (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>; +//===----------------------------------------------------------------------===// +// VSHUFPS - VSHUFPD Operations + +multiclass avx512_shufp { + def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), + (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, + EVEX_4V, TB, Sched<[WriteShuffleLd, ReadAfterLd]>; + def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, + (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, + EVEX_4V, TB, Sched<[WriteShuffle]>; +} + +defm VSHUFPSZ : avx512_shufp, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VSHUFPDZ : avx512_shufp, OpSize, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + + +multiclass avx512_alignr { + def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, EVEX_4V; + def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, i8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, EVEX_4V; +} +defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, + EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + +def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), + (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>; +def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), + (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>; +def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), + (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>; +def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), + (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>; + diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 4eaba38e520..9b27e27e8de 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1327,7 +1327,7 @@ let Predicates = [UseSSE2] in { // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions //===----------------------------------------------------------------------===// -let AddedComplexity = 20 in { +let AddedComplexity = 20, Predicates = [UseAVX] in { def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -1358,7 +1358,7 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in { IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>; } -let Predicates = [HasAVX] in { +let Predicates = [UseAVX] in { // MOVLHPS patterns def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), (VMOVLHPSrr VR128:$src1, VR128:$src2)>; diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll index df9106eef3f..9495c65e9fd 100644 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ b/test/CodeGen/X86/avx512-shuffle.ll @@ -106,6 +106,53 @@ define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind { ret <16 x i32> %d } +; CHECK-LABEL: test12 +; CHECK: vmovlhpsz %xmm +; CHECK: ret +define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind { + %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %c +} + +; CHECK-LABEL: test13 +; CHECK: vpermilps $-79, %zmm +; CHECK: ret +define <16 x float> @test13(<16 x float> %a) { + %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> + ret <16 x float> %b +} + +; CHECK-LABEL: test14 +; CHECK: vpermilpd $-53, %zmm +; CHECK: ret +define <8 x double> @test14(<8 x double> %a) { + %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> + ret <8 x double> %b +} + +; CHECK-LABEL: test15 +; CHECK: vpshufd $-79, %zmm +; CHECK: ret +define <16 x i32> @test15(<16 x i32> %a) { + %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> + ret <16 x i32> %b +} +; CHECK-LABEL: test16 +; CHECK: valignq $2, %zmm0, %zmm1 +; CHECK: ret +define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + ret <8 x double> %c +} + +; CHECK-LABEL: test17 +; CHECK: vshufpd $19, %zmm1, %zmm0 +; CHECK: ret +define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> + ret <8 x double> %c +} + ; CHECK-LABEL: test18 ; CHECK: vpunpckhdq %zmm ; CHECK: ret @@ -138,3 +185,18 @@ define <16 x float> @test21(<16 x float> %a, <16 x float> %c) { ret <16 x float> %b } +; CHECK-LABEL: test22 +; CHECK: vmovhlpsz %xmm +; CHECK: ret +define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind { + %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %c +} + +; CHECK-LABEL: @test23 +; CHECK: vshufps $-112, %zmm +; CHECK: ret +define <16 x float> @test23(<16 x float> %a, <16 x float> %c) { + %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> + ret <16 x float> %b +} \ No newline at end of file -- 2.34.1