From 6292eceea07d142c10aff86362fdaa12ef7878ea Mon Sep 17 00:00:00 2001 From: Bruno Cardoso Lopes Date: Thu, 25 Aug 2011 21:40:37 +0000 Subject: [PATCH] Add support for AVX 256-bit version of MOVDDUP! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138588 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 33 ++++++++++++++++++++++++++++++ lib/Target/X86/X86InstrSSE.td | 14 +++++++++++++ test/CodeGen/X86/avx-vmovddup.ll | 14 +++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 test/CodeGen/X86/avx-vmovddup.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 80f3e943b54..89aaff53cfa 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3560,6 +3560,13 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl &Mask, EVT VT) { if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) return false; + // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern + // FIXME: Need a better way to get rid of this, there's no latency difference + // between UNPCKLPD and MOVDDUP, the later should always be checked first and + // the former later. We should also remove the "_undef" special mask. + if (NumElems == 4 && VT.getSizeInBits() == 256) + return false; + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. unsigned NumLanes = VT.getSizeInBits() / 128; @@ -3913,6 +3920,28 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, return true; } +/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 256-bit +/// version of MOVDDUP. +static bool isMOVDDUPYMask(ShuffleVectorSDNode *N, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + int NumElts = VT.getVectorNumElements(); + bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF; + + if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 || + !V2IsUndef || NumElts != 4) + return false; + + for (int i = 0; i != NumElts/2; ++i) + if (!isUndefOrEqual(N->getMaskElt(i), 0)) + return false; + for (int i = NumElts/2; i != NumElts; ++i) + if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2)) + return false; + return true; +} + /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit /// version of MOVDDUP. @@ -6691,6 +6720,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // supported in the AVX instruction set. // + // Handle VMOVDDUPY permutations + if (isMOVDDUPYMask(SVOp, Subtarget)) + return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); + // Handle VPERMILPS* permutations if (isVPERMILPSMask(M, VT, Subtarget)) return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 5d647b06d89..6bafc2d6c7a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4010,6 +4010,20 @@ let Predicates = [HasAVX] in { def : Pat<(X86Movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + + // 256-bit version + def : Pat<(X86Movddup (memopv4f64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (memopv4i64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4f64 (scalar_to_vector (loadf64 addr:$src)))), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4f64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; + def : Pat<(X86Movddup (v4i64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; } //===---------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/avx-vmovddup.ll b/test/CodeGen/X86/avx-vmovddup.ll new file mode 100644 index 00000000000..1c56fe2b1a0 --- /dev/null +++ b/test/CodeGen/X86/avx-vmovddup.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +; CHECK: vmovddup %ymm +define <4 x i64> @A(<4 x i64> %a) { + %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> + ret <4 x i64> %c +} + +; CHECK: vmovddup (% +define <4 x i64> @B(<4 x i64>* %ptr) { + %a = load <4 x i64>* %ptr + %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> + ret <4 x i64> %c +} -- 2.34.1