From: Nadav Rotem Date: Sun, 8 Apr 2012 12:54:54 +0000 (+0000) Subject: AVX2: Build splat vectors by broadcasting a scalar from the constant pool. X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=9d68b06bc5ef1157d198d52e3f6829c721d72552;p=oota-llvm.git AVX2: Build splat vectors by broadcasting a scalar from the constant pool. Previously we used three instructions to broadcast an immediate value into a vector register. On Sandybridge we continue to load the broadcasted value from the constant pool. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154284 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e8c0d4fcfcf..e3d98a2aa54 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4852,41 +4852,41 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, return SDValue(); } -/// isVectorBroadcast - Check if the node chain is suitable to be xformed to -/// a vbroadcast node. We support two patterns: -/// 1. A splat BUILD_VECTOR which uses a single scalar load. +/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction +/// to generate a splat value for the following cases: +/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. /// 2. A splat shuffle which uses a scalar_to_vector node which comes from -/// a scalar load. -/// The scalar load node is returned when a pattern is found, +/// a scalar load, or a constant. +/// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. -static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) { +static SDValue LowerVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget, + DebugLoc &dl, SelectionDAG &DAG) { if (!Subtarget->hasAVX()) return SDValue(); EVT VT = Op.getValueType(); - SDValue V = Op; - if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - - //A suspected load to be broadcasted. SDValue Ld; + bool ConstSplatVal; - switch (V.getOpcode()) { + switch (Op.getOpcode()) { default: // Unknown pattern found. return SDValue(); case ISD::BUILD_VECTOR: { // The BUILD_VECTOR node must be a splat. - if (!isSplatVector(V.getNode())) + if (!isSplatVector(Op.getNode())) return SDValue(); - Ld = V.getOperand(0); + Ld = Op.getOperand(0); + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); // The suspected load node has several users. Make sure that all // of its users are from the BUILD_VECTOR node. - if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) + // Constants may have multiple users. + if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) return SDValue(); break; } @@ -4904,15 +4904,57 @@ static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) { return SDValue(); Ld = Sc.getOperand(0); + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); // The scalar_to_vector node and the suspected // load node must have exactly one user. - if (!Sc.hasOneUse() || !Ld.hasOneUse()) + // Constants may have multiple users. + if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) return SDValue(); break; } } + bool Is256 = VT.getSizeInBits() == 256; + bool Is128 = VT.getSizeInBits() == 128; + + // Handle the broadcasting a single constant scalar from the constant pool + // into a vector. On Sandybridge it is still better to load a constant vector + // from the constant pool and not to broadcast it from a scalar. + if (ConstSplatVal && Subtarget->hasAVX2()) { + EVT CVT = Ld.getValueType(); + assert(!CVT.isVector() && "Must not broadcast a vector type"); + unsigned ScalarSize = CVT.getSizeInBits(); + + if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) || + (Is128 && (ScalarSize == 32))) { + + // This is the type of the load operation for the constant that we save + // in the constant pool. We can't load float values from the constant pool + // because the DAG has to be legal at this stage. + MVT LdTy = (ScalarSize == 32 ? MVT::i32 : MVT::i64); + + const Constant *C = 0; + if (ConstantSDNode *CI = dyn_cast(Ld)) + C = CI->getConstantIntValue(); + else if (ConstantFPSDNode *CF = dyn_cast(Ld)) + C = CF->getConstantFPValue(); + + assert(C && "Invalid constant type"); + + SDValue CP = DAG.getConstantPool(C, LdTy); + unsigned Alignment = cast(CP)->getAlignment(); + Ld = DAG.getLoad(LdTy, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + + // Bitcast the loaded constant back to the requested type. + Ld = DAG.getNode(ISD::BITCAST, dl, CVT, Ld); + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + } + } + // The scalar source must be a normal load. if (!ISD::isNormalLoad(Ld.getNode())) return SDValue(); @@ -4921,28 +4963,26 @@ static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) { if (Ld->hasAnyUseOfValue(1)) return SDValue(); - bool Is256 = VT.getSizeInBits() == 256; - bool Is128 = VT.getSizeInBits() == 128; unsigned ScalarSize = Ld.getValueType().getSizeInBits(); // VBroadcast to YMM if (Is256 && (ScalarSize == 32 || ScalarSize == 64)) - return Ld; + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // VBroadcast to XMM if (Is128 && (ScalarSize == 32)) - return Ld; + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is vbroadcastsd xmm if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) { // VBroadcast to YMM if (Is256 && (ScalarSize == 8 || ScalarSize == 16)) - return Ld; + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // VBroadcast to XMM if (Is128 && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) - return Ld; + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } // Unsupported broadcast. @@ -4977,9 +5017,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl); } - SDValue LD = isVectorBroadcast(Op, Subtarget); - if (LD.getNode()) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); + SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, dl, DAG); + if (Broadcast.getNode()) + return Broadcast; unsigned EVTBits = ExtVT.getSizeInBits(); @@ -6205,9 +6245,9 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, int Size = VT.getSizeInBits(); // Use vbroadcast whenever the splat comes from a foldable load - SDValue LD = isVectorBroadcast(Op, Subtarget); - if (LD.getNode()) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD); + SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, dl, DAG); + if (Broadcast.getNode()) + return Broadcast; // Handle splats by matching through known shuffle masks if ((Size == 128 && NumElem <= 4) || diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll index 5bf9f4f2e2f..148ae7329f4 100644 --- a/test/CodeGen/X86/avx-vbroadcast.ll +++ b/test/CodeGen/X86/avx-vbroadcast.ll @@ -47,7 +47,7 @@ entry: ;;;; 128-bit versions ; CHECK: vbroadcastss (% -define <4 x float> @E(float* %ptr) nounwind uwtable readnone ssp { +define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp { entry: %q = load float* %ptr, align 4 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 @@ -57,6 +57,19 @@ entry: ret <4 x float> %vecinit6.i } + +; CHECK: _e2 +; CHECK-NOT: vbroadcastss +; CHECK: ret +define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp { + %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3 + ret <4 x float> %vecinit6.i +} + + ; CHECK: vbroadcastss (% define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp { entry: @@ -71,7 +84,7 @@ entry: ; Unsupported vbroadcasts ; CHECK: _G -; CHECK-NOT: vbroadcastsd (% +; CHECK-NOT: broadcast (% ; CHECK: ret define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp { entry: @@ -82,7 +95,7 @@ entry: } ; CHECK: _H -; CHECK-NOT: vbroadcastss +; CHECK-NOT: broadcast ; CHECK: ret define <4 x i32> @H(<4 x i32> %a) { %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> @@ -90,7 +103,7 @@ define <4 x i32> @H(<4 x i32> %a) { } ; CHECK: _I -; CHECK-NOT: vbroadcastsd (% +; CHECK-NOT: broadcast (% ; CHECK: ret define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp { entry: diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index fbabb151588..314466175e1 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -150,3 +150,24 @@ entry: %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1 ret <2 x double> %vecinit2.i } + +; CHECK: V111 +; CHECK: vpbroadcastd +; CHECK: ret +define <8 x i32> @V111(<8 x i32> %in) nounwind uwtable readnone ssp { +entry: + %g = add <8 x i32> %in, + ret <8 x i32> %g +} + +; CHECK: _e2 +; CHECK: vbroadcastss +; CHECK: ret +define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp { + %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3 + ret <4 x float> %vecinit6.i +} +