From ab18d0e7cbfb2639a68b7282ba1ac7e0a51dbb24 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 18 Mar 2015 22:18:51 +0000 Subject: [PATCH] [X86][SSE] Avoid scalarization of v2i64 vector shifts (REAPPLIED) Fixed broken tests. Differential Revision: http://reviews.llvm.org/D8416 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@232682 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 37 +++++++++++++------- test/Analysis/CostModel/X86/testshiftlshr.ll | 16 ++++----- test/Analysis/CostModel/X86/testshiftshl.ll | 16 ++++----- test/CodeGen/X86/vshift-4.ll | 9 +++-- test/CodeGen/X86/x86-shifts.ll | 13 ++++--- 5 files changed, 56 insertions(+), 35 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f4017da11ab..50c50259334 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5906,7 +5906,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, return DAG.getNode(ISD::OR, dl, ResVT, V1, V2); } -static SDValue LowerCONCAT_VECTORS(SDValue Op, +static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); @@ -13255,11 +13255,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // If we have AVX, we can use a variable vector select (VBLENDV) instead // of 3 logic instructions for size savings and potentially speed. // Unfortunately, there is no scalar form of VBLENDV. - + // If either operand is a constant, don't try this. We can expect to // optimize away at least one of the logic instructions later in that // case, so that sequence would be faster than a variable blend. - + // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly // uses XMM0 as the selection register. That may need just as many // instructions as the AND/ANDN/OR sequence due to register moves, so @@ -13267,10 +13267,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->hasAVX() && !isa(Op1) && !isa(Op2)) { - + // Convert to vectors, do a VSELECT, and convert back to scalar. // All of the conversions should be optimized away. - + EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); @@ -13278,9 +13278,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp); - + SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); - + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel, DAG.getIntPtrConstant(0)); } @@ -16189,6 +16189,17 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return Op; } + // 2i64 vector logical shifts can efficiently avoid scalarization - do the + // shifts per-lane and then shuffle the partial results back together. + if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { + // Splat the shift amounts so the scalar shifts above will catch it. + SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); + SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); + SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0); + SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1); + return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); + } + // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. @@ -21960,7 +21971,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, // an and with a mask. // We'd like to try to combine that into a shuffle with zero // plus a bitcast, removing the and. - if (N0.getOpcode() != ISD::BITCAST || + if (N0.getOpcode() != ISD::BITCAST || N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); @@ -21990,7 +22001,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, unsigned ResSize = N1.getValueType().getScalarSizeInBits(); // Make sure the splat matches the mask we expect - if (SplatBitSize > ResSize || + if (SplatBitSize > ResSize || (SplatValue + 1).exactLogBase2() != (int)SrcSize) return SDValue(); @@ -22948,7 +22959,7 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { if (ConstantFPSDNode *C = dyn_cast(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); - + return SDValue(); } @@ -23222,7 +23233,7 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return DAG.getConstant(1, VT); if (CC == ISD::SETEQ || CC == ISD::SETGE) return DAG.getNOT(DL, LHS.getOperand(0), VT); - + assert((CC == ISD::SETNE || CC == ISD::SETLT) && "Unexpected condition code!"); return LHS.getOperand(0); @@ -23264,7 +23275,7 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, // countS and just gets an f32 from that address. unsigned DestIndex = cast(N->getOperand(2))->getZExtValue() >> 6; - + Ld = NarrowVectorLoadToElement(cast(Ld), DestIndex, DAG); // Create this as a scalar to vector to match the instruction pattern. @@ -23288,7 +23299,7 @@ static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { // pattern-matching possibilities related to scalar math ops in SSE/AVX. // x86InstrInfo knows how to commute this back after instruction selection // if it would help register allocation. - + // TODO: If optimizing for size or a processor that doesn't suffer from // partial register update stalls, this should be transformed into a MOVSD // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. diff --git a/test/Analysis/CostModel/X86/testshiftlshr.ll b/test/Analysis/CostModel/X86/testshiftlshr.ll index 7bc8d89e4ad..78bf0a60830 100644 --- a/test/Analysis/CostModel/X86/testshiftlshr.ll +++ b/test/Analysis/CostModel/X86/testshiftlshr.ll @@ -7,7 +7,7 @@ entry: ; SSE2: shift2i16 ; SSE2: cost of 20 {{.*}} lshr ; SSE2-CODEGEN: shift2i16 - ; SSE2-CODEGEN: shrq %cl + ; SSE2-CODEGEN: psrlq %0 = lshr %shifttype %a , %b ret %shifttype %0 @@ -67,7 +67,7 @@ entry: ; SSE2: shift2i32 ; SSE2: cost of 20 {{.*}} lshr ; SSE2-CODEGEN: shift2i32 - ; SSE2-CODEGEN: shrq %cl + ; SSE2-CODEGEN: psrlq %0 = lshr %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -127,7 +127,7 @@ entry: ; SSE2: shift2i64 ; SSE2: cost of 20 {{.*}} lshr ; SSE2-CODEGEN: shift2i64 - ; SSE2-CODEGEN: shrq %cl + ; SSE2-CODEGEN: psrlq %0 = lshr %shifttype2i64 %a , %b ret %shifttype2i64 %0 @@ -139,7 +139,7 @@ entry: ; SSE2: shift4i64 ; SSE2: cost of 40 {{.*}} lshr ; SSE2-CODEGEN: shift4i64 - ; SSE2-CODEGEN: shrq %cl + ; SSE2-CODEGEN: psrlq %0 = lshr %shifttype4i64 %a , %b ret %shifttype4i64 %0 @@ -151,7 +151,7 @@ entry: ; SSE2: shift8i64 ; SSE2: cost of 80 {{.*}} lshr ; SSE2-CODEGEN: shift8i64 - ; SSE2-CODEGEN: shrq %cl + ; SSE2-CODEGEN: psrlq %0 = lshr %shifttype8i64 %a , %b ret %shifttype8i64 %0 @@ -163,7 +163,7 @@ entry: ; SSE2: shift16i64 ; SSE2: cost of 160 {{.*}} lshr ; SSE2-CODEGEN: shift16i64 - ; SSE2-CODEGEN: shrq %cl + ; SSE2-CODEGEN: psrlq %0 = lshr %shifttype16i64 %a , %b ret %shifttype16i64 %0 @@ -175,7 +175,7 @@ entry: ; SSE2: shift32i64 ; SSE2: cost of 320 {{.*}} lshr ; SSE2-CODEGEN: shift32i64 - ; SSE2-CODEGEN: shrq %cl + ; SSE2-CODEGEN: psrlq %0 = lshr %shifttype32i64 %a , %b ret %shifttype32i64 %0 @@ -187,7 +187,7 @@ entry: ; SSE2: shift2i8 ; SSE2: cost of 20 {{.*}} lshr ; SSE2-CODEGEN: shift2i8 - ; SSE2-CODEGEN: shrq %cl + ; SSE2-CODEGEN: psrlq %0 = lshr %shifttype2i8 %a , %b ret %shifttype2i8 %0 diff --git a/test/Analysis/CostModel/X86/testshiftshl.ll b/test/Analysis/CostModel/X86/testshiftshl.ll index 40effd02922..c36e0f5dfdf 100644 --- a/test/Analysis/CostModel/X86/testshiftshl.ll +++ b/test/Analysis/CostModel/X86/testshiftshl.ll @@ -7,7 +7,7 @@ entry: ; SSE2: shift2i16 ; SSE2: cost of 20 {{.*}} shl ; SSE2-CODEGEN: shift2i16 - ; SSE2-CODEGEN: shlq %cl + ; SSE2-CODEGEN: psllq %0 = shl %shifttype %a , %b ret %shifttype %0 @@ -67,7 +67,7 @@ entry: ; SSE2: shift2i32 ; SSE2: cost of 20 {{.*}} shl ; SSE2-CODEGEN: shift2i32 - ; SSE2-CODEGEN: shlq %cl + ; SSE2-CODEGEN: psllq %0 = shl %shifttype2i32 %a , %b ret %shifttype2i32 %0 @@ -127,7 +127,7 @@ entry: ; SSE2: shift2i64 ; SSE2: cost of 20 {{.*}} shl ; SSE2-CODEGEN: shift2i64 - ; SSE2-CODEGEN: shlq %cl + ; SSE2-CODEGEN: psllq %0 = shl %shifttype2i64 %a , %b ret %shifttype2i64 %0 @@ -139,7 +139,7 @@ entry: ; SSE2: shift4i64 ; SSE2: cost of 40 {{.*}} shl ; SSE2-CODEGEN: shift4i64 - ; SSE2-CODEGEN: shlq %cl + ; SSE2-CODEGEN: psllq %0 = shl %shifttype4i64 %a , %b ret %shifttype4i64 %0 @@ -151,7 +151,7 @@ entry: ; SSE2: shift8i64 ; SSE2: cost of 80 {{.*}} shl ; SSE2-CODEGEN: shift8i64 - ; SSE2-CODEGEN: shlq %cl + ; SSE2-CODEGEN: psllq %0 = shl %shifttype8i64 %a , %b ret %shifttype8i64 %0 @@ -163,7 +163,7 @@ entry: ; SSE2: shift16i64 ; SSE2: cost of 160 {{.*}} shl ; SSE2-CODEGEN: shift16i64 - ; SSE2-CODEGEN: shlq %cl + ; SSE2-CODEGEN: psllq %0 = shl %shifttype16i64 %a , %b ret %shifttype16i64 %0 @@ -175,7 +175,7 @@ entry: ; SSE2: shift32i64 ; SSE2: cost of 320 {{.*}} shl ; SSE2-CODEGEN: shift32i64 - ; SSE2-CODEGEN: shlq %cl + ; SSE2-CODEGEN: psllq %0 = shl %shifttype32i64 %a , %b ret %shifttype32i64 %0 @@ -187,7 +187,7 @@ entry: ; SSE2: shift2i8 ; SSE2: cost of 20 {{.*}} shl ; SSE2-CODEGEN: shift2i8 - ; SSE2-CODEGEN: shlq %cl + ; SSE2-CODEGEN: psllq %0 = shl %shifttype2i8 %a , %b ret %shifttype2i8 %0 diff --git a/test/CodeGen/X86/vshift-4.ll b/test/CodeGen/X86/vshift-4.ll index cda9bc893a2..4ab5db88982 100644 --- a/test/CodeGen/X86/vshift-4.ll +++ b/test/CodeGen/X86/vshift-4.ll @@ -13,11 +13,16 @@ entry: ret void } -; shift1b can't use a packed shift +; shift1b can't use a packed shift but can shift lanes separately and shuffle back together define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind { entry: ; CHECK-LABEL: shift1b: -; CHECK: shll +; CHECK: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: psllq %xmm2, %xmm3 +; CHECK-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero +; CHECK-NEXT: psllq %xmm1, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] %shamt = shufflevector <2 x i64> %sh, <2 x i64> undef, <2 x i32> %shl = shl <2 x i64> %val, %shamt store <2 x i64> %shl, <2 x i64>* %dst diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll index a10134e4bb6..20505cb2af8 100644 --- a/test/CodeGen/X86/x86-shifts.ll +++ b/test/CodeGen/X86/x86-shifts.ll @@ -118,10 +118,15 @@ entry: define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind { entry: -; CHECK: shr2_nosplat -; CHECK-NOT: psrlq -; CHECK-NOT: psrlq -; CHECK: ret +; CHECK-LABEL: shr2_nosplat +; CHECK: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrlq $8, %xmm2 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; CHECK-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; CHECK-NEXT: xorpd %xmm1, %xmm0 +; CHECK-NEXT: ret %B = lshr <2 x i64> %A, < i64 8, i64 1> %C = lshr <2 x i64> %A, < i64 1, i64 0> %K = xor <2 x i64> %B, %C -- 2.34.1