From: Andrea Di Biagio Date: Thu, 10 Jul 2014 18:04:55 +0000 (+0000) Subject: [DAG] Further improve the logic in DAGCombiner that folds a pair of shuffles into... X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=3996e5290a60e24e02381830b02f1105980428db;p=oota-llvm.git [DAG] Further improve the logic in DAGCombiner that folds a pair of shuffles into a single shuffle if the resulting mask is legal. This patch teaches the DAGCombiner how to fold shuffles according to the following new rules: 1. shuffle(shuffle(x, y), undef) -> x 2. shuffle(shuffle(x, y), undef) -> y 3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef) 4. shuffle(shuffle(x, y), undef) -> shuffle(y, undef) The backend avoids to combine shuffles according to rules 3. and 4. if the resulting shuffle does not have a legal mask. This is to avoid introducing illegal shuffles that are potentially expanded into a sub-optimal sequence of target specific dag nodes during vector legalization. Added test case combine-vec-shuffle-2.ll to verify that we correctly triggers the new rules when combining shuffles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@212748 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f6077b07d8d..7c42e4d3d21 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10677,17 +10677,12 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { } // If this shuffle node is simply a swizzle of another shuffle node, - // and it reverses the swizzle of the previous shuffle then we can - // optimize shuffle(shuffle(x, undef), undef) -> x. + // then try to simplify it. if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && N1.getOpcode() == ISD::UNDEF) { ShuffleVectorSDNode *OtherSV = cast(N0); - // Shuffle nodes can only reverse shuffles with a single non-undef value. - if (N0.getOperand(1).getOpcode() != ISD::UNDEF) - return SDValue(); - // The incoming shuffle must be of the same type as the result of the // current shuffle. assert(OtherSV->getOperand(0).getValueType() == VT && @@ -10704,27 +10699,69 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { Idx = OtherSV->getMaskElt(Idx); Mask.push_back(Idx); } + + bool CommuteOperands = false; + if (N0.getOperand(1).getOpcode() != ISD::UNDEF) { + // To be valid, the combine shuffle mask should only reference elements + // from one of the two vectors in input to the inner shufflevector. + bool IsValidMask = true; + for (unsigned i = 0; i != NumElts && IsValidMask; ++i) + // See if the combined mask only reference undefs or elements coming + // from the first shufflevector operand. + IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] < NumElts; + + if (!IsValidMask) { + IsValidMask = true; + for (unsigned i = 0; i != NumElts && IsValidMask; ++i) + // Check that all the elements come from the second shuffle operand. + IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] >= NumElts; + CommuteOperands = IsValidMask; + } + + // Early exit if the combined shuffle mask is not valid. + if (!IsValidMask) + return SDValue(); + } + // See if this pair of shuffles can be safely folded according to either + // of the following rules: + // shuffle(shuffle(x, y), undef) -> x + // shuffle(shuffle(x, undef), undef) -> x + // shuffle(shuffle(x, y), undef) -> y bool IsIdentityMask = true; + unsigned BaseMaskIndex = CommuteOperands ? NumElts : 0; for (unsigned i = 0; i != NumElts && IsIdentityMask; ++i) { // Skip Undefs. if (Mask[i] < 0) continue; // The combined shuffle must map each index to itself. - IsIdentityMask = (unsigned)Mask[i] == i; - } - - if (IsIdentityMask) - // optimize shuffle(shuffle(x, undef), undef) -> x. + IsIdentityMask = (unsigned)Mask[i] == i + BaseMaskIndex; + } + + if (IsIdentityMask) { + if (CommuteOperands) + // optimize shuffle(shuffle(x, y), undef) -> y. + return OtherSV->getOperand(1); + + // optimize shuffle(shuffle(x, undef), undef) -> x + // optimize shuffle(shuffle(x, y), undef) -> x return OtherSV->getOperand(0); + } // It may still be beneficial to combine the two shuffles if the // resulting shuffle is legal. - // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3). - if (TLI.isShuffleMaskLegal(Mask, VT)) - return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1, + if (TLI.isShuffleMaskLegal(Mask, VT)) { + if (!CommuteOperands) + // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3). + // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(x, undef, M3) + return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1, + &Mask[0]); + + // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(undef, y, M3) + return DAG.getVectorShuffle(VT, SDLoc(N), N1, N0->getOperand(1), &Mask[0]); + } } return SDValue(); diff --git a/test/CodeGen/X86/combine-vec-shuffle-2.ll b/test/CodeGen/X86/combine-vec-shuffle-2.ll new file mode 100644 index 00000000000..7ab7f809125 --- /dev/null +++ b/test/CodeGen/X86/combine-vec-shuffle-2.ll @@ -0,0 +1,164 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s + +; Check that DAGCombiner correctly folds the following pairs of shuffles +; using the following rules: +; 1. shuffle(shuffle(x, y), undef) -> x +; 2. shuffle(shuffle(x, y), undef) -> y +; 3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef) +; 4. shuffle(shuffle(x, y), undef) -> shuffle(undef, y) +; +; Rules 3. and 4. are used only if the resulting shuffle mask is legal. + +define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test1 +; Mask: [3,0,0,1] +; CHECK: pshufd $67 +; CHECK-NEXT: ret + + +define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test2 +; Mask: [2,0,0,3] +; CHECK: pshufd $-62 +; CHECK-NEXT: ret + + +define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test3 +; Mask: [2,0,0,3] +; CHECK: pshufd $-62 +; CHECK-NEXT: ret + + +define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test4 +; Mask: [0,0,0,1] +; CHECK: pshufd $64 +; CHECK-NEXT: ret + + +define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test5 +; Mask: [1,1] +; CHECK: movhlps +; CHECK-NEXT: ret + + +define <4 x i32> @test6(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test6 +; Mask: [2,0,0,0] +; CHECK: pshufd $2 +; CHECK-NEXT: ret + + +define <4 x i32> @test7(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test7 +; Mask: [0,2,0,2] +; CHECK: pshufd $-120 +; CHECK-NEXT: ret + + +define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test8 +; Mask: [1,0,3,0] +; CHECK: pshufd $49 +; CHECK-NEXT: ret + + +define <4 x i32> @test9(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test9 +; Mask: [1,3,0,2] +; CHECK: pshufd $-115 +; CHECK-NEXT: ret + + +define <4 x i32> @test10(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test10 +; Mask: [1,0,1,0] +; CHECK: pshufd $17 +; CHECK-NEXT: ret + + +define <4 x i32> @test11(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test11 +; Mask: [1,0,2,1] +; CHECK: pshufd $97 +; CHECK-NEXT: ret + + +define <4 x i32> @test12(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test12 +; Mask: [0,0,0,0] +; CHECK: pshufd $0 +; CHECK-NEXT: ret + + +; The following pair of shuffles is folded into vector %A. +define <4 x i32> @test13(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test13 +; CHECK-NOT: pshufd +; CHECK: ret + + +; The following pair of shuffles is folded into vector %B. +define <4 x i32> @test14(<4 x i32> %A, <4 x i32> %B) { + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} +; CHECK-LABEL: test14 +; CHECK-NOT: pshufd +; CHECK: ret + diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll index 803402b1f1f..a355b75fafc 100644 --- a/test/CodeGen/X86/widen_shuffle-1.ll +++ b/test/CodeGen/X86/widen_shuffle-1.ll @@ -33,7 +33,9 @@ entry: define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind { entry: ; CHECK-LABEL: shuf3: -; CHECK: shufps +; CHECK-NOT: movlhps +; CHECK-NOT: shufps +; CHECK: pshufd %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32>