From 37f645cb34590ccb20717a914240a0b435b8103c Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Sat, 15 Nov 2014 22:56:25 +0000 Subject: [PATCH] [DAG] Improved target independent vector shuffle folding logic. This patch teaches the DAGCombiner how to combine shuffles according to rules: shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(B, A, M2) shuffle(shuffle(A, B, M0), B, M1) -> shuffle(B, A, M2) shuffle(shuffle(A, B, M0), A, M1) -> shuffle(B, A, M2) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222090 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 ++++++ test/CodeGen/X86/vector-shuffle-combining.ll | 64 ++++++-------------- 2 files changed, 40 insertions(+), 44 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 530ced6e532..342a4278adf 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11239,6 +11239,26 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return DAG.getVectorShuffle(VT, SDLoc(N), SV0, N1, &Mask[0]); return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]); } + + // Compute the commuted shuffle mask. + for (unsigned i = 0; i != NumElts; ++i) { + int idx = Mask[i]; + if (idx < 0) + continue; + else if (idx < (int)NumElts) + Mask[i] = idx + NumElts; + else + Mask[i] = idx - NumElts; + } + + if (TLI.isShuffleMaskLegal(Mask, VT)) { + if (IsSV1Undef) + // shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(B, A, M2) + return DAG.getVectorShuffle(VT, SDLoc(N), N1, SV0, &Mask[0]); + // shuffle(shuffle(A, B, M0), B, M1) -> shuffle(B, A, M2) + // shuffle(shuffle(A, B, M0), A, M1) -> shuffle(B, A, M2) + return DAG.getVectorShuffle(VT, SDLoc(N), SV1, SV0, &Mask[0]); + } } return SDValue(); diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 898e011ca8f..22a67492110 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1146,18 +1146,14 @@ define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_test2: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test2: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test2: @@ -1268,18 +1264,14 @@ define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: combine_test7: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test7: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test7: @@ -1385,14 +1377,12 @@ define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_test12: ; SSE2: # BB#0: ; SSE2-NEXT: movss %xmm0, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test12: ; SSSE3: # BB#0: ; SSSE3-NEXT: movss %xmm0, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -1486,14 +1476,12 @@ define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: combine_test17: ; SSE2: # BB#0: ; SSE2-NEXT: movss %xmm0, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test17: ; SSSE3: # BB#0: ; SSSE3-NEXT: movss %xmm0, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -1700,30 +1688,24 @@ define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { ; SSE2-LABEL: combine_test1c: ; SSE2: # BB#0: -; SSE2-NEXT: movd (%rdi), %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movd (%rsi), %xmm1 +; SSE2-NEXT: movd (%rdi), %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: movd (%rsi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movss %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test1c: ; SSSE3: # BB#0: -; SSSE3-NEXT: movd (%rdi), %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movd (%rsi), %xmm1 +; SSSE3-NEXT: movd (%rdi), %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: movd (%rsi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movss %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test1c: @@ -1984,19 +1966,13 @@ define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_blend_123: ; SSE2: # BB#0: -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSE2-NEXT: movsd %xmm2, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_blend_123: ; SSSE3: # BB#0: -; SSSE3-NEXT: movaps %xmm1, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSSE3-NEXT: movsd %xmm2, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -- 2.34.1