From 4e60da755a221f080a25079abfcc021f7bf14f1c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 3 Apr 2015 10:02:21 +0000 Subject: [PATCH] [DAGCombiner] Combine shuffles of BUILD_VECTOR and SCALAR_TO_VECTOR This patch attempts to fold the shuffling of 'scalar source' inputs - BUILD_VECTOR and SCALAR_TO_VECTOR nodes - if the shuffle node is the only user. This folds away a lot of unnecessary shuffle nodes, and allows quite a bit of constant folding that was being missed. Differential Revision: http://reviews.llvm.org/D8516 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@234004 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 37 ++++++++ test/CodeGen/AArch64/arm64-neon-copy.ll | 2 +- test/CodeGen/AArch64/arm64-vshuffle.ll | 95 ++++++--------------- test/CodeGen/PowerPC/vperm-lowering.ll | 57 +++---------- test/CodeGen/X86/mmx-bitcast.ll | 3 +- test/CodeGen/X86/sse41.ll | 27 +++--- test/CodeGen/X86/vec_insert-5.ll | 2 +- test/CodeGen/X86/vec_insert-mmx.ll | 2 +- test/CodeGen/X86/vec_zero_cse.ll | 2 +- test/CodeGen/X86/vector-shuffle-128-v16.ll | 99 +++++++--------------- test/CodeGen/X86/vector-shuffle-128-v8.ll | 38 ++++----- 11 files changed, 140 insertions(+), 224 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 19950f5dad1..70f6185964c 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11980,6 +11980,43 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return V; } + // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - + // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. + if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { + SmallVector Ops; + for (int M : SVN->getMask()) { + SDValue Op = DAG.getUNDEF(VT.getScalarType()); + if (M >= 0) { + int Idx = M % NumElts; + SDValue &S = (M < (int)NumElts ? N0 : N1); + if (S.getOpcode() == ISD::BUILD_VECTOR && S.hasOneUse()) { + Op = S.getOperand(Idx); + } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR && S.hasOneUse()) { + if (Idx == 0) + Op = S.getOperand(0); + } else { + // Operand can't be combined - bail out. + break; + } + } + Ops.push_back(Op); + } + if (Ops.size() == VT.getVectorNumElements()) { + // BUILD_VECTOR requires all inputs to be of the same type, find the + // maximum type and extend them all. + EVT SVT = VT.getScalarType(); + if (SVT.isInteger()) + for (SDValue &Op : Ops) + SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); + if (SVT != VT.getScalarType()) + for (SDValue &Op : Ops) + Op = TLI.isZExtFree(Op.getValueType(), SVT) + ? DAG.getZExtOrTrunc(Op, SDLoc(N), SVT) + : DAG.getSExtOrTrunc(Op, SDLoc(N), SVT); + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Ops); + } + } + // If this shuffle only has a single input that is a bitcasted shuffle, // attempt to merge the 2 shuffles and suitably bitcast the inputs/output // back to their original types. diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll index 4a92c3d49c1..b74a40626ce 100644 --- a/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1086,7 +1086,7 @@ define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) { ; CHECK-LABEL: test_concat_diff_v1i32_v1i32: ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} -; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK: ins {{v[0-9]+}}.s[1], w{{[0-9]+}} entry: %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) %d = insertelement <2 x i32> undef, i32 %c, i32 0 diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll index 75e0d80e7f4..15ea21b7638 100644 --- a/test/CodeGen/AArch64/arm64-vshuffle.ll +++ b/test/CodeGen/AArch64/arm64-vshuffle.ll @@ -1,22 +1,8 @@ ; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s -; The mask: -; CHECK: lCPI0_0: -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; The second vector is legalized to undef and the elements of the first vector -; are used instead. -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 4 ; 0x4 -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 0 ; 0x0 ; CHECK: test1 -; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0 -; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8 -; CHECK: tbl.8b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]] +; CHECK: movi d[[REG0:[0-9]+]], #0000000000000000 define <8 x i1> @test1() { entry: %Shuff = shufflevector <8 x i1> @test2() { bb: %Shuff = shufflevector <8 x i1> zeroinitializer, @@ -51,28 +35,8 @@ bb: ret <8 x i1> %Shuff } -; CHECK: lCPI2_0: -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 0 ; 0x0 -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 255 ; 0xff -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 0 ; 0x0 ; CHECK: test3 -; CHECK: adrp x[[REG3:[0-9]+]], lCPI2_0@PAGE -; CHECK: ldr q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF] -; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF] -; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]] +; CHECK: movi.4s v{{[0-9]+}}, #0x1 define <16 x i1> @test3(i1* %ptr, i32 %v) { bb: %Shuff = shufflevector <16 x i1> , <16 x i1> undef, @@ -81,29 +45,26 @@ bb: i32 14, i32 0> ret <16 x i1> %Shuff } -; CHECK: lCPI3_1: +; CHECK: lCPI3_0: +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 ; CHECK: .byte 0 ; 0x0 ; CHECK: .byte 1 ; 0x1 -; CHECK: .byte 2 ; 0x2 -; CHECK: .byte 18 ; 0x12 -; CHECK: .byte 4 ; 0x4 -; CHECK: .byte 5 ; 0x5 -; CHECK: .byte 6 ; 0x6 -; CHECK: .byte 7 ; 0x7 -; CHECK: .byte 8 ; 0x8 -; CHECK: .byte 31 ; 0x1f -; CHECK: .byte 10 ; 0xa -; CHECK: .byte 30 ; 0x1e -; CHECK: .byte 12 ; 0xc -; CHECK: .byte 13 ; 0xd -; CHECK: .byte 14 ; 0xe -; CHECK: .byte 15 ; 0xf +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 +; CHECK: .byte 0 ; 0x0 ; CHECK: _test4: -; CHECK: ldr q[[REG1:[0-9]+]] -; CHECK: movi.2d v[[REG0:[0-9]+]], #0000000000000000 -; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_1@PAGE -; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF] -; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]] +; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_0@PAGE +; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_0@PAGEOFF] define <16 x i1> @test4(i1* %ptr, i32 %v) { bb: %Shuff = shufflevector <16 x i1> zeroinitializer, diff --git a/test/CodeGen/PowerPC/vperm-lowering.ll b/test/CodeGen/PowerPC/vperm-lowering.ll index d55d26c959b..c78ffdd0c07 100644 --- a/test/CodeGen/PowerPC/vperm-lowering.ll +++ b/test/CodeGen/PowerPC/vperm-lowering.ll @@ -9,58 +9,23 @@ define <16 x i8> @foo() nounwind ssp { } ; CHECK: .LCPI0_0: -; CHECK: .byte 31 -; CHECK: .byte 26 -; CHECK: .byte 21 -; CHECK: .byte 16 -; CHECK: .byte 11 -; CHECK: .byte 6 -; CHECK: .byte 1 -; CHECK: .byte 28 -; CHECK: .byte 23 -; CHECK: .byte 18 -; CHECK: .byte 13 -; CHECK: .byte 8 -; CHECK: .byte 3 -; CHECK: .byte 30 -; CHECK: .byte 25 -; CHECK: .byte 20 -; CHECK: .LCPI0_1: ; CHECK: .byte 0 -; CHECK: .byte 1 -; CHECK: .byte 2 -; CHECK: .byte 3 -; CHECK: .byte 4 ; CHECK: .byte 5 -; CHECK: .byte 6 -; CHECK: .byte 7 -; CHECK: .byte 8 -; CHECK: .byte 9 ; CHECK: .byte 10 -; CHECK: .byte 11 -; CHECK: .byte 12 -; CHECK: .byte 13 -; CHECK: .byte 14 ; CHECK: .byte 15 -; CHECK: .LCPI0_2: -; CHECK: .byte 16 -; CHECK: .byte 17 -; CHECK: .byte 18 -; CHECK: .byte 19 ; CHECK: .byte 20 -; CHECK: .byte 21 -; CHECK: .byte 22 -; CHECK: .byte 23 -; CHECK: .byte 24 ; CHECK: .byte 25 -; CHECK: .byte 26 -; CHECK: .byte 27 -; CHECK: .byte 28 -; CHECK: .byte 29 ; CHECK: .byte 30 -; CHECK: .byte 31 +; CHECK: .byte 3 +; CHECK: .byte 8 +; CHECK: .byte 13 +; CHECK: .byte 18 +; CHECK: .byte 23 +; CHECK: .byte 28 +; CHECK: .byte 1 +; CHECK: .byte 6 +; CHECK: .byte 11 ; CHECK: foo: -; CHECK: addis [[REG1:[0-9]+]], 2, .LCPI0_2@toc@ha -; CHECK: addi [[REG2:[0-9]+]], [[REG1]], .LCPI0_2@toc@l +; CHECK: addis [[REG1:[0-9]+]], 2, .LCPI0_0@toc@ha +; CHECK: addi [[REG2:[0-9]+]], [[REG1]], .LCPI0_0@toc@l ; CHECK: lvx [[REG3:[0-9]+]], 0, [[REG2]] -; CHECK: vperm {{[0-9]+}}, [[REG3]], {{[0-9]+}}, {{[0-9]+}} diff --git a/test/CodeGen/X86/mmx-bitcast.ll b/test/CodeGen/X86/mmx-bitcast.ll index 4aa10a98d45..00c803917f7 100644 --- a/test/CodeGen/X86/mmx-bitcast.ll +++ b/test/CodeGen/X86/mmx-bitcast.ll @@ -75,8 +75,7 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone { ; CHECK-NEXT: movd ; CHECK-NEXT: movd ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; CHECK-NEXT: movd %xmm0, %rax +; CHECK-NEXT: movd %xmm1, %rax ; CHECK-NEXT: retq %v0 = insertelement <2 x i32> undef, i32 %a, i32 0 %v1 = insertelement <2 x i32> %v0, i32 %b, i32 1 diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index ca13392ffe8..3bde991c337 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -1026,29 +1026,24 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) { } ; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1> -define void @insertps_pr20411(i32* noalias nocapture %RET) #1 { +define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 { ; X32-LABEL: insertps_pr20411: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1] -; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[3,1,2,3] -; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; X32-NEXT: movdqu %xmm1, (%eax) ; X32-NEXT: retl ; -; X64-LABEL: insertps_pr20411: -; X64: ## BB#0: -; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1] -; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[3,1,2,3] -; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; X64-NEXT: movdqu %xmm1, (%rdi) +; X64-LABEL: insertps_pr20411: +; X64: ## BB#0: +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; X64-NEXT: movdqu %xmm1, (%rdi) ; X64-NEXT: retq - %gather_load = shufflevector <8 x i32> , <8 x i32> undef, <8 x i32> - %shuffle109 = shufflevector <4 x i32> , <4 x i32> undef, <4 x i32> ; 4 5 6 7 - %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> ; 3 x x x - %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> ; 3 7 x x - %ptrcast = bitcast i32* %RET to <4 x i32>* - store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 + %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> + %ptrcast = bitcast i32* %RET to <4 x i32>* + store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 ret void } diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll index 0f8951529dd..26ac2a23e31 100644 --- a/test/CodeGen/X86/vec_insert-5.ll +++ b/test/CodeGen/X86/vec_insert-5.ll @@ -8,7 +8,7 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: shll $12, %ecx ; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; CHECK-NEXT: movlpd %xmm0, (%eax) ; CHECK-NEXT: retl %tmp12 = shl i32 %a, 12 diff --git a/test/CodeGen/X86/vec_insert-mmx.ll b/test/CodeGen/X86/vec_insert-mmx.ll index 447f97ab8e2..c77bffcbd37 100644 --- a/test/CodeGen/X86/vec_insert-mmx.ll +++ b/test/CodeGen/X86/vec_insert-mmx.ll @@ -6,7 +6,7 @@ define x86_mmx @t0(i32 %A) nounwind { ; X86-32-LABEL: t0: ; X86-32: ## BB#0: ; X86-32: movd {{[0-9]+}}(%esp), %xmm0 -; X86-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] +; X86-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; X86-32-NEXT: movlpd %xmm0, (%esp) ; X86-32-NEXT: movq (%esp), %mm0 ; X86-32-NEXT: addl $12, %esp diff --git a/test/CodeGen/X86/vec_zero_cse.ll b/test/CodeGen/X86/vec_zero_cse.ll index afde0ed534b..66c64d32522 100644 --- a/test/CodeGen/X86/vec_zero_cse.ll +++ b/test/CodeGen/X86/vec_zero_cse.ll @@ -17,7 +17,7 @@ define void @test1() { define void @test2() { ;CHECK-LABEL: @test2 -;CHECK: pshufd +;CHECK: pcmpeqd store <1 x i64> < i64 -1 >, <1 x i64>* @M1 store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2 ret void diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 01b8972e13d..60cfec688fd 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -634,28 +634,16 @@ define <16 x i8> @PR20540(<8 x i8> %a) { } define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> @@ -663,29 +651,18 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( } define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; +; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: shll $8, %edi +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $2, %edi, %xmm0 +; SSE-NEXT: retq + ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: shll $8, %edi +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> @@ -695,14 +672,16 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { ; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; SSE: # BB#0: -; SSE-NEXT: movd %edi, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE-NEXT: shll $8, %edi +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $7, %edi, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX-NEXT: shll $8, %edi +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> @@ -710,32 +689,18 @@ define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16( } define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pslld $24, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $1, %eax, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $1, %eax, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 3 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index eb77c38e0b7..4007f0b2b13 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1384,16 +1384,14 @@ define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) { define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_z8zzzzzz: ; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $1, %edi, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_z8zzzzzz: ; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $1, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> @@ -1403,16 +1401,14 @@ define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zzzzz8zz: ; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $5, %edi, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_zzzzz8zz: ; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> @@ -1422,14 +1418,14 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zuuzuuz8: ; SSE: # BB#0: -; SSE-NEXT: movd %edi, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $7, %edi, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_zuuzuuz8: ; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> @@ -1439,16 +1435,14 @@ define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zzBzzzzz: ; SSE: # BB#0: -; SSE-NEXT: movzwl %di, %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pinsrw $2, %edi, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_zzBzzzzz: ; AVX: # BB#0: -; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: vpxor %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 3 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> -- 2.34.1