From 72f0d9515e304395c2874c8750c69272d424e4ae Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Thu, 18 Sep 2014 09:00:25 +0000 Subject: [PATCH] [x86] Use PALIGNR for v4i32 and v2i64 blends when appropriate. There is no purpose in using it for single-input shuffles as pshufd is just as fast and doesn't tie the two operands. This removes a substantial amount of wrong-domain blend operations in SSSE3 mode. It also completes the usage of PALIGNR for integer shuffles and addresses one of the test cases Quentin hit with the new vector shuffle lowering. There is still the question of whether and when to use this for floating point shuffles. It is faster than shufps or shufpd but in the integer domain. I don't yet really have a good heuristic here for when to use this instruction for floating point vectors. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218038 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 12 ++ test/CodeGen/X86/vector-shuffle-128-v2.ll | 83 +++++++-- test/CodeGen/X86/vector-shuffle-128-v4.ll | 194 ++++++++++++++++++++++ test/CodeGen/X86/vector-shuffle-128-v8.ll | 14 +- test/CodeGen/X86/vector-shuffle-256-v4.ll | 8 +- 5 files changed, 288 insertions(+), 23 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8721d8ffe50..8343a856eda 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7439,6 +7439,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, DAG)) return Blend; + // Try to use rotation instructions if available. + if (Subtarget->hasSSSE3()) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v2i64, V1, V2, Mask, DAG)) + return Rotate; + // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't @@ -7732,6 +7738,12 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, DAG)) return Blend; + // Try to use rotation instructions if available. + if (Subtarget->hasSSSE3()) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Rotate; + // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index 621e9a7db80..06673936586 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -222,17 +222,46 @@ define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: @shuffle_v2i64_12 -; ALL: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0] -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v2i64_12 +; SSE2: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v2i64_12 +; SSE3: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v2i64_12 +; SSSE3: palignr {{.*}} # xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v2i64_12 +; SSE41: palignr {{.*}} # xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_12_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: @shuffle_v2i64_12_copy -; ALL: shufpd {{.*}} # xmm1 = xmm1[1],xmm2[0] -; ALL-NEXT: movapd %xmm1, %xmm0 -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v2i64_12_copy +; SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm2[0] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v2i64_12_copy +; SSE3: shufpd {{.*}} # xmm1 = xmm1[1],xmm2[0] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v2i64_12_copy +; SSSE3: palignr {{.*}} # xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v2i64_12_copy +; SSE41: palignr {{.*}} # xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -314,18 +343,42 @@ define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_30(<2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: @shuffle_v2i64_30 -; ALL: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[0] -; ALL-NEXT: movapd %xmm1, %xmm0 -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v2i64_30 +; SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[0] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v2i64_30 +; SSE3: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[0] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v2i64_30 +; SSSE3: palignr {{.*}} # xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_30_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: @shuffle_v2i64_30_copy -; ALL: shufpd {{.*}} # xmm2 = xmm2[1],xmm1[0] -; ALL-NEXT: movapd %xmm2, %xmm0 -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v2i64_30_copy +; SSE2: shufpd {{.*}} # xmm2 = xmm2[1],xmm1[0] +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v2i64_30_copy +; SSE3: shufpd {{.*}} # xmm2 = xmm2[1],xmm1[0] +; SSE3-NEXT: movapd %xmm2, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v2i64_30_copy +; SSSE3: palignr {{.*}} # xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v2i64_30_copy +; SSE41: palignr {{.*}} # xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 31205c93f5a..077780416dc 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -571,3 +571,197 @@ define <4 x i32> @shuffle_v4i32_z6zz(i32 %i) { %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle } + +define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: @shuffle_v4i32_7012 +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*}} # xmm1 = xmm1[3,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*}} # xmm1 = xmm1[0,2],xmm0[1,2] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v4i32_7012 +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*}} # xmm1 = xmm1[3,0],xmm0[0,0] +; SSE3-NEXT: shufps {{.*}} # xmm1 = xmm1[0,2],xmm0[1,2] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v4i32_7012 +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr $12, {{.*}} # xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4i32_7012 +; SSE41: # BB#0: +; SSE41-NEXT: palignr $12, {{.*}} # xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4i32_7012 +; AVX1: # BB#0: +; AVX1-NEXT: vpalignr $12, {{.*}} # xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: @shuffle_v4i32_6701 +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[0] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v4i32_6701 +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[0] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v4i32_6701 +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr $8, {{.*}} # xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4i32_6701 +; SSE41: # BB#0: +; SSE41-NEXT: palignr $8, {{.*}} # xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4i32_6701 +; AVX1: # BB#0: +; AVX1-NEXT: vpalignr $8, {{.*}} # xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: @shuffle_v4i32_5670 +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*}} # xmm1 = xmm1[1,2],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v4i32_5670 +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,0],xmm1[3,0] +; SSE3-NEXT: shufps {{.*}} # xmm1 = xmm1[1,2],xmm0[2,0] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v4i32_5670 +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr $4, {{.*}} # xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4i32_5670 +; SSE41: # BB#0: +; SSE41-NEXT: palignr $4, {{.*}} # xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4i32_5670 +; AVX1: # BB#0: +; AVX1-NEXT: vpalignr $4, {{.*}} # xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: @shuffle_v4i32_1234 +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[1,2],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v4i32_1234 +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[3,0] +; SSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[1,2],xmm1[2,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v4i32_1234 +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr $4, {{.*}} # xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4i32_1234 +; SSE41: # BB#0: +; SSE41-NEXT: palignr $4, {{.*}} # xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4i32_1234 +; AVX1: # BB#0: +; AVX1-NEXT: vpalignr $4, {{.*}} # xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: @shuffle_v4i32_2345 +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v4i32_2345 +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v4i32_2345 +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr $8, {{.*}} # xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4i32_2345 +; SSE41: # BB#0: +; SSE41-NEXT: palignr $8, {{.*}} # xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4i32_2345 +; AVX1: # BB#0: +; AVX1-NEXT: vpalignr $8, {{.*}} # xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: @shuffle_v4i32_3456 +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[3,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],xmm1[1,2] +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v4i32_3456 +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[3,0],xmm1[0,0] +; SSE3-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],xmm1[1,2] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v4i32_3456 +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr $12, {{.*}} # xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v4i32_3456 +; SSE41: # BB#0: +; SSE41-NEXT: palignr $12, {{.*}} # xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: @shuffle_v4i32_3456 +; AVX1: # BB#0: +; AVX1-NEXT: vpalignr $12, {{.*}} # xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 92a2282f0ed..b61e282404c 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -21,10 +21,16 @@ define <8 x i16> @shuffle_v8i16_67452301(<8 x i16> %a, <8 x i16> %b) { ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) { -; ALL-LABEL: @shuffle_v8i16_456789AB -; ALL: # BB#0: -; ALL: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0] -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v8i16_456789AB +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v8i16_456789AB +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*}} # xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 757ef8bf176..982542b59b2 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -29,7 +29,7 @@ define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0112 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[1],xmm1[0] +; AVX1-NEXT: vpalignr {{.*}} # xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -75,7 +75,7 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_3330 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0] +; AVX1-NEXT: vpalignr {{.*}} # xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -303,7 +303,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0412 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vpalignr {{.*}} # xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -315,7 +315,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_4012 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vpalignr {{.*}} # xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vpblendw {{.*}} # xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -- 2.34.1