From: Chandler Carruth Date: Thu, 2 Oct 2014 07:56:47 +0000 (+0000) Subject: [x86] Merge the third combining test into the generic one and add proper X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=cd912001b4419a20bab21709c7aabc68599c5c46;p=oota-llvm.git [x86] Merge the third combining test into the generic one and add proper checks for all the ISA variants. If the SSE2 checks here terrify you, good. This is (in large part) the kind of amazingly bad code that is holding LLVM back when vectorizing on older ISAs. At the same time, these tests seem increasingly dubious to me. There are a very large number of tests and it isn't clear that they are systematically covering a specific set of functionality. Anyways, I don't want to reduce testing during the transition, I just want to consolidate it to where it is easier to manage. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218860 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/CodeGen/X86/combine-vec-shuffle-3.ll b/test/CodeGen/X86/combine-vec-shuffle-3.ll deleted file mode 100644 index bd2d34ca189..00000000000 --- a/test/CodeGen/X86/combine-vec-shuffle-3.ll +++ /dev/null @@ -1,380 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s - -define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test1 -; Mask: [0,1,2,3] -; CHECK: movaps -; CHECK: ret - -define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test2 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test3 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test4 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test5 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - - -define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test6 -; Mask: [4,5,6,7] -; CHECK: movaps -; CHECK: ret - -define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test7 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x i32> @test8(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test8 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test9 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i32> @test10(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test10 -; Mask: [4,1,6,7] -; CHECK: blendps -; CHECK: ret - -define <4 x float> @test11(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test11 -; Mask: [0,1,2,3] -; CHECK-NOT: movaps -; CHECK-NOT: blendps -; CHECK: ret - -define <4 x float> @test12(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test12 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x float> @test13(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test13 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test14(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test14 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK: ret - -define <4 x float> @test15(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test15 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - -define <4 x i32> @test16(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test16 -; Mask: [0,1,2,3] -; CHECK-NOT: movaps -; CHECK-NOT: blendps -; CHECK: ret - -define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test17 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test18 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test19 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK: ret - -define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test20 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - -; Check some negative cases. -define <4 x float> @test1b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test1b -; CHECK: shufps -; CHECK: shufps -; CHECK: ret - -define <4 x float> @test2b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test2b -; CHECK: shufps -; CHECK: pshufd -; CHECK: ret - -define <4 x float> @test3b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test3b -; CHECK: shufps -; CHECK: shufps -; CHECK: ret - -define <4 x float> @test4b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> - ret <4 x float> %2 -} -; CHECK-LABEL: test4b -; CHECK: shufps -; CHECK: shufps -; CHECK: ret - - -; Verify that we correctly fold shuffles even when we use illegal vector types. -define <4 x i8> @test1c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> - ret <4 x i8> %2 -} -; CHECK-LABEL: test1c -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK-NEXT: ret - -define <4 x i8> @test2c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> - ret <4 x i8> %2 -} -; CHECK-LABEL: test2c -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK-NEXT: ret - -define <4 x i8> @test3c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> - ret <4 x i8> %2 -} -; CHECK-LABEL: test3c -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i8> @test4c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> - ret <4 x i8> %2 -} -; CHECK-LABEL: test4c -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - -; The following test cases are generated from this C++ code -; -;__m128 blend_01(__m128 a, __m128 b) -;{ -; __m128 s = a; -; s = _mm_blend_ps( s, b, 1<<0 ); -; s = _mm_blend_ps( s, b, 1<<1 ); -; return s; -;} -; -;__m128 blend_02(__m128 a, __m128 b) -;{ -; __m128 s = a; -; s = _mm_blend_ps( s, b, 1<<0 ); -; s = _mm_blend_ps( s, b, 1<<2 ); -; return s; -;} -; -;__m128 blend_123(__m128 a, __m128 b) -;{ -; __m128 s = a; -; s = _mm_blend_ps( s, b, 1<<1 ); -; s = _mm_blend_ps( s, b, 1<<2 ); -; s = _mm_blend_ps( s, b, 1<<3 ); -; return s; -;} - -; Ideally, we should collapse the following shuffles into a single one. - -define <4 x float> @blend_01(<4 x float> %a, <4 x float> %b) { - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> - ret <4 x float> %shuffle6 -} -; CHECK-LABEL: blend_01 -; CHECK: movsd -; CHECK-NEXT: ret - -define <4 x float> @blend_02(<4 x float> %a, <4 x float> %b) { - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> - ret <4 x float> %shuffle6 -} -; CHECK-LABEL: blend_02 -; CHECK: blendps $5 -; CHECK-NEXT: ret - -define <4 x float> @blend_123(<4 x float> %a, <4 x float> %b) { - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> - %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> - ret <4 x float> %shuffle12 -} -; CHECK-LABEL: blend_123 -; CHECK: movss -; CHECK: ret - -define <4 x i32> @test_movhl_1(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test_movhl_1 -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i32> @test_movhl_2(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test_movhl_2 -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i32> @test_movhl_3(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> - ret <4 x i32> %2 -} -; CHECK-LABEL: test_movhl_3 -; CHECK: movhlps -; CHECK-NEXT: ret - diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index d8e6cf2b8c1..dd0961769e7 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1097,3 +1097,1004 @@ define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 } + +define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test1: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test1: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test1: +; SSE41: # BB#0: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test1: +; AVX: # BB#0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test2: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2: +; SSE41: # BB#0: +; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2: +; AVX: # BB#0: +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test3: +; SSE: # BB#0: +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test3: +; AVX: # BB#0: +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test4: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test4: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test5: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test5: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test5: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test5: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test6: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test6: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test6: +; SSE41: # BB#0: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test6: +; AVX: # BB#0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test7: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test7: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test7: +; SSE41: # BB#0: +; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test7: +; AVX: # BB#0: +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test8: +; SSE: # BB#0: +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test8: +; AVX: # BB#0: +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test9: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test9: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test10: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test10: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test10: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test10: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test10: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { +; ALL-LABEL: combine_test11: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} + +define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test12: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test12: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test12: +; SSE41: # BB#0: +; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test12: +; AVX: # BB#0: +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} + +define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test13: +; SSE: # BB#0: +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test13: +; AVX: # BB#0: +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} + +define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test14: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test14: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} + +define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test15: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test15: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test15: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test15: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> + ret <4 x float> %2 +} + +define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { +; ALL-LABEL: combine_test16: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test17: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test17: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test17: +; SSE41: # BB#0: +; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test17: +; AVX: # BB#0: +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test18: +; SSE: # BB#0: +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test18: +; AVX: # BB#0: +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test19: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test19: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test20: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test20: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test20: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test20: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test20: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> + ret <4 x i32> %2 +} + + +; Check some negative cases. +; FIXME: Do any of these really make sense? Are they redundant with the above tests? + +define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test1b: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test1b: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test1b: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test1b: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test2b: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2b: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2b: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2b: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test3b: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test3b: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm0[3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + +define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test4b: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test4b: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test4b: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test4b: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[0,2] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> + ret <4 x float> %2 +} + + +; Verify that we correctly fold shuffles even when we use illegal vector types. + +define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test1c: +; SSE2: # BB#0: +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm0 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE2-NEXT: movl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm1 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test1c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movd (%rsi), %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test1c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm0 +; SSE41-NEXT: movss %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test1c: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test2c: +; SSE2: # BB#0: +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm0 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE2-NEXT: movl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm1 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movd (%rsi), %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm0 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm1 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2c: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test3c: +; SSE2: # BB#0: +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm0 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE2-NEXT: movl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm1 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test3c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movd (%rsi), %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: movhlps {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test3c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm0 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test3c: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test4c: +; SSE2: # BB#0: +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm0 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE2-NEXT: movl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: movzbl %ah, %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm1 +; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 +; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test4c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movd (%rsi), %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[0,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test4c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm0 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test4c: +; AVX1: # BB#0: +; AVX1-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test4c: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX2-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> + ret <4 x i8> %2 +} + + +; The following test cases are generated from this C++ code +; +;__m128 blend_01(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<1 ); +; return s; +;} +; +;__m128 blend_02(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; return s; +;} +; +;__m128 blend_123(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<1 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; s = _mm_blend_ps( s, b, 1<<3 ); +; return s; +;} + +; Ideally, we should collapse the following shuffles into a single one. + +define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_01: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_01: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_01: +; SSE41: # BB#0: +; SSE41-NEXT: movsd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_01: +; AVX: # BB#0: +; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle6 +} + +define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_02: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_02: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_02: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_02: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle6 +} + +define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_123: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: movss %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_123: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm2 +; SSSE3-NEXT: movss %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_123: +; SSE41: # BB#0: +; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_123: +; AVX: # BB#0: +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> + %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle12 +} + +define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_1: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_1: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_2: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_2: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_3: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_3: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> + ret <4 x i32> %2 +}