From f159de96bd89d69dd9b9bb61639dbd9c1ee24415 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 3 Oct 2014 22:43:17 +0000 Subject: [PATCH] [x86] Add a really preposterous number of patterns for matching all of the various ways in which blends can be used to do vector element insertion for lowering with the scalar math instruction forms that effectively re-blend with the high elements after performing the operation. This then allows me to bail on the element insertion lowering path when we have SSE4.1 and are going to be doing a normal blend, which in turn restores the last of the blends lost from the new vector shuffle lowering when I got it to prioritize insertion in other cases (for example when we don't *have* a blend instruction). Without the patterns, using blends here would have regressed sse-scalar-fp-arith.ll *completely* with the new vector shuffle lowering. For completeness, I've added RUN-lines with the new lowering here. This is somewhat superfluous as I'm about to flip the default, but hey, it shows that this actually significantly changed behavior. The patterns I've added are just ridiculously repetative. Suggestions on making them better very much welcome. In particular, handling the commuted form of the v2f64 patterns is somewhat obnoxious. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@219033 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 5 + lib/Target/X86/X86InstrSSE.td | 194 +++++++++++++++++++++- test/CodeGen/X86/sse-scalar-fp-arith.ll | 3 + test/CodeGen/X86/vector-shuffle-128-v2.ll | 88 ++++++++-- test/CodeGen/X86/vector-shuffle-256-v4.ll | 8 +- test/CodeGen/X86/vector-shuffle-512-v8.ll | 26 +-- 6 files changed, 284 insertions(+), 40 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b6d134ff0fb..fa48b23ec0c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7830,6 +7830,11 @@ static SDValue lowerVectorShuffleAsElementInsertion( V1Mask[V2Index] = -1; if (!isNoopShuffleMask(V1Mask)) return SDValue(); + // This is essentially a special case blend operation, but if we have + // general purpose blend operations, they are always faster. Bail and let + // the rest of the lowering handle these as blends. + if (Subtarget->hasSSE41()) + return SDValue(); // Otherwise, use MOVSD or MOVSS. assert((EltVT == MVT::f32 || EltVT == MVT::f64) && diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index a2d97456405..03ce09f5f07 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3125,7 +3125,6 @@ let Predicates = [UseSSE1] in { let Predicates = [UseSSE2] in { // SSE2 patterns to select scalar double-precision fp arithmetic instructions - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), @@ -3145,10 +3144,10 @@ let Predicates = [UseSSE2] in { } let Predicates = [UseSSE41] in { - // If the subtarget has SSE4.1 but not AVX, the vector insert - // instruction is lowered into a X86insertps rather than a X86Movss. - // When selecting SSE scalar single-precision fp arithmetic instructions, - // make sure that we correctly match the X86insertps. + // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is + // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When + // selecting SSE scalar single-precision fp arithmetic instructions, make + // sure that we correctly match them. def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), @@ -3166,6 +3165,57 @@ let Predicates = [UseSSE41] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } let Predicates = [HasAVX] in { @@ -3204,6 +3254,57 @@ let Predicates = [HasAVX] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } // Patterns used to select SSE scalar fp arithmetic instructions from @@ -3258,6 +3359,49 @@ let Predicates = [UseSSE2] in { (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; } +let Predicates = [UseSSE41] in { + // With SSE4.1 we may see these operations using X86Blendi rather than + // X86Movs{s,d}. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (MULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; +} + let Predicates = [HasAVX] in { // The following patterns select AVX Scalar single/double precision fp // arithmetic instructions from a packed single precision fp instruction @@ -3287,6 +3431,46 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + // Also handle X86Blendi-based patterns. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; } /// Unop Arithmetic diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll index b122ef67544..415a4f12b2c 100644 --- a/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1,6 +1,9 @@ ; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s +; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s -x86-experimental-vector-shuffle-lowering | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s ; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s +; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s -x86-experimental-vector-shuffle-lowering | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s ; RUN: llc -mcpu=x86-64 -mattr=+avx < %s | FileCheck --check-prefix=AVX %s +; RUN: llc -mcpu=x86-64 -mattr=+avx < %s -x86-experimental-vector-shuffle-lowering | FileCheck --check-prefix=AVX %s target triple = "x86_64-unknown-unknown" diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index aa837f15e57..59041367bba 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -211,28 +211,61 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) { ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) { -; SSE-LABEL: shuffle_v2f64_03: -; SSE: # BB#0: -; SSE-NEXT: movsd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v2f64_03: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_03: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2f64_03: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_03: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_03: ; AVX: # BB#0: -; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) { -; SSE-LABEL: shuffle_v2f64_21: -; SSE: # BB#0: -; SSE-NEXT: movsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v2f64_21: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_21: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2f64_21: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_21: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_21: ; AVX: # BB#0: -; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -753,16 +786,35 @@ define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) { } define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) { -; SSE-LABEL: shuffle_v2f64_z1: -; SSE: # BB#0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: movsd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v2f64_z1: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_z1: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2f64_z1: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_z1: +; SSE41: # BB#0: +; SSE41-NEXT: xorpd %xmm1, %xmm1 +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_z1: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> ret <2 x double> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 32ee62fa985..7899a52a741 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -55,7 +55,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2] -; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4f64_0300: @@ -382,7 +382,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2] -; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_0300: @@ -518,7 +518,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_4012: @@ -654,7 +654,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: stress_test1: diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 662b9832611..2f02f2fc08f 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -91,7 +91,7 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] ; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 -; ALL-NEXT: vmovsd %xmm1, %xmm0, %xmm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3] ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -275,12 +275,12 @@ define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08991abb: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,1,1] -; ALL-NEXT: vmovsd %xmm0, %xmm2, %xmm2 -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm0[1,0,2,2] +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,2,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -303,11 +303,11 @@ define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_09ab1def: ; ALL: # BB#0: -; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm2 -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -721,7 +721,7 @@ define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] ; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] -; ALL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] ; ALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> -- 2.34.1