From 29720a4bad5f6ca271843f31cf3f03865ccb73cb Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Sun, 21 Sep 2014 11:17:55 +0000
Subject: [PATCH] [x86] Teach the new vector shuffle lowering of v4f64 to
 prefer a direct VBLENDPD over using VSHUFPD. While the 256-bit variant of
 VBLENDPD slows down to the same speed as VSHUFPD on Sandy Bridge CPUs, it has
 twice the reciprocal throughput on Ivy Bridge CPUs much like it does
 everywhere for 128-bits. There isn't a downside, so just eagerly use this
 instruction when it suffices.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218208 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp        | 5 +++++
 test/CodeGen/X86/vector-shuffle-256-v4.ll | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 36c8a95c105..a3871339a49 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7237,6 +7237,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
   switch (VT.SimpleTy) {
   case MVT::v2f64:
   case MVT::v4f32:
+  case MVT::v4f64:
     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
                        DAG.getConstant(BlendMask, MVT::i8));
 
@@ -9229,6 +9230,10 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   if (isShuffleEquivalent(Mask, 5, 1, 7, 3))
     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
 
+  if (SDValue Blend =
+          lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, DAG))
+    return Blend;
+
   // Check if the blend happens to exactly fit that of SHUFPD.
   if (Mask[0] < 4 && (Mask[1] == -1 || Mask[1] >= 4) &&
       Mask[2] < 4 && (Mask[3] == -1 || Mask[3] >= 4)) {
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index b2338217407..56bbce61ae6 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -359,7 +359,7 @@ define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) {
 ; ALL-LABEL: @shuffle_v4f64_0527
 ; ALL:       # BB#0:
-; ALL-NEXT:    vshufpd {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vblendpd {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x double> %shuffle
@@ -368,7 +368,7 @@ define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) {
 ; ALL-LABEL: @shuffle_v4f64_4163
 ; ALL:       # BB#0:
-; ALL-NEXT:    vshufpd {{.*}} # ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT:    vblendpd {{.*}} # ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   ret <4 x double> %shuffle
-- 
2.34.1