From 7ea151449d5d932c92aa09b917e7e5801455947d Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 31 Mar 2015 16:32:11 +0000
Subject: [PATCH] [X86, AVX] try to lowerVectorShuffleAsElementInsertion() for
 all 256-bit vector sub-types

I suggested this change in D7898 (http://llvm.org/viewvc/llvm-project?view=revision&revision=231354)

It improves the v4i64 case although not optimally. This AVX codegen:

  vmovq {{.*#+}} xmm0 = mem[0],zero
  vxorpd %ymm1, %ymm1, %ymm1
  vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]

Becomes:

  vmovsd {{.*#+}} xmm0 = mem[0],zero

Unfortunately, this doesn't completely solve PR22685. There are still at least 2 problems under here:

    We're not handling v32i8 / v16i16.
    We're not getting the FP / int domains right for instruction selection.

But since this patch alone appears to do no harm, reduces code duplication, and helps v4i64,
I'm submitting this patch ahead of fixing the above.

Differential Revision: http://reviews.llvm.org/D8341



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@233704 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp        | 30 +++++++++--------------
 test/CodeGen/X86/2012-1-10-buildvector.ll | 13 +++++-----
 test/CodeGen/X86/vector-shuffle-256-v4.ll | 12 ++-------
 test/CodeGen/X86/vector-shuffle-256-v8.ll |  8 +++---
 4 files changed, 25 insertions(+), 38 deletions(-)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 16e291b1443..84b71429f33 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -9281,15 +9281,6 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
 
-  // If we have a single input to the zero element, insert that into V1 if we
-  // can do so cheaply.
-  int NumV2Elements =
-      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
-  if (NumV2Elements == 1 && Mask[0] >= 4)
-    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
-      return Insertion;
-
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
                                                 Subtarget, DAG))
     return Blend;
@@ -9432,15 +9423,6 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
-  // If we have a single input to the zero element, insert that into V1 if we
-  // can do so cheaply.
-  int NumV2Elements =
-      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 8; });
-  if (NumV2Elements == 1 && Mask[0] >= 8)
-    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
-      return Insertion;
-
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
                                                 Subtarget, DAG))
     return Blend;
@@ -9811,6 +9793,18 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   ArrayRef<int> Mask = SVOp->getMask();
 
+  // If we have a single input to the zero element, insert that into V1 if we
+  // can do so cheaply.
+  int NumElts = VT.getVectorNumElements();
+  int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
+    return M >= NumElts;
+  });
+  
+  if (NumV2Elements == 1 && Mask[0] >= NumElts)
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+                              DL, VT, V1, V2, Mask, Subtarget, DAG))
+      return Insertion;
+
   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
   // check for those subtargets here and avoid much of the subtarget querying in
   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
diff --git a/test/CodeGen/X86/2012-1-10-buildvector.ll b/test/CodeGen/X86/2012-1-10-buildvector.ll
index a9b8cc6c62a..ece28bfb6c5 100644
--- a/test/CodeGen/X86/2012-1-10-buildvector.ll
+++ b/test/CodeGen/X86/2012-1-10-buildvector.ll
@@ -3,25 +3,26 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
 target triple = "i686-pc-win32"
 
-;CHECK-LABEL: bad_cast:
+; CHECK-LABEL: bad_cast:
 define void @bad_cast() {
 entry:
   %vext.i = shufflevector <2 x i64> undef, <2 x i64> undef, <3 x i32> <i32 0, i32 1, i32 undef>
   %vecinit8.i = shufflevector <3 x i64> zeroinitializer, <3 x i64> %vext.i, <3 x i32> <i32 0, i32 3, i32 4>
   store <3 x i64> %vecinit8.i, <3 x i64>* undef, align 32
-;CHECK: ret
+; CHECK: ret
   ret void
 }
 
 
-;CHECK-LABEL: bad_insert:
+; CHECK-LABEL: bad_insert:
 define void @bad_insert(i32 %t) {
 entry:
-;CHECK: vxorps %ymm1, %ymm1, %ymm1
-;CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; CHECK:      vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovaps %ymm0 
+; CHECK:      ret
+
   %v2 = insertelement <8 x i32> zeroinitializer, i32 %t, i32 0
   store <8 x i32> %v2, <8 x i32> addrspace(1)* undef, align 32
-;CHECK: ret
   ret void
 }
 
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 8aca67c0bdb..2c9af163f1e 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -813,15 +813,11 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
 ; AVX1-LABEL: insert_reg_and_zero_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovq %rdi, %xmm0
-; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: insert_reg_and_zero_v4i64:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovq %rdi, %xmm0
-; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
   %v = insertelement <4 x i64> undef, i64 %a, i64 0
   %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -831,16 +827,12 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
 define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
 ; AVX1-LABEL: insert_mem_and_zero_v4i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: insert_mem_and_zero_v4i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    retq
   %a = load i64, i64* %ptr
   %v = insertelement <4 x i64> undef, i64 %a, i64 0
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 417423a462e..a318e8a17c3 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -133,8 +133,8 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    movl $7, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -962,8 +962,8 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    movl $7, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-- 
2.34.1