[X86, AVX] try to lowerVectorShuffleAsElementInsertion() for all 256-bit vector sub...

author Sanjay Patel <spatel@rotateright.com>

Tue, 31 Mar 2015 16:32:11 +0000 (16:32 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Tue, 31 Mar 2015 16:32:11 +0000 (16:32 +0000)
author Sanjay Patel <spatel@rotateright.com>
Tue, 31 Mar 2015 16:32:11 +0000 (16:32 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Tue, 31 Mar 2015 16:32:11 +0000 (16:32 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 16e291b14439de359821299fa578c7bb7d908869..84b71429f339c24eb731734bfa60725037ee3a63 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -9281,15 +9281,6 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
  
-  // If we have a single input to the zero element, insert that into V1 if we
-  // can do so cheaply.
-  int NumV2Elements =
-      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
-  if (NumV2Elements == 1 && Mask[0] >= 4)
-    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
-      return Insertion;
-
    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
                                                  Subtarget, DAG))
      return Blend;
@@ -9432,15 +9423,6 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    ArrayRef<int> Mask = SVOp->getMask();
    assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
  
-  // If we have a single input to the zero element, insert that into V1 if we
-  // can do so cheaply.
-  int NumV2Elements =
-      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 8; });
-  if (NumV2Elements == 1 && Mask[0] >= 8)
-    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
-            DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
-      return Insertion;
-
    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
                                                  Subtarget, DAG))
      return Blend;
@@ -9811,6 +9793,18 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
    ArrayRef<int> Mask = SVOp->getMask();
  
+  // If we have a single input to the zero element, insert that into V1 if we
+  // can do so cheaply.
+  int NumElts = VT.getVectorNumElements();
+  int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
+    return M >= NumElts;
+  });
+  
+  if (NumV2Elements == 1 && Mask[0] >= NumElts)
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+                              DL, VT, V1, V2, Mask, Subtarget, DAG))
+      return Insertion;
+
    // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
    // check for those subtargets here and avoid much of the subtarget querying in
    // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
diff --git a/test/CodeGen/X86/2012-1-10-buildvector.ll b/test/CodeGen/X86/2012-1-10-buildvector.ll

index a9b8cc6c62ab4373b4c6cf3e20dc86162565e322..ece28bfb6c503e9a5a5dcbb6d3e4abcc55944ac4 100644 (file)
--- a/test/CodeGen/X86/2012-1-10-buildvector.ll
+++ b/test/CodeGen/X86/2012-1-10-buildvector.ll
@@ -3,25 +3,26 @@
  target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
  target triple = "i686-pc-win32"
  
-;CHECK-LABEL: bad_cast:
+; CHECK-LABEL: bad_cast:
  define void @bad_cast() {
  entry:
    %vext.i = shufflevector <2 x i64> undef, <2 x i64> undef, <3 x i32> <i32 0, i32 1, i32 undef>
    %vecinit8.i = shufflevector <3 x i64> zeroinitializer, <3 x i64> %vext.i, <3 x i32> <i32 0, i32 3, i32 4>
    store <3 x i64> %vecinit8.i, <3 x i64>* undef, align 32
-;CHECK: ret
+; CHECK: ret
    ret void
  }
  
  
-;CHECK-LABEL: bad_insert:
+; CHECK-LABEL: bad_insert:
  define void @bad_insert(i32 %t) {
  entry:
-;CHECK: vxorps %ymm1, %ymm1, %ymm1
-;CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; CHECK:      vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: vmovaps %ymm0 
+; CHECK:      ret
+
    %v2 = insertelement <8 x i32> zeroinitializer, i32 %t, i32 0
    store <8 x i32> %v2, <8 x i32> addrspace(1)* undef, align 32
-;CHECK: ret
    ret void
  }
  
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll

index 8aca67c0bdb36ed4c62430bd8b1a615c14ffd4df..2c9af163f1e3655a540aeabf694c31274eacfbe2 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -813,15 +813,11 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
  ; AVX1-LABEL: insert_reg_and_zero_v4i64:
  ; AVX1:       # BB#0:
  ; AVX1-NEXT:    vmovq %rdi, %xmm0
-; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
  ; AVX1-NEXT:    retq
  ;
  ; AVX2-LABEL: insert_reg_and_zero_v4i64:
  ; AVX2:       # BB#0:
  ; AVX2-NEXT:    vmovq %rdi, %xmm0
-; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
  ; AVX2-NEXT:    retq
    %v = insertelement <4 x i64> undef, i64 %a, i64 0
    %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -831,16 +827,12 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
  define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
  ; AVX1-LABEL: insert_mem_and_zero_v4i64:
  ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
  ; AVX1-NEXT:    retq
  ;
  ; AVX2-LABEL: insert_mem_and_zero_v4i64:
  ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
  ; AVX2-NEXT:    retq
    %a = load i64, i64* %ptr
    %v = insertelement <4 x i64> undef, i64 %a, i64 0
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll

index 417423a462eba595549b653d00c5ca57a3053a93..a318e8a17c3c41ba6a6bbbb5c941fd4e08354700 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -133,8 +133,8 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
  ; AVX2:       # BB#0:
  ; AVX2-NEXT:    movl $7, %eax
  ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
  ; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
  ; AVX2-NEXT:    retq
    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -962,8 +962,8 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
  ; AVX2:       # BB#0:
  ; AVX2-NEXT:    movl $7, %eax
  ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
  ; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
  ; AVX2-NEXT:    retq
    %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
author	Sanjay Patel <spatel@rotateright.com>
	Tue, 31 Mar 2015 16:32:11 +0000 (16:32 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Tue, 31 Mar 2015 16:32:11 +0000 (16:32 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/2012-1-10-buildvector.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v4.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v8.ll		patch \| blob \| history