From ab18d0e7cbfb2639a68b7282ba1ac7e0a51dbb24 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 18 Mar 2015 22:18:51 +0000
Subject: [PATCH] [X86][SSE] Avoid scalarization of v2i64 vector shifts
 (REAPPLIED)

Fixed broken tests.

Differential Revision: http://reviews.llvm.org/D8416

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@232682 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp           | 37 +++++++++++++-------
 test/Analysis/CostModel/X86/testshiftlshr.ll | 16 ++++-----
 test/Analysis/CostModel/X86/testshiftshl.ll  | 16 ++++-----
 test/CodeGen/X86/vshift-4.ll                 |  9 +++--
 test/CodeGen/X86/x86-shifts.ll               | 13 ++++---
 5 files changed, 56 insertions(+), 35 deletions(-)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f4017da11ab..50c50259334 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5906,7 +5906,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
 }
 
-static SDValue LowerCONCAT_VECTORS(SDValue Op, 
+static SDValue LowerCONCAT_VECTORS(SDValue Op,
                                    const X86Subtarget *Subtarget,
                                    SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
@@ -13255,11 +13255,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       // If we have AVX, we can use a variable vector select (VBLENDV) instead
       // of 3 logic instructions for size savings and potentially speed.
       // Unfortunately, there is no scalar form of VBLENDV.
-      
+
       // If either operand is a constant, don't try this. We can expect to
       // optimize away at least one of the logic instructions later in that
       // case, so that sequence would be faster than a variable blend.
-      
+
       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
       // uses XMM0 as the selection register. That may need just as many
       // instructions as the AND/ANDN/OR sequence due to register moves, so
@@ -13267,10 +13267,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
       if (Subtarget->hasAVX() &&
           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
-        
+
         // Convert to vectors, do a VSELECT, and convert back to scalar.
         // All of the conversions should be optimized away.
-        
+
         EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
@@ -13278,9 +13278,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
         EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
         VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp);
-        
+
         SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
-        
+
         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
                            VSel, DAG.getIntPtrConstant(0));
       }
@@ -16189,6 +16189,17 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
       return Op;
   }
 
+  // 2i64 vector logical shifts can efficiently avoid scalarization - do the
+  // shifts per-lane and then shuffle the partial results back together.
+  if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
+    // Splat the shift amounts so the scalar shifts above will catch it.
+    SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
+    SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
+    SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
+    SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
+    return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
+  }
+
   // If possible, lower this packed shift into a vector multiply instead of
   // expanding it into a sequence of scalar shifts.
   // Do this only if the vector shift count is a constant build_vector.
@@ -21960,7 +21971,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
   // an and with a mask.
   // We'd like to try to combine that into a shuffle with zero
   // plus a bitcast, removing the and.
-  if (N0.getOpcode() != ISD::BITCAST || 
+  if (N0.getOpcode() != ISD::BITCAST ||
       N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
     return SDValue();
 
@@ -21990,7 +22001,7 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
 
   unsigned ResSize = N1.getValueType().getScalarSizeInBits();
   // Make sure the splat matches the mask we expect
-  if (SplatBitSize > ResSize || 
+  if (SplatBitSize > ResSize ||
       (SplatValue + 1).exactLogBase2() != (int)SrcSize)
     return SDValue();
 
@@ -22948,7 +22959,7 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
     if (C->getValueAPF().isPosZero())
       return N->getOperand(1);
-  
+
   return SDValue();
 }
 
@@ -23222,7 +23233,7 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
         return DAG.getConstant(1, VT);
       if (CC == ISD::SETEQ || CC == ISD::SETGE)
         return DAG.getNOT(DL, LHS.getOperand(0), VT);
-      
+
       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
              "Unexpected condition code!");
       return LHS.getOperand(0);
@@ -23264,7 +23275,7 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
     // countS and just gets an f32 from that address.
     unsigned DestIndex =
         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
-    
+
     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
 
     // Create this as a scalar to vector to match the instruction pattern.
@@ -23288,7 +23299,7 @@ static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
   // pattern-matching possibilities related to scalar math ops in SSE/AVX.
   // x86InstrInfo knows how to commute this back after instruction selection
   // if it would help register allocation.
-  
+
   // TODO: If optimizing for size or a processor that doesn't suffer from
   // partial register update stalls, this should be transformed into a MOVSD
   // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
diff --git a/test/Analysis/CostModel/X86/testshiftlshr.ll b/test/Analysis/CostModel/X86/testshiftlshr.ll
index 7bc8d89e4ad..78bf0a60830 100644
--- a/test/Analysis/CostModel/X86/testshiftlshr.ll
+++ b/test/Analysis/CostModel/X86/testshiftlshr.ll
@@ -7,7 +7,7 @@ entry:
   ; SSE2: shift2i16
   ; SSE2: cost of 20 {{.*}} lshr
   ; SSE2-CODEGEN: shift2i16
-  ; SSE2-CODEGEN: shrq %cl
+  ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype %a , %b
   ret %shifttype %0
@@ -67,7 +67,7 @@ entry:
   ; SSE2: shift2i32
   ; SSE2: cost of 20 {{.*}} lshr
   ; SSE2-CODEGEN: shift2i32
-  ; SSE2-CODEGEN: shrq %cl
+  ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype2i32 %a , %b
   ret %shifttype2i32 %0
@@ -127,7 +127,7 @@ entry:
   ; SSE2: shift2i64
   ; SSE2: cost of 20 {{.*}} lshr
   ; SSE2-CODEGEN: shift2i64
-  ; SSE2-CODEGEN: shrq %cl
+  ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype2i64 %a , %b
   ret %shifttype2i64 %0
@@ -139,7 +139,7 @@ entry:
   ; SSE2: shift4i64
   ; SSE2: cost of 40 {{.*}} lshr
   ; SSE2-CODEGEN: shift4i64
-  ; SSE2-CODEGEN: shrq %cl
+  ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype4i64 %a , %b
   ret %shifttype4i64 %0
@@ -151,7 +151,7 @@ entry:
   ; SSE2: shift8i64
   ; SSE2: cost of 80 {{.*}} lshr
   ; SSE2-CODEGEN: shift8i64
-  ; SSE2-CODEGEN: shrq %cl
+  ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype8i64 %a , %b
   ret %shifttype8i64 %0
@@ -163,7 +163,7 @@ entry:
   ; SSE2: shift16i64
   ; SSE2: cost of 160 {{.*}} lshr
   ; SSE2-CODEGEN: shift16i64
-  ; SSE2-CODEGEN: shrq %cl
+  ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype16i64 %a , %b
   ret %shifttype16i64 %0
@@ -175,7 +175,7 @@ entry:
   ; SSE2: shift32i64
   ; SSE2: cost of 320 {{.*}} lshr
   ; SSE2-CODEGEN: shift32i64
-  ; SSE2-CODEGEN: shrq %cl
+  ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype32i64 %a , %b
   ret %shifttype32i64 %0
@@ -187,7 +187,7 @@ entry:
   ; SSE2: shift2i8
   ; SSE2: cost of 20 {{.*}} lshr
   ; SSE2-CODEGEN: shift2i8
-  ; SSE2-CODEGEN: shrq %cl
+  ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype2i8 %a , %b
   ret %shifttype2i8 %0
diff --git a/test/Analysis/CostModel/X86/testshiftshl.ll b/test/Analysis/CostModel/X86/testshiftshl.ll
index 40effd02922..c36e0f5dfdf 100644
--- a/test/Analysis/CostModel/X86/testshiftshl.ll
+++ b/test/Analysis/CostModel/X86/testshiftshl.ll
@@ -7,7 +7,7 @@ entry:
   ; SSE2: shift2i16
   ; SSE2: cost of 20 {{.*}} shl
   ; SSE2-CODEGEN: shift2i16
-  ; SSE2-CODEGEN: shlq %cl
+  ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype %a , %b
   ret %shifttype %0
@@ -67,7 +67,7 @@ entry:
   ; SSE2: shift2i32
   ; SSE2: cost of 20 {{.*}} shl
   ; SSE2-CODEGEN: shift2i32
-  ; SSE2-CODEGEN: shlq %cl
+  ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype2i32 %a , %b
   ret %shifttype2i32 %0
@@ -127,7 +127,7 @@ entry:
   ; SSE2: shift2i64
   ; SSE2: cost of 20 {{.*}} shl
   ; SSE2-CODEGEN: shift2i64
-  ; SSE2-CODEGEN: shlq %cl
+  ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype2i64 %a , %b
   ret %shifttype2i64 %0
@@ -139,7 +139,7 @@ entry:
   ; SSE2: shift4i64
   ; SSE2: cost of 40 {{.*}} shl
   ; SSE2-CODEGEN: shift4i64
-  ; SSE2-CODEGEN: shlq %cl
+  ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype4i64 %a , %b
   ret %shifttype4i64 %0
@@ -151,7 +151,7 @@ entry:
   ; SSE2: shift8i64
   ; SSE2: cost of 80 {{.*}} shl
   ; SSE2-CODEGEN: shift8i64
-  ; SSE2-CODEGEN: shlq %cl
+  ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype8i64 %a , %b
   ret %shifttype8i64 %0
@@ -163,7 +163,7 @@ entry:
   ; SSE2: shift16i64
   ; SSE2: cost of 160 {{.*}} shl
   ; SSE2-CODEGEN: shift16i64
-  ; SSE2-CODEGEN: shlq %cl
+  ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype16i64 %a , %b
   ret %shifttype16i64 %0
@@ -175,7 +175,7 @@ entry:
   ; SSE2: shift32i64
   ; SSE2: cost of 320 {{.*}} shl
   ; SSE2-CODEGEN: shift32i64
-  ; SSE2-CODEGEN: shlq %cl
+  ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype32i64 %a , %b
   ret %shifttype32i64 %0
@@ -187,7 +187,7 @@ entry:
   ; SSE2: shift2i8
   ; SSE2: cost of 20 {{.*}} shl
   ; SSE2-CODEGEN: shift2i8
-  ; SSE2-CODEGEN: shlq %cl
+  ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype2i8 %a , %b
   ret %shifttype2i8 %0
diff --git a/test/CodeGen/X86/vshift-4.ll b/test/CodeGen/X86/vshift-4.ll
index cda9bc893a2..4ab5db88982 100644
--- a/test/CodeGen/X86/vshift-4.ll
+++ b/test/CodeGen/X86/vshift-4.ll
@@ -13,11 +13,16 @@ entry:
   ret void
 }
 
-; shift1b can't use a packed shift
+; shift1b can't use a packed shift but can shift lanes separately and shuffle back together
 define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
 entry:
 ; CHECK-LABEL: shift1b:
-; CHECK: shll
+; CHECK:       pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; CHECK-NEXT:  movdqa %xmm0, %xmm3
+; CHECK-NEXT:  psllq  %xmm2, %xmm3
+; CHECK-NEXT:  movq   {{.*#+}} xmm1 = xmm1[0],zero
+; CHECK-NEXT:  psllq  %xmm1, %xmm0
+; CHECK-NEXT:  movsd  {{.*#+}} xmm3 = xmm0[0],xmm3[1]
   %shamt = shufflevector <2 x i64> %sh, <2 x i64> undef, <2 x i32> <i32 0, i32 1>
   %shl = shl <2 x i64> %val, %shamt
   store <2 x i64> %shl, <2 x i64>* %dst
diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll
index a10134e4bb6..20505cb2af8 100644
--- a/test/CodeGen/X86/x86-shifts.ll
+++ b/test/CodeGen/X86/x86-shifts.ll
@@ -118,10 +118,15 @@ entry:
 
 define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
 entry:
-; CHECK: shr2_nosplat
-; CHECK-NOT:  psrlq
-; CHECK-NOT:  psrlq
-; CHECK:      ret
+; CHECK-LABEL: shr2_nosplat
+; CHECK:       movdqa %xmm1, %xmm2
+; CHECK-NEXT:  psrlq  $8, %xmm2
+; CHECK-NEXT:  movdqa %xmm1, %xmm0
+; CHECK-NEXT:  psrlq  $1, %xmm0
+; CHECK-NEXT:  movsd  {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; CHECK-NEXT:  movsd  {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; CHECK-NEXT:  xorpd  %xmm1, %xmm0
+; CHECK-NEXT:  ret
   %B = lshr <2 x i64> %A,  < i64 8, i64 1>
   %C = lshr <2 x i64> %A,  < i64 1, i64 0>
   %K = xor <2 x i64> %B, %C
-- 
2.34.1