X86: use vpsllvd (& friends) for 16-bit shifts on Haswell

author Tim Northover <tnorthover@apple.com>

Tue, 18 Feb 2014 11:15:32 +0000 (11:15 +0000)

committer Tim Northover <tnorthover@apple.com>

Tue, 18 Feb 2014 11:15:32 +0000 (11:15 +0000)
author Tim Northover <tnorthover@apple.com>
Tue, 18 Feb 2014 11:15:32 +0000 (11:15 +0000)
committer Tim Northover <tnorthover@apple.com>
Tue, 18 Feb 2014 11:15:32 +0000 (11:15 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index e35aa8fa20826516eefc8175e9da4b5275d268b9..b1d734e932b5159fcaf657319eb8db6f3a7e5655 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -13161,6 +13161,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
      Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
      return DAG.getNode(ISD::MUL, dl, VT, Op, R);
    }
+
    if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
      assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
  
@@ -13204,6 +13205,19 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
      return R;
    }
  
+  // It's worth extending once and using the v8i32 shifts for 16-bit types, but
+  // the extra overheads to get from v16i8 to v8i32 make the existing SSE
+  // solution better.
+  if (Subtarget->hasInt256() && VT == MVT::v8i16) {
+    MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
+    unsigned ExtOpc =
+        Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    R = DAG.getNode(ExtOpc, dl, NewVT, R);
+    Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
+    }
+
    // Decompose 256-bit shifts into smaller 128-bit shifts.
    if (VT.is256BitVector()) {
      unsigned NumElems = VT.getVectorNumElements();
diff --git a/test/CodeGen/X86/avx2-shift.ll b/test/CodeGen/X86/avx2-shift.ll

index 7fdbaaa39cbe699cca86a80d8b2f2a59fd214617..025d52ede0f46c37633f027bfd0fc85910fabaf4 100644 (file)
--- a/test/CodeGen/X86/avx2-shift.ll
+++ b/test/CodeGen/X86/avx2-shift.ll
@@ -266,3 +266,36 @@ define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
    %c = sext <8 x i16> %b to <8 x i32>
    ret <8 x i32> %c
  }
+
+define <8 x i16> @variable_shl16(<8 x i16> %lhs, <8  x i16> %rhs) {
+; CHECK-LABEL: variable_shl16:
+; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
+; CHECK-DAG: vpmovzxwd %xmm0, [[LHS:%ymm[0-9]+]]
+; CHECK: vpsllvd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
+; CHECK: vpshufb
+; CHECK: vpermq
+  %res = shl <8 x i16> %lhs, %rhs
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @variable_ashr16(<8 x i16> %lhs, <8  x i16> %rhs) {
+; CHECK-LABEL: variable_ashr16:
+; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
+; CHECK-DAG: vpmovsxwd %xmm0, [[LHS:%ymm[0-9]+]]
+; CHECK: vpsravd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
+; CHECK: vpshufb
+; CHECK: vpermq
+  %res = ashr <8 x i16> %lhs, %rhs
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @variable_lshr16(<8 x i16> %lhs, <8  x i16> %rhs) {
+; CHECK-LABEL: variable_lshr16:
+; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
+; CHECK-DAG: vpmovzxwd %xmm0, [[LHS:%ymm[0-9]+]]
+; CHECK: vpsrlvd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
+; CHECK: vpshufb
+; CHECK: vpermq
+  %res = lshr <8 x i16> %lhs, %rhs
+  ret <8 x i16> %res
+}
+\ No newline at end of file
author	Tim Northover <tnorthover@apple.com>
	Tue, 18 Feb 2014 11:15:32 +0000 (11:15 +0000)
committer	Tim Northover <tnorthover@apple.com>
	Tue, 18 Feb 2014 11:15:32 +0000 (11:15 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/avx2-shift.ll		patch \| blob \| history