Fix pr11266.

author Nadav Rotem <nadav.rotem@intel.com>

Sun, 30 Oct 2011 13:24:22 +0000 (13:24 +0000)

committer Nadav Rotem <nadav.rotem@intel.com>

Sun, 30 Oct 2011 13:24:22 +0000 (13:24 +0000)
author Nadav Rotem <nadav.rotem@intel.com>
Sun, 30 Oct 2011 13:24:22 +0000 (13:24 +0000)
committer Nadav Rotem <nadav.rotem@intel.com>
Sun, 30 Oct 2011 13:24:22 +0000 (13:24 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 2ec08140db51db92b29083ae404d10a8d0bfe7cf..1af24497ba72b613699277400b6bf1e95f2489df 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -13042,7 +13042,8 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
  
    // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
    // since the result of setcc_c is all zero's or all ones.
-  if (N1C && N0.getOpcode() == ISD::AND &&
+  if (VT.isInteger() && !VT.isVector() &&
+      N1C && N0.getOpcode() == ISD::AND &&
        N0.getOperand(1).getOpcode() == ISD::Constant) {
      SDValue N00 = N0.getOperand(0);
      if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
@@ -13058,6 +13059,22 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
      }
    }
  
+
+  // Hardware support for vector shifts is sparse which makes us scalarize the
+  // vector operations in many cases. Also, on sandybridge ADD is faster than
+  // shl.
+  // (shl V, 1) -> add V,V
+  if (isSplatVector(N1.getNode())) {
+    assert(N0.getValueType().isVector() && "Invalid vector shift type");
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
+    // We shift all of the values by one. In many cases we do not have
+    // hardware support for this operation. This is better expressed as an ADD
+    // of two values.
+    if (N1C && (1 == N1C->getZExtValue())) {
+      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
+    }
+  }
+
    return SDValue();
  }
  
@@ -13066,9 +13083,10 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
  static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
                                     const X86Subtarget *Subtarget) {
    EVT VT = N->getValueType(0);
-  if (!VT.isVector() && VT.isInteger() &&
-      N->getOpcode() == ISD::SHL)
-    return PerformSHLCombine(N, DAG);
+  if (N->getOpcode() == ISD::SHL) {
+    SDValue V = PerformSHLCombine(N, DAG);
+    if (V.getNode()) return V;
+  }
  
    // On X86 with SSE2 support, we can transform this to a vector shift if
    // all elements are shifted by the same amount.  We can't do this in legalize
diff --git a/test/CodeGen/X86/2011-10-30-padd.ll b/test/CodeGen/X86/2011-10-30-padd.ll

new file mode 100644 (file)

index 0000000..180ca15
--- /dev/null
+++ b/test/CodeGen/X86/2011-10-30-padd.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+
+;CHECK: addXX_test
+;CHECK: padd
+;CHECK: ret
+
+
+define <16 x i8> @addXX_test(<16 x i8> %a) {
+      %b = add <16 x i8> %a, %a
+      ret <16 x i8> %b
+}
+
+;CHECK: instcombine_test
+;CHECK: padd
+;CHECK: ret
+define <16 x i8> @instcombine_test(<16 x i8> %a) {
+  %b = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %b
+}
+
diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll

index 1cb07aa0824f918378797d94a3553640708d3323..5a91b090472d56a2fde754fbfbe042b71194f926 100644 (file)
--- a/test/CodeGen/X86/x86-shifts.ll
+++ b/test/CodeGen/X86/x86-shifts.ll
@@ -6,8 +6,9 @@
  define <4 x i32> @shl4(<4 x i32> %A) nounwind {
  entry:
  ; CHECK:      shl4
+; CHECK:      padd
  ; CHECK:      pslld
-; CHECK-NEXT: pslld
+; CHECK:      ret
    %B = shl <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
    %C = shl <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
    %K = xor <4 x i32> %B, %C
@@ -19,6 +20,7 @@ entry:
  ; CHECK:      shr4
  ; CHECK:      psrld
  ; CHECK-NEXT: psrld
+; CHECK:      ret
    %B = lshr <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
    %C = lshr <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
    %K = xor <4 x i32> %B, %C
@@ -30,6 +32,7 @@ entry:
  ; CHECK:      sra4
  ; CHECK:      psrad
  ; CHECK-NEXT: psrad
+; CHECK:      ret
    %B = ashr <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
    %C = ashr <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
    %K = xor <4 x i32> %B, %C
@@ -41,6 +44,7 @@ entry:
  ; CHECK:      shl2
  ; CHECK:      psllq
  ; CHECK-NEXT: psllq
+; CHECK:      ret
    %B = shl <2 x i64> %A,  < i64 2, i64 2>
    %C = shl <2 x i64> %A,  < i64 9, i64 9>
    %K = xor <2 x i64> %B, %C
@@ -52,6 +56,7 @@ entry:
  ; CHECK:      shr2
  ; CHECK:      psrlq
  ; CHECK-NEXT: psrlq
+; CHECK:      ret
    %B = lshr <2 x i64> %A,  < i64 8, i64 8>
    %C = lshr <2 x i64> %A,  < i64 1, i64 1>
    %K = xor <2 x i64> %B, %C
@@ -62,8 +67,9 @@ entry:
  define <8 x i16> @shl8(<8 x i16> %A) nounwind {
  entry:
  ; CHECK:      shl8
+; CHECK:      padd
  ; CHECK:      psllw
-; CHECK-NEXT: psllw
+; CHECK:      ret
    %B = shl <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
    %C = shl <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    %K = xor <8 x i16> %B, %C
@@ -75,6 +81,7 @@ entry:
  ; CHECK:      shr8
  ; CHECK:      psrlw
  ; CHECK-NEXT: psrlw
+; CHECK:      ret
    %B = lshr <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
    %C = lshr <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    %K = xor <8 x i16> %B, %C
@@ -86,6 +93,7 @@ entry:
  ; CHECK:      sra8
  ; CHECK:      psraw
  ; CHECK-NEXT: psraw
+; CHECK:      ret
    %B = ashr <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
    %C = ashr <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    %K = xor <8 x i16> %B, %C
@@ -100,6 +108,7 @@ entry:
  ; CHECK: sll8_nosplat
  ; CHECK-NOT: psll
  ; CHECK-NOT: psll
+; CHECK:      ret
    %B = shl <8 x i16> %A,  < i16 1, i16 2, i16 3, i16 6, i16 2, i16 2, i16 2, i16 2>
    %C = shl <8 x i16> %A,  < i16 9, i16 7, i16 5, i16 1, i16 4, i16 1, i16 1, i16 1>
    %K = xor <8 x i16> %B, %C
@@ -112,6 +121,7 @@ entry:
  ; CHECK: shr2_nosplat
  ; CHECK-NOT:  psrlq
  ; CHECK-NOT:  psrlq
+; CHECK:      ret
    %B = lshr <2 x i64> %A,  < i64 8, i64 1>
    %C = lshr <2 x i64> %A,  < i64 1, i64 0>
    %K = xor <2 x i64> %B, %C
@@ -125,6 +135,7 @@ define <2 x i32> @shl2_other(<2 x i32> %A) nounwind {
  entry:
  ; CHECK: shl2_other
  ; CHECK: psllq
+; CHECK: ret
    %B = shl <2 x i32> %A,  < i32 2, i32 2>
    %C = shl <2 x i32> %A,  < i32 9, i32 9>
    %K = xor <2 x i32> %B, %C
@@ -135,6 +146,7 @@ define <2 x i32> @shr2_other(<2 x i32> %A) nounwind {
  entry:
  ; CHECK: shr2_other
  ; CHECK: psrlq
+; CHECK: ret
    %B = lshr <2 x i32> %A,  < i32 8, i32 8>
    %C = lshr <2 x i32> %A,  < i32 1, i32 1>
    %K = xor <2 x i32> %B, %C
author	Nadav Rotem <nadav.rotem@intel.com>
	Sun, 30 Oct 2011 13:24:22 +0000 (13:24 +0000)
committer	Nadav Rotem <nadav.rotem@intel.com>
	Sun, 30 Oct 2011 13:24:22 +0000 (13:24 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/2011-10-30-padd.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/x86-shifts.ll		patch \| blob \| history