Look for SSE and instructions of this form: (and x, (build_vector c1,c2,c3,c4)).

author Evan Cheng <evan.cheng@apple.com>

Tue, 16 Feb 2010 21:09:44 +0000 (21:09 +0000)

committer Evan Cheng <evan.cheng@apple.com>

Tue, 16 Feb 2010 21:09:44 +0000 (21:09 +0000)
author Evan Cheng <evan.cheng@apple.com>
Tue, 16 Feb 2010 21:09:44 +0000 (21:09 +0000)
committer Evan Cheng <evan.cheng@apple.com>
Tue, 16 Feb 2010 21:09:44 +0000 (21:09 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 9974d8c997dadd85eaf7809dfffd48db2fe438bb..4c40fe1803e92ef22a45dff38c1c11f19655e55d 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -990,6 +990,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
    setTargetDAGCombine(ISD::BUILD_VECTOR);
    setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::AND);
    setTargetDAGCombine(ISD::SHL);
    setTargetDAGCombine(ISD::SRA);
    setTargetDAGCombine(ISD::SRL);
@@ -9157,6 +9158,53 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+/// PerformANDCombine - Look for SSE and instructions of this form:
+/// (and x, (build_vector c1,c2,c3,c4)). If there exists a use of a build_vector
+/// that's the bitwise complement of the mask, then transform the node to
+/// (and (xor x, (build_vector -1,-1,-1,-1)), (build_vector ~c1,~c2,~c3,~c4)).
+static SDValue PerformANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0.getOpcode() == ISD::XOR || !N1.hasOneUse())
+    return SDValue();
+
+  if (N1.getOpcode() == ISD::BUILD_VECTOR) {
+    unsigned NumElts = VT.getVectorNumElements();
+    EVT EltVT = VT.getVectorElementType();
+    SmallVector<SDValue, 8> Mask;
+    Mask.reserve(NumElts);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      SDValue Arg = N1.getOperand(i);
+      if (Arg.getOpcode() == ISD::UNDEF) {
+        Mask.push_back(Arg);
+        continue;
+      }
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Arg);
+      if (!C) return SDValue();
+      Mask.push_back(DAG.getConstant(~C->getAPIntValue(), EltVT));
+    }
+    N1 = DAG.getNode(ISD::BUILD_VECTOR, N1.getDebugLoc(), VT,
+                     &Mask[0], NumElts);
+    if (!N1.use_empty()) {
+      unsigned Bits = EltVT.getSizeInBits();
+      Mask.clear();
+      for (unsigned i = 0; i != NumElts; ++i)
+        Mask.push_back(DAG.getConstant(APInt::getAllOnesValue(Bits), EltVT));
+      SDValue NewMask = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                                    VT, &Mask[0], NumElts);
+      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                         DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+                                     N0, NewMask), N1);
+    }
+  }
+
+  return SDValue();
+}
  
  /// PerformMulCombine - Optimize a single multiply with constant into two
  /// in order to implement it with two cheaper instructions, e.g.
@@ -9305,7 +9353,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
        }
      } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
-         unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
+         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
           if (C->getZExtValue() == SplatIdx)
             BaseShAmt = InVec.getOperand(1);
         }
@@ -9690,6 +9738,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
    case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
    case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
+  case ISD::AND:            return PerformANDCombine(N, DAG, DCI);
    case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
    case ISD::SHL:
    case ISD::SRA:
diff --git a/test/CodeGen/X86/lsr-reuse-trunc.ll b/test/CodeGen/X86/lsr-reuse-trunc.ll

index d1d714491faab2a43894a7e6785e93edf2d2fada..a663a220e62d3fc0a930e16b3b3c90d06e908cba 100644 (file)
--- a/test/CodeGen/X86/lsr-reuse-trunc.ll
+++ b/test/CodeGen/X86/lsr-reuse-trunc.ll
@@ -1,10 +1,19 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
  
  ; Full strength reduction wouldn't reduce register pressure, so LSR should
  ; stick with indexing here.
  
-; CHECK: movaps        (%rsi,%rax,4), %xmm3
-; CHECK: movaps        %xmm3, (%rdi,%rax,4)
+; Also checks andps and andnps shares the same constantpool. Previously llvm
+; will codegen two andps, one using 0x80000000, the other 0x7fffffff.
+; rdar://7323335
+
+; CHECK: movaps LCPI1_0
+; CHECK: movaps LCPI1_1
+; CHECK-NOT: movaps LCPI1_2
+; CHECK: movaps (%rsi,%rax,4), %xmm2
+; CHECK: andps
+; CHECK: andnps
+; CHECK: movaps %xmm2, (%rdi,%rax,4)
  ; CHECK: addq  $4, %rax
  ; CHECK: cmpl  %eax, (%rdx)
  ; CHECK-NEXT: jg
diff --git a/test/CodeGen/X86/sink-hoist.ll b/test/CodeGen/X86/sink-hoist.ll

index 01d73736d6c22568ee71e583cd38ac13f6f1d535..e1d0fe76657d9d3e89ce8d1e9fa2bfa38926931b 100644 (file)
--- a/test/CodeGen/X86/sink-hoist.ll
+++ b/test/CodeGen/X86/sink-hoist.ll
@@ -63,7 +63,6 @@ entry:
  ; CHECK: vv:
  ; CHECK: LCPI4_0(%rip), %xmm0
  ; CHECK: LCPI4_1(%rip), %xmm1
-; CHECK: LCPI4_2(%rip), %xmm2
  ; CHECK: align
  ; CHECK-NOT: LCPI
  ; CHECK: ret
author	Evan Cheng <evan.cheng@apple.com>
	Tue, 16 Feb 2010 21:09:44 +0000 (21:09 +0000)
committer	Evan Cheng <evan.cheng@apple.com>
	Tue, 16 Feb 2010 21:09:44 +0000 (21:09 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/lsr-reuse-trunc.ll		patch \| blob \| history
test/CodeGen/X86/sink-hoist.ll		patch \| blob \| history