Added optimization that narrow load / op / store and the 'op' is a bit twiddling...

author Evan Cheng <evan.cheng@apple.com>

Thu, 28 May 2009 00:35:15 +0000 (00:35 +0000)

committer Evan Cheng <evan.cheng@apple.com>

Thu, 28 May 2009 00:35:15 +0000 (00:35 +0000)
author Evan Cheng <evan.cheng@apple.com>
Thu, 28 May 2009 00:35:15 +0000 (00:35 +0000)
committer Evan Cheng <evan.cheng@apple.com>
Thu, 28 May 2009 00:35:15 +0000 (00:35 +0000)
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h

index 0576e3e1a8bda337cbf4125410d086b77a7898ca..dc66e55a1ff66aa5a3c83f531665a1aa4e326010 100644 (file)
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -1420,6 +1420,13 @@ public:
      return false;
    }
  
+  /// isNarrowingProfitable - Return true if it's profitable to narrow
+  /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+  /// from i32 to i8 but not from i32 to i16.
+  virtual bool isNarrowingProfitable(MVT VT1, MVT VT2) const {
+    return true;
+  }
+
    //===--------------------------------------------------------------------===//
    // Div utility functions
    //
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index a866cb5629e8eb8ca8f17197ffab83d8473785a9..6a47aa52a2a95629f9b528e6af6da32689370bf2 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -41,6 +41,7 @@ using namespace llvm;
  STATISTIC(NodesCombined   , "Number of dag nodes combined");
  STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
  STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
+STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");
  
  namespace {
    static cl::opt<bool>
@@ -222,6 +223,7 @@ namespace {
      SDValue BuildUDIV(SDNode *N);
      SDNode *MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL);
      SDValue ReduceLoadWidth(SDNode *N);
+    SDValue ReduceLoadOpStoreWidth(SDNode *N);
  
      SDValue GetDemandedBits(SDValue V, const APInt &Mask);
  
@@ -4900,6 +4902,96 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
    return SDValue();
  }
  
+
+/// ReduceLoadOpStoreWidth - Look for sequence of load / op / store where op is
+/// one of 'or', 'xor', and 'and' of immediates. If 'op' is only touching some
+/// of the loaded bits, try narrowing the load and store if it would end up
+/// being a win for performance or code size.
+SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr   = ST->getBasePtr();
+  MVT VT = Value.getValueType();
+
+  if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
+    return SDValue(0, 0);
+
+  unsigned Opc = Value.getOpcode();
+  if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
+      Value.getOperand(1).getOpcode() != ISD::Constant)
+    return SDValue(0, 0);
+
+  SDValue N0 = Value.getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LD = cast<LoadSDNode>(N0);
+    if (LD->getBasePtr() != Ptr/* || Chain != N0.getValue(1)*/)
+      return SDValue(0, 0);
+
+    // Find the type to narrow it the load / op / store to.
+    SDValue N1 = Value.getOperand(1);
+    unsigned BitWidth = N1.getValueSizeInBits();
+    APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
+    if (Opc == ISD::AND)
+      Imm ^= APInt::getAllOnesValue(BitWidth);
+    unsigned ShAmt = Imm.countTrailingZeros();
+    unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
+    unsigned NewBW = NextPowerOf2(MSB - ShAmt);
+    MVT NewVT = MVT::getIntegerVT(NewBW);
+    while (NewBW < BitWidth &&
+           !(TLI.isTypeLegal(NewVT) &&
+             TLI.isOperationLegalOrCustom(Opc, NewVT) &&
+             TLI.isNarrowingProfitable(VT, NewVT))) {
+      NewBW = NextPowerOf2(NewBW);
+      NewVT = MVT::getIntegerVT(NewBW);
+    }
+    if (NewBW == BitWidth)
+      return SDValue(0, 0);
+
+    // If the lsb changed does not start at the type bitwidth boundary,
+    // start at the previous one.
+    if (ShAmt % NewBW)
+      ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
+    APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, ShAmt + NewBW);
+    if ((Imm & Mask) == Imm) {
+      APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
+      if (Opc == ISD::AND)
+        NewImm ^= APInt::getAllOnesValue(NewBW);
+      uint64_t PtrOff = ShAmt / 8;
+      // For big endian targets, we need to adjust the offset to the pointer to
+      // load the correct bytes.
+      if (TLI.isBigEndian())
+        PtrOff = (BitWidth - NewBW) / 8 - PtrOff;
+
+      unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
+      SDValue NewPtr = DAG.getNode(ISD::ADD, LD->getDebugLoc(),
+                                   Ptr.getValueType(), Ptr,
+                                   DAG.getConstant(PtrOff, Ptr.getValueType()));
+      SDValue NewLD = DAG.getLoad(NewVT, N0.getDebugLoc(),
+                                  LD->getChain(), NewPtr,
+                                  LD->getSrcValue(), LD->getSrcValueOffset(),
+                                  LD->isVolatile(), NewAlign);
+      SDValue NewVal = DAG.getNode(Opc, Value.getDebugLoc(), NewVT, NewLD,
+                                   DAG.getConstant(NewImm, NewVT));
+      SDValue NewST = DAG.getStore(Chain, N->getDebugLoc(),
+                                   NewVal, NewPtr,
+                                   ST->getSrcValue(), ST->getSrcValueOffset(),
+                                   ST->isVolatile(), NewAlign);
+
+      AddToWorkList(NewPtr.getNode());
+      AddToWorkList(NewLD.getNode());
+      AddToWorkList(NewVal.getNode());
+      WorkListRemover DeadNodes(*this);
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1),
+                                    &DeadNodes);
+      ++OpsNarrowed;
+      return NewST;
+    }
+  }
+
+  return SDValue(0, 0);
+}
+
  SDValue DAGCombiner::visitSTORE(SDNode *N) {
    StoreSDNode *ST  = cast<StoreSDNode>(N);
    SDValue Chain = ST->getChain();
@@ -5086,7 +5178,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
                               ST->isVolatile(), ST->getAlignment());
    }
  
-  return SDValue();
+  return ReduceLoadOpStoreWidth(N);
  }
  
  SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index b89eef0fb214493210c7bf6f20f55a0d02d0f370..0136f90ec435c52d0c81355d3e0b92217956dd7d 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6877,6 +6877,11 @@ bool X86TargetLowering::isZExtFree(MVT VT1, MVT VT2) const {
    return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
  }
  
+bool X86TargetLowering::isNarrowingProfitable(MVT VT1, MVT VT2) const {
+  // i16 instructions are longer (0x66 prefix) and potentially slower.
+  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
  /// isShuffleMaskLegal - Targets can use this to indicate that they only
  /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
  /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h

index badbd2462feb656c398af38d596613e060f433a5..550f8bdf9b64b6e0c6c20cf61731edf3278c37df 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -466,6 +466,11 @@ namespace llvm {
      virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
      virtual bool isZExtFree(MVT VT1, MVT VT2) const;
  
+    /// isNarrowingProfitable - Return true if it's profitable to narrow
+    /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+    /// from i32 to i8 but not from i32 to i16.
+    virtual bool isNarrowingProfitable(MVT VT1, MVT VT2) const;
+
      /// isShuffleMaskLegal - Targets can use this to indicate that they only
      /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
      /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
diff --git a/test/CodeGen/X86/narrow_op-1.ll b/test/CodeGen/X86/narrow_op-1.ll

new file mode 100644 (file)

index 0000000..0ee11b4
--- /dev/null
+++ b/test/CodeGen/X86/narrow_op-1.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | grep orb | count 1
+; RUN: llvm-as < %s | llc -march=x86-64 | grep orb | grep 1
+; RUN: llvm-as < %s | llc -march=x86-64 | grep orl | count 1
+; RUN: llvm-as < %s | llc -march=x86-64 | grep orl | grep 16842752
+
+       %struct.bf = type { i64, i16, i16, i32 }
+@bfi = common global %struct.bf zeroinitializer, align 16
+
+define void @t1() nounwind optsize ssp {
+entry:
+       %0 = load i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
+       %1 = or i32 %0, 65536
+       store i32 %1, i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
+       ret void
+}
+
+define void @t2() nounwind optsize ssp {
+entry:
+       %0 = load i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
+       %1 = or i32 %0, 16842752
+       store i32 %1, i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
+       ret void
+}
diff --git a/test/CodeGen/X86/narrow_op-2.ll b/test/CodeGen/X86/narrow_op-2.ll

new file mode 100644 (file)

index 0000000..b441794
--- /dev/null
+++ b/test/CodeGen/X86/narrow_op-2.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | grep andb | count 2
+; RUN: llvm-as < %s | llc -march=x86-64 | grep andb | grep 254
+; RUN: llvm-as < %s | llc -march=x86-64 | grep andb | grep 253
+
+       %struct.bf = type { i64, i16, i16, i32 }
+@bfi = external global %struct.bf*
+
+define void @t1() nounwind ssp {
+entry:
+       %0 = load %struct.bf** @bfi, align 8
+       %1 = getelementptr %struct.bf* %0, i64 0, i32 1
+       %2 = bitcast i16* %1 to i32*
+       %3 = load i32* %2, align 1
+       %4 = and i32 %3, -65537
+       store i32 %4, i32* %2, align 1
+       %5 = load %struct.bf** @bfi, align 8
+       %6 = getelementptr %struct.bf* %5, i64 0, i32 1
+       %7 = bitcast i16* %6 to i32*
+       %8 = load i32* %7, align 1
+       %9 = and i32 %8, -131073
+       store i32 %9, i32* %7, align 1
+       ret void
+}
author	Evan Cheng <evan.cheng@apple.com>
	Thu, 28 May 2009 00:35:15 +0000 (00:35 +0000)
committer	Evan Cheng <evan.cheng@apple.com>
	Thu, 28 May 2009 00:35:15 +0000 (00:35 +0000)
include/llvm/Target/TargetLowering.h		patch \| blob \| history
lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
test/CodeGen/X86/narrow_op-1.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/narrow_op-2.ll	[new file with mode: 0644]	patch \| blob