Add target hook to prevent folding some bitcasted loads.

author Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 15 Nov 2013 04:42:23 +0000 (04:42 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 15 Nov 2013 04:42:23 +0000 (04:42 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 15 Nov 2013 04:42:23 +0000 (04:42 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 15 Nov 2013 04:42:23 +0000 (04:42 +0000)
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h

index 2649d26cb7d898348ef9bd944c6ef83e92014d73..5ab04f794452e1106201bc2a327edaf7b33fb95f 100644 (file)
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -204,6 +204,17 @@ public:
      return PredictableSelectIsExpensive;
    }
  
+  /// isLoadBitCastBeneficial() - Return true if the following transform
+  /// is beneficial.
+  /// fold (conv (load x)) -> (load (conv*)x)
+  /// On architectures that don't natively support some vector loads efficiently,
+  /// casting the load to a smaller vector of larger types and loading
+  /// is more efficient, however, this can be undone by optimizations in
+  /// dag combiner.
+  virtual bool isLoadBitCastBeneficial(EVT /* Load */, EVT /* Bitcast */) const {
+    return true;
+  }
+
    /// Return the ValueType of the result of SETCC operations.  Also used to
    /// obtain the target's preferred type for the condition operand of SELECT and
    /// BRCOND nodes.  In the case of BRCOND the argument passed is MVT::Other
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index 90cd1d3ee28ff7ff66ae94edf1a774ecc34386b7..78543a4113d5f90e73fe85c0ffa50e48a977de8b 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5768,7 +5768,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
    if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
        // Do not change the width of a volatile load.
        !cast<LoadSDNode>(N0)->isVolatile() &&
-      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) {
+      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
+      TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
      unsigned Align = TLI.getDataLayout()->
        getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp

index 51ad217fc8793b983588e1f418102d89a69f0c13..fdabea51695cab06dc76e8d9d86d44057d51afcf 100644 (file)
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -196,6 +196,18 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const {
    return MVT::i32;
  }
  
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
+                                                   EVT CastTy) const {
+  if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
+    return true;
+
+  unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
+  unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
+
+  return ((LScalarSize <= CastScalarSize) ||
+          (CastScalarSize >= 32) ||
+          (LScalarSize < 32));
+}
  
  //===---------------------------------------------------------------------===//
  // Target Properties
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h

index dacb086dd5dfde8cfc2057d36fc67ca48a0922c2..2dfd3cf492a9ef35f026b0f5e633838db70ebf19 100644 (file)
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -77,6 +77,7 @@ public:
    virtual bool isFAbsFree(EVT VT) const;
    virtual bool isFNegFree(EVT VT) const;
    virtual MVT getVectorIdxTy() const;
+  virtual bool isLoadBitCastBeneficial(EVT, EVT) const LLVM_OVERRIDE;
    virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git a/test/CodeGen/R600/combine_vloads.ll b/test/CodeGen/R600/combine_vloads.ll

new file mode 100644 (file)

index 0000000..f8ec712
--- /dev/null
+++ b/test/CodeGen/R600/combine_vloads.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+;
+; kernel void combine_vloads(global char8* src, global char8* result) {
+;   for (int i = 0; i < 1024; ++i)
+;     result[i] = src[0] + src[1] + src[2] + src[3];
+; }
+;
+
+
+; 128-bit loads instead of many 8-bit
+; EG-LABEL: @combine_vloads:
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+entry:
+  br label %for.body
+
+for.exit:                                         ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ]
+  %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)*
+  %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)*
+  %vecload2 = load <8 x i32> addrspace(1)* %0, align 32
+  %1 = bitcast <8 x i32> %vecload2 to <32 x i8>
+  %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp9 = add nsw <8 x i8> %tmp5, %tmp8
+  %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %tmp13 = add nsw <8 x i8> %tmp9, %tmp12
+  %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tmp17 = add nsw <8 x i8> %tmp13, %tmp16
+  %scevgep = getelementptr <8 x i8> addrspace(1)* %result, i32 %i.01
+  %2 = bitcast <8 x i8> %tmp17 to <2 x i32>
+  %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)*
+  store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8
+  %tmp19 = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %tmp19, 1024
+  br i1 %exitcond, label %for.exit, label %for.body
+}
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 15 Nov 2013 04:42:23 +0000 (04:42 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 15 Nov 2013 04:42:23 +0000 (04:42 +0000)
include/llvm/Target/TargetLowering.h		patch \| blob \| history
lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
lib/Target/R600/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/R600/AMDGPUISelLowering.h		patch \| blob \| history
test/CodeGen/R600/combine_vloads.ll	[new file with mode: 0644]	patch \| blob