DAGCombiner: Combine extract_vector_elt from build_vector

author Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 12 Oct 2015 23:59:50 +0000 (23:59 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 12 Oct 2015 23:59:50 +0000 (23:59 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 12 Oct 2015 23:59:50 +0000 (23:59 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 12 Oct 2015 23:59:50 +0000 (23:59 +0000)
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h

index ae83833fd6373e80161a2111c20f638be70b9961..10194e37fef4815f1e23c3d1f4cbbcf4614e8276 100644 (file)
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -1720,6 +1720,12 @@ public:
      return false;
    }
  
+  // Return true if it is profitable to use a scalar input to a BUILD_VECTOR
+  // even if the vector itself has multiple uses.
+  virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const {
+    return false;
+  }
+
    //===--------------------------------------------------------------------===//
    // Runtime Library hooks
    //
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index 8f745a149ac93be41c1cc48d874d5467cf5d8069..a26f378b49aaee0ece7421a8d35b7dfa7b5df92e 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11886,7 +11886,24 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
    }
  
    SDValue EltNo = N->getOperand(1);
-  bool ConstEltNo = isa<ConstantSDNode>(EltNo);
+  ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+
+  // extract_vector_elt (build_vector x, y), 1 -> y
+  if (ConstEltNo &&
+      InVec.getOpcode() == ISD::BUILD_VECTOR &&
+      TLI.isTypeLegal(VT) &&
+      (InVec.hasOneUse() ||
+       TLI.aggressivelyPreferBuildVectorSources(VT))) {
+    SDValue Elt = InVec.getOperand(ConstEltNo->getZExtValue());
+    EVT InEltVT = Elt.getValueType();
+
+    // Sometimes build_vector's scalar input types do not match result type.
+    if (NVT == InEltVT)
+      return Elt;
+
+    // TODO: It may be useful to truncate if free if the build_vector implicitly
+    // converts.
+  }
  
    // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
    // We only perform this optimization before the op legalization phase because
@@ -11894,13 +11911,11 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
    // patterns. For example on AVX, extracting elements from a wide vector
    // without using extract_subvector. However, if we can find an underlying
    // scalar value, then we can always use that.
-  if (InVec.getOpcode() == ISD::VECTOR_SHUFFLE
-      && ConstEltNo) {
-    int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+  if (ConstEltNo && InVec.getOpcode() == ISD::VECTOR_SHUFFLE) {
      int NumElem = VT.getVectorNumElements();
      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(InVec);
      // Find the new index to extract from.
-    int OrigElt = SVOp->getMaskElt(Elt);
+    int OrigElt = SVOp->getMaskElt(ConstEltNo->getZExtValue());
  
      // Extracting an undef index is undef.
      if (OrigElt == -1)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 8300431ec01d0a06508f82e6e29ade728f9e63aa..a8af7ec75f0482ed8cceec8b8f66430194e4cb7a 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -533,6 +533,18 @@ bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
    return true;
  }
  
+bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
+  // There are few operations which truly have vector input operands. Any vector
+  // operation is going to involve operations on each component, and a
+  // build_vector will be a copy per element, so it always makes sense to use a
+  // build_vector input in place of the extracted element to avoid a copy into a
+  // super register.
+  //
+  // We should probably only do this if all users are extracts only, but this
+  // should be the common case.
+  return true;
+}
+
  bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
    // Truncate is just accessing a subregister.
    return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h

index 3f5b1f59e068202ba494d887c2a8ced82a97ee4a..1e060c4d70877a3621be8866a316a00abb22f8f2 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -138,6 +138,7 @@ public:
    bool storeOfVectorConstantIsCheap(EVT MemVT,
                                      unsigned NumElem,
                                      unsigned AS) const override;
+  bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
    bool isCheapToSpeculateCttz() const override;
    bool isCheapToSpeculateCtlz() const override;
  
diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll

index 2dd0d1245930b61afa9ff3459ca86afe8fddc779..3f70f0a7e9f9e003c4bc6a4112156406f9f15ba0 100644 (file)
--- a/test/CodeGen/AArch64/fold-constants.ll
+++ b/test/CodeGen/AArch64/fold-constants.ll
@@ -3,9 +3,6 @@
  define i64 @dotests_616() {
  ; CHECK-LABEL: dotests_616
  ; CHECK:       movi d0, #0000000000000000
-; CHECK-NEXT:  umov w8, v0.b[2]
-; CHECK-NEXT:  sbfx w8, w8, #0, #1
-; CHECK-NEXT:  fmov s0, w8
  ; CHECK-NEXT:  fmov x0, d0
  ; CHECK-NEXT:  ret
  entry:
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll

index ec04f8b1acd6a66eeead8b150eaf1d030c65309c..579f989faeb5e3aa1e0806077c53afa236a48e23 100644 (file)
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -216,10 +216,8 @@ define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x f
    ret void
  }
  
-; We should be able to merge in this case, but probably not worth the effort.
-; SI-NOT: ds_read2_b32
-; SI: ds_read_b32
-; SI: ds_read_b32
+; SI-LABEL: {{^}}read2_ptr_is_subreg_f32:
+; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
  ; SI: s_endpgm
  define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
    %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll

index e8c34f0141e406d87660db13d87eddc1c2fb0e51..c8ef5b101c4d0e16ed788dcedbfbcb9c48691911 100644 (file)
--- a/test/CodeGen/AMDGPU/fceil64.ll
+++ b/test/CodeGen/AMDGPU/fceil64.ll
@@ -17,12 +17,12 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
  ; SI: s_lshr_b64
  ; SI: s_not_b64
  ; SI: s_and_b64
-; SI: cmp_gt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: cmp_lt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
+; SI-DAG: cmp_gt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
+; SI-DAG: cmp_lt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
  ; SI-DAG: v_cmp_lt_f64
  ; SI-DAG: v_cmp_lg_f64
  ; SI: s_and_b64
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll

index 6618d8b5e57e311ca264420904a6cacda6d7599c..83a8ad8901d2167c774b4d9798fe7932f8e56acb 100644 (file)
--- a/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -29,12 +29,12 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
  ; SI: s_lshr_b64
  ; SI: s_not_b64
  ; SI: s_and_b64
-; SI: cmp_gt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
-; SI: cmp_lt_i32
-; SI: cndmask_b32
-; SI: cndmask_b32
+; SI-DAG: cmp_gt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
+; SI-DAG: cmp_lt_i32
+; SI-DAG: cndmask_b32
+; SI-DAG: cndmask_b32
  ; SI: s_endpgm
  define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
    %y = call double @llvm.trunc.f64(double %x) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll

index 471b0f6b13e78ca0641a0588afa29dba9b87a80e..f5ab390ce686db6dd8b6690225fa8b89e4910599 100644 (file)
--- a/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -11,24 +11,35 @@ define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
    ret void
  }
  
-define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
  ; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
  ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
  ; SI, which is why it is being OR'd with the base pointer.
  ; SI: s_or_b32
  ; CI: s_add_i32
  ; CHECK: ds_write_b32
+define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
    %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
    store i32 99, i32 addrspace(3)* %p
    ret void
  }
  
-define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
  ; CHECK-LABEL: {{^}}gep_as_vector_v4:
-; CHECK: s_add_i32
-; CHECK: s_add_i32
-; CHECK: s_add_i32
-; CHECK: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CHECK: s_endpgm
+define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
    %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
    %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
    %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
@@ -41,10 +52,15 @@ define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind
    ret void
  }
  
-define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
  ; CHECK-LABEL: {{^}}gep_as_vector_v2:
-; CHECK: s_add_i32
-; CHECK: s_add_i32
+; SI: s_add_i32
+; SI: s_add_i32
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CHECK-DAG: v_mov_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
+; CHECK: s_endpgm
+define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
    %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
    %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
    %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 12 Oct 2015 23:59:50 +0000 (23:59 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 12 Oct 2015 23:59:50 +0000 (23:59 +0000)
include/llvm/Target/TargetLowering.h		patch \| blob \| history
lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.h		patch \| blob \| history
test/CodeGen/AArch64/fold-constants.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ds_read2.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fceil64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/ftrunc.f64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/gep-address-space.ll		patch \| blob \| history