Legalize vector truncates by parts rather than just splitting.

author Jim Grosbach <grosbach@apple.com>

Sun, 21 Apr 2013 23:47:41 +0000 (23:47 +0000)

committer Jim Grosbach <grosbach@apple.com>

Sun, 21 Apr 2013 23:47:41 +0000 (23:47 +0000)
author Jim Grosbach <grosbach@apple.com>
Sun, 21 Apr 2013 23:47:41 +0000 (23:47 +0000)
committer Jim Grosbach <grosbach@apple.com>
Sun, 21 Apr 2013 23:47:41 +0000 (23:47 +0000)
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h

index adc0e9d86e5a345e122cdf21e9f4362f49ed91fd..1c4274a910896715e5042355f968aa74f4e2d6de 100644 (file)
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -581,6 +581,7 @@ private:
    SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
    SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
    SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue SplitVecOp_TRUNCATE(SDNode *N);
    SDValue SplitVecOp_VSETCC(SDNode *N);
    SDValue SplitVecOp_FP_ROUND(SDNode *N);
  
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

index bd8b6ac0c7dbcf5d66ec541ca4f796b62a3a986a..04c6bfd0c23a4c079df0e542bb0d8281ee53c1dd 100644 (file)
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1046,6 +1046,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
      case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
      case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
      case ISD::CONCAT_VECTORS:    Res = SplitVecOp_CONCAT_VECTORS(N); break;
+    case ISD::TRUNCATE:          Res = SplitVecOp_TRUNCATE(N); break;
      case ISD::FP_ROUND:          Res = SplitVecOp_FP_ROUND(N); break;
      case ISD::STORE:
        Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
@@ -1062,7 +1063,6 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
      case ISD::SINT_TO_FP:
      case ISD::UINT_TO_FP:
      case ISD::FTRUNC:
-    case ISD::TRUNCATE:
      case ISD::SIGN_EXTEND:
      case ISD::ZERO_EXTEND:
      case ISD::ANY_EXTEND:
@@ -1293,6 +1293,66 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
                       &Elts[0], Elts.size());
  }
  
+SDValue DAGTypeLegalizer::SplitVecOp_TRUNCATE(SDNode *N) {
+  // The result type is legal, but the input type is illegal.  If splitting
+  // ends up with the result type of each half still being legal, just
+  // do that.  If, however, that would result in an illegal result type,
+  // we can try to get more clever with power-two vectors. Specifically,
+  // split the input type, but also widen the result element size, then
+  // concatenate the halves and truncate again.  For example, consider a target
+  // where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit
+  // vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do:
+  //   %inlo = v4i32 extract_subvector %in, 0
+  //   %inhi = v4i32 extract_subvector %in, 4
+  //   %lo16 = v4i16 trunc v4i32 %inlo
+  //   %hi16 = v4i16 trunc v4i32 %inhi
+  //   %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
+  //   %res = v8i8 trunc v8i16 %in16
+  //
+  // Without this transform, the original truncate would end up being
+  // scalarized, which is pretty much always a last resort.
+  SDValue InVec = N->getOperand(0);
+  EVT InVT = InVec->getValueType(0);
+  EVT OutVT = N->getValueType(0);
+  unsigned NumElements = OutVT.getVectorNumElements();
+  // Widening should have already made sure this is a power-two vector
+  // if we're trying to split it at all. assert() that's true, just in case.
+  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+
+  unsigned InElementSize = InVT.getVectorElementType().getSizeInBits();
+  unsigned OutElementSize = OutVT.getVectorElementType().getSizeInBits();
+
+  // If the input elements are only 1/2 the width of the result elements,
+  // just use the normal splitting. Our trick only work if there's room
+  // to split more than once.
+  if (InElementSize <= OutElementSize * 2)
+    return SplitVecOp_UnaryOp(N);
+  DebugLoc DL = N->getDebugLoc();
+
+  // Extract the halves of the input via extract_subvector.
+  EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
+                                 InVT.getVectorElementType(), NumElements/2);
+  SDValue InLoVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
+                                DAG.getIntPtrConstant(0));
+  SDValue InHiVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
+                                DAG.getIntPtrConstant(NumElements/2));
+  // Truncate them to 1/2 the element size.
+  EVT HalfElementVT = EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
+  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
+                                NumElements/2);
+  SDValue HalfLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InLoVec);
+  SDValue HalfHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, InHiVec);
+  // Concatenate them to get the full intermediate truncation result.
+  EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements);
+  SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo,
+                                 HalfHi);
+  // Now finish up by truncating all the way down to the original result
+  // type. This should normally be something that ends up being legal directly,
+  // but in theory if a target has very wide vectors and an annoyingly
+  // restricted set of legal types, this split can chain to build things up.
+  return DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
+}
+
  SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
    assert(N->getValueType(0).isVector() &&
           N->getOperand(0).getValueType().isVector() &&
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp

index 1019b972e9575255f7a5def6265511140ea244d5..3149f19b787bc39f656216adc023f9152dc64d1a 100644 (file)
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -223,9 +223,9 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
      { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
      { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  
-    // Operations that we legalize using load/stores to the stack.
-    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 },
-    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 },
+    // Operations that we legalize using splitting.
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
+    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
  
      // Vector float <-> i32 conversions.
      { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
diff --git a/test/Analysis/CostModel/ARM/cast.ll b/test/Analysis/CostModel/ARM/cast.ll

index ba9d84cf3e23ed248d12e2649b90633f2f5dbb52..0cdd61cac4f7f6a3ef7c0645eb4d76f8bd70a551 100644 (file)
--- a/test/Analysis/CostModel/ARM/cast.ll
+++ b/test/Analysis/CostModel/ARM/cast.ll
@@ -175,9 +175,9 @@ define i32 @casts() {
    %rext_5 = zext <4 x i16> undef to <4 x i64>
  
    ; Vector cast cost of instructions lowering the cast to the stack.
-  ; CHECK: cost of 19 {{.*}} trunc
+  ; CHECK: cost of 3 {{.*}} trunc
    %r74 = trunc <8 x i32> undef to <8 x i8>
-  ; CHECK: cost of 38 {{.*}} trunc
+  ; CHECK: cost of 6 {{.*}} trunc
    %r75 = trunc <16 x i32> undef to <16 x i8>
  
    ; Floating point truncation costs.
diff --git a/test/CodeGen/ARM/vcvt-cost.ll b/test/CodeGen/ARM/vcvt-cost.ll

index 04619b9c6cf6e0a5f73d7f136d1c2048e6cadbf4..0d45c40b8814e134dc02db81600ce50673a9ec4a 100644 (file)
--- a/test/CodeGen/ARM/vcvt-cost.ll
+++ b/test/CodeGen/ARM/vcvt-cost.ll
@@ -32,29 +32,22 @@ define void @func_cvt1(%TA0_5* %loadaddr, %TA1_5* %storeaddr) {
    store %TA1_5 %r, %TA1_5* %storeaddr
    ret void
  }
-;; We currently estimate the cost of this instruction as expensive. If lowering
-;; is improved the cost needs to change.
+
  %T0_51 = type <8 x i32>
  %T1_51 = type <8 x i8>
  ; CHECK: func_cvt51:
  define void @func_cvt51(%T0_51* %loadaddr, %T1_51* %storeaddr) {
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i16
    %v0 = load %T0_51* %loadaddr
  ; COST: func_cvt51
-; COST: cost of 19 {{.*}} trunc
+; COST: cost of 3 {{.*}} trunc
    %r = trunc %T0_51 %v0 to %T1_51
    store %T1_51 %r, %T1_51* %storeaddr
    ret void
  }
-;; We currently estimate the cost of this instruction as expensive. If lowering
-;; is improved the cost needs to change.
+
  %TT0_5 = type <16 x i8>
  %TT1_5 = type <16 x i32>
  ; CHECK: func_cvt52:
@@ -87,31 +80,20 @@ define void @func_cvt12(%TTA0_5* %loadaddr, %TTA1_5* %storeaddr) {
    store %TTA1_5 %r, %TTA1_5* %storeaddr
    ret void
  }
-;; We currently estimate the cost of this instruction as expensive. If lowering
-;; is improved the cost needs to change.
+
  %TT0_51 = type <16 x i32>
  %TT1_51 = type <16 x i8>
  ; CHECK: func_cvt512:
  define void @func_cvt512(%TT0_51* %loadaddr, %TT1_51* %storeaddr) {
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
-; CHECK: strb
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i16
+; CHECK: vmovn.i16
    %v0 = load %TT0_51* %loadaddr
  ; COST: func_cvt512
-; COST: cost of 38 {{.*}} trunc
+; COST: cost of 6 {{.*}} trunc
    %r = trunc %TT0_51 %v0 to %TT1_51
    store %TT1_51 %r, %TT1_51* %storeaddr
    ret void
author	Jim Grosbach <grosbach@apple.com>
	Sun, 21 Apr 2013 23:47:41 +0000 (23:47 +0000)
committer	Jim Grosbach <grosbach@apple.com>
	Sun, 21 Apr 2013 23:47:41 +0000 (23:47 +0000)
lib/CodeGen/SelectionDAG/LegalizeTypes.h		patch \| blob \| history
lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp		patch \| blob \| history
lib/Target/ARM/ARMTargetTransformInfo.cpp		patch \| blob \| history
test/Analysis/CostModel/ARM/cast.ll		patch \| blob \| history
test/CodeGen/ARM/vcvt-cost.ll		patch \| blob \| history