[ARM] Improve the instruction selection of vector loads.

author Quentin Colombet <qcolombet@apple.com>

Wed, 3 Jul 2013 21:42:57 +0000 (21:42 +0000)

committer Quentin Colombet <qcolombet@apple.com>

Wed, 3 Jul 2013 21:42:57 +0000 (21:42 +0000)
author Quentin Colombet <qcolombet@apple.com>
Wed, 3 Jul 2013 21:42:57 +0000 (21:42 +0000)
committer Quentin Colombet <qcolombet@apple.com>
Wed, 3 Jul 2013 21:42:57 +0000 (21:42 +0000)
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index cc09754ef705c891cbf1207e2c75907d4a5956eb..9b68590f856d07ac6b8c2c24afb57b4fce933466 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -8773,6 +8773,98 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
    return DAG.getNode(ISD::BITCAST, dl, VT, BV);
  }
  
+/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
+static SDValue
+PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
+  // At that time, we may have inserted bitcasts from integer to float.
+  // If these bitcasts have survived DAGCombine, change the lowering of this
+  // BUILD_VECTOR in something more vector friendly, i.e., that does not
+  // force to use floating point types.
+
+  // Make sure we can change the type of the vector.
+  // This is possible iff:
+  // 1. The vector is only used in a bitcast to a integer type. I.e.,
+  //    1.1. Vector is used only once.
+  //    1.2. Use is a bit convert to an integer type.
+  // 2. The size of its operands are 32-bits (64-bits are not legal).
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+
+  // Check 1.1. and 2.
+  if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
+    return SDValue();
+
+  // By construction, the input type must be float.
+  assert(EltVT == MVT::f32 && "Unexpected type!");
+
+  // Check 1.2.
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() != ISD::BITCAST ||
+      Use->getValueType(0).isFloatingPoint())
+    return SDValue();
+
+  // Check profitability.
+  // Model is, if more than half of the relevant operands are bitcast from
+  // i32, turn the build_vector into a sequence of insert_vector_elt.
+  // Relevant operands are everything that is not statically
+  // (i.e., at compile time) bitcasted.
+  unsigned NumOfBitCastedElts = 0;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumOfRelevantElts = NumElts;
+  for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+    SDValue Elt = N->getOperand(Idx);
+    if (Elt->getOpcode() == ISD::BITCAST) {
+      // Assume only bit cast to i32 will go away.
+      if (Elt->getOperand(0).getValueType() == MVT::i32)
+        ++NumOfBitCastedElts;
+    } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt))
+      // Constants are statically casted, thus do not count them as
+      // relevant operands.
+      --NumOfRelevantElts;
+  }
+
+  // Check if more than half of the elements require a non-free bitcast.
+  if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  // Create the new vector type.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+  // Check if the type is legal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(VecVT))
+    return SDValue();
+
+  // Combine:
+  // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
+  // => BITCAST INSERT_VECTOR_ELT
+  //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
+  //                      (BITCAST EN), N.
+  SDValue Vec = DAG.getUNDEF(VecVT);
+  SDLoc dl(N);
+  for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
+    SDValue V = N->getOperand(Idx);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (V.getOpcode() == ISD::BITCAST &&
+        V->getOperand(0).getValueType() == MVT::i32)
+      // Fold obvious case.
+      V = V.getOperand(0);
+    else {
+      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 
+      // Make the DAGCombiner fold the bitcasts.
+      DCI.AddToWorklist(V.getNode());
+    }
+    SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32);
+    Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
+  }
+  Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  return Vec;
+}
+
  /// PerformInsertEltCombine - Target-specific dag combine xforms for
  /// ISD::INSERT_VECTOR_ELT.
  static SDValue PerformInsertEltCombine(SDNode *N,
@@ -9709,6 +9801,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
    case ARMISD::VLD3DUP:
    case ARMISD::VLD4DUP:
      return CombineBaseUpdate(N, DCI);
+  case ARMISD::BUILD_VECTOR:
+    return PerformARMBUILD_VECTORCombine(N, DCI);
    case ISD::INTRINSIC_VOID:
    case ISD::INTRINSIC_W_CHAIN:
      switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
diff --git a/test/CodeGen/ARM/2012-05-04-vmov.ll b/test/CodeGen/ARM/2012-05-04-vmov.ll

index d52ef2cc5a1c90de54bdfe5e20d93d7408ccb42a..14dbf7ff4ac9ad8e66e51500a9264cd9659f711e 100644 (file)
--- a/test/CodeGen/ARM/2012-05-04-vmov.ll
+++ b/test/CodeGen/ARM/2012-05-04-vmov.ll
@@ -7,5 +7,8 @@ entry:
    %div = udiv <2 x i32> %A, %B
    ret <2 x i32> %div
  ; A9-CHECK: vmov.32
-; SWIFT-CHECK-NOT: vmov.32
+; vmov.32 should not be used to get a lane:
+; vmov.32 <dst>, <src>[<lane>].
+; but vmov.32 <dst>[<lane>], <src> is fine.
+; SWIFT-CHECK-NOT: vmov.32 {{r[0-9]+}}, {{d[0-9]\[[0-9]+\]}}
  }
diff --git a/test/CodeGen/ARM/vector-DAGCombine.ll b/test/CodeGen/ARM/vector-DAGCombine.ll

index 42964deb0b5e51a0fc2951faf8634e40a44caed4..6d586f2426464b632ce8c0cae113ff9f0633cc17 100644 (file)
--- a/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -160,3 +160,27 @@ define void @reverse_v16i8(<16 x i8>* %loadaddr, <16 x i8>* %storeaddr) {
    store <16 x i8> %v1, <16 x i8>* %storeaddr
    ret void
  }
+
+; <rdar://problem/14170854>.
+; vldr cannot handle unaligned loads.
+; Fall back to vld1.32, which can, instead of using the general purpose loads
+; followed by a costly sequence of instructions to build the vector register.
+; CHECK: t3
+; CHECK: vld1.32 {[[REG:d[0-9]+]][0]}
+; CHECK: vld1.32 {[[REG]][1]}
+; CHECK: vmull.u8 q{{[0-9]+}}, [[REG]], [[REG]]
+define <8 x i16> @t3(i8 zeroext %xf, i8* nocapture %sp0, i8* nocapture %sp1, i32* nocapture %outp) {
+entry:
+  %pix_sp0.0.cast = bitcast i8* %sp0 to i32*
+  %pix_sp0.0.copyload = load i32* %pix_sp0.0.cast, align 1
+  %pix_sp1.0.cast = bitcast i8* %sp1 to i32*
+  %pix_sp1.0.copyload = load i32* %pix_sp1.0.cast, align 1
+  %vecinit = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vecinit1 = insertelement <2 x i32> %vecinit, i32 %pix_sp1.0.copyload, i32 1
+  %0 = bitcast <2 x i32> %vecinit1 to <8 x i8>
+  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %0, <8 x i8> %0)
+  ret <8 x i16> %vmull.i
+}
+
+; Function Attrs: nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)
author	Quentin Colombet <qcolombet@apple.com>
	Wed, 3 Jul 2013 21:42:57 +0000 (21:42 +0000)
committer	Quentin Colombet <qcolombet@apple.com>
	Wed, 3 Jul 2013 21:42:57 +0000 (21:42 +0000)
lib/Target/ARM/ARMISelLowering.cpp		patch \| blob \| history
test/CodeGen/ARM/2012-05-04-vmov.ll		patch \| blob \| history
test/CodeGen/ARM/vector-DAGCombine.ll		patch \| blob \| history