From ad69a64d4aac849d14e51af0bb547a542d4b50bf Mon Sep 17 00:00:00 2001
From: Chad Rosier <mcrosier@codeaurora.org>
Date: Thu, 3 Sep 2015 18:13:57 +0000
Subject: [PATCH] [AArch64] Improve ISel using across lane addition reduction.

In vectorized add reduction code, the final "reduce" step is sub-optimal.
This change wll combine :

ext  v1.16b, v0.16b, v0.16b, #8
add  v0.4s, v1.4s, v0.4s
dup  v1.4s, v0.s[1]
add  v0.4s, v1.4s, v0.4s

into

addv s0, v0.4s

PR21371
http://reviews.llvm.org/D12325
Patch by Jun Bum Lim <junbuml@codeaurora.org>!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246790 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp | 99 ++++++++++++++++++++++
 test/CodeGen/AArch64/aarch64-addv.ll       | 53 ++++++++++++
 2 files changed, 152 insertions(+)
 create mode 100644 test/CodeGen/AArch64/aarch64-addv.ll
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 20ed0016a79..b7f56472a4e 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -495,6 +495,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::INTRINSIC_VOID);
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
@@ -8584,6 +8585,102 @@ static SDValue performPostLD1Combine(SDNode *N,
   return SDValue();
 }
 
+/// Target-specific DAG combine for the across vector reduction.
+/// This function specifically handles the final clean-up step of a vector
+/// reduction produced by the LoopVectorizer. It is the log2-shuffle pattern,
+/// consisting of log2(NumVectorElements) steps and, in each step, 2^(s)
+/// elements are reduced, where s is an induction variable from 0
+/// to log2(NumVectorElements).
+/// For example,
+///   %1 = vector_shuffle %0, <2,3,u,u>
+///   %2 = add %0, %1
+///   %3 = vector_shuffle %2, <1,u,u,u>
+///   %4 = add %2, %3
+///   %5 = extract_vector_elt %4, 0
+/// becomes :
+///   %0 = uaddv %0
+///   %1 = extract_vector_elt %0, 0
+///
+/// FIXME: Currently this function is implemented and tested specifically
+/// for the add reduction. We could also support other types of across lane
+/// reduction available in AArch64, including SMAXV, SMINV, UMAXV, UMINV,
+/// SADDLV, UADDLV, FMAXNMV, FMAXV, FMINNMV, FMINV.
+static SDValue
+performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
+                                  const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Check if the input vector is fed by the operator we want to handle.
+  // We specifically check only ADD for now.
+  if (N0->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // The vector extract idx must constant zero because we only expect the final
+  // result of the reduction is placed in lane 0.
+  if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue())
+    return SDValue();
+
+  EVT EltTy = N0.getValueType().getVectorElementType();
+  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+    return SDValue();
+
+  int NumVecElts = N0.getValueType().getVectorNumElements();
+  if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
+    return SDValue();
+
+  int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
+  SDValue PreOp = N0;
+  // Iterate over each step of the across vector reduction.
+  for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
+    // We specifically check ADD for now.
+    if (PreOp.getOpcode() != ISD::ADD)
+      return SDValue();
+    SDValue CurOp = PreOp.getOperand(0);
+    SDValue Shuffle = PreOp.getOperand(1);
+    if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
+      // Try to swap the 1st and 2nd operand as add is commutative.
+      CurOp = PreOp.getOperand(1);
+      Shuffle = PreOp.getOperand(0);
+      if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
+        return SDValue();
+    }
+    // Check if it forms one step of the across vector reduction.
+    // E.g.,
+    //   %cur = add %1, %0
+    //   %shuffle = vector_shuffle %cur, <2, 3, u, u>
+    //   %pre = add %cur, %shuffle
+    if (Shuffle.getOperand(0) != CurOp)
+      return SDValue();
+
+    int NumMaskElts = 1 << CurStep;
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
+    // Check mask values in each step.
+    // We expect the shuffle mask in each step follows a specific pattern
+    // denoted here by the <M, U> form, where M is a sequence of integers
+    // starting from NumMaskElts, increasing by 1, and the number integers
+    // in M should be NumMaskElts. U is a sequence of UNDEFs and the number
+    // of undef in U should be NumVecElts - NumMaskElts.
+    // E.g., for <8 x i16>, mask values in each step should be :
+    //   step 0 : <1,u,u,u,u,u,u,u>
+    //   step 1 : <2,3,u,u,u,u,u,u>
+    //   step 2 : <4,5,6,7,u,u,u,u>
+    for (int i = 0; i < NumVecElts; ++i)
+      if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
+          (i >= NumMaskElts && !(Mask[i] < 0)))
+        return SDValue();
+
+    PreOp = CurOp;
+  }
+  SDLoc DL(N);
+  return DAG.getNode(
+      ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
+      DAG.getNode(AArch64ISD::UADDV, DL, PreOp.getSimpleValueType(), PreOp),
+      DAG.getConstant(0, DL, MVT::i64));
+}
+
 /// Target-specific DAG combine function for NEON load/store intrinsics
 /// to merge base address updates.
 static SDValue performNEONPostLDSTCombine(SDNode *N,
@@ -9178,6 +9275,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performNVCASTCombine(N);
   case ISD::INSERT_VECTOR_ELT:
     return performPostLD1Combine(N, DCI, true);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return performAcrossLaneReductionCombine(N, DAG, Subtarget);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
diff --git a/test/CodeGen/AArch64/aarch64-addv.ll b/test/CodeGen/AArch64/aarch64-addv.ll
new file mode 100644
index 00000000000..3be3c556d27
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-addv.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=aarch64 < %s | FileCheck %s
+
+define i8 @f_v16i8(<16 x i8>* %arr)  {
+; CHECK-LABEL: f_v16i8
+; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
+  %bin.rdx = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0
+  %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
+  %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf
+  %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12
+  %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13
+  %r = extractelement <16 x i8> %bin.rdx14, i32 0
+  ret i8 %r
+}
+
+define i16 @f_v8i16(<8 x i16>* %arr)  {
+; CHECK-LABEL: f_v8i16
+; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
+  %bin.rdx = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12
+  %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13
+  %r = extractelement <8 x i16> %bin.rdx14, i32 0
+  ret i16 %r
+}
+
+define i32 @f_v4i32( <4 x i32>* %arr)  {
+; CHECK-LABEL: f_v4i32
+; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
+  %bin.rdx = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12
+  %r = extractelement <4 x i32> %bin.rdx13, i32 0
+  ret i32 %r
+}
+
+define i64 @f_v2i64(<2 x i64>* %arr)  {
+; CHECK-LABEL: f_v2i64
+; CHECK-NOT: addv
+  %bin.rdx = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0
+  %r = extractelement <2 x i64> %bin.rdx0, i32 0
+  ret i64 %r
+}
-- 
2.34.1