From ad69a64d4aac849d14e51af0bb547a542d4b50bf Mon Sep 17 00:00:00 2001 From: Chad Rosier Date: Thu, 3 Sep 2015 18:13:57 +0000 Subject: [PATCH] [AArch64] Improve ISel using across lane addition reduction. In vectorized add reduction code, the final "reduce" step is sub-optimal. This change wll combine : ext v1.16b, v0.16b, v0.16b, #8 add v0.4s, v1.4s, v0.4s dup v1.4s, v0.s[1] add v0.4s, v1.4s, v0.4s into addv s0, v0.4s PR21371 http://reviews.llvm.org/D12325 Patch by Jun Bum Lim ! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246790 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 99 ++++++++++++++++++++++ test/CodeGen/AArch64/aarch64-addv.ll | 53 ++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 test/CodeGen/AArch64/aarch64-addv.ll diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 20ed0016a79..b7f56472a4e 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -495,6 +495,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -8584,6 +8585,102 @@ static SDValue performPostLD1Combine(SDNode *N, return SDValue(); } +/// Target-specific DAG combine for the across vector reduction. +/// This function specifically handles the final clean-up step of a vector +/// reduction produced by the LoopVectorizer. It is the log2-shuffle pattern, +/// consisting of log2(NumVectorElements) steps and, in each step, 2^(s) +/// elements are reduced, where s is an induction variable from 0 +/// to log2(NumVectorElements). +/// For example, +/// %1 = vector_shuffle %0, <2,3,u,u> +/// %2 = add %0, %1 +/// %3 = vector_shuffle %2, <1,u,u,u> +/// %4 = add %2, %3 +/// %5 = extract_vector_elt %4, 0 +/// becomes : +/// %0 = uaddv %0 +/// %1 = extract_vector_elt %0, 0 +/// +/// FIXME: Currently this function is implemented and tested specifically +/// for the add reduction. We could also support other types of across lane +/// reduction available in AArch64, including SMAXV, SMINV, UMAXV, UMINV, +/// SADDLV, UADDLV, FMAXNMV, FMAXV, FMINNMV, FMINV. +static SDValue +performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Check if the input vector is fed by the operator we want to handle. + // We specifically check only ADD for now. + if (N0->getOpcode() != ISD::ADD) + return SDValue(); + + // The vector extract idx must constant zero because we only expect the final + // result of the reduction is placed in lane 0. + if (!isa(N1) || cast(N1)->getZExtValue()) + return SDValue(); + + EVT EltTy = N0.getValueType().getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + int NumVecElts = N0.getValueType().getVectorNumElements(); + if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) + return SDValue(); + + int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); + SDValue PreOp = N0; + // Iterate over each step of the across vector reduction. + for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { + // We specifically check ADD for now. + if (PreOp.getOpcode() != ISD::ADD) + return SDValue(); + SDValue CurOp = PreOp.getOperand(0); + SDValue Shuffle = PreOp.getOperand(1); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { + // Try to swap the 1st and 2nd operand as add is commutative. + CurOp = PreOp.getOperand(1); + Shuffle = PreOp.getOperand(0); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + } + // Check if it forms one step of the across vector reduction. + // E.g., + // %cur = add %1, %0 + // %shuffle = vector_shuffle %cur, <2, 3, u, u> + // %pre = add %cur, %shuffle + if (Shuffle.getOperand(0) != CurOp) + return SDValue(); + + int NumMaskElts = 1 << CurStep; + ArrayRef Mask = cast(Shuffle)->getMask(); + // Check mask values in each step. + // We expect the shuffle mask in each step follows a specific pattern + // denoted here by the form, where M is a sequence of integers + // starting from NumMaskElts, increasing by 1, and the number integers + // in M should be NumMaskElts. U is a sequence of UNDEFs and the number + // of undef in U should be NumVecElts - NumMaskElts. + // E.g., for <8 x i16>, mask values in each step should be : + // step 0 : <1,u,u,u,u,u,u,u> + // step 1 : <2,3,u,u,u,u,u,u> + // step 2 : <4,5,6,7,u,u,u,u> + for (int i = 0; i < NumVecElts; ++i) + if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || + (i >= NumMaskElts && !(Mask[i] < 0))) + return SDValue(); + + PreOp = CurOp; + } + SDLoc DL(N); + return DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), + DAG.getNode(AArch64ISD::UADDV, DL, PreOp.getSimpleValueType(), PreOp), + DAG.getConstant(0, DL, MVT::i64)); +} + /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, @@ -9178,6 +9275,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); + case ISD::EXTRACT_VECTOR_ELT: + return performAcrossLaneReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { diff --git a/test/CodeGen/AArch64/aarch64-addv.ll b/test/CodeGen/AArch64/aarch64-addv.ll new file mode 100644 index 00000000000..3be3c556d27 --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-addv.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=aarch64 < %s | FileCheck %s + +define i8 @f_v16i8(<16 x i8>* %arr) { +; CHECK-LABEL: f_v16i8 +; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b + %bin.rdx = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> + %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0 + %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> + %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf + %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> + %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> + %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13 + %r = extractelement <16 x i8> %bin.rdx14, i32 0 + ret i8 %r +} + +define i16 @f_v8i16(<8 x i16>* %arr) { +; CHECK-LABEL: f_v8i16 +; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h + %bin.rdx = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> + %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf + %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> + %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12 + %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> + %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13 + %r = extractelement <8 x i16> %bin.rdx14, i32 0 + ret i16 %r +} + +define i32 @f_v4i32( <4 x i32>* %arr) { +; CHECK-LABEL: f_v4i32 +; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s + %bin.rdx = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> + %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf + %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> + %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12 + %r = extractelement <4 x i32> %bin.rdx13, i32 0 + ret i32 %r +} + +define i64 @f_v2i64(<2 x i64>* %arr) { +; CHECK-LABEL: f_v2i64 +; CHECK-NOT: addv + %bin.rdx = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> + %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0 + %r = extractelement <2 x i64> %bin.rdx0, i32 0 + ret i64 %r +} -- 2.34.1