From: Jun Bum Lim Date: Mon, 14 Sep 2015 16:19:52 +0000 (+0000) Subject: Improve ISel using across lane min/max reduction X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=5ffe2bacea665e69eebcd3389ebbc5d4521d680d;p=oota-llvm.git Improve ISel using across lane min/max reduction In vectorized integer min/max reduction code, the final "reduce" step is sub-optimal. In AArch64, this change wll combine : %svn0 = vector_shuffle %0, undef<2,3,u,u> %smax0 = smax %0, svn0 %svn3 = vector_shuffle %smax0, undef<1,u,u,u> %sc = setcc %smax0, %svn3, gt %n0 = extract_vector_elt %sc, #0 %n1 = extract_vector_elt %smax0, #0 %n2 = extract_vector_elt $smax0, #1 %result = select %n0, %n1, n2 becomes : %1 = smaxv %0 %result = extract_vector_elt %1, 0 This change extends r246790. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247575 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 3717297ae87..3c1251e2695 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8585,68 +8585,43 @@ static SDValue performPostLD1Combine(SDNode *N, return SDValue(); } -/// Target-specific DAG combine for the across vector reduction. -/// This function specifically handles the final clean-up step of a vector -/// reduction produced by the LoopVectorizer. It is the log2-shuffle pattern, -/// consisting of log2(NumVectorElements) steps and, in each step, 2^(s) -/// elements are reduced, where s is an induction variable from 0 -/// to log2(NumVectorElements). -/// For example, -/// %1 = vector_shuffle %0, <2,3,u,u> -/// %2 = add %0, %1 -/// %3 = vector_shuffle %2, <1,u,u,u> -/// %4 = add %2, %3 -/// %5 = extract_vector_elt %4, 0 -/// becomes : -/// %0 = uaddv %0 -/// %1 = extract_vector_elt %0, 0 -/// -/// FIXME: Currently this function is implemented and tested specifically -/// for the add reduction. We could also support other types of across lane -/// reduction available in AArch64, including SMAXV, SMINV, UMAXV, UMINV, -/// SADDLV, UADDLV, FMAXNMV, FMAXV, FMINNMV, FMINV. -static SDValue -performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { - if (!Subtarget->hasNEON()) +/// This function handles the log2-shuffle pattern produced by the +/// LoopVectorizer for the across vector reduction. It consists of +/// log2(NumVectorElements) steps and, in each step, 2^(s) elements +/// are reduced, where s is an induction variable from 0 to +/// log2(NumVectorElements). +static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, + unsigned Op, + SelectionDAG &DAG) { + EVT VTy = OpV->getOperand(0).getValueType(); + if (!VTy.isVector()) return SDValue(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - // Check if the input vector is fed by the operator we want to handle. - // We specifically check only ADD for now. - if (N0->getOpcode() != ISD::ADD) - return SDValue(); - - // The vector extract idx must constant zero because we only expect the final - // result of the reduction is placed in lane 0. - if (!isa(N1) || cast(N1)->getZExtValue()) - return SDValue(); - - EVT EltTy = N0.getValueType().getVectorElementType(); - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) - return SDValue(); - - int NumVecElts = N0.getValueType().getVectorNumElements(); + int NumVecElts = VTy.getVectorNumElements(); if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) return SDValue(); int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); - SDValue PreOp = N0; + SDValue PreOp = OpV; // Iterate over each step of the across vector reduction. for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { - // We specifically check ADD for now. - if (PreOp.getOpcode() != ISD::ADD) - return SDValue(); SDValue CurOp = PreOp.getOperand(0); SDValue Shuffle = PreOp.getOperand(1); if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { - // Try to swap the 1st and 2nd operand as add is commutative. + // Try to swap the 1st and 2nd operand as add and min/max instructions + // are commutative. CurOp = PreOp.getOperand(1); Shuffle = PreOp.getOperand(0); if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); } + + // Check if the input vector is fed by the operator we want to handle, + // except the last step; the very first input vector is not necessarily + // the same operator we are handling. + if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) + return SDValue(); + // Check if it forms one step of the across vector reduction. // E.g., // %cur = add %1, %0 @@ -8674,11 +8649,169 @@ performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG, PreOp = CurOp; } + unsigned Opcode; + switch (Op) { + default: + llvm_unreachable("Unexpected operator for across vector reduction"); + case ISD::ADD: + Opcode = AArch64ISD::UADDV; + break; + case ISD::SMAX: + Opcode = AArch64ISD::SMAXV; + break; + case ISD::UMAX: + Opcode = AArch64ISD::UMAXV; + break; + case ISD::SMIN: + Opcode = AArch64ISD::SMINV; + break; + case ISD::UMIN: + Opcode = AArch64ISD::UMINV; + break; + } SDLoc DL(N); - return DAG.getNode( - ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), - DAG.getNode(AArch64ISD::UADDV, DL, PreOp.getSimpleValueType(), PreOp), - DAG.getConstant(0, DL, MVT::i64)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), + DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), + DAG.getConstant(0, DL, MVT::i64)); +} + +/// Target-specific DAG combine for the across vector min/max reductions. +/// This function specifically handles the final clean-up step of the vector +/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which narrows down and finds the final min/max value from all +/// elements of the vector. +/// For example, for a <16 x i8> vector : +/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> +/// %smax0 = smax %arr, svn0 +/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax1 = smax %smax0, %svn1 +/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax2 = smax %smax1, svn2 +/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %sc = setcc %smax2, %svn3, gt +/// %n0 = extract_vector_elt %sc, #0 +/// %n1 = extract_vector_elt %smax2, #0 +/// %n2 = extract_vector_elt $smax2, #1 +/// %result = select %n0, %n1, n2 +/// becomes : +/// %1 = smaxv %0 +/// %result = extract_vector_elt %1, 0 +/// FIXME: Currently this function matches only SMAXV, UMAXV, SMINV, and UMINV. +/// We could also support other types of across lane reduction available +/// in AArch64, including FMAXNMV, FMAXV, FMINNMV, and FMINV. +static SDValue +performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue IfTrue = N->getOperand(1); + SDValue IfFalse = N->getOperand(2); + + // Check if the SELECT merges up the final result of the min/max + // from a vector. + if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Expect N0 is fed by SETCC. + SDValue SetCC = N0.getOperand(0); + EVT SetCCVT = SetCC.getValueType(); + if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || + SetCCVT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue VectorOp = SetCC.getOperand(0); + unsigned Op = VectorOp->getOpcode(); + // Check if the input vector is fed by the operator we want to handle. + if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && Op != ISD::UMIN) + return SDValue(); + + EVT VTy = VectorOp.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + // Check if extracting from the same vector. + // For example, + // %sc = setcc %vector, %svn1, gt + // %n0 = extract_vector_elt %sc, #0 + // %n1 = extract_vector_elt %vector, #0 + // %n2 = extract_vector_elt $vector, #1 + if (!(VectorOp == IfTrue->getOperand(0) && + VectorOp == IfFalse->getOperand(0))) + return SDValue(); + + // Check if the condition code is matched with the operator type. + ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); + if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || + (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || + (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || + (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE)) + return SDValue(); + + // Expect to check only lane 0 from the vector SETCC. + if (!isa(N0.getOperand(1)) || + cast(N0.getOperand(1))->getZExtValue() != 0) + return SDValue(); + + // Expect to extract the true value from lane 0. + if (!isa(IfTrue.getOperand(1)) || + cast(IfTrue.getOperand(1))->getZExtValue() != 0) + return SDValue(); + + // Expect to extract the false value from lane 1. + if (!isa(IfFalse.getOperand(1)) || + cast(IfFalse.getOperand(1))->getZExtValue() != 1) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); +} + +/// Target-specific DAG combine for the across vector add reduction. +/// This function specifically handles the final clean-up step of the vector +/// add reduction produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which adds all elements of a vector together. +/// For example, for a <4 x i32> vector : +/// %1 = vector_shuffle %0, <2,3,u,u> +/// %2 = add %0, %1 +/// %3 = vector_shuffle %2, <1,u,u,u> +/// %4 = add %2, %3 +/// %result = extract_vector_elt %4, 0 +/// becomes : +/// %0 = uaddv %0 +/// %result = extract_vector_elt %0, 0 +static SDValue +performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Check if the input vector is fed by the ADD. + if (N0->getOpcode() != ISD::ADD) + return SDValue(); + + // The vector extract idx must constant zero because we only expect the final + // result of the reduction is placed in lane 0. + if (!isa(N1) || cast(N1)->getZExtValue() != 0) + return SDValue(); + + EVT VTy = N0.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); } /// Target-specific DAG combine function for NEON load/store intrinsics @@ -9259,8 +9392,12 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: - return performSelectCombine(N, DCI); + case ISD::SELECT: { + SDValue RV = performSelectCombine(N, DCI); + if (!RV.getNode()) + RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); + return RV; + } case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::STORE: @@ -9276,7 +9413,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); case ISD::EXTRACT_VECTOR_ELT: - return performAcrossLaneReductionCombine(N, DAG, Subtarget); + return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { diff --git a/test/CodeGen/AArch64/aarch64-addv.ll b/test/CodeGen/AArch64/aarch64-addv.ll index a5bb6351749..be8430e8ba3 100644 --- a/test/CodeGen/AArch64/aarch64-addv.ll +++ b/test/CodeGen/AArch64/aarch64-addv.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic < %s | FileCheck %s -define i8 @f_v16i8(<16 x i8>* %arr) { -; CHECK-LABEL: f_v16i8 +define i8 @add_B(<16 x i8>* %arr) { +; CHECK-LABEL: add_B ; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b %bin.rdx = load <16 x i8>, <16 x i8>* %arr %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> @@ -16,8 +16,8 @@ define i8 @f_v16i8(<16 x i8>* %arr) { ret i8 %r } -define i16 @f_v8i16(<8 x i16>* %arr) { -; CHECK-LABEL: f_v8i16 +define i16 @add_H(<8 x i16>* %arr) { +; CHECK-LABEL: add_H ; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h %bin.rdx = load <8 x i16>, <8 x i16>* %arr %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> @@ -30,8 +30,8 @@ define i16 @f_v8i16(<8 x i16>* %arr) { ret i16 %r } -define i32 @f_v4i32( <4 x i32>* %arr) { -; CHECK-LABEL: f_v4i32 +define i32 @add_S( <4 x i32>* %arr) { +; CHECK-LABEL: add_S ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s %bin.rdx = load <4 x i32>, <4 x i32>* %arr %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> @@ -42,8 +42,8 @@ define i32 @f_v4i32( <4 x i32>* %arr) { ret i32 %r } -define i64 @f_v2i64(<2 x i64>* %arr) { -; CHECK-LABEL: f_v2i64 +define i64 @add_D(<2 x i64>* %arr) { +; CHECK-LABEL: add_D ; CHECK-NOT: addv %bin.rdx = load <2 x i64>, <2 x i64>* %arr %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> diff --git a/test/CodeGen/AArch64/aarch64-minmaxv.ll b/test/CodeGen/AArch64/aarch64-minmaxv.ll new file mode 100644 index 00000000000..42c76e443e5 --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-minmaxv.ll @@ -0,0 +1,287 @@ +; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linu--gnu" + +; CHECK-LABEL: smax_B +; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b +define i8 @smax_B(<16 x i8>* nocapture readonly %arr) { + %arr.load = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf = shufflevector <16 x i8> %arr.load, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp sgt <16 x i8> %arr.load, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %arr.load, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp sgt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp sgt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp sgt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + ret i8 %r +} + +; CHECK-LABEL: smax_H +; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h +define i16 @smax_H(<8 x i16>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp sgt <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp sgt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp sgt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + ret i16 %r +} + +; CHECK-LABEL: smax_S +; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s +define i32 @smax_S(<4 x i32> * nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp sgt <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp sgt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + ret i32 %r +} + +; CHECK-LABEL: smax_D +; CHECK-NOT: smaxv +define i64 @smax_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp sgt <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +} + + +; CHECK-LABEL: umax_B +; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b +define i8 @umax_B(<16 x i8>* nocapture readonly %arr) { + %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ugt <16 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ugt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ugt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ugt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + ret i8 %r +} + +; CHECK-LABEL: umax_H +; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h +define i16 @umax_H(<8 x i16>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp ugt <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp ugt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp ugt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + ret i16 %r +} + +; CHECK-LABEL: umax_S +; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s +define i32 @umax_S(<4 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp ugt <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp ugt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + ret i32 %r +} + +; CHECK-LABEL: umax_D +; CHECK-NOT: umaxv +define i64 @umax_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp ugt <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +} + + +; CHECK-LABEL: smin_B +; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b +define i8 @smin_B(<16 x i8>* nocapture readonly %arr) { + %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp slt <16 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp slt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp slt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp slt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + ret i8 %r +} + +; CHECK-LABEL: smin_H +; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h +define i16 @smin_H(<8 x i16>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp slt <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp slt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp slt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + ret i16 %r +} + +; CHECK-LABEL: smin_S +; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s +define i32 @smin_S(<4 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp slt <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp slt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + ret i32 %r +} + +; CHECK-LABEL: smin_D +; CHECK-NOT: sminv +define i64 @smin_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp slt <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +} + + +; CHECK-LABEL: umin_B +; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b +define i8 @umin_B(<16 x i8>* nocapture readonly %arr) { + %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr + %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp22 = icmp ult <16 x i8> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf + %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp25 = icmp ult <16 x i8> %rdx.minmax.select23, %rdx.shuf24 + %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 + %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp28 = icmp ult <16 x i8> %rdx.minmax.select26, %rdx.shuf27 + %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 + %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> + %rdx.minmax.cmp31 = icmp ult <16 x i8> %rdx.minmax.select29, %rdx.shuf30 + %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 + %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 + %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 + %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + ret i8 %r +} + +; CHECK-LABEL: umin_H +; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h +define i16 @umin_H(<8 x i16>* nocapture readonly %arr) { + %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr + %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp23 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf + %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp26 = icmp ult <8 x i16> %rdx.minmax.select24, %rdx.shuf25 + %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 + %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> + %rdx.minmax.cmp29 = icmp ult <8 x i16> %rdx.minmax.select27, %rdx.shuf28 + %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 + %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 + %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 + %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + ret i16 %r +} + +; CHECK-LABEL: umin_S +; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s +define i32 @umin_S(<4 x i32>* nocapture readonly %arr) { + %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr + %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp18 = icmp ult <4 x i32> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf + %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> + %rdx.minmax.cmp21 = icmp ult <4 x i32> %rdx.minmax.select19, %rdx.shuf20 + %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 + %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 + %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 + %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + ret i32 %r +} + +; CHECK-LABEL: umin_D +; CHECK-NOT: uminv +define i64 @umin_D(<2 x i64>* nocapture readonly %arr) { + %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr + %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> + %rdx.minmax.cmp18 = icmp ult <2 x i64> %rdx.minmax.select, %rdx.shuf + %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 + %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 + %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 + %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt + ret i64 %r +}