From b5b07c36862e772571fcbfa459fa9780615ec42b Mon Sep 17 00:00:00 2001 From: Cong Hou Date: Tue, 24 Nov 2015 05:44:19 +0000 Subject: [PATCH] [X86][SSE] Detect AVG pattern during instruction combine for SSE2/AVX2/AVX512BW. This patch detects the AVG pattern in vectorized code, which is simply c = (a + b + 1) / 2, where a, b, and c have the same type which are vectors of either unsigned i8 or unsigned i16. In the IR, i8/i16 will be promoted to i32 before any arithmetic operations. The following IR shows such an example: %1 = zext %a to %2 = zext %b to %3 = add nuw nsw %1, %4 = add nuw nsw %3, %2 %5 = lshr %N, %6 = trunc %5 to and with this patch it will be converted to a X86ISD::AVG instruction. The pattern recognition is done when combining instructions just before type legalization during instruction selection. We do it here because after type legalization, it is much more difficult to do pattern recognition based on many instructions that are doing type conversions. Therefore, for target-specific instructions (like X86ISD::AVG), we need to take care of type legalization by ourselves. However, as X86ISD::AVG behaves similarly to ISD::ADD, I am wondering if there is a way to legalize operands and result types of X86ISD::AVG together with ISD::ADD. It seems that the current design doesn't support this idea. Tests are added for SSE2, AVX2, and AVX512BW and both i8 and i16 types of variant vector sizes. Differential revision: http://reviews.llvm.org/D14761 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@253952 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 168 ++++++++ lib/Target/X86/X86InstrSSE.td | 8 +- lib/Target/X86/X86IntrinsicsInfo.h | 4 + test/CodeGen/X86/avg.ll | 627 +++++++++++++++++++++++++++++ 4 files changed, 803 insertions(+), 4 deletions(-) create mode 100644 test/CodeGen/X86/avg.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 22935c31aba..e06e6fbcf52 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1779,6 +1779,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); @@ -19853,6 +19854,36 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::AVG: { + // Legalize types for X86ISD::AVG by expanding vectors. + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + + auto InVT = N->getValueType(0); + auto InVTSize = InVT.getSizeInBits(); + const unsigned RegSize = + (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; + assert((!Subtarget->hasAVX512() || RegSize < 512) && + "512-bit vector requires AVX512"); + assert((!Subtarget->hasAVX2() || RegSize < 256) && + "256-bit vector requires AVX2"); + + auto ElemVT = InVT.getVectorElementType(); + auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, + RegSize / ElemVT.getSizeInBits()); + assert(RegSize % InVT.getSizeInBits() == 0); + unsigned NumConcat = RegSize / InVT.getSizeInBits(); + + SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); + Ops[0] = N->getOperand(0); + SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + Ops[0] = N->getOperand(1); + SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + + SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); + Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + DAG.getIntPtrConstant(0, dl))); + return; + } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -25347,6 +25378,132 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// This function detects the AVG pattern between vectors of unsigned i8/i16, +/// which is c = (a + b + 1) / 2, and replace this operation with the efficient +/// X86ISD::AVG instruction. +static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget *Subtarget, SDLoc DL) { + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + EVT ScalarVT = VT.getVectorElementType(); + if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && + isPowerOf2_32(NumElems))) + return SDValue(); + + // InScalarVT is the intermediate type in AVG pattern and it should be greater + // than the original input type (i8/i16). + EVT InScalarVT = InVT.getVectorElementType(); + if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) + return SDValue(); + + if (Subtarget->hasAVX512()) { + if (VT.getSizeInBits() > 512) + return SDValue(); + } else if (Subtarget->hasAVX2()) { + if (VT.getSizeInBits() > 256) + return SDValue(); + } else { + if (VT.getSizeInBits() > 128) + return SDValue(); + } + + // Detect the following pattern: + // + // %1 = zext %a to + // %2 = zext %b to + // %3 = add nuw nsw %1, + // %4 = add nuw nsw %3, %2 + // %5 = lshr %N, + // %6 = trunc %5 to + // + // In AVX512, the last instruction can also be a trunc store. + + if (In.getOpcode() != ISD::SRL) + return SDValue(); + + // A lambda checking the given SDValue is a constant vector and each element + // is in the range [Min, Max]. + auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { + BuildVectorSDNode *BV = dyn_cast(V); + if (!BV || !BV->isConstant()) + return false; + for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { + ConstantSDNode *C = dyn_cast(V.getOperand(i)); + if (!C) + return false; + uint64_t Val = C->getZExtValue(); + if (Val < Min || Val > Max) + return false; + } + return true; + }; + + // Check if each element of the vector is left-shifted by one. + auto LHS = In.getOperand(0); + auto RHS = In.getOperand(1); + if (!IsConstVectorInRange(RHS, 1, 1)) + return SDValue(); + if (LHS.getOpcode() != ISD::ADD) + return SDValue(); + + // Detect a pattern of a + b + 1 where the order doesn't matter. + SDValue Operands[3]; + Operands[0] = LHS.getOperand(0); + Operands[1] = LHS.getOperand(1); + + // Take care of the case when one of the operands is a constant vector whose + // element is in the range [1, 256]. + if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && + Operands[0].getOpcode() == ISD::ZERO_EXTEND && + Operands[0].getOperand(0).getValueType() == VT) { + // The pattern is detected. Subtract one from the constant vector, then + // demote it and emit X86ISD::AVG instruction. + SDValue One = DAG.getConstant(1, DL, InScalarVT); + SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, + SmallVector(NumElems, One)); + Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones); + Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1]); + } + + if (Operands[0].getOpcode() == ISD::ADD) + std::swap(Operands[0], Operands[1]); + else if (Operands[1].getOpcode() != ISD::ADD) + return SDValue(); + Operands[2] = Operands[1].getOperand(0); + Operands[1] = Operands[1].getOperand(1); + + // Now we have three operands of two additions. Check that one of them is a + // constant vector with ones, and the other two are promoted from i8/i16. + for (int i = 0; i < 3; ++i) { + if (!IsConstVectorInRange(Operands[i], 1, 1)) + continue; + std::swap(Operands[i], Operands[2]); + + // Check if Operands[0] and Operands[1] are results of type promotion. + for (int j = 0; j < 2; ++j) + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + + // The pattern is detected, emit X86ISD::AVG instruction. + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1].getOperand(0)); + } + + return SDValue(); +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget, + SDLoc(N)); +} + /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -25611,6 +25768,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { + // Check if we can detect an AVG pattern from the truncation. If yes, + // replace the trunc store by a normal store with the result of X86ISD::AVG + // instruction. + SDValue Avg = + detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); + if (Avg.getNode()) + return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); @@ -26873,6 +27040,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); + case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); case X86ISD::FMIN: diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index fd829e1b189..dd3ab69fc05 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4046,6 +4046,10 @@ defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; // Intrinsic forms defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, @@ -4062,10 +4066,6 @@ defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; -defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, - int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; -defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, - int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, int_x86_avx2_psad_bw, SSE_PMADD, 1>; diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 4bdb5b9146e..0b77d480bc0 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -250,6 +250,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -1699,6 +1701,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll new file mode 100644 index 00000000000..ce2bf0fdad1 --- /dev/null +++ b/test/CodeGen/X86/avg.ll @@ -0,0 +1,627 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW + +define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: avg_v4i8 +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd (%rsi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovd (%rdi), %xmm0 +; AVX2-NEXT: vmovd (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = load <4 x i8>, <4 x i8>* %b + %3 = zext <4 x i8> %1 to <4 x i32> + %4 = zext <4 x i8> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, + %6 = add nuw nsw <4 x i32> %5, %4 + %7 = lshr <4 x i32> %6, + %8 = trunc <4 x i32> %7 to <4 x i8> + store <4 x i8> %8, <4 x i8>* undef, align 4 + ret void +} + +define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) { +; SSE2-LABEL: avg_v8i8 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = load <8 x i8>, <8 x i8>* %b + %3 = zext <8 x i8> %1 to <8 x i32> + %4 = zext <8 x i8> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, + %6 = add nuw nsw <8 x i32> %5, %4 + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i8> + store <8 x i8> %8, <8 x i8>* undef, align 4 + ret void +} + +define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { +; SSE2-LABEL: avg_v16i8 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: pavgb (%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v16i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + %3 = zext <16 x i8> %1 to <16 x i32> + %4 = zext <16 x i8> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, + %6 = add nuw nsw <16 x i32> %5, %4 + %7 = lshr <16 x i32> %6, + %8 = trunc <16 x i32> %7 to <16 x i8> + store <16 x i8> %8, <16 x i8>* undef, align 4 + ret void +} + +define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { +; AVX2-LABEL: avg_v32i8 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <32 x i8>, <32 x i8>* %a + %2 = load <32 x i8>, <32 x i8>* %b + %3 = zext <32 x i8> %1 to <32 x i32> + %4 = zext <32 x i8> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, + %6 = add nuw nsw <32 x i32> %5, %4 + %7 = lshr <32 x i32> %6, + %8 = trunc <32 x i32> %7 to <32 x i8> + store <32 x i8> %8, <32 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { +; AVX512BW-LABEL: avg_v64i8 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %3, + %6 = add nuw nsw <64 x i32> %5, %4 + %7 = lshr <64 x i32> %6, + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + +define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) { +; SSE2-LABEL: avg_v4i16 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgw %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i16 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = load <4 x i16>, <4 x i16>* %b + %3 = zext <4 x i16> %1 to <4 x i32> + %4 = zext <4 x i16> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, + %6 = add nuw nsw <4 x i32> %5, %4 + %7 = lshr <4 x i32> %6, + %8 = trunc <4 x i32> %7 to <4 x i16> + store <4 x i16> %8, <4 x i16>* undef, align 4 + ret void +} + +define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { +; SSE2-LABEL: avg_v8i16 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: pavgw (%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i16 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + %3 = zext <8 x i16> %1 to <8 x i32> + %4 = zext <8 x i16> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, + %6 = add nuw nsw <8 x i32> %5, %4 + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i16> + store <8 x i16> %8, <8 x i16>* undef, align 4 + ret void +} + +define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { +; AVX2-LABEL: avg_v16i16 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = load <16 x i16>, <16 x i16>* %b + %3 = zext <16 x i16> %1 to <16 x i32> + %4 = zext <16 x i16> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, + %6 = add nuw nsw <16 x i32> %5, %4 + %7 = lshr <16 x i32> %6, + %8 = trunc <16 x i32> %7 to <16 x i16> + store <16 x i16> %8, <16 x i16>* undef, align 4 + ret void +} + +define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { +; AVX512BW-LABEL: avg_v32i16 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <32 x i16>, <32 x i16>* %a + %2 = load <32 x i16>, <32 x i16>* %b + %3 = zext <32 x i16> %1 to <32 x i32> + %4 = zext <32 x i16> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, + %6 = add nuw nsw <32 x i32> %5, %4 + %7 = lshr <32 x i32> %6, + %8 = trunc <32 x i32> %7 to <32 x i16> + store <32 x i16> %8, <32 x i16>* undef, align 4 + ret void +} + +define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: avg_v4i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: movd (%rsi), %xmm1 +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovd (%rdi), %xmm0 +; AVX2-NEXT: vmovd (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = load <4 x i8>, <4 x i8>* %b + %3 = zext <4 x i8> %1 to <4 x i32> + %4 = zext <4 x i8> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, %4 + %6 = add nuw nsw <4 x i32> %5, + %7 = lshr <4 x i32> %6, + %8 = trunc <4 x i32> %7 to <4 x i8> + store <4 x i8> %8, <4 x i8>* undef, align 4 + ret void +} + +define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) { +; SSE2-LABEL: avg_v8i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = load <8 x i8>, <8 x i8>* %b + %3 = zext <8 x i8> %1 to <8 x i32> + %4 = zext <8 x i8> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, %4 + %6 = add nuw nsw <8 x i32> %5, + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i8> + store <8 x i8> %8, <8 x i8>* undef, align 4 + ret void +} + +define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { +; SSE2-LABEL: avg_v16i8_2 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + %3 = zext <16 x i8> %1 to <16 x i32> + %4 = zext <16 x i8> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, %4 + %6 = add nuw nsw <16 x i32> %5, + %7 = lshr <16 x i32> %6, + %8 = trunc <16 x i32> %7 to <16 x i8> + store <16 x i8> %8, <16 x i8>* undef, align 4 + ret void +} + +define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { +; AVX2-LABEL: avg_v32i8_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <32 x i8>, <32 x i8>* %a + %2 = load <32 x i8>, <32 x i8>* %b + %3 = zext <32 x i8> %1 to <32 x i32> + %4 = zext <32 x i8> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, %4 + %6 = add nuw nsw <32 x i32> %5, + %7 = lshr <32 x i32> %6, + %8 = trunc <32 x i32> %7 to <32 x i8> + store <32 x i8> %8, <32 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) { +; AVX512BW-LABEL: avg_v64i8_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 +; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <64 x i8>, <64 x i8>* %a + %2 = load <64 x i8>, <64 x i8>* %b + %3 = zext <64 x i8> %1 to <64 x i32> + %4 = zext <64 x i8> %2 to <64 x i32> + %5 = add nuw nsw <64 x i32> %4, %4 + %6 = add nuw nsw <64 x i32> %5, + %7 = lshr <64 x i32> %6, + %8 = trunc <64 x i32> %7 to <64 x i8> + store <64 x i8> %8, <64 x i8>* undef, align 4 + ret void +} + + +define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) { +; SSE2-LABEL: avg_v4i16_2 +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero +; SSE2-NEXT: pavgw %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vmovq (%rsi), %xmm1 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = load <4 x i16>, <4 x i16>* %b + %3 = zext <4 x i16> %1 to <4 x i32> + %4 = zext <4 x i16> %2 to <4 x i32> + %5 = add nuw nsw <4 x i32> %3, %4 + %6 = add nuw nsw <4 x i32> %5, + %7 = lshr <4 x i32> %6, + %8 = trunc <4 x i32> %7 to <4 x i16> + store <4 x i16> %8, <4 x i16>* undef, align 4 + ret void +} + +define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { +; SSE2-LABEL: avg_v8i16_2 +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + %3 = zext <8 x i16> %1 to <8 x i32> + %4 = zext <8 x i16> %2 to <8 x i32> + %5 = add nuw nsw <8 x i32> %3, %4 + %6 = add nuw nsw <8 x i32> %5, + %7 = lshr <8 x i32> %6, + %8 = trunc <8 x i32> %7 to <8 x i16> + store <8 x i16> %8, <8 x i16>* undef, align 4 + ret void +} + +define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { +; AVX2-LABEL: avg_v16i16_2 +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = load <16 x i16>, <16 x i16>* %b + %3 = zext <16 x i16> %1 to <16 x i32> + %4 = zext <16 x i16> %2 to <16 x i32> + %5 = add nuw nsw <16 x i32> %3, %4 + %6 = add nuw nsw <16 x i32> %5, + %7 = lshr <16 x i32> %6, + %8 = trunc <16 x i32> %7 to <16 x i16> + store <16 x i16> %8, <16 x i16>* undef, align 4 + ret void +} + +define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { +; AVX512BW-LABEL: avg_v32i16_2 +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <32 x i16>, <32 x i16>* %a + %2 = load <32 x i16>, <32 x i16>* %b + %3 = zext <32 x i16> %1 to <32 x i32> + %4 = zext <32 x i16> %2 to <32 x i32> + %5 = add nuw nsw <32 x i32> %3, %4 + %6 = add nuw nsw <32 x i32> %5, + %7 = lshr <32 x i32> %6, + %8 = trunc <32 x i32> %7 to <32 x i16> + store <32 x i16> %8, <32 x i16>* undef, align 4 + ret void +} + +define void @avg_v4i8_const(<4 x i8>* %a) { +; SSE2-LABEL: avg_v4i8_const +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pavgb {{.*}}, %xmm0 +; SSE2-NEXT: movd %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovd (%rdi), %xmm0 +; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i8>, <4 x i8>* %a + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = add nuw nsw <4 x i32> %2, + %4 = lshr <4 x i32> %3, + %5 = trunc <4 x i32> %4 to <4 x i8> + store <4 x i8> %5, <4 x i8>* undef, align 4 + ret void +} + +define void @avg_v8i8_const(<8 x i8>* %a) { +; SSE2-LABEL: avg_v8i8_const +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: pavgb {{.*}}, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i8>, <8 x i8>* %a + %2 = zext <8 x i8> %1 to <8 x i32> + %3 = add nuw nsw <8 x i32> %2, + %4 = lshr <8 x i32> %3, + %5 = trunc <8 x i32> %4 to <8 x i8> + store <8 x i8> %5, <8 x i8>* undef, align 4 + ret void +} + +define void @avg_v16i8_const(<16 x i8>* %a) { +; SSE2-LABEL: avg_v16i8_const +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgb {{.*}}, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <16 x i8>, <16 x i8>* %a + %2 = zext <16 x i8> %1 to <16 x i32> + %3 = add nuw nsw <16 x i32> %2, + %4 = lshr <16 x i32> %3, + %5 = trunc <16 x i32> %4 to <16 x i8> + store <16 x i8> %5, <16 x i8>* undef, align 4 + ret void +} + +define void @avg_v32i8_const(<32 x i8>* %a) { +; AVX2-LABEL: avg_v32i8_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb {{.*}}, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <32 x i8>, <32 x i8>* %a + %2 = zext <32 x i8> %1 to <32 x i32> + %3 = add nuw nsw <32 x i32> %2, + %4 = lshr <32 x i32> %3, + %5 = trunc <32 x i32> %4 to <32 x i8> + store <32 x i8> %5, <32 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8_const(<64 x i8>* %a) { +; AVX512BW-LABEL: avg_v64i8_const +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgb {{.*}}, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax) +; AVX512BW-NEXT: retq +; + %1 = load <64 x i8>, <64 x i8>* %a + %2 = zext <64 x i8> %1 to <64 x i32> + %3 = add nuw nsw <64 x i32> %2, + %4 = lshr <64 x i32> %3, + %5 = trunc <64 x i32> %4 to <64 x i8> + store <64 x i8> %5, <64 x i8>* undef, align 4 + ret void +} + +define void @avg_v4i16_const(<4 x i16>* %a) { +; SSE2-LABEL: avg_v4i16_const +; SSE2: # BB#0: +; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: pavgw {{.*}}, %xmm0 +; SSE2-NEXT: movq %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v4i16_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <4 x i16>, <4 x i16>* %a + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = add nuw nsw <4 x i32> %2, + %4 = lshr <4 x i32> %3, + %5 = trunc <4 x i32> %4 to <4 x i16> + store <4 x i16> %5, <4 x i16>* undef, align 4 + ret void +} + +define void @avg_v8i16_const(<8 x i16>* %a) { +; SSE2-LABEL: avg_v8i16_const +; SSE2: # BB#0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: pavgw {{.*}}, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: retq +; + %1 = load <8 x i16>, <8 x i16>* %a + %2 = zext <8 x i16> %1 to <8 x i32> + %3 = add nuw nsw <8 x i32> %2, + %4 = lshr <8 x i32> %3, + %5 = trunc <8 x i32> %4 to <8 x i16> + store <8 x i16> %5, <8 x i16>* undef, align 4 + ret void +} + +define void @avg_v16i16_const(<16 x i16>* %a) { +; AVX2-LABEL: avg_v16i16_const +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgw {{.*}}, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; + %1 = load <16 x i16>, <16 x i16>* %a + %2 = zext <16 x i16> %1 to <16 x i32> + %3 = add nuw nsw <16 x i32> %2, + %4 = lshr <16 x i32> %3, + %5 = trunc <16 x i32> %4 to <16 x i16> + store <16 x i16> %5, <16 x i16>* undef, align 4 + ret void +} + +define void @avg_v32i16_const(<32 x i16>* %a) { +; AVX512BW-LABEL: avg_v32i16_const +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw {{.*}}, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax) +; + %1 = load <32 x i16>, <32 x i16>* %a + %2 = zext <32 x i16> %1 to <32 x i32> + %3 = add nuw nsw <32 x i32> %2, + %4 = lshr <32 x i32> %3, + %5 = trunc <32 x i32> %4 to <32 x i16> + store <32 x i16> %5, <32 x i16>* undef, align 4 + ret void +} -- 2.34.1