From: Ulrich Weigand Date: Tue, 31 Mar 2015 12:56:33 +0000 (+0000) Subject: [SystemZ] Use POPCNT instruction on z196 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=ee849734207320ee844a80c065ba2848561694e0;p=oota-llvm.git [SystemZ] Use POPCNT instruction on z196 We already exploit a number of instructions specific to z196, but not yet POPCNT. Add support for the population-count facility, MC support for the POPCNT instruction, CodeGen support for using POPCNT, and implement the getPopcntSupport TargetTransformInfo hook. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@233689 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 18d4c02e779..e0cb376d11d 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -163,8 +163,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, // available, or if the operand is constant. setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); + // Use POPCNT on z196 and above. + if (Subtarget.hasPopulationCount()) + setOperationAction(ISD::CTPOP, VT, Custom); + else + setOperationAction(ISD::CTPOP, VT, Expand); + // No special instructions for these. - setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); @@ -2304,6 +2309,45 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const { MVT::i64, HighOp, Low32); } +SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + int64_t OrigBitSize = VT.getSizeInBits(); + SDLoc DL(Op); + + // Get the known-zero mask for the operand. + Op = Op.getOperand(0); + APInt KnownZero, KnownOne; + DAG.computeKnownBits(Op, KnownZero, KnownOne); + uint64_t Mask = ~KnownZero.getZExtValue(); + + // Skip known-zero high parts of the operand. + int64_t BitSize = OrigBitSize; + while ((Mask & ((((uint64_t)1 << (BitSize / 2)) - 1) << (BitSize / 2))) == 0) + BitSize = BitSize / 2; + + // The POPCNT instruction counts the number of bits in each byte. + Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op); + Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op); + Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op); + + // Add up per-byte counts in a binary tree. All bits of Op at + // position larger than BitSize remain zero throughout. + for (int64_t I = BitSize / 2; I >= 8; I = I / 2) { + SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, VT)); + if (BitSize != OrigBitSize) + Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp, + DAG.getConstant(((uint64_t)1 << BitSize) - 1, VT)); + Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp); + } + + // Extract overall result from high byte. + if (BitSize > 8) + Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getConstant(BitSize - 8, VT)); + + return Op; +} + // Op is an atomic load. Lower it into a normal volatile load. SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const { @@ -2554,6 +2598,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, return lowerUDIVREM(Op, DAG); case ISD::OR: return lowerOR(Op, DAG); + case ISD::CTPOP: + return lowerCTPOP(Op, DAG); case ISD::ATOMIC_SWAP: return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW); case ISD::ATOMIC_STORE: diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 15add448d19..dda4f4544ed 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -87,6 +87,9 @@ enum { // the number of the register. EXTRACT_ACCESS, + // Count number of bits set in operand 0 per byte. + POPCNT, + // Wrappers around the ISD opcodes of the same name. The output and // first input operands are GR128s. The trailing numbers are the // widths of the second operand in bits. @@ -304,6 +307,7 @@ private: SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const; SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const; SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG, diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index a7f774791d4..31db301d95e 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -1382,6 +1382,13 @@ let Defs = [CC] in { def : Pat<(ctlz GR64:$src), (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>; +// Population count. Counts bits set per byte. +let Predicates = [FeaturePopulationCount], Defs = [CC] in { + def POPCNT : InstRRE<0xB9E1, (outs GR64:$R1), (ins GR64:$R2), + "popcnt\t$R1, $R2", + [(set GR64:$R1, (z_popcnt GR64:$R2))]>; +} + // Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext. def : Pat<(i64 (anyext GR32:$src)), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>; diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index 51ac5daad54..0cb476d2122 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -121,6 +121,7 @@ def z_select_ccmask : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask, def z_adjdynalloc : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>; def z_extract_access : SDNode<"SystemZISD::EXTRACT_ACCESS", SDT_ZExtractAccess>; +def z_popcnt : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>; def z_umul_lohi64 : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>; def z_sdivrem32 : SDNode<"SystemZISD::SDIVREM32", SDT_ZGR128Binary32>; def z_sdivrem64 : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>; diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td index 1594854ab2c..b8bbd5677e0 100644 --- a/lib/Target/SystemZ/SystemZProcessors.td +++ b/lib/Target/SystemZ/SystemZProcessors.td @@ -39,6 +39,11 @@ def FeatureFPExtension : SystemZFeature< "Assume that the floating-point extension facility is installed" >; +def FeaturePopulationCount : SystemZFeature< + "population-count", "PopulationCount", + "Assume that the population-count facility is installed" +>; + def FeatureFastSerialization : SystemZFeature< "fast-serialization", "FastSerialization", "Assume that the fast-serialization facility is installed" @@ -54,9 +59,9 @@ def : Processor<"generic", NoItineraries, []>; def : Processor<"z10", NoItineraries, []>; def : Processor<"z196", NoItineraries, [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord, - FeatureFPExtension, FeatureFastSerialization, - FeatureInterlockedAccess1]>; + FeatureFPExtension, FeaturePopulationCount, + FeatureFastSerialization, FeatureInterlockedAccess1]>; def : Processor<"zEC12", NoItineraries, [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord, - FeatureFPExtension, FeatureFastSerialization, - FeatureInterlockedAccess1]>; + FeatureFPExtension, FeaturePopulationCount, + FeatureFastSerialization, FeatureInterlockedAccess1]>; diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp index 4b5c23cefc9..0999b45c9d6 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -38,7 +38,8 @@ SystemZSubtarget::SystemZSubtarget(const std::string &TT, const TargetMachine &TM) : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false), HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false), - HasFastSerialization(false), HasInterlockedAccess1(false), + HasPopulationCount(false), HasFastSerialization(false), + HasInterlockedAccess1(false), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), TSInfo(*TM.getDataLayout()), FrameLowering() {} diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h index 99cb1ad3045..b3e7a3512cd 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.h +++ b/lib/Target/SystemZ/SystemZSubtarget.h @@ -38,6 +38,7 @@ protected: bool HasLoadStoreOnCond; bool HasHighWord; bool HasFPExtension; + bool HasPopulationCount; bool HasFastSerialization; bool HasInterlockedAccess1; @@ -86,6 +87,9 @@ public: // Return true if the target has the floating-point extension facility. bool hasFPExtension() const { return HasFPExtension; } + // Return true if the target has the population-count facility. + bool hasPopulationCount() const { return HasPopulationCount; } + // Return true if the target has the fast-serialization facility. bool hasFastSerialization() const { return HasFastSerialization; } diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 244bd0379a6..3337f6388bd 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -229,3 +229,12 @@ unsigned SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, } return SystemZTTIImpl::getIntImmCost(Imm, Ty); } + +TargetTransformInfo::PopcntSupportKind +SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) { + assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2"); + if (ST->hasPopulationCount() && TyWidth <= 64) + return TTI::PSK_FastHardware; + return TTI::PSK_Software; +} + diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index d96515adce4..d4989130679 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -60,6 +60,8 @@ public: unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty); + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + /// @} }; diff --git a/test/CodeGen/SystemZ/ctpop-01.ll b/test/CodeGen/SystemZ/ctpop-01.ll new file mode 100644 index 00000000000..ad80f9f2151 --- /dev/null +++ b/test/CodeGen/SystemZ/ctpop-01.ll @@ -0,0 +1,96 @@ +; Test population-count instruction +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s + +declare i32 @llvm.ctpop.i32(i32 %a) +declare i64 @llvm.ctpop.i64(i64 %a) + +define i32 @f1(i32 %a) { +; CHECK-LABEL: f1: +; CHECK: popcnt %r0, %r2 +; CHECK: sllk %r1, %r0, 16 +; CHECK: ar %r1, %r0 +; CHECK: sllk %r2, %r1, 8 +; CHECK: ar %r2, %r1 +; CHECK: srl %r2, 24 +; CHECK: br %r14 + + %popcnt = call i32 @llvm.ctpop.i32(i32 %a) + ret i32 %popcnt +} + +define i32 @f2(i32 %a) { +; CHECK-LABEL: f2: +; CHECK: llhr %r0, %r2 +; CHECK: popcnt %r0, %r0 +; CHECK: risblg %r2, %r0, 16, 151, 8 +; CHECK: ar %r2, %r0 +; CHECK: srl %r2, 8 +; CHECK: br %r14 + %and = and i32 %a, 65535 + %popcnt = call i32 @llvm.ctpop.i32(i32 %and) + ret i32 %popcnt +} + +define i32 @f3(i32 %a) { +; CHECK-LABEL: f3: +; CHECK: llcr %r0, %r2 +; CHECK: popcnt %r2, %r0 +; CHECK: br %r14 + %and = and i32 %a, 255 + %popcnt = call i32 @llvm.ctpop.i32(i32 %and) + ret i32 %popcnt +} + +define i64 @f4(i64 %a) { +; CHECK-LABEL: f4: +; CHECK: popcnt %r0, %r2 +; CHECK: sllg %r1, %r0, 32 +; CHECK: agr %r1, %r0 +; CHECK: sllg %r0, %r1, 16 +; CHECK: agr %r0, %r1 +; CHECK: sllg %r1, %r0, 8 +; CHECK: agr %r1, %r0 +; CHECK: srlg %r2, %r1, 56 +; CHECK: br %r14 + %popcnt = call i64 @llvm.ctpop.i64(i64 %a) + ret i64 %popcnt +} + +define i64 @f5(i64 %a) { +; CHECK-LABEL: f5: +; CHECK: llgfr %r0, %r2 +; CHECK: popcnt %r0, %r0 +; CHECK: sllg %r1, %r0, 16 +; CHECK: algfr %r0, %r1 +; CHECK: sllg %r1, %r0, 8 +; CHECK: algfr %r0, %r1 +; CHECK: srlg %r2, %r0, 24 + %and = and i64 %a, 4294967295 + %popcnt = call i64 @llvm.ctpop.i64(i64 %and) + ret i64 %popcnt +} + +define i64 @f6(i64 %a) { +; CHECK-LABEL: f6: +; CHECK: llghr %r0, %r2 +; CHECK: popcnt %r0, %r0 +; CHECK: risbg %r1, %r0, 48, 183, 8 +; CHECK: agr %r1, %r0 +; CHECK: srlg %r2, %r1, 8 +; CHECK: br %r14 + %and = and i64 %a, 65535 + %popcnt = call i64 @llvm.ctpop.i64(i64 %and) + ret i64 %popcnt +} + +define i64 @f7(i64 %a) { +; CHECK-LABEL: f7: +; CHECK: llgcr %r0, %r2 +; CHECK: popcnt %r2, %r0 +; CHECK: br %r14 + %and = and i64 %a, 255 + %popcnt = call i64 @llvm.ctpop.i64(i64 %and) + ret i64 %popcnt +} + diff --git a/test/MC/Disassembler/SystemZ/insns.txt b/test/MC/Disassembler/SystemZ/insns.txt index 54a3c5b1d6a..df032dc53f9 100644 --- a/test/MC/Disassembler/SystemZ/insns.txt +++ b/test/MC/Disassembler/SystemZ/insns.txt @@ -6334,6 +6334,18 @@ # CHECK: pfd 15, 0 0xe3 0xf0 0x00 0x00 0x00 0x36 +# CHECK: popcnt %r0, %r0 +0xb9 0xe1 0x00 0x00 + +# CHECK: popcnt %r0, %r15 +0xb9 0xe1 0x00 0x0f + +# CHECK: popcnt %r15, %r0 +0xb9 0xe1 0x00 0xf0 + +# CHECK: popcnt %r7, %r8 +0xb9 0xe1 0x00 0x78 + # CHECK: risbg %r0, %r0, 0, 0, 0 0xec 0x00 0x00 0x00 0x00 0x55 diff --git a/test/MC/SystemZ/insn-bad.s b/test/MC/SystemZ/insn-bad.s index a08cb34da83..0410a411167 100644 --- a/test/MC/SystemZ/insn-bad.s +++ b/test/MC/SystemZ/insn-bad.s @@ -2666,6 +2666,11 @@ pfdrl 1, 1 pfdrl 1, 0x100000000 +#CHECK: error: {{(instruction requires: population-count)?}} +#CHECK: popcnt %r0, %r0 + + popcnt %r0, %r0 + #CHECK: error: invalid operand #CHECK: risbg %r0,%r0,0,0,-1 #CHECK: error: invalid operand diff --git a/test/MC/SystemZ/insn-good-z196.s b/test/MC/SystemZ/insn-good-z196.s index db5ecdd238c..36bea38f5fe 100644 --- a/test/MC/SystemZ/insn-good-z196.s +++ b/test/MC/SystemZ/insn-good-z196.s @@ -1021,6 +1021,16 @@ ork %r15,%r0,%r0 ork %r7,%r8,%r9 +#CHECK: popcnt %r0, %r0 # encoding: [0xb9,0xe1,0x00,0x00] +#CHECK: popcnt %r0, %r15 # encoding: [0xb9,0xe1,0x00,0x0f] +#CHECK: popcnt %r15, %r0 # encoding: [0xb9,0xe1,0x00,0xf0] +#CHECK: popcnt %r7, %r8 # encoding: [0xb9,0xe1,0x00,0x78] + + popcnt %r0,%r0 + popcnt %r0,%r15 + popcnt %r15,%r0 + popcnt %r7,%r8 + #CHECK: risbhg %r0, %r0, 0, 0, 0 # encoding: [0xec,0x00,0x00,0x00,0x00,0x5d] #CHECK: risbhg %r0, %r0, 0, 0, 63 # encoding: [0xec,0x00,0x00,0x00,0x3f,0x5d] #CHECK: risbhg %r0, %r0, 0, 255, 0 # encoding: [0xec,0x00,0x00,0xff,0x00,0x5d]