From ee849734207320ee844a80c065ba2848561694e0 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Tue, 31 Mar 2015 12:56:33 +0000
Subject: [PATCH] [SystemZ] Use POPCNT instruction on z196

We already exploit a number of instructions specific to z196,
but not yet POPCNT.  Add support for the population-count
facility, MC support for the POPCNT instruction, CodeGen
support for using POPCNT, and implement the getPopcntSupport
TargetTransformInfo hook.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@233689 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/SystemZ/SystemZISelLowering.cpp    | 48 +++++++++-
 lib/Target/SystemZ/SystemZISelLowering.h      |  4 +
 lib/Target/SystemZ/SystemZInstrInfo.td        |  7 ++
 lib/Target/SystemZ/SystemZOperators.td        |  1 +
 lib/Target/SystemZ/SystemZProcessors.td       | 13 ++-
 lib/Target/SystemZ/SystemZSubtarget.cpp       |  3 +-
 lib/Target/SystemZ/SystemZSubtarget.h         |  4 +
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  9 ++
 .../SystemZ/SystemZTargetTransformInfo.h      |  2 +
 test/CodeGen/SystemZ/ctpop-01.ll              | 96 +++++++++++++++++++
 test/MC/Disassembler/SystemZ/insns.txt        | 12 +++
 test/MC/SystemZ/insn-bad.s                    |  5 +
 test/MC/SystemZ/insn-good-z196.s              | 10 ++
 13 files changed, 208 insertions(+), 6 deletions(-)
 create mode 100644 test/CodeGen/SystemZ/ctpop-01.ll

diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 18d4c02e779..e0cb376d11d 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -163,8 +163,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm,
       // available, or if the operand is constant.
       setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 
+      // Use POPCNT on z196 and above.
+      if (Subtarget.hasPopulationCount())
+        setOperationAction(ISD::CTPOP, VT, Custom);
+      else
+        setOperationAction(ISD::CTPOP, VT, Expand);
+
       // No special instructions for these.
-      setOperationAction(ISD::CTPOP,           VT, Expand);
       setOperationAction(ISD::CTTZ,            VT, Expand);
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
@@ -2304,6 +2309,45 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
                                    MVT::i64, HighOp, Low32);
 }
 
+SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  int64_t OrigBitSize = VT.getSizeInBits();
+  SDLoc DL(Op);
+
+  // Get the known-zero mask for the operand.
+  Op = Op.getOperand(0);
+  APInt KnownZero, KnownOne;
+  DAG.computeKnownBits(Op, KnownZero, KnownOne);
+  uint64_t Mask = ~KnownZero.getZExtValue();
+
+  // Skip known-zero high parts of the operand.
+  int64_t BitSize = OrigBitSize;
+  while ((Mask & ((((uint64_t)1 << (BitSize / 2)) - 1) << (BitSize / 2))) == 0)
+    BitSize = BitSize / 2;
+
+  // The POPCNT instruction counts the number of bits in each byte.
+  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
+  Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
+  Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
+
+  // Add up per-byte counts in a binary tree.  All bits of Op at
+  // position larger than BitSize remain zero throughout.
+  for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, VT));
+    if (BitSize != OrigBitSize)
+      Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
+                        DAG.getConstant(((uint64_t)1 << BitSize) - 1, VT));
+    Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
+  }
+
+  // Extract overall result from high byte.
+  if (BitSize > 8)
+    Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getConstant(BitSize - 8, VT));
+
+  return Op;
+}
+
 // Op is an atomic load.  Lower it into a normal volatile load.
 SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
                                                 SelectionDAG &DAG) const {
@@ -2554,6 +2598,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerUDIVREM(Op, DAG);
   case ISD::OR:
     return lowerOR(Op, DAG);
+  case ISD::CTPOP:
+    return lowerCTPOP(Op, DAG);
   case ISD::ATOMIC_SWAP:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
   case ISD::ATOMIC_STORE:
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 15add448d19..dda4f4544ed 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -87,6 +87,9 @@ enum {
   // the number of the register.
   EXTRACT_ACCESS,
 
+  // Count number of bits set in operand 0 per byte.
+  POPCNT,
+
   // Wrappers around the ISD opcodes of the same name.  The output and
   // first input operands are GR128s.  The trailing numbers are the
   // widths of the second operand in bits.
@@ -304,6 +307,7 @@ private:
   SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index a7f774791d4..31db301d95e 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -1382,6 +1382,13 @@ let Defs = [CC] in {
 def : Pat<(ctlz GR64:$src),
           (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
 
+// Population count.  Counts bits set per byte.
+let Predicates = [FeaturePopulationCount], Defs = [CC] in {
+  def POPCNT : InstRRE<0xB9E1, (outs GR64:$R1), (ins GR64:$R2),
+                       "popcnt\t$R1, $R2",
+                       [(set GR64:$R1, (z_popcnt GR64:$R2))]>;
+}
+
 // Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
 def : Pat<(i64 (anyext GR32:$src)),
           (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 51ac5daad54..0cb476d2122 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -121,6 +121,7 @@ def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
 def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
 def z_extract_access    : SDNode<"SystemZISD::EXTRACT_ACCESS",
                                  SDT_ZExtractAccess>;
+def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
 def z_umul_lohi64       : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>;
 def z_sdivrem32         : SDNode<"SystemZISD::SDIVREM32", SDT_ZGR128Binary32>;
 def z_sdivrem64         : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index 1594854ab2c..b8bbd5677e0 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -39,6 +39,11 @@ def FeatureFPExtension : SystemZFeature<
   "Assume that the floating-point extension facility is installed"
 >;
 
+def FeaturePopulationCount : SystemZFeature<
+  "population-count", "PopulationCount",
+  "Assume that the population-count facility is installed"
+>;
+
 def FeatureFastSerialization : SystemZFeature<
   "fast-serialization", "FastSerialization",
   "Assume that the fast-serialization facility is installed"
@@ -54,9 +59,9 @@ def : Processor<"generic", NoItineraries, []>;
 def : Processor<"z10", NoItineraries, []>;
 def : Processor<"z196", NoItineraries,
                 [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
-                 FeatureFPExtension, FeatureFastSerialization,
-                 FeatureInterlockedAccess1]>;
+                 FeatureFPExtension, FeaturePopulationCount,
+                 FeatureFastSerialization, FeatureInterlockedAccess1]>;
 def : Processor<"zEC12", NoItineraries,
                 [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
-                 FeatureFPExtension, FeatureFastSerialization,
-                 FeatureInterlockedAccess1]>;
+                 FeatureFPExtension, FeaturePopulationCount,
+                 FeatureFastSerialization, FeatureInterlockedAccess1]>;
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index 4b5c23cefc9..0999b45c9d6 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -38,7 +38,8 @@ SystemZSubtarget::SystemZSubtarget(const std::string &TT,
                                    const TargetMachine &TM)
     : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
       HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
-      HasFastSerialization(false), HasInterlockedAccess1(false),
+      HasPopulationCount(false), HasFastSerialization(false),
+      HasInterlockedAccess1(false),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), TSInfo(*TM.getDataLayout()), FrameLowering() {}
 
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index 99cb1ad3045..b3e7a3512cd 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -38,6 +38,7 @@ protected:
   bool HasLoadStoreOnCond;
   bool HasHighWord;
   bool HasFPExtension;
+  bool HasPopulationCount;
   bool HasFastSerialization;
   bool HasInterlockedAccess1;
 
@@ -86,6 +87,9 @@ public:
   // Return true if the target has the floating-point extension facility.
   bool hasFPExtension() const { return HasFPExtension; }
 
+  // Return true if the target has the population-count facility.
+  bool hasPopulationCount() const { return HasPopulationCount; }
+
   // Return true if the target has the fast-serialization facility.
   bool hasFastSerialization() const { return HasFastSerialization; }
 
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 244bd0379a6..3337f6388bd 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -229,3 +229,12 @@ unsigned SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   }
   return SystemZTTIImpl::getIntImmCost(Imm, Ty);
 }
+
+TargetTransformInfo::PopcntSupportKind
+SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
+  assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
+  if (ST->hasPopulationCount() && TyWidth <= 64)
+    return TTI::PSK_FastHardware;
+  return TTI::PSK_Software;
+}
+
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index d96515adce4..d4989130679 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -60,6 +60,8 @@ public:
   unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                          Type *Ty);
 
+  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
+
   /// @}
 };
 
diff --git a/test/CodeGen/SystemZ/ctpop-01.ll b/test/CodeGen/SystemZ/ctpop-01.ll
new file mode 100644
index 00000000000..ad80f9f2151
--- /dev/null
+++ b/test/CodeGen/SystemZ/ctpop-01.ll
@@ -0,0 +1,96 @@
+; Test population-count instruction
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+declare i32 @llvm.ctpop.i32(i32 %a)
+declare i64 @llvm.ctpop.i64(i64 %a)
+
+define i32 @f1(i32 %a) {
+; CHECK-LABEL: f1:
+; CHECK: popcnt  %r0, %r2
+; CHECK: sllk    %r1, %r0, 16
+; CHECK: ar      %r1, %r0
+; CHECK: sllk    %r2, %r1, 8
+; CHECK: ar      %r2, %r1
+; CHECK: srl     %r2, 24
+; CHECK: br      %r14
+
+  %popcnt = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %popcnt
+}
+
+define i32 @f2(i32 %a) {
+; CHECK-LABEL: f2:
+; CHECK: llhr    %r0, %r2
+; CHECK: popcnt  %r0, %r0
+; CHECK: risblg  %r2, %r0, 16, 151, 8
+; CHECK: ar      %r2, %r0
+; CHECK: srl     %r2, 8
+; CHECK: br      %r14
+  %and = and i32 %a, 65535
+  %popcnt = call i32 @llvm.ctpop.i32(i32 %and)
+  ret i32 %popcnt
+}
+
+define i32 @f3(i32 %a) {
+; CHECK-LABEL: f3:
+; CHECK: llcr    %r0, %r2
+; CHECK: popcnt  %r2, %r0
+; CHECK: br      %r14
+  %and = and i32 %a, 255
+  %popcnt = call i32 @llvm.ctpop.i32(i32 %and)
+  ret i32 %popcnt
+}
+
+define i64 @f4(i64 %a) {
+; CHECK-LABEL: f4:
+; CHECK: popcnt  %r0, %r2
+; CHECK: sllg    %r1, %r0, 32
+; CHECK: agr     %r1, %r0
+; CHECK: sllg    %r0, %r1, 16
+; CHECK: agr     %r0, %r1
+; CHECK: sllg    %r1, %r0, 8
+; CHECK: agr     %r1, %r0
+; CHECK: srlg    %r2, %r1, 56
+; CHECK: br      %r14
+  %popcnt = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %popcnt
+}
+
+define i64 @f5(i64 %a) {
+; CHECK-LABEL: f5:
+; CHECK: llgfr   %r0, %r2
+; CHECK: popcnt  %r0, %r0
+; CHECK: sllg    %r1, %r0, 16
+; CHECK: algfr   %r0, %r1
+; CHECK: sllg    %r1, %r0, 8
+; CHECK: algfr   %r0, %r1
+; CHECK: srlg    %r2, %r0, 24
+  %and = and i64 %a, 4294967295
+  %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
+  ret i64 %popcnt
+}
+
+define i64 @f6(i64 %a) {
+; CHECK-LABEL: f6:
+; CHECK: llghr   %r0, %r2
+; CHECK: popcnt  %r0, %r0
+; CHECK: risbg   %r1, %r0, 48, 183, 8
+; CHECK: agr     %r1, %r0
+; CHECK: srlg    %r2, %r1, 8
+; CHECK: br      %r14
+  %and = and i64 %a, 65535
+  %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
+  ret i64 %popcnt
+}
+
+define i64 @f7(i64 %a) {
+; CHECK-LABEL: f7:
+; CHECK: llgcr   %r0, %r2
+; CHECK: popcnt  %r2, %r0
+; CHECK: br      %r14
+  %and = and i64 %a, 255
+  %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
+  ret i64 %popcnt
+}
+
diff --git a/test/MC/Disassembler/SystemZ/insns.txt b/test/MC/Disassembler/SystemZ/insns.txt
index 54a3c5b1d6a..df032dc53f9 100644
--- a/test/MC/Disassembler/SystemZ/insns.txt
+++ b/test/MC/Disassembler/SystemZ/insns.txt
@@ -6334,6 +6334,18 @@
 # CHECK: pfd 15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x36
 
+# CHECK: popcnt %r0, %r0
+0xb9 0xe1 0x00 0x00
+
+# CHECK: popcnt %r0, %r15
+0xb9 0xe1 0x00 0x0f
+
+# CHECK: popcnt %r15, %r0
+0xb9 0xe1 0x00 0xf0
+
+# CHECK: popcnt %r7, %r8
+0xb9 0xe1 0x00 0x78
+
 # CHECK: risbg %r0, %r0, 0, 0, 0
 0xec 0x00 0x00 0x00 0x00 0x55
 
diff --git a/test/MC/SystemZ/insn-bad.s b/test/MC/SystemZ/insn-bad.s
index a08cb34da83..0410a411167 100644
--- a/test/MC/SystemZ/insn-bad.s
+++ b/test/MC/SystemZ/insn-bad.s
@@ -2666,6 +2666,11 @@
 	pfdrl	1, 1
 	pfdrl	1, 0x100000000
 
+#CHECK: error: {{(instruction requires: population-count)?}}
+#CHECK: popcnt	%r0, %r0
+
+	popcnt	%r0, %r0
+
 #CHECK: error: invalid operand
 #CHECK: risbg	%r0,%r0,0,0,-1
 #CHECK: error: invalid operand
diff --git a/test/MC/SystemZ/insn-good-z196.s b/test/MC/SystemZ/insn-good-z196.s
index db5ecdd238c..36bea38f5fe 100644
--- a/test/MC/SystemZ/insn-good-z196.s
+++ b/test/MC/SystemZ/insn-good-z196.s
@@ -1021,6 +1021,16 @@
 	ork	%r15,%r0,%r0
 	ork	%r7,%r8,%r9
 
+#CHECK: popcnt	%r0, %r0                # encoding: [0xb9,0xe1,0x00,0x00]
+#CHECK: popcnt	%r0, %r15               # encoding: [0xb9,0xe1,0x00,0x0f]
+#CHECK: popcnt	%r15, %r0               # encoding: [0xb9,0xe1,0x00,0xf0]
+#CHECK: popcnt	%r7, %r8                # encoding: [0xb9,0xe1,0x00,0x78]
+
+	popcnt	%r0,%r0
+	popcnt	%r0,%r15
+	popcnt	%r15,%r0
+	popcnt	%r7,%r8
+
 #CHECK: risbhg	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x5d]
 #CHECK: risbhg	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x5d]
 #CHECK: risbhg	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x5d]
-- 
2.34.1