From 331ba2739d484b670000bd59b170fe1e993786d2 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Fri, 28 Jun 2013 17:58:07 +0000
Subject: [PATCH] [NVPTX] Add support for cttz/ctlz/ctpop

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185176 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/NVPTX/NVPTXISelLowering.cpp | 16 +++++++
 lib/Target/NVPTX/NVPTXInstrInfo.td     | 58 ++++++++++++++++++++++++++
 test/CodeGen/NVPTX/ctlz.ll             | 44 +++++++++++++++++++
 test/CodeGen/NVPTX/ctpop.ll            | 25 +++++++++++
 test/CodeGen/NVPTX/cttz.ll             | 45 ++++++++++++++++++++
 5 files changed, 188 insertions(+)
 create mode 100644 test/CodeGen/NVPTX/ctlz.ll
 create mode 100644 test/CodeGen/NVPTX/ctpop.ll
 create mode 100644 test/CodeGen/NVPTX/cttz.ll

diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 338fe7c155f..8877d131eae 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -216,6 +216,22 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   // Custom handling for i8 intrinsics
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
 
+  setOperationAction(ISD::CTLZ, MVT::i16, Legal);
+  setOperationAction(ISD::CTLZ, MVT::i32, Legal);
+  setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Legal);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal);
+  setOperationAction(ISD::CTTZ, MVT::i16, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i16, Legal);
+  setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+  setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+
   // Now deduce the information based on the above mentioned
   // actions
   computeRegisterProperties();
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 32193641f21..553a6ba703d 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2406,6 +2406,64 @@ def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
                            "mov.b64\t{{$d1, $d2}}, $s;",
                           []>;
 
+// Count leading zeros
+def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                       "clz.b32\t$d, $a;",
+                       []>;
+def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                       "clz.b64\t$d, $a;",
+                       []>;
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctlz Int32Regs:$a),
+          (CLZr32 Int32Regs:$a)>;
+def : Pat<(ctlz_zero_undef Int32Regs:$a),
+          (CLZr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctlz Int64Regs:$a),
+          (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(ctlz_zero_undef Int64Regs:$a),
+          (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
+// than 16 bits to store). We also need to subtract 16 because the
+// high-order 16 zeros were counted.
+def : Pat<(ctlz Int16Regs:$a),
+          (SUBi16ri (CVT_u16_u32 (CLZr32
+            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+           CvtNONE), 16)>;
+def : Pat<(ctlz_zero_undef Int16Regs:$a),
+          (SUBi16ri (CVT_u16_u32 (CLZr32
+            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+           CvtNONE), 16)>;
+
+// Population count
+def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                        "popc.b32\t$d, $a;",
+                        []>;
+def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                        "popc.b64\t$d, $a;",
+                        []>;
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctpop Int32Regs:$a),
+          (POPCr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctpop Int64Regs:$a),
+          (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
+// than 16 bits to store)
+def : Pat<(ctpop Int16Regs:$a),
+          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+           CvtNONE)>;
+
 // fround f64 -> f32
 def : Pat<(f32 (fround Float64Regs:$a)),
           (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
diff --git a/test/CodeGen/NVPTX/ctlz.ll b/test/CodeGen/NVPTX/ctlz.ll
new file mode 100644
index 00000000000..bed15a9f6a5
--- /dev/null
+++ b/test/CodeGen/NVPTX/ctlz.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+declare i16 @llvm.ctlz.i16(i16, i1) readnone
+declare i32 @llvm.ctlz.i32(i32, i1) readnone
+declare i64 @llvm.ctlz.i64(i64, i1) readnone
+
+define i32 @myctpop(i32 %a) {
+; CHECK: clz.b32
+  %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
+  ret i32 %val
+}
+
+define i16 @myctpop16(i16 %a) {
+; CHECK: clz.b32
+  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+  ret i16 %val
+}
+
+define i64 @myctpop64(i64 %a) {
+; CHECK: clz.b64
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
+  ret i64 %val
+}
+
+
+define i32 @myctpop_2(i32 %a) {
+; CHECK: clz.b32
+  %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
+  ret i32 %val
+}
+
+define i16 @myctpop16_2(i16 %a) {
+; CHECK: clz.b32
+  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
+  ret i16 %val
+}
+
+define i64 @myctpop64_2(i64 %a) {
+; CHECK: clz.b64
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
+  ret i64 %val
+}
diff --git a/test/CodeGen/NVPTX/ctpop.ll b/test/CodeGen/NVPTX/ctpop.ll
new file mode 100644
index 00000000000..89477acae9f
--- /dev/null
+++ b/test/CodeGen/NVPTX/ctpop.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @myctpop(i32 %a) {
+; CHECK: popc.b32
+  %val = tail call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %val
+}
+
+define i16 @myctpop16(i16 %a) {
+; CHECK: popc.b32
+  %val = tail call i16 @llvm.ctpop.i16(i16 %a)
+  ret i16 %val
+}
+
+define i64 @myctpop64(i64 %a) {
+; CHECK: popc.b64
+  %val = tail call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %val
+}
+
+declare i16 @llvm.ctpop.i16(i16)
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
\ No newline at end of file
diff --git a/test/CodeGen/NVPTX/cttz.ll b/test/CodeGen/NVPTX/cttz.ll
new file mode 100644
index 00000000000..124ba9d1e9a
--- /dev/null
+++ b/test/CodeGen/NVPTX/cttz.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+declare i16 @llvm.cttz.i16(i16, i1) readnone
+declare i32 @llvm.cttz.i32(i32, i1) readnone
+declare i64 @llvm.cttz.i64(i64, i1) readnone
+
+define i32 @myctpop(i32 %a) {
+; CHECK: popc.b32
+  %val = call i32 @llvm.cttz.i32(i32 %a, i1 false) readnone
+  ret i32 %val
+}
+
+define i16 @myctpop16(i16 %a) {
+; CHECK: popc.b32
+  %val = call i16 @llvm.cttz.i16(i16 %a, i1 false) readnone
+  ret i16 %val
+}
+
+define i64 @myctpop64(i64 %a) {
+; CHECK: popc.b64
+  %val = call i64 @llvm.cttz.i64(i64 %a, i1 false) readnone
+  ret i64 %val
+}
+
+
+define i32 @myctpop_2(i32 %a) {
+; CHECK: popc.b32
+  %val = call i32 @llvm.cttz.i32(i32 %a, i1 true) readnone
+  ret i32 %val
+}
+
+define i16 @myctpop16_2(i16 %a) {
+; CHECK: popc.b32
+  %val = call i16 @llvm.cttz.i16(i16 %a, i1 true) readnone
+  ret i16 %val
+}
+
+define i64 @myctpop64_2(i64 %a) {
+; CHECK: popc.b64
+  %val = call i64 @llvm.cttz.i64(i64 %a, i1 true) readnone
+  ret i64 %val
+}
-- 
2.34.1