netfilter: xt_qtaguid: 1st pass at tracking tag based data resources
authorJP Abgrall <jpa@google.com>
Fri, 9 Sep 2011 08:55:24 +0000 (01:55 -0700)
committerJP Abgrall <jpa@google.com>
Fri, 9 Sep 2011 22:46:22 +0000 (15:46 -0700)
* Added global resource tracking based on tags.
 - Can be put into passive mode via
    /sys/modules/xt_qtaguid/params/tag_tracking_passive
 - The number of socket tags per UID is now limited
 - Adding /dev/xt_qtaguid that each process should open before starting
to tag sockets. A later change will make it a "must".
 - A process should not create new tags unless it has the dev open.
  A later change will make it a must.
 - On qtaguid_resources release, the process' matching socket tag info
  is deleted.
* Support run-time debug mask via /sys/modules parameter "debug_mask".
* split module into prettyprinting code, includes, main.
* Removed ptrdiff_t usage which didn't work in all cases.

Change-Id: I4a21d3bea55d23c1c3747253904e2a79f7d555d9
Signed-off-by: JP Abgrall <jpa@google.com>
net/netfilter/Makefile
net/netfilter/xt_qtaguid.c
net/netfilter/xt_qtaguid_internal.h [new file with mode: 0644]
net/netfilter/xt_qtaguid_print.c [new file with mode: 0644]
net/netfilter/xt_qtaguid_print.h [new file with mode: 0644]

index 8b658ef785018a5efb5ae7e916b4681d181c67f1..6d917176c3b8e477b52a0fc25fc74c9054f27ec9 100644 (file)
@@ -95,7 +95,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
-obj-$(CONFIG_NETFILTER_XT_MATCH_QTAGUID) += xt_qtaguid.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QTAGUID) += xt_qtaguid_print.o xt_qtaguid.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA2) += xt_quota2.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o
index b145f5a4b45d2d6e27e1dd7f8ef27b9b18d98c7e..dff2ff3fd7cc8ce76e00c060e5f500dd92a2eb75 100644 (file)
@@ -8,36 +8,11 @@
  * published by the Free Software Foundation.
  */
 
-/* #define DEBUG */
-/* #define IDEBUG */
-/* #define MDEBUG */
-/* #define RDEBUG */
-/* #define CDEBUG */
-
-/* Iface handling */
-#ifdef IDEBUG
-#define IF_DEBUG(...) pr_debug(__VA_ARGS__)
-#else
-#define IF_DEBUG(...) no_printk(__VA_ARGS__)
-#endif
-/* Iptable Matching */
-#ifdef MDEBUG
-#define MT_DEBUG(...) pr_debug(__VA_ARGS__)
-#else
-#define MT_DEBUG(...) no_printk(__VA_ARGS__)
-#endif
-/* Red-black tree handling */
-#ifdef RDEBUG
-#define RB_DEBUG(...) pr_debug(__VA_ARGS__)
-#else
-#define RB_DEBUG(...) no_printk(__VA_ARGS__)
-#endif
-/* procfs ctrl/stats handling */
-#ifdef CDEBUG
-#define CT_DEBUG(...) pr_debug(__VA_ARGS__)
-#else
-#define CT_DEBUG(...) no_printk(__VA_ARGS__)
-#endif
+/*
+ * There are run-time debug flags enabled via the debug_mask module param, or
+ * via the DEFAULT_DEBUG_MASK. See xt_qtaguid_internal.h.
+ */
+#define DEBUG
 
 #include <linux/file.h>
 #include <linux/inetdevice.h>
@@ -52,6 +27,9 @@
 #include <net/udp.h>
 
 #include <linux/netfilter/xt_socket.h>
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+
 /*
  * We only use the xt_socket funcs within a similar context to avoid unexpected
  * return values.
@@ -92,202 +70,159 @@ module_param_named(stats_readall_gid, proc_stats_readall_gid, uint,
 module_param_named(ctrl_write_gid, proc_ctrl_write_gid, uint,
                   S_IRUGO | S_IWUSR);
 
+/*
+ * Limit the number of active tags (via socket tags) for a given UID.
+ * Multiple processes could share the UID.
+ */
+static int max_sock_tags = DEFAULT_MAX_SOCK_TAGS;
+module_param(max_sock_tags, int, S_IRUGO | S_IWUSR);
+
 /*
  * After the kernel has initiallized this module, it is still possible
- * to make it passive:
- *  - do not register it via iptables.
- *   the matching code will not be invoked.
- *  - set passive to 0
- *   the iface stats handling will not be act on notifications.
+ * to make it passive.
+ * Setting passive to Y:
+ *  - the iface stats handling will not act on notifications.
+ *  - iptables matches will never match.
+ *  - ctrl commands silently succeed.
+ *  - stats are always empty.
  * This is mostly usefull when a bug is suspected.
  */
 static bool module_passive;
 module_param_named(passive, module_passive, bool, S_IRUGO | S_IWUSR);
 
-/*---------------------------------------------------------------------------*/
 /*
- * Tags:
- *
- * They represent what the data usage counters will be tracked against.
- * By default a tag is just based on the UID.
- * The UID is used as the base for policying, and can not be ignored.
- * So a tag will always at least represent a UID (uid_tag).
- *
- * A tag can be augmented with an "accounting tag" which is associated
- * with a UID.
- * User space can set the acct_tag portion of the tag which is then used
- * with sockets: all data belong to that socket will be counted against the
- * tag. The policing is then based on the tag's uid_tag portion,
- * and stats are collected for the acct_tag portion seperately.
- *
- * There could be
- * a:  {acct_tag=1, uid_tag=10003}
- * b:  {acct_tag=2, uid_tag=10003}
- * c:  {acct_tag=3, uid_tag=10003}
- * d:  {acct_tag=0, uid_tag=10003}
- * (a, b, and c represent tags associated with specific sockets.
- * d is for the totals for that uid, including all untagged traffic.
- * Typically d is used with policing/quota rules.
- *
- * We want tag_t big enough to distinguish uid_t and acct_tag.
- * It might become a struct if needed.
- * Nothing should be using it as an int.
+ * Control how qtaguid data is tracked per proc/uid.
+ * Setting tag_tracking_passive to Y:
+ *  - don't create proc specific structs to track tags
+ *  - don't check that active tag stats exceed some limits.
+ *  - don't clean up socket tags on process exits.
+ * This is mostly usefull when a bug is suspected.
  */
-typedef uint64_t tag_t;  /* Only used via accessors */
+static bool qtu_proc_handling_passive;
+module_param_named(tag_tracking_passive, qtu_proc_handling_passive, bool,
+                  S_IRUGO | S_IWUSR);
+
 
+#define QTU_DEV_NAME "xt_qtaguid"
+
+uint debug_mask = DEFAULT_DEBUG_MASK;
+module_param(debug_mask, uint, S_IRUGO | S_IWUSR);
+
+/*---------------------------------------------------------------------------*/
 static const char *iface_stat_procdirname = "iface_stat";
 static struct proc_dir_entry *iface_stat_procdir;
 
-
 /*
- * For now we only track 2 sets of counters.
- * The default set is 0.
- * Userspace can activate another set for a given uid being tracked.
+ * Ordering of locks:
+ *  outer locks:
+ *    iface_stat_list_lock
+ *    sock_tag_list_lock
+ *  inner locks:
+ *    uid_tag_data_tree_lock
+ *    tag_counter_set_list_lock
+ * Notice how sock_tag_list_lock is held sometimes when uid_tag_data_tree_lock
+ * is acquired.
+ *
+ * Call tree with all lock holders as of 2011-09-06:
+ *
+ *   qtaguid_ctrl_parse()
+ *     ctrl_cmd_delete()
+ *       sock_tag_list_lock
+ *       tag_counter_set_list_lock
+ *       iface_stat_list_lock
+ *         iface_entry->tag_stat_list_lock
+ *       uid_tag_data_tree_lock
+ *     ctrl_cmd_counter_set()
+ *       tag_counter_set_list_lock
+ *     ctrl_cmd_tag()
+ *       sock_tag_list_lock
+ *         get_tag_ref()
+ *           uid_tag_data_tree_lock
+ *       uid_tag_data_tree_lock
+ *     ctrl_cmd_untag()
+ *       sock_tag_list_lock
+ *         uid_tag_data_tree_lock
+ *
+ *   qtaguid_mt()
+ *     account_for_uid()
+ *       if_tag_stat_update()
+ *     get_sock_stat()
+ *       sock_tag_list_lock
+ *        iface_entry->tag_stat_list_lock
+ *        tag_stat_update()
+ *          get_active_counter_set()
+ *            tag_counter_set_list_lock
+ *
+ *   iface_netdev_event_handler()
+ *     iface_stat_create()
+ *       iface_stat_list_lock
+ *     iface_stat_update()
+ *       iface_stat_list_lock
+ *
+ *   iface_inet6addr_event_handler()
+ *     iface_stat_create_ipv6()
+ *       iface_stat_list_lock
+ *     iface_stat_update()
+ *       iface_stat_list_lock
+ *
+ *   iface_inetaddr_event_handler()
+ *     iface_stat_create()
+ *       iface_stat_list_lock
+ *     iface_stat_update()
+ *       iface_stat_list_lock
+ *
+ *   qtaguid_ctrl_proc_read()
+ *     sock_tag_list_lock
+ *     sock_tag_list_lock
+ *     uid_tag_data_tree_lock
+ *     iface_stat_list_lock
+ *
+ *   qtaguid_stats_proc_read()
+ *     iface_stat_list_lock
+ *       iface_entry->tag_stat_list_lock
+ *
+ *   qtudev_open()
+ *     uid_tag_data_tree_lock
+ *
+ *   qtud_dev_release()
+ *     sock_tag_list_lock
+ *       uid_tag_data_tree_lock
  */
-#define IFS_MAX_COUNTER_SETS 2
-
-enum ifs_tx_rx {
-       IFS_TX,
-       IFS_RX,
-       IFS_MAX_DIRECTIONS
-};
-
-/* For now, TCP, UDP, the rest */
-enum ifs_proto {
-       IFS_TCP,
-       IFS_UDP,
-       IFS_PROTO_OTHER,
-       IFS_MAX_PROTOS
-};
-
-struct byte_packet_counters {
-       uint64_t bytes;
-       uint64_t packets;
-};
-
-struct data_counters {
-       struct byte_packet_counters bpc[IFS_MAX_COUNTER_SETS][IFS_MAX_DIRECTIONS][IFS_MAX_PROTOS];
-};
-
-/* Generic tag based node used as a base for rb_tree ops. */
-struct tag_node {
-       struct rb_node node;
-       tag_t tag;
-};
-
-struct tag_stat {
-       struct tag_node tn;
-       struct data_counters counters;
-       /*
-        * If this tag is acct_tag based, we need to count against the
-        * matching parent uid_tag.
-        */
-       struct data_counters *parent_counters;
-};
-
-struct iface_stat {
-       struct list_head list;
-       char *ifname;
-       uint64_t rx_bytes;
-       uint64_t rx_packets;
-       uint64_t tx_bytes;
-       uint64_t tx_packets;
-       bool active;
-       struct proc_dir_entry *proc_ptr;
-
-       struct rb_root tag_stat_tree;
-       spinlock_t tag_stat_list_lock;
-};
-
 static LIST_HEAD(iface_stat_list);
 static DEFINE_SPINLOCK(iface_stat_list_lock);
 
-/* This is needed to create proc_dir_entries from atomic context. */
-struct iface_stat_work {
-       struct work_struct iface_work;
-       struct iface_stat *iface_entry;
-};
-
-/*
- * Track tag that this socket is transferring data for, and not necessarily
- * the uid that owns the socket.
- * This is the tag against which tag_stat.counters will be billed.
- */
-struct sock_tag {
-       struct rb_node sock_node;
-       struct sock *sk;  /* Only used as a number, never dereferenced */
-       /* The socket is needed for sockfd_put() */
-       struct socket *socket;
-
-       tag_t tag;
-};
-
-struct qtaguid_event_counts {
-       /* Various successful events */
-       atomic64_t sockets_tagged;
-       atomic64_t sockets_untagged;
-       atomic64_t counter_set_changes;
-       atomic64_t delete_cmds;
-       atomic64_t iface_events;  /* Number of NETDEV_* events handled */
-       /*
-        * match_found_sk_*: numbers related to the netfilter matching
-        * function finding a sock for the sk_buff.
-        */
-       atomic64_t match_found_sk;   /* An sk was already in the sk_buff. */
-       /* The connection tracker had the sk. */
-       atomic64_t match_found_sk_in_ct;
-       /*
-        * No sk could be found. No apparent owner. Could happen with
-        * unsolicited traffic.
-        */
-       atomic64_t match_found_sk_none;
-};
-static struct qtaguid_event_counts qtu_events;
-
 static struct rb_root sock_tag_tree = RB_ROOT;
 static DEFINE_SPINLOCK(sock_tag_list_lock);
 
-/* Track the set active_set for the given tag. */
-struct tag_counter_set {
-       struct tag_node tn;
-       int active_set;
-};
-
 static struct rb_root tag_counter_set_tree = RB_ROOT;
 static DEFINE_SPINLOCK(tag_counter_set_list_lock);
 
-static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par);
+static struct rb_root uid_tag_data_tree = RB_ROOT;
+static DEFINE_SPINLOCK(uid_tag_data_tree_lock);
 
+static struct rb_root proc_qtu_data_tree = RB_ROOT;
+/* No proc_qtu_data_tree_lock; use uid_tag_data_tree_lock */
+
+static struct qtaguid_event_counts qtu_events;
 /*----------------------------------------------*/
-static inline int tag_compare(tag_t t1, tag_t t2)
+static bool can_manipulate_uids(void)
 {
-       return t1 < t2 ? -1 : t1 == t2 ? 0 : 1;
+       /* root pwnd */
+       return unlikely(!current_fsuid()) || unlikely(!proc_ctrl_write_gid)
+               || in_egroup_p(proc_ctrl_write_gid);
 }
 
-static inline tag_t combine_atag_with_uid(tag_t acct_tag, uid_t uid)
-{
-       return acct_tag | uid;
-}
-static inline tag_t make_tag_from_uid(uid_t uid)
-{
-       return uid;
-}
-static inline uid_t get_uid_from_tag(tag_t tag)
-{
-       return tag & 0xFFFFFFFFULL;
-}
-static inline tag_t get_utag_from_tag(tag_t tag)
-{
-       return tag & 0xFFFFFFFFULL;
-}
-static inline tag_t get_atag_from_tag(tag_t tag)
+static bool can_impersonate_uid(uid_t uid)
 {
-       return tag & ~0xFFFFFFFFULL;
+       return uid == current_fsuid() || can_manipulate_uids();
 }
 
-static inline bool valid_atag(tag_t tag)
+static bool can_read_other_uid_stats(uid_t uid)
 {
-       return !(tag & 0xFFFFFFFFULL);
+       /* root pwnd */
+       return unlikely(!current_fsuid()) || uid == current_fsuid()
+               || unlikely(!proc_stats_readall_gid)
+               || in_egroup_p(proc_stats_readall_gid);
 }
 
 static inline void dc_add_byte_packets(struct data_counters *counters, int set,
@@ -324,12 +259,13 @@ static struct tag_node *tag_node_tree_search(struct rb_root *root, tag_t tag)
 
        while (node) {
                struct tag_node *data = rb_entry(node, struct tag_node, node);
-               int result = tag_compare(tag, data->tag);
-               RB_DEBUG("qtaguid: tag_node_tree_search(): tag=0x%llx"
-                        " (uid=%d)\n",
-                        data->tag,
-                        get_uid_from_tag(data->tag));
-
+               int result;
+               RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+                        " node=%p data=%p\n", tag, node, data);
+               result = tag_compare(tag, data->tag);
+               RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+                        " data.tag=0x%llx (uid=%u) res=%d\n",
+                        tag, data->tag, get_uid_from_tag(data->tag), result);
                if (result < 0)
                        node = node->rb_left;
                else if (result > 0)
@@ -349,8 +285,8 @@ static void tag_node_tree_insert(struct tag_node *data, struct rb_root *root)
                struct tag_node *this = rb_entry(*new, struct tag_node,
                                                 node);
                int result = tag_compare(data->tag, this->tag);
-               RB_DEBUG("qtaguid: tag_node_tree_insert(): tag=0x%llx"
-                        " (uid=%d)\n",
+               RB_DEBUG("qtaguid: %s(): tag=0x%llx"
+                        " (uid=%u)\n", __func__,
                         this->tag,
                         get_uid_from_tag(this->tag));
                parent = *new;
@@ -396,6 +332,19 @@ static struct tag_counter_set *tag_counter_set_tree_search(struct rb_root *root,
 
 }
 
+static void tag_ref_tree_insert(struct tag_ref *data, struct rb_root *root)
+{
+       tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_ref *tag_ref_tree_search(struct rb_root *root, tag_t tag)
+{
+       struct tag_node *node = tag_node_tree_search(root, tag);
+       if (!node)
+               return NULL;
+       return rb_entry(&node->node, struct tag_ref, tn.node);
+}
+
 static struct sock_tag *sock_tag_tree_search(struct rb_root *root,
                                             const struct sock *sk)
 {
@@ -404,10 +353,9 @@ static struct sock_tag *sock_tag_tree_search(struct rb_root *root,
        while (node) {
                struct sock_tag *data = rb_entry(node, struct sock_tag,
                                                 sock_node);
-               ptrdiff_t result = sk - data->sk;
-               if (result < 0)
+               if (sk < data->sk)
                        node = node->rb_left;
-               else if (result > 0)
+               else if (sk > data->sk)
                        node = node->rb_right;
                else
                        return data;
@@ -423,11 +371,10 @@ static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root)
        while (*new) {
                struct sock_tag *this = rb_entry(*new, struct sock_tag,
                                                 sock_node);
-               ptrdiff_t result = data->sk - this->sk;
                parent = *new;
-               if (result < 0)
+               if (data->sk < this->sk)
                        new = &((*new)->rb_left);
-               else if (result > 0)
+               else if (data->sk > this->sk)
                        new = &((*new)->rb_right);
                else
                        BUG();
@@ -438,6 +385,292 @@ static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root)
        rb_insert_color(&data->sock_node, root);
 }
 
+static void sock_tag_tree_erase(struct rb_root *st_to_free_tree)
+{
+       struct rb_node *node;
+       struct sock_tag *st_entry;
+
+       node = rb_first(st_to_free_tree);
+       while (node) {
+               st_entry = rb_entry(node, struct sock_tag, sock_node);
+               node = rb_next(node);
+               CT_DEBUG("qtaguid: %s(): "
+                        "erase st: sk=%p tag=0x%llx (uid=%u)\n", __func__,
+                        st_entry->sk,
+                        st_entry->tag,
+                        get_uid_from_tag(st_entry->tag));
+               rb_erase(&st_entry->sock_node, st_to_free_tree);
+               sockfd_put(st_entry->socket);
+               kfree(st_entry);
+       }
+}
+
+static struct proc_qtu_data *proc_qtu_data_tree_search(struct rb_root *root,
+                                                      const pid_t pid)
+{
+       struct rb_node *node = root->rb_node;
+
+       while (node) {
+               struct proc_qtu_data *data = rb_entry(node,
+                                                     struct proc_qtu_data,
+                                                     node);
+               if (pid < data->pid)
+                       node = node->rb_left;
+               else if (pid > data->pid)
+                       node = node->rb_right;
+               else
+                       return data;
+       }
+       return NULL;
+}
+
+static void proc_qtu_data_tree_insert(struct proc_qtu_data *data,
+                                     struct rb_root *root)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+       /* Figure out where to put new node */
+       while (*new) {
+               struct proc_qtu_data *this = rb_entry(*new,
+                                                     struct proc_qtu_data,
+                                                     node);
+               parent = *new;
+               if (data->pid < this->pid)
+                       new = &((*new)->rb_left);
+               else if (data->pid > this->pid)
+                       new = &((*new)->rb_right);
+               else
+                       BUG();
+       }
+
+       /* Add new node and rebalance tree. */
+       rb_link_node(&data->node, parent, new);
+       rb_insert_color(&data->node, root);
+}
+
+static void uid_tag_data_tree_insert(struct uid_tag_data *data,
+                                    struct rb_root *root)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+       /* Figure out where to put new node */
+       while (*new) {
+               struct uid_tag_data *this = rb_entry(*new,
+                                                    struct uid_tag_data,
+                                                    node);
+               parent = *new;
+               if (data->uid < this->uid)
+                       new = &((*new)->rb_left);
+               else if (data->uid > this->uid)
+                       new = &((*new)->rb_right);
+               else
+                       BUG();
+       }
+
+       /* Add new node and rebalance tree. */
+       rb_link_node(&data->node, parent, new);
+       rb_insert_color(&data->node, root);
+}
+
+static struct uid_tag_data *uid_tag_data_tree_search(struct rb_root *root,
+                                                    uid_t uid)
+{
+       struct rb_node *node = root->rb_node;
+
+       while (node) {
+               struct uid_tag_data *data = rb_entry(node,
+                                                    struct uid_tag_data,
+                                                    node);
+               if (uid < data->uid)
+                       node = node->rb_left;
+               else if (uid > data->uid)
+                       node = node->rb_right;
+               else
+                       return data;
+       }
+       return NULL;
+}
+
+/*
+ * Allocates a new uid_tag_data struct if needed.
+ * Returns a pointer to the found or allocated uid_tag_data.
+ * Returns a PTR_ERR on failures, and lock is not held.
+ * If found is not NULL:
+ *   sets *found to true if not allocated.
+ *   sets *found to false if allocated.
+ */
+struct uid_tag_data *get_uid_data(uid_t uid, bool *found_res)
+{
+       struct uid_tag_data *utd_entry;
+
+       /* Look for top level uid_tag_data for the UID */
+       utd_entry = uid_tag_data_tree_search(&uid_tag_data_tree, uid);
+       DR_DEBUG("qtaguid: get_uid_data(%u) utd=%p\n", uid, utd_entry);
+
+       if (found_res)
+               *found_res = utd_entry;
+       if (utd_entry)
+               return utd_entry;
+
+       utd_entry = kzalloc(sizeof(*utd_entry), GFP_ATOMIC);
+       if (!utd_entry) {
+               pr_err("qtaguid: get_uid_data(%u): "
+                      "tag data alloc failed\n", uid);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       utd_entry->uid = uid;
+       utd_entry->tag_ref_tree = RB_ROOT;
+       uid_tag_data_tree_insert(utd_entry, &uid_tag_data_tree);
+       DR_DEBUG("qtaguid: get_uid_data(%u) new utd=%p\n", uid, utd_entry);
+       return utd_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *new_tag_ref(tag_t new_tag,
+                                  struct uid_tag_data *utd_entry)
+{
+       struct tag_ref *tr_entry;
+       int res;
+
+       if (utd_entry->num_active_tags + 1 > max_sock_tags) {
+               pr_info("qtaguid: new_tag_ref(0x%llx): "
+                       "tag ref alloc quota exceeded. max=%d\n",
+                       new_tag, max_sock_tags);
+               res = -EMFILE;
+               goto err_res;
+
+       }
+
+       tr_entry = kzalloc(sizeof(*tr_entry), GFP_ATOMIC);
+       if (!tr_entry) {
+               pr_err("qtaguid: new_tag_ref(0x%llx): "
+                      "tag ref alloc failed\n",
+                      new_tag);
+               res = -ENOMEM;
+               goto err_res;
+       }
+       tr_entry->tn.tag = new_tag;
+       /* tr_entry->num_sock_tags  handled by caller */
+       utd_entry->num_active_tags++;
+       tag_ref_tree_insert(tr_entry, &utd_entry->tag_ref_tree);
+       DR_DEBUG("qtaguid: new_tag_ref(0x%llx): "
+                " inserted new tag ref\n",
+                new_tag);
+       return tr_entry;
+
+err_res:
+       return ERR_PTR(res);
+}
+
+static struct tag_ref *lookup_tag_ref(tag_t full_tag,
+                                     struct uid_tag_data **utd_res)
+{
+       struct uid_tag_data *utd_entry;
+       struct tag_ref *tr_entry;
+       bool found_utd;
+       uid_t uid = get_uid_from_tag(full_tag);
+
+       DR_DEBUG("qtaguid: lookup_tag_ref(tag=0x%llx (uid=%u))\n",
+                full_tag, uid);
+
+       utd_entry = get_uid_data(uid, &found_utd);
+       if (IS_ERR_OR_NULL(utd_entry)) {
+               if (utd_res)
+                       *utd_res = utd_entry;
+               return NULL;
+       }
+
+       tr_entry = tag_ref_tree_search(&utd_entry->tag_ref_tree, full_tag);
+       if (utd_res)
+               *utd_res = utd_entry;
+       DR_DEBUG("qtaguid: lookup_tag_ref(0x%llx) utd_entry=%p tr_entry=%p\n",
+                full_tag, utd_entry, tr_entry);
+       return tr_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *get_tag_ref(tag_t full_tag,
+                                  struct uid_tag_data **utd_res)
+{
+       struct uid_tag_data *utd_entry;
+       struct tag_ref *tr_entry;
+
+       DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n",
+                full_tag);
+       spin_lock_bh(&uid_tag_data_tree_lock);
+       tr_entry = lookup_tag_ref(full_tag, &utd_entry);
+       BUG_ON(IS_ERR_OR_NULL(utd_entry));
+       if (!tr_entry)
+               tr_entry = new_tag_ref(full_tag, utd_entry);
+
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+       if (utd_res)
+               *utd_res = utd_entry;
+       DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n",
+                full_tag, utd_entry, tr_entry);
+       return tr_entry;
+}
+
+/* Checks and maybe frees the UID Tag Data entry */
+static void put_utd_entry(struct uid_tag_data *utd_entry)
+{
+       /* Are we done with the UID tag data entry? */
+       if (RB_EMPTY_ROOT(&utd_entry->tag_ref_tree)) {
+               DR_DEBUG("qtaguid: %s(): "
+                        "erase utd_entry=%p uid=%u "
+                        "by pid=%u tgid=%u uid=%u\n", __func__,
+                        utd_entry, utd_entry->uid,
+                        current->pid, current->tgid, current_fsuid());
+               BUG_ON(utd_entry->num_active_tags);
+               rb_erase(&utd_entry->node, &uid_tag_data_tree);
+               kfree(utd_entry);
+       } else {
+               DR_DEBUG("qtaguid: %s(): "
+                        "utd_entry=%p still has %d tags\n", __func__,
+                        utd_entry, utd_entry->num_active_tags);
+               BUG_ON(!utd_entry->num_active_tags);
+       }
+}
+
+/*
+ * If no sock_tags are using this tag_ref,
+ * decrements refcount of utd_entry, removes tr_entry
+ * from utd_entry->tag_ref_tree and frees.
+ */
+static void free_tag_ref_from_utd_entry(struct tag_ref *tr_entry,
+                                       struct uid_tag_data *utd_entry)
+{
+       DR_DEBUG("qtaguid: %s(): %p tag=0x%llx (uid=%u)\n", __func__,
+                tr_entry, tr_entry->tn.tag,
+                get_uid_from_tag(tr_entry->tn.tag));
+       if (!tr_entry->num_sock_tags) {
+               BUG_ON(!utd_entry->num_active_tags);
+               utd_entry->num_active_tags--;
+               rb_erase(&tr_entry->tn.node, &utd_entry->tag_ref_tree);
+               DR_DEBUG("qtaguid: %s(): erased %p\n", __func__, tr_entry);
+               kfree(tr_entry);
+       }
+}
+
+static void put_tag_ref_tree(tag_t full_tag, struct uid_tag_data *utd_entry)
+{
+       struct rb_node *node;
+       struct tag_ref *tr_entry;
+       tag_t acct_tag;
+
+       DR_DEBUG("qtaguid: %s(tag=0x%llx (uid=%u))\n", __func__,
+                full_tag, get_uid_from_tag(full_tag));
+       acct_tag = get_atag_from_tag(full_tag);
+       node = rb_first(&utd_entry->tag_ref_tree);
+       while (node) {
+               tr_entry = rb_entry(node, struct tag_ref, tn.node);
+               node = rb_next(node);
+               if (!acct_tag || tr_entry->tn.tag == full_tag)
+                       free_tag_ref_from_utd_entry(tr_entry, utd_entry);
+       }
+}
+
 static int read_proc_u64(char *page, char **start, off_t off,
                        int count, int *eof, void *data)
 {
@@ -843,8 +1076,8 @@ static struct tag_stat *create_if_tag_stat(struct iface_stat *iface_entry,
                                           tag_t tag)
 {
        struct tag_stat *new_tag_stat_entry = NULL;
-       IF_DEBUG("qtaguid: iface_stat: create_if_tag_stat(): ife=%p tag=0x%llx"
-                " (uid=%u)\n",
+       IF_DEBUG("qtaguid: iface_stat: %s(): ife=%p tag=0x%llx"
+                " (uid=%u)\n", __func__,
                 iface_entry, tag, get_uid_from_tag(tag));
        new_tag_stat_entry = kzalloc(sizeof(*new_tag_stat_entry), GFP_ATOMIC);
        if (!new_tag_stat_entry) {
@@ -894,9 +1127,9 @@ static void if_tag_stat_update(const char *ifname, uid_t uid,
                acct_tag = get_atag_from_tag(tag);
                uid_tag = get_utag_from_tag(tag);
        } else {
-               uid_tag = make_tag_from_uid(uid);
-               acct_tag = 0;
+               acct_tag = make_atag_from_value(0);
                tag = combine_atag_with_uid(acct_tag, uid);
+               uid_tag = make_tag_from_uid(uid);
        }
        MT_DEBUG("qtaguid: iface_stat: stat_update(): "
                 " looking for tag=0x%llx (uid=%u) in ife=%p\n",
@@ -1289,22 +1522,23 @@ static int qtaguid_ctrl_proc_read(char *page, char **num_items_returned,
        char *outp = page;
        int len;
        uid_t uid;
-       struct sock_tag *sock_tag_entry;
        struct rb_node *node;
+       struct sock_tag *sock_tag_entry;
        int item_index = 0;
+       int indent_level = 0;
+       long f_count;
 
        if (unlikely(module_passive)) {
                *eof = 1;
                return 0;
        }
 
-       /* TODO: support skipping num_items_returned on entry. */
-       CT_DEBUG("qtaguid: proc ctrl page=%p off=%ld char_count=%d *eof=%d\n",
-               page, items_to_skip, char_count, *eof);
-
        if (*eof)
                return 0;
 
+       CT_DEBUG("qtaguid: proc ctrl page=%p off=%ld char_count=%d *eof=%d\n",
+               page, items_to_skip, char_count, *eof);
+
        spin_lock_bh(&sock_tag_list_lock);
        for (node = rb_first(&sock_tag_tree);
             node;
@@ -1313,14 +1547,21 @@ static int qtaguid_ctrl_proc_read(char *page, char **num_items_returned,
                        continue;
                sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
                uid = get_uid_from_tag(sock_tag_entry->tag);
-               CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u)\n",
+               CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) "
+                        "pid=%u\n",
                         sock_tag_entry->sk,
                         sock_tag_entry->tag,
-                        uid
+                        uid,
+                        sock_tag_entry->pid
                        );
+               f_count = atomic_long_read(
+                       &sock_tag_entry->socket->file->f_count);
                len = snprintf(outp, char_count,
-                              "sock=%p tag=0x%llx (uid=%u)\n",
-                              sock_tag_entry->sk, sock_tag_entry->tag, uid);
+                              "sock=%p tag=0x%llx (uid=%u) pid=%u "
+                              "f_count=%lu\n",
+                              sock_tag_entry->sk,
+                              sock_tag_entry->tag, uid,
+                              sock_tag_entry->pid, f_count);
                if (len >= char_count) {
                        spin_unlock_bh(&sock_tag_list_lock);
                        *outp = '\0';
@@ -1359,28 +1600,31 @@ static int qtaguid_ctrl_proc_read(char *page, char **num_items_returned,
                (*num_items_returned)++;
        }
 
-       *eof = 1;
-       return outp - page;
-}
+#ifdef CDEBUG
+       /* Count the following as part of the last item_index */
+       if (item_index > items_to_skip) {
+               CT_DEBUG("qtaguid: proc ctrl state debug {\n");
+               spin_lock_bh(&sock_tag_list_lock);
+               prdebug_sock_tag_tree(indent_level, &sock_tag_tree);
+               spin_unlock_bh(&sock_tag_list_lock);
 
-static bool can_manipulate_uids(void)
-{
-       /* root pwnd */
-       return unlikely(!current_fsuid()) || unlikely(!proc_ctrl_write_gid)
-               || in_egroup_p(proc_ctrl_write_gid);
-}
+               spin_lock_bh(&uid_tag_data_tree_lock);
+               prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree);
+               prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree);
+               spin_unlock_bh(&uid_tag_data_tree_lock);
 
-static bool can_impersonate_uid(uid_t uid)
-{
-       return uid == current_fsuid() || can_manipulate_uids();
-}
+               spin_lock_bh(&iface_stat_list_lock);
+               prdebug_iface_stat_list(indent_level, &iface_stat_list);
+               spin_unlock_bh(&iface_stat_list_lock);
 
-static bool can_read_other_uid_stats(uid_t uid)
-{
-       /* root pwnd */
-       return unlikely(!current_fsuid()) || uid == current_fsuid()
-               || unlikely(!proc_stats_readall_gid)
-               || in_egroup_p(proc_stats_readall_gid);
+               CT_DEBUG("qtaguid: proc ctrl state debug }\n");
+
+
+       }
+#endif
+
+       *eof = 1;
+       return outp - page;
 }
 
 /*
@@ -1401,6 +1645,8 @@ static int ctrl_cmd_delete(const char *input)
        struct rb_root st_to_free_tree = RB_ROOT;
        struct tag_stat *ts_entry;
        struct tag_counter_set *tcs_entry;
+       struct tag_ref *tr_entry;
+       struct uid_tag_data *utd_entry;
 
        argc = sscanf(input, "%c %llu %u", &cmd, &acct_tag, &uid);
        CT_DEBUG("qtaguid: ctrl_delete(%s): argc=%d cmd=%c "
@@ -1419,12 +1665,17 @@ static int ctrl_cmd_delete(const char *input)
                uid = current_fsuid();
        } else if (!can_impersonate_uid(uid)) {
                pr_info("qtaguid: ctrl_delete(%s): "
-                       "insufficient priv from pid=%u uid=%u\n",
-                       input, current->pid, current_fsuid());
+                       "insufficient priv from pid=%u tgid=%u uid=%u\n",
+                       input, current->pid, current->tgid, current_fsuid());
                res = -EPERM;
                goto err;
        }
 
+       tag = combine_atag_with_uid(acct_tag, uid);
+       CT_DEBUG("qtaguid: ctrl_delete(): "
+                "looking for tag=0x%llx (uid=%u)\n",
+                tag, uid);
+
        /* Delete socket tags */
        spin_lock_bh(&sock_tag_list_lock);
        node = rb_first(&sock_tag_tree);
@@ -1435,32 +1686,25 @@ static int ctrl_cmd_delete(const char *input)
                if (entry_uid != uid)
                        continue;
 
+               CT_DEBUG("qtaguid: ctrl_delete(): st tag=0x%llx (uid=%u)\n",
+                        st_entry->tag, entry_uid);
+
                if (!acct_tag || st_entry->tag == tag) {
                        rb_erase(&st_entry->sock_node, &sock_tag_tree);
                        /* Can't sockfd_put() within spinlock, do it later. */
                        sock_tag_tree_insert(st_entry, &st_to_free_tree);
+                       tr_entry = lookup_tag_ref(st_entry->tag, NULL);
+                       BUG_ON(tr_entry->num_sock_tags <= 0);
+                       tr_entry->num_sock_tags--;
                }
        }
        spin_unlock_bh(&sock_tag_list_lock);
 
-       node = rb_first(&st_to_free_tree);
-       while (node) {
-               st_entry = rb_entry(node, struct sock_tag, sock_node);
-               node = rb_next(node);
-               CT_DEBUG("qtaguid: ctrl_delete(): "
-                        "erase st: sk=%p tag=0x%llx (uid=%u)\n",
-                        st_entry->sk,
-                        st_entry->tag,
-                        entry_uid);
-               rb_erase(&st_entry->sock_node, &st_to_free_tree);
-               sockfd_put(st_entry->socket);
-               kfree(st_entry);
-       }
-
-       tag = combine_atag_with_uid(acct_tag, uid);
+       sock_tag_tree_erase(&st_to_free_tree);
 
        /* Delete tag counter-sets */
        spin_lock_bh(&tag_counter_set_list_lock);
+       /* Counter sets are only on the uid tag, not full tag */
        tcs_entry = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
        if (tcs_entry) {
                CT_DEBUG("qtaguid: ctrl_delete(): "
@@ -1485,6 +1729,11 @@ static int ctrl_cmd_delete(const char *input)
                        ts_entry = rb_entry(node, struct tag_stat, tn.node);
                        entry_uid = get_uid_from_tag(ts_entry->tn.tag);
                        node = rb_next(node);
+
+                       CT_DEBUG("qtaguid: ctrl_delete(): "
+                                "ts tag=0x%llx (uid=%u)\n",
+                                ts_entry->tn.tag, entry_uid);
+
                        if (entry_uid != uid)
                                continue;
                        if (!acct_tag || ts_entry->tn.tag == tag) {
@@ -1501,6 +1750,30 @@ static int ctrl_cmd_delete(const char *input)
                spin_unlock_bh(&iface_entry->tag_stat_list_lock);
        }
        spin_unlock_bh(&iface_stat_list_lock);
+
+       /* Cleanup the uid_tag_data */
+       spin_lock_bh(&uid_tag_data_tree_lock);
+       node = rb_first(&uid_tag_data_tree);
+       while (node) {
+               utd_entry = rb_entry(node, struct uid_tag_data, node);
+               entry_uid = utd_entry->uid;
+               node = rb_next(node);
+
+               CT_DEBUG("qtaguid: ctrl_delete(): "
+                        "utd uid=%u\n",
+                        entry_uid);
+
+               if (entry_uid != uid)
+                       continue;
+               /*
+                * Go over the tag_refs, and those that don't have
+                * sock_tags using them are freed.
+                */
+               put_tag_ref_tree(tag, utd_entry);
+               put_utd_entry(utd_entry);
+       }
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+
        atomic64_inc(&qtu_events.delete_cmds);
        res = 0;
 
@@ -1533,8 +1806,8 @@ static int ctrl_cmd_counter_set(const char *input)
        }
        if (!can_manipulate_uids()) {
                pr_info("qtaguid: ctrl_counterset(%s): "
-                       "insufficient priv from pid=%u uid=%u\n",
-                       input, current->pid, current_fsuid());
+                       "insufficient priv from pid=%u tgid=%u uid=%u\n",
+                       input, current->pid, current->tgid, current_fsuid());
                res = -EPERM;
                goto err;
        }
@@ -1572,11 +1845,14 @@ static int ctrl_cmd_tag(const char *input)
        char cmd;
        int sock_fd = 0;
        uid_t uid = 0;
-       tag_t acct_tag = 0;
+       tag_t acct_tag = make_atag_from_value(0);
+       tag_t full_tag;
        struct socket *el_socket;
-       int refcnt = -1;
        int res, argc;
        struct sock_tag *sock_tag_entry;
+       struct tag_ref *tag_ref_entry;
+       struct uid_tag_data *uid_tag_data_entry;
+       struct proc_qtu_data *pqd_entry;
 
        /* Unassigned args will get defaulted later. */
        argc = sscanf(input, "%c %d %llu %u", &cmd, &sock_fd, &acct_tag, &uid);
@@ -1593,44 +1869,66 @@ static int ctrl_cmd_tag(const char *input)
                        " sock_fd=%d err=%d\n", input, sock_fd, res);
                goto err;
        }
-       refcnt = atomic_read(&el_socket->file->f_count);
-       CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->f_count=%d\n",
-                input, refcnt);
+       CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->f_count=%ld ->sk=%p\n",
+                input, atomic_long_read(&el_socket->file->f_count),
+                el_socket->sk);
        if (argc < 3) {
-               acct_tag = 0;
+               acct_tag = make_atag_from_value(0);
        } else if (!valid_atag(acct_tag)) {
                pr_info("qtaguid: ctrl_tag(%s): invalid tag\n", input);
                res = -EINVAL;
                goto err_put;
        }
        CT_DEBUG("qtaguid: ctrl_tag(%s): "
-                "uid=%u euid=%u fsuid=%u "
+                "pid=%u tgid=%u uid=%u euid=%u fsuid=%u "
                 "in_group=%d in_egroup=%d\n",
-                input, current_uid(), current_euid(), current_fsuid(),
-                in_group_p(proc_stats_readall_gid),
-                in_egroup_p(proc_stats_readall_gid));
+                input, current->pid, current->tgid, current_uid(),
+                current_euid(), current_fsuid(),
+                in_group_p(proc_ctrl_write_gid),
+                in_egroup_p(proc_ctrl_write_gid));
        if (argc < 4) {
                uid = current_fsuid();
        } else if (!can_impersonate_uid(uid)) {
                pr_info("qtaguid: ctrl_tag(%s): "
-                       "insufficient priv from pid=%u uid=%u\n",
-                       input, current->pid, current_fsuid());
+                       "insufficient priv from pid=%u tgid=%u uid=%u\n",
+                       input, current->pid, current->tgid, current_fsuid());
                res = -EPERM;
                goto err_put;
        }
+       full_tag = combine_atag_with_uid(acct_tag, uid);
 
        spin_lock_bh(&sock_tag_list_lock);
        sock_tag_entry = get_sock_stat_nl(el_socket->sk);
+       tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry);
+       if (IS_ERR(tag_ref_entry)) {
+               res = PTR_ERR(tag_ref_entry);
+               spin_unlock_bh(&sock_tag_list_lock);
+               goto err_put;
+       }
+       tag_ref_entry->num_sock_tags++;
        if (sock_tag_entry) {
+               struct tag_ref *prev_tag_ref_entry;
+
+               CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p "
+                        "st@%p ...->f_count=%ld\n",
+                        input, el_socket->sk, sock_tag_entry,
+                        atomic_long_read(&el_socket->file->f_count));
                /*
                 * This is a re-tagging, so release the sock_fd that was
                 * locked at the time of the 1st tagging.
+                * There is still the ref from this call's sockfd_lookup() so
+                * it can be done within the spinlock.
                 */
                sockfd_put(sock_tag_entry->socket);
-               refcnt--;
-               sock_tag_entry->tag = combine_atag_with_uid(acct_tag,
-                                                           uid);
+               prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag,
+                                                   &uid_tag_data_entry);
+               BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry));
+               BUG_ON(prev_tag_ref_entry->num_sock_tags <= 0);
+               prev_tag_ref_entry->num_sock_tags--;
+               sock_tag_entry->tag = full_tag;
        } else {
+               CT_DEBUG("qtaguid: ctrl_tag(%s): newtag for sk=%p\n",
+                        input, el_socket->sk);
                sock_tag_entry = kzalloc(sizeof(*sock_tag_entry),
                                         GFP_ATOMIC);
                if (!sock_tag_entry) {
@@ -1639,29 +1937,47 @@ static int ctrl_cmd_tag(const char *input)
                               input);
                        spin_unlock_bh(&sock_tag_list_lock);
                        res = -ENOMEM;
-                       goto err_put;
+                       goto err_tag_unref_put;
                }
                sock_tag_entry->sk = el_socket->sk;
                sock_tag_entry->socket = el_socket;
+               sock_tag_entry->pid = current->tgid;
                sock_tag_entry->tag = combine_atag_with_uid(acct_tag,
                                                            uid);
+               spin_lock_bh(&uid_tag_data_tree_lock);
+               pqd_entry = proc_qtu_data_tree_search(
+                       &proc_qtu_data_tree, current->tgid);
+               /* TODO: remove if() test, do BUG_ON() */
+               WARN_ON(IS_ERR_OR_NULL(pqd_entry));
+               if (!IS_ERR_OR_NULL(pqd_entry)) {
+                       list_add(&sock_tag_entry->list,
+                                &pqd_entry->sock_tag_list);
+               }
+               spin_unlock_bh(&uid_tag_data_tree_lock);
+
                sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree);
                atomic64_inc(&qtu_events.sockets_tagged);
        }
        spin_unlock_bh(&sock_tag_list_lock);
        /* We keep the ref to the socket (file) until it is untagged */
-       CT_DEBUG("qtaguid: ctrl_tag(%s): done. socket->...->f_count=%d\n",
-                input,
-                el_socket ? atomic_read(&el_socket->file->f_count) : -1);
+       CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->f_count=%ld\n",
+                input, sock_tag_entry,
+                atomic_long_read(&el_socket->file->f_count));
        return 0;
 
+err_tag_unref_put:
+       BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+       tag_ref_entry->num_sock_tags--;
+       free_tag_ref_from_utd_entry(tag_ref_entry, uid_tag_data_entry);
 err_put:
+       CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->f_count=%ld\n",
+                input, atomic_long_read(&el_socket->file->f_count) - 1);
        /* Release the sock_fd that was grabbed by sockfd_lookup(). */
        sockfd_put(el_socket);
-       refcnt--;
+       return res;
+
 err:
-       CT_DEBUG("qtaguid: ctrl_tag(%s): done. socket->...->f_count=%d\n",
-                input, refcnt);
+       CT_DEBUG("qtaguid: ctrl_tag(%s): done.\n", input);
        return res;
 }
 
@@ -1670,9 +1986,11 @@ static int ctrl_cmd_untag(const char *input)
        char cmd;
        int sock_fd = 0;
        struct socket *el_socket;
-       int refcnt = -1;
        int res, argc;
        struct sock_tag *sock_tag_entry;
+       struct tag_ref *tag_ref_entry;
+       struct uid_tag_data *utd_entry;
+       struct proc_qtu_data *pqd_entry;
 
        argc = sscanf(input, "%c %d", &cmd, &sock_fd);
        CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n",
@@ -1687,9 +2005,9 @@ static int ctrl_cmd_untag(const char *input)
                        " sock_fd=%d err=%d\n", input, sock_fd, res);
                goto err;
        }
-       refcnt = atomic_read(&el_socket->file->f_count);
-       CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%d\n",
-                input, refcnt);
+       CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n",
+                input, atomic_long_read(&el_socket->file->f_count),
+                el_socket->sk);
        spin_lock_bh(&sock_tag_list_lock);
        sock_tag_entry = get_sock_stat_nl(el_socket->sk);
        if (!sock_tag_entry) {
@@ -1703,28 +2021,47 @@ static int ctrl_cmd_untag(const char *input)
         */
        rb_erase(&sock_tag_entry->sock_node, &sock_tag_tree);
 
+       tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &utd_entry);
+       BUG_ON(!tag_ref_entry);
+       BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+       spin_lock_bh(&uid_tag_data_tree_lock);
+       pqd_entry = proc_qtu_data_tree_search(
+               &proc_qtu_data_tree, current->tgid);
+       /* TODO: remove if() test, do BUG_ON() */
+       WARN_ON(IS_ERR_OR_NULL(pqd_entry));
+       if (!IS_ERR_OR_NULL(pqd_entry))
+               list_del(&sock_tag_entry->list);
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+       /*
+        * We don't free tag_ref from the utd_entry here,
+        * only during a cmd_delete().
+        */
+       tag_ref_entry->num_sock_tags--;
+       spin_unlock_bh(&sock_tag_list_lock);
        /*
         * Release the sock_fd that was grabbed at tag time,
         * and once more for the sockfd_lookup() here.
         */
        sockfd_put(sock_tag_entry->socket);
-       spin_unlock_bh(&sock_tag_list_lock);
+       CT_DEBUG("qtaguid: ctrl_untag(%s): done. st@%p ...->f_count=%ld\n",
+                input, sock_tag_entry,
+                atomic_long_read(&el_socket->file->f_count) - 1);
        sockfd_put(el_socket);
-       refcnt -= 2;
+
        kfree(sock_tag_entry);
        atomic64_inc(&qtu_events.sockets_untagged);
-       CT_DEBUG("qtaguid: ctrl_untag(%s): done. socket->...->f_count=%d\n",
-                input, refcnt);
 
        return 0;
 
 err_put:
+       CT_DEBUG("qtaguid: ctrl_untag(%s): done. socket->...->f_count=%ld\n",
+                input, atomic_long_read(&el_socket->file->f_count) - 1);
        /* Release the sock_fd that was grabbed by sockfd_lookup(). */
        sockfd_put(el_socket);
-       refcnt--;
+       return res;
+
 err:
-       CT_DEBUG("qtaguid: ctrl_untag(%s): done. socket->...->f_count=%d\n",
-                input, refcnt);
+       CT_DEBUG("qtaguid: ctrl_untag(%s): done.\n", input);
        return res;
 }
 
@@ -1816,11 +2153,11 @@ static int pp_stats_line(struct proc_print_info *ppi, int cnt_set)
 
                if (!can_read_other_uid_stats(stat_uid)) {
                        CT_DEBUG("qtaguid: stats line: "
-                                "%s 0x%llx %u: "
-                                "insufficient priv from pid=%u uid=%u\n",
+                                "%s 0x%llx %u: insufficient priv "
+                                "from pid=%u tgid=%u uid=%u\n",
                                 ppi->iface_entry->ifname,
                                 get_atag_from_tag(tag), stat_uid,
-                                current->pid, current_fsuid());
+                                current->pid, current->tgid, current_fsuid());
                        return 0;
                }
                if (ppi->item_index++ < ppi->items_to_skip)
@@ -1903,7 +2240,7 @@ static int qtaguid_stats_proc_read(char *page, char **num_items_returned,
        if (unlikely(module_passive)) {
                len = pp_stats_line(&ppi, 0);
                /* The header should always be shorter than the buffer. */
-               WARN_ON(len >= ppi.char_count);
+               BUG_ON(len >= ppi.char_count);
                (*num_items_returned)++;
                *eof = 1;
                return len;
@@ -1952,6 +2289,163 @@ static int qtaguid_stats_proc_read(char *page, char **num_items_returned,
        return ppi.outp - page;
 }
 
+/*------------------------------------------*/
+static int qtudev_open(struct inode *inode, struct file *file)
+{
+       struct uid_tag_data *utd_entry;
+       struct proc_qtu_data  *pqd_entry;
+       struct proc_qtu_data  *new_pqd_entry = 0;
+       int res;
+       bool utd_entry_found;
+
+       if (unlikely(qtu_proc_handling_passive))
+               return 0;
+
+       DR_DEBUG("qtaguid: qtudev_open(): pid=%u tgid=%u uid=%u\n",
+                current->pid, current->tgid, current_fsuid());
+
+       spin_lock_bh(&uid_tag_data_tree_lock);
+
+       /* Look for existing uid data, or alloc one. */
+       utd_entry = get_uid_data(current_fsuid(), &utd_entry_found);
+       if (IS_ERR_OR_NULL(utd_entry)) {
+               res = PTR_ERR(utd_entry);
+               goto err;
+       }
+
+       /* Look for existing PID based proc_data */
+       pqd_entry = proc_qtu_data_tree_search(&proc_qtu_data_tree,
+                                             current->tgid);
+       if (pqd_entry) {
+               pr_err("qtaguid: qtudev_open(): %u/%u %u "
+                      "%s already opened\n",
+                      current->pid, current->tgid, current_fsuid(),
+                      QTU_DEV_NAME);
+               res = -EBUSY;
+               goto err_unlock_free_utd;
+       }
+
+       new_pqd_entry = kzalloc(sizeof(*new_pqd_entry), GFP_ATOMIC);
+       if (!new_pqd_entry) {
+               pr_err("qtaguid: qtudev_open(): %u/%u %u: "
+                      "proc data alloc failed\n",
+                      current->pid, current->tgid, current_fsuid());
+               res = -ENOMEM;
+               goto err_unlock_free_utd;
+       }
+       new_pqd_entry->pid = current->tgid;
+       INIT_LIST_HEAD(&new_pqd_entry->sock_tag_list);
+       new_pqd_entry->parent_tag_data = utd_entry;
+
+       proc_qtu_data_tree_insert(new_pqd_entry,
+                                 &proc_qtu_data_tree);
+
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+       DR_DEBUG("qtaguid: tracking data for uid=%u\n", current_fsuid());
+       file->private_data = new_pqd_entry;
+       return 0;
+
+err_unlock_free_utd:
+       if (!utd_entry_found) {
+               rb_erase(&utd_entry->node, &uid_tag_data_tree);
+               kfree(utd_entry);
+       }
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+err:
+       return res;
+}
+
+static int qtudev_release(struct inode *inode, struct file *file)
+{
+       struct proc_qtu_data  *pqd_entry = file->private_data;
+       struct uid_tag_data  *utd_entry = pqd_entry->parent_tag_data;
+       struct sock_tag *st_entry;
+       struct rb_root st_to_free_tree = RB_ROOT;
+       struct list_head *entry, *next;
+       struct tag_ref *tr;
+
+       if (unlikely(qtu_proc_handling_passive))
+               return 0;
+
+       /*
+        * Do not trust the current->pid, it might just be a kworker cleaning
+        * up after a dead proc.
+        */
+       DR_DEBUG("qtaguid: qtudev_release(): "
+                "pid=%u tgid=%u uid=%u "
+                "pqd_entry=%p->pid=%u utd_entry=%p->active_tags=%d\n",
+                current->pid, current->tgid, pqd_entry->parent_tag_data->uid,
+                pqd_entry, pqd_entry->pid, utd_entry,
+                utd_entry->num_active_tags);
+
+       spin_lock_bh(&sock_tag_list_lock);
+       spin_lock_bh(&uid_tag_data_tree_lock);
+
+       /*
+        * If this proc didn't actually tag anything for itself, or has already
+        * willingly cleaned up itself ...
+        */
+       put_utd_entry(utd_entry);
+
+       list_for_each_safe(entry, next, &pqd_entry->sock_tag_list) {
+               st_entry = list_entry(entry, struct sock_tag, list);
+               DR_DEBUG("qtaguid: %s(): "
+                        "erase sock_tag=%p->sk=%p pid=%u tgid=%u uid=%u\n",
+                        __func__,
+                        st_entry, st_entry->sk,
+                        current->pid, current->tgid,
+                        pqd_entry->parent_tag_data->uid);
+
+               utd_entry = uid_tag_data_tree_search(
+                       &uid_tag_data_tree,
+                       get_uid_from_tag(st_entry->tag));
+               BUG_ON(IS_ERR_OR_NULL(utd_entry));
+               DR_DEBUG("qtaguid: %s(): "
+                        "looking for tag=0x%llx in utd_entry=%p\n", __func__,
+                        st_entry->tag, utd_entry);
+               tr = tag_ref_tree_search(&utd_entry->tag_ref_tree,
+                                        st_entry->tag);
+               BUG_ON(!tr);
+               BUG_ON(tr->num_sock_tags <= 0);
+               tr->num_sock_tags--;
+               free_tag_ref_from_utd_entry(tr, utd_entry);
+
+               rb_erase(&st_entry->sock_node, &sock_tag_tree);
+               list_del(&st_entry->list);
+               /* Can't sockfd_put() within spinlock, do it later. */
+               sock_tag_tree_insert(st_entry, &st_to_free_tree);
+
+               /* Do not put_utd_entry(utd_entry) someone elses utd_entry */
+       }
+
+       rb_erase(&pqd_entry->node, &proc_qtu_data_tree);
+       kfree(pqd_entry);
+       file->private_data = NULL;
+
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+       spin_unlock_bh(&sock_tag_list_lock);
+
+
+       sock_tag_tree_erase(&st_to_free_tree);
+
+
+       return 0;
+}
+
+/*------------------------------------------*/
+static const struct file_operations qtudev_fops = {
+       .owner = THIS_MODULE,
+       .open = qtudev_open,
+       .release = qtudev_release,
+};
+
+static struct miscdevice qtu_device = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = QTU_DEV_NAME,
+       .fops = &qtudev_fops,
+       /* How sad it doesn't allow for defaults: .mode = S_IRUGO | S_IWUSR */
+};
+
 /*------------------------------------------*/
 static int __init qtaguid_proc_register(struct proc_dir_entry **res_procdir)
 {
@@ -2014,7 +2508,8 @@ static int __init qtaguid_mt_init(void)
 {
        if (qtaguid_proc_register(&xt_qtaguid_procdir)
            || iface_stat_init(xt_qtaguid_procdir)
-           || xt_register_match(&qtaguid_mt_reg))
+           || xt_register_match(&qtaguid_mt_reg)
+           || misc_register(&qtu_device))
                return -1;
        return 0;
 }
diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h
new file mode 100644 (file)
index 0000000..752e196
--- /dev/null
@@ -0,0 +1,305 @@
+/*
+ * Kernel iptables module to track stats for packets based on user tags.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_INTERNAL_H__
+#define __XT_QTAGUID_INTERNAL_H__
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock_types.h>
+#include <linux/workqueue.h>
+
+/* Define/comment out these *DEBUG to compile in/out the pr_debug calls. */
+/* Iface handling */
+#define IDEBUG
+/* Iptable Matching. Per packet. */
+#define MDEBUG
+/* Red-black tree handling. Per packet. */
+#define RDEBUG
+/* procfs ctrl/stats handling */
+#define CDEBUG
+/* dev and resource tracking */
+#define DDEBUG
+
+/* E.g (IDEBUG_MASK | CDEBUG_MASK | DDEBUG_MASK) */
+#define DEFAULT_DEBUG_MASK 0
+
+
+#define IDEBUG_MASK (1<<0)
+#define MDEBUG_MASK (1<<1)
+#define RDEBUG_MASK (1<<2)
+#define CDEBUG_MASK (1<<3)
+#define DDEBUG_MASK (1<<4)
+
+#define MSK_DEBUG(mask, ...) do {                       \
+               if (unlikely(debug_mask & (mask)))      \
+                       pr_debug(__VA_ARGS__);          \
+       } while (0)
+#ifdef IDEBUG
+#define IF_DEBUG(...) MSK_DEBUG(IDEBUG_MASK, __VA_ARGS__)
+#else
+#define IF_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef MDEBUG
+#define MT_DEBUG(...) MSK_DEBUG(MDEBUG_MASK, __VA_ARGS__)
+#else
+#define MT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef RDEBUG
+#define RB_DEBUG(...) MSK_DEBUG(RDEBUG_MASK, __VA_ARGS__)
+#else
+#define RB_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef CDEBUG
+#define CT_DEBUG(...) MSK_DEBUG(CDEBUG_MASK, __VA_ARGS__)
+#else
+#define CT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef DDEBUG
+#define DR_DEBUG(...) MSK_DEBUG(DDEBUG_MASK, __VA_ARGS__)
+#else
+#define DR_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+
+extern uint debug_mask;
+
+/*---------------------------------------------------------------------------*/
+/*
+ * Tags:
+ *
+ * They represent what the data usage counters will be tracked against.
+ * By default a tag is just based on the UID.
+ * The UID is used as the base for policing, and can not be ignored.
+ * So a tag will always at least represent a UID (uid_tag).
+ *
+ * A tag can be augmented with an "accounting tag" which is associated
+ * with a UID.
+ * User space can set the acct_tag portion of the tag which is then used
+ * with sockets: all data belonging to that socket will be counted against the
+ * tag. The policing is then based on the tag's uid_tag portion,
+ * and stats are collected for the acct_tag portion separately.
+ *
+ * There could be
+ * a:  {acct_tag=1, uid_tag=10003}
+ * b:  {acct_tag=2, uid_tag=10003}
+ * c:  {acct_tag=3, uid_tag=10003}
+ * d:  {acct_tag=0, uid_tag=10003}
+ * a, b, and c represent tags associated with specific sockets.
+ * d is for the totals for that uid, including all untagged traffic.
+ * Typically d is used with policing/quota rules.
+ *
+ * We want tag_t big enough to distinguish uid_t and acct_tag.
+ * It might become a struct if needed.
+ * Nothing should be using it as an int.
+ */
+typedef uint64_t tag_t;  /* Only used via accessors */
+
+#define TAG_UID_MASK 0xFFFFFFFFULL
+#define TAG_ACCT_MASK (~0xFFFFFFFFULL)
+
+static inline int tag_compare(tag_t t1, tag_t t2)
+{
+       return t1 < t2 ? -1 : t1 == t2 ? 0 : 1;
+}
+
+static inline tag_t combine_atag_with_uid(tag_t acct_tag, uid_t uid)
+{
+       return acct_tag | uid;
+}
+static inline tag_t make_tag_from_uid(uid_t uid)
+{
+       return uid;
+}
+static inline uid_t get_uid_from_tag(tag_t tag)
+{
+       return tag & TAG_UID_MASK;
+}
+static inline tag_t get_utag_from_tag(tag_t tag)
+{
+       return tag & TAG_UID_MASK;
+}
+static inline tag_t get_atag_from_tag(tag_t tag)
+{
+       return tag & TAG_ACCT_MASK;
+}
+
+static inline bool valid_atag(tag_t tag)
+{
+       return !(tag & TAG_UID_MASK);
+}
+static inline tag_t make_atag_from_value(uint32_t value)
+{
+       return (uint64_t)value << 32;
+}
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Maximum number of socket tags that a UID is allowed to have active.
+ * Multiple processes belonging to the same UID contribute towards this limit.
+ * Special UIDs that can impersonate a UID also contribute (e.g. download
+ * manager, ...)
+ */
+#define DEFAULT_MAX_SOCK_TAGS 1024
+
+/*
+ * For now we only track 2 sets of counters.
+ * The default set is 0.
+ * Userspace can activate another set for a given uid being tracked.
+ */
+#define IFS_MAX_COUNTER_SETS 2
+
+enum ifs_tx_rx {
+       IFS_TX,
+       IFS_RX,
+       IFS_MAX_DIRECTIONS
+};
+
+/* For now, TCP, UDP, the rest */
+enum ifs_proto {
+       IFS_TCP,
+       IFS_UDP,
+       IFS_PROTO_OTHER,
+       IFS_MAX_PROTOS
+};
+
+struct byte_packet_counters {
+       uint64_t bytes;
+       uint64_t packets;
+};
+
+struct data_counters {
+       struct byte_packet_counters bpc[IFS_MAX_COUNTER_SETS][IFS_MAX_DIRECTIONS][IFS_MAX_PROTOS];
+};
+
+/* Generic X based nodes used as a base for rb_tree ops */
+struct tag_node {
+       struct rb_node node;
+       tag_t tag;
+};
+
+struct tag_stat {
+       struct tag_node tn;
+       struct data_counters counters;
+       /*
+        * If this tag is acct_tag based, we need to count against the
+        * matching parent uid_tag.
+        */
+       struct data_counters *parent_counters;
+};
+
+struct iface_stat {
+       struct list_head list;  /* in iface_stat_list */
+       char *ifname;
+       uint64_t rx_bytes;
+       uint64_t rx_packets;
+       uint64_t tx_bytes;
+       uint64_t tx_packets;
+       bool active;
+       struct proc_dir_entry *proc_ptr;
+
+       struct rb_root tag_stat_tree;
+       spinlock_t tag_stat_list_lock;
+};
+
+/* This is needed to create proc_dir_entries from atomic context. */
+struct iface_stat_work {
+       struct work_struct iface_work;
+       struct iface_stat *iface_entry;
+};
+
+/*
+ * Track tag that this socket is transferring data for, and not necessarily
+ * the uid that owns the socket.
+ * This is the tag against which tag_stat.counters will be billed.
+ * These structs need to be looked up by sock and pid.
+ */
+struct sock_tag {
+       struct rb_node sock_node;
+       struct sock *sk;  /* Only used as a number, never dereferenced */
+       /* The socket is needed for sockfd_put() */
+       struct socket *socket;
+       /* Used to associate with a given pid */
+       struct list_head list;   /* in proc_qtu_data.sock_tag_list */
+       pid_t pid;
+
+       tag_t tag;
+};
+
+struct qtaguid_event_counts {
+       /* Various successful events */
+       atomic64_t sockets_tagged;
+       atomic64_t sockets_untagged;
+       atomic64_t counter_set_changes;
+       atomic64_t delete_cmds;
+       atomic64_t iface_events;  /* Number of NETDEV_* events handled */
+       /*
+        * match_found_sk_*: numbers related to the netfilter matching
+        * function finding a sock for the sk_buff.
+        */
+       atomic64_t match_found_sk;   /* An sk was already in the sk_buff. */
+       /* The connection tracker had the sk. */
+       atomic64_t match_found_sk_in_ct;
+       /*
+        * No sk could be found. No apparent owner. Could happen with
+        * unsolicited traffic.
+        */
+       atomic64_t match_found_sk_none;
+};
+
+/* Track the set active_set for the given tag. */
+struct tag_counter_set {
+       struct tag_node tn;
+       int active_set;
+};
+
+/*----------------------------------------------*/
+/*
+ * The qtu uid data is used to track resources that are created directly or
+ * indirectly by processes (uid tracked).
+ * It is shared by the processes with the same uid.
+ * Some of the resource will be counted to prevent further rogue allocations,
+ * some will need freeing once the owner process (uid) exits.
+ */
+struct uid_tag_data {
+       struct rb_node node;
+       uid_t uid;
+
+       /*
+        * For the uid, how many accounting tags have been set.
+        */
+       int num_active_tags;
+       struct rb_root tag_ref_tree;
+       /* No tag_node_tree_lock; use uid_tag_data_tree_lock */
+};
+
+struct tag_ref {
+       struct tag_node tn;
+
+       /*
+        * This tracks the number of active sockets that have a tag on them
+        * which matches this tag_ref.tn.tag.
+        * A tag ref can live on after the sockets are untagged.
+        * A tag ref can only be removed during a tag delete command.
+        */
+       int num_sock_tags;
+};
+
+struct proc_qtu_data {
+       struct rb_node node;
+       pid_t pid;
+
+       struct uid_tag_data *parent_tag_data;
+
+       /* Tracks the sock_tags that need freeing upon this proc's death */
+       struct list_head sock_tag_list;
+       /* No spinlock_t sock_tag_list_lock; use the global one. */
+};
+
+/*----------------------------------------------*/
+#endif  /* ifndef __XT_QTAGUID_INTERNAL_H__ */
diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c
new file mode 100644 (file)
index 0000000..3d05447
--- /dev/null
@@ -0,0 +1,397 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * There are run-time debug flags enabled via the debug_mask module param, or
+ * via the DEFAULT_DEBUG_MASK. See xt_qtaguid_internal.h.
+ */
+#define DEBUG
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/net.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock_types.h>
+
+
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+
+char *pp_tag_t(tag_t *tag)
+{
+       if (!tag)
+               return kasprintf(GFP_ATOMIC, "tag_t@null{}");
+       return kasprintf(GFP_ATOMIC,
+                        "tag_t@%p{tag=0x%llx, uid=%u}",
+                        tag, *tag, get_uid_from_tag(*tag));
+}
+
+char *pp_data_counters(struct data_counters *dc, bool showValues)
+{
+       if (!dc)
+               return kasprintf(GFP_ATOMIC, "data_counters@null{}");
+       if (showValues)
+               return kasprintf(
+                       GFP_ATOMIC, "data_counters@%p{"
+                       "set0{"
+                       "rx{"
+                       "tcp{b=%llu, p=%llu}, "
+                       "udp{b=%llu, p=%llu},"
+                       "other{b=%llu, p=%llu}}, "
+                       "tx{"
+                       "tcp{b=%llu, p=%llu}, "
+                       "udp{b=%llu, p=%llu},"
+                       "other{b=%llu, p=%llu}}}, "
+                       "set1{"
+                       "rx{"
+                       "tcp{b=%llu, p=%llu}, "
+                       "udp{b=%llu, p=%llu},"
+                       "other{b=%llu, p=%llu}}, "
+                       "tx{"
+                       "tcp{b=%llu, p=%llu}, "
+                       "udp{b=%llu, p=%llu},"
+                       "other{b=%llu, p=%llu}}}}",
+                       dc,
+                       dc->bpc[0][IFS_RX][IFS_TCP].bytes,
+                       dc->bpc[0][IFS_RX][IFS_TCP].packets,
+                       dc->bpc[0][IFS_RX][IFS_UDP].bytes,
+                       dc->bpc[0][IFS_RX][IFS_UDP].packets,
+                       dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].bytes,
+                       dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].packets,
+                       dc->bpc[0][IFS_TX][IFS_TCP].bytes,
+                       dc->bpc[0][IFS_TX][IFS_TCP].packets,
+                       dc->bpc[0][IFS_TX][IFS_UDP].bytes,
+                       dc->bpc[0][IFS_TX][IFS_UDP].packets,
+                       dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].bytes,
+                       dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].packets,
+                       dc->bpc[1][IFS_RX][IFS_TCP].bytes,
+                       dc->bpc[1][IFS_RX][IFS_TCP].packets,
+                       dc->bpc[1][IFS_RX][IFS_UDP].bytes,
+                       dc->bpc[1][IFS_RX][IFS_UDP].packets,
+                       dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].bytes,
+                       dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].packets,
+                       dc->bpc[1][IFS_TX][IFS_TCP].bytes,
+                       dc->bpc[1][IFS_TX][IFS_TCP].packets,
+                       dc->bpc[1][IFS_TX][IFS_UDP].bytes,
+                       dc->bpc[1][IFS_TX][IFS_UDP].packets,
+                       dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].bytes,
+                       dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].packets);
+       else
+               return kasprintf(GFP_ATOMIC, "data_counters@%p{...}", dc);
+}
+
+char *pp_tag_node(struct tag_node *tn)
+{
+       char *tag_str;
+       char *res;
+
+       if (!tn)
+               return kasprintf(GFP_ATOMIC, "tag_node@null{}");
+       tag_str = pp_tag_t(&tn->tag);
+       res = kasprintf(GFP_ATOMIC,
+                       "tag_node@%p{tag=%s}",
+                       tn, tag_str);
+       kfree(tag_str);
+       return res;
+}
+
+char *pp_tag_ref(struct tag_ref *tr)
+{
+       char *tn_str;
+       char *res;
+
+       if (!tr)
+               return kasprintf(GFP_ATOMIC, "tag_ref@null{}");
+       tn_str = pp_tag_node(&tr->tn);
+       res = kasprintf(GFP_ATOMIC,
+                       "tag_ref@%p{%s, num_sock_tags=%d}",
+                       tr, tn_str, tr->num_sock_tags);
+       kfree(tn_str);
+       return res;
+}
+
+char *pp_tag_stat(struct tag_stat *ts)
+{
+       char *tn_str;
+       char *counters_str;
+       char *parent_counters_str;
+       char *res;
+
+       if (!ts)
+               return kasprintf(GFP_ATOMIC, "tag_stat@null{}");
+       tn_str = pp_tag_node(&ts->tn);
+       counters_str = pp_data_counters(&ts->counters, true);
+       parent_counters_str = pp_data_counters(ts->parent_counters, false);
+       res = kasprintf(GFP_ATOMIC,
+                       "tag_stat@%p{%s, counters=%s, parent_counters=%s}",
+                       ts, tn_str, counters_str, parent_counters_str);
+       kfree(tn_str);
+       kfree(counters_str);
+       kfree(parent_counters_str);
+       return res;
+}
+
+char *pp_iface_stat(struct iface_stat *is)
+{
+       if (!is)
+               return kasprintf(GFP_ATOMIC, "iface_stat@null{}");
+       return kasprintf(GFP_ATOMIC, "iface_stat@%p{"
+                        "list=list_head{...}, "
+                        "ifname=%s, "
+                        "rx_bytes=%llu, "
+                        "rx_packets=%llu, "
+                        "tx_bytes=%llu, "
+                        "tx_packets=%llu, "
+                        "active=%d, "
+                        "proc_ptr=%p, "
+                        "tag_stat_tree=rb_root{...}}",
+                        is,
+                        is->ifname,
+                        is->rx_bytes,
+                        is->rx_packets,
+                        is->tx_bytes,
+                        is->tx_packets,
+                        is->active,
+                        is->proc_ptr);
+}
+
+char *pp_sock_tag(struct sock_tag *st)
+{
+       char *tag_str;
+       char *res;
+
+       if (!st)
+               return kasprintf(GFP_ATOMIC, "sock_tag@null{}");
+       tag_str = pp_tag_t(&st->tag);
+       res = kasprintf(GFP_ATOMIC, "sock_tag@%p{"
+                       "sock_node=rb_node{...}, "
+                       "sk=%p socket=%p (f_count=%lu), list=list_head{...}, "
+                       "pid=%u, tag=%s}",
+                       st, st->sk, st->socket, atomic_long_read(
+                               &st->socket->file->f_count),
+                       st->pid, tag_str);
+       kfree(tag_str);
+       return res;
+}
+
+char *pp_uid_tag_data(struct uid_tag_data *utd)
+{
+       char *res;
+
+       if (!utd)
+               return kasprintf(GFP_ATOMIC, "uid_tag_data@null{}");
+       res = kasprintf(GFP_ATOMIC, "uid_tag_data@%p{"
+                       "uid=%u, num_active_acct_tags=%d, "
+                       "tag_node_tree=rb_root{...}, "
+                       "proc_qtu_data_tree=rb_root{...}}",
+                       utd, utd->uid,
+                       utd->num_active_tags);
+       return res;
+}
+
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
+{
+       char *parent_tag_data_str;
+       char *res;
+
+       if (!pqd)
+               return kasprintf(GFP_ATOMIC, "proc_qtu_data@null{}");
+       parent_tag_data_str = pp_uid_tag_data(pqd->parent_tag_data);
+       res = kasprintf(GFP_ATOMIC, "proc_qtu_data@%p{"
+                       "node=rb_node{...}, pid=%u, "
+                       "parent_tag_data=%s, "
+                       "sock_tag_list=list_head{...}}",
+                       pqd, pqd->pid, parent_tag_data_str
+               );
+       kfree(parent_tag_data_str);
+       return res;
+}
+
+/*------------------------------------------*/
+void prdebug_sock_tag_tree(int indent_level,
+                          struct rb_root *sock_tag_tree)
+{
+       struct rb_node *node;
+       struct sock_tag *sock_tag_entry;
+       char *str;
+
+       str = "sock_tag_tree=rb_root{";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       for (node = rb_first(sock_tag_tree);
+            node;
+            node = rb_next(node)) {
+               sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+               str = pp_sock_tag(sock_tag_entry);
+               CT_DEBUG("%*d: %s,\n", indent_level*2, indent_level, str);
+               kfree(str);
+       }
+       indent_level--;
+       str = "}";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_sock_tag_list(int indent_level,
+                          struct list_head *sock_tag_list)
+{
+       struct sock_tag *sock_tag_entry;
+       char *str;
+
+       str = "sock_tag_list=list_head{";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       list_for_each_entry(sock_tag_entry, sock_tag_list, list) {
+               str = pp_sock_tag(sock_tag_entry);
+               CT_DEBUG("%*d: %s,\n", indent_level*2, indent_level, str);
+               kfree(str);
+       }
+       indent_level--;
+       str = "}";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_proc_qtu_data_tree(int indent_level,
+                               struct rb_root *proc_qtu_data_tree)
+{
+       char *str;
+       struct rb_node *node;
+       struct proc_qtu_data *proc_qtu_data_entry;
+
+       str = "proc_qtu_data_tree=rb_root{";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       for (node = rb_first(proc_qtu_data_tree);
+            node;
+            node = rb_next(node)) {
+               proc_qtu_data_entry = rb_entry(node,
+                                              struct proc_qtu_data,
+                                              node);
+               str = pp_proc_qtu_data(proc_qtu_data_entry);
+               CT_DEBUG("%*d: %s,\n", indent_level*2, indent_level,
+                        str);
+               kfree(str);
+               indent_level++;
+               prdebug_sock_tag_list(indent_level,
+                                     &proc_qtu_data_entry->sock_tag_list);
+               indent_level--;
+
+       }
+       indent_level--;
+       str = "}";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
+{
+       char *str;
+       struct rb_node *node;
+       struct tag_ref *tag_ref_entry;
+
+       str = "tag_ref_tree{";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       for (node = rb_first(tag_ref_tree);
+            node;
+            node = rb_next(node)) {
+               tag_ref_entry = rb_entry(node,
+                                        struct tag_ref,
+                                        tn.node);
+               str = pp_tag_ref(tag_ref_entry);
+               CT_DEBUG("%*d: %s,\n", indent_level*2, indent_level,
+                        str);
+               kfree(str);
+       }
+       indent_level--;
+       str = "}";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_uid_tag_data_tree(int indent_level,
+                              struct rb_root *uid_tag_data_tree)
+{
+       char *str;
+       struct rb_node *node;
+       struct uid_tag_data *uid_tag_data_entry;
+
+       str = "uid_tag_data_tree=rb_root{";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       for (node = rb_first(uid_tag_data_tree);
+            node;
+            node = rb_next(node)) {
+               uid_tag_data_entry = rb_entry(node, struct uid_tag_data,
+                                             node);
+               str = pp_uid_tag_data(uid_tag_data_entry);
+               CT_DEBUG("%*d: %s,\n", indent_level*2, indent_level, str);
+               kfree(str);
+               if (!RB_EMPTY_ROOT(&uid_tag_data_entry->tag_ref_tree)) {
+                       indent_level++;
+                       prdebug_tag_ref_tree(indent_level,
+                                            &uid_tag_data_entry->tag_ref_tree);
+                       indent_level--;
+               }
+       }
+       indent_level--;
+       str = "}";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_stat_tree(int indent_level,
+                                 struct rb_root *tag_stat_tree)
+{
+       char *str;
+       struct rb_node *node;
+       struct tag_stat *ts_entry;
+
+       str = "tag_stat_tree{";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       for (node = rb_first(tag_stat_tree);
+            node;
+            node = rb_next(node)) {
+               ts_entry = rb_entry(node, struct tag_stat, tn.node);
+               str = pp_tag_stat(ts_entry);
+               CT_DEBUG("%*d: %s\n", indent_level*2, indent_level,
+                        str);
+               kfree(str);
+       }
+       indent_level--;
+       str = "}";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_iface_stat_list(int indent_level,
+                            struct list_head *iface_stat_list)
+{
+       char *str;
+       struct iface_stat *iface_entry;
+
+       str = "iface_stat_list=list_head{";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       list_for_each_entry(iface_entry, iface_stat_list, list) {
+               str = pp_iface_stat(iface_entry);
+               CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+               kfree(str);
+
+               spin_lock_bh(&iface_entry->tag_stat_list_lock);
+               if (!RB_EMPTY_ROOT(&iface_entry->tag_stat_tree)) {
+                       indent_level++;
+                       prdebug_tag_stat_tree(indent_level,
+                                             &iface_entry->tag_stat_tree);
+                       indent_level--;
+               }
+               spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+       }
+       indent_level--;
+       str = "}";
+       CT_DEBUG("%*d: %s\n", indent_level*2, indent_level, str);
+}
diff --git a/net/netfilter/xt_qtaguid_print.h b/net/netfilter/xt_qtaguid_print.h
new file mode 100644 (file)
index 0000000..e26020c
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_PRINT_H__
+#define __XT_QTAGUID_PRINT_H__
+
+#include "xt_qtaguid_internal.h"
+
+char *pp_tag_t(tag_t *tag);
+char *pp_data_counters(struct data_counters *dc, bool showValues);
+char *pp_tag_node(struct tag_node *tn);
+char *pp_tag_ref(struct tag_ref *tr);
+char *pp_tag_stat(struct tag_stat *ts);
+char *pp_iface_stat(struct iface_stat *is);
+char *pp_sock_tag(struct sock_tag *st);
+char *pp_uid_tag_data(struct uid_tag_data *qtd);
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd);
+
+/*------------------------------------------*/
+void prdebug_sock_tag_list(int indent_level,
+                          struct list_head *sock_tag_list);
+void prdebug_sock_tag_tree(int indent_level,
+                          struct rb_root *sock_tag_tree);
+void prdebug_proc_qtu_data_tree(int indent_level,
+                               struct rb_root *proc_qtu_data_tree);
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree);
+void prdebug_uid_tag_data_tree(int indent_level,
+                              struct rb_root *uid_tag_data_tree);
+void prdebug_tag_stat_tree(int indent_level,
+                          struct rb_root *tag_stat_tree);
+void prdebug_iface_stat_list(int indent_level,
+                            struct list_head *iface_stat_list);
+#endif  /* ifndef __XT_QTAGUID_PRINT_H__ */