xfs: use per-filesystem radix trees for dquot lookup
authorChristoph Hellwig <hch@infradead.org>
Tue, 13 Mar 2012 08:52:35 +0000 (08:52 +0000)
committerBen Myers <bpm@sgi.com>
Wed, 14 Mar 2012 16:09:06 +0000 (11:09 -0500)
Replace the global hash tables for looking up in-memory dquot structures
with per-filesystem radix trees to allow scaling to a large number of
in-memory dquot structures.

Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
fs/xfs/xfs_dquot.c
fs/xfs/xfs_dquot.h
fs/xfs/xfs_qm.c
fs/xfs/xfs_qm.h
fs/xfs/xfs_quota_priv.h
fs/xfs/xfs_trace.h

index fec1a3d78e9fb5ab9f231fe7e24f21dad35a9138..49456e555cfa920353a4472f9c36f8015ad4f58b 100644 (file)
@@ -43,7 +43,7 @@
  * Lock order:
  *
  * ip->i_lock
- *   qh->qh_lock
+ *   qi->qi_tree_lock
  *     qi->qi_dqlist_lock
  *       dquot->q_qlock (xfs_dqlock() and friends)
  *         dquot->q_flush (xfs_dqflock() and friends)
@@ -601,60 +601,6 @@ error0:
        return error;
 }
 
-/*
- * Lookup a dquot in the incore dquot hashtable. We keep two separate
- * hashtables for user and group dquots; and, these are global tables
- * inside the XQM, not per-filesystem tables.
- * The hash chain must be locked by caller, and it is left locked
- * on return. Returning dquot is locked.
- */
-STATIC int
-xfs_qm_dqlookup(
-       xfs_mount_t             *mp,
-       xfs_dqid_t              id,
-       xfs_dqhash_t            *qh,
-       xfs_dquot_t             **O_dqpp)
-{
-       xfs_dquot_t             *dqp;
-
-       ASSERT(mutex_is_locked(&qh->qh_lock));
-
-       /*
-        * Traverse the hashchain looking for a match
-        */
-       list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
-               /*
-                * We already have the hashlock. We don't need the
-                * dqlock to look at the id field of the dquot, since the
-                * id can't be modified without the hashlock anyway.
-                */
-               if (be32_to_cpu(dqp->q_core.d_id) != id || dqp->q_mount != mp)
-                       continue;
-
-               trace_xfs_dqlookup_found(dqp);
-
-               xfs_dqlock(dqp);
-               if (dqp->dq_flags & XFS_DQ_FREEING) {
-                       *O_dqpp = NULL;
-                       xfs_dqunlock(dqp);
-                       return -1;
-               }
-
-               dqp->q_nrefs++;
-
-               /*
-                * move the dquot to the front of the hashchain
-                */
-               list_move(&dqp->q_hashlist, &qh->qh_list);
-               trace_xfs_dqlookup_done(dqp);
-               *O_dqpp = dqp;
-               return 0;
-       }
-
-       *O_dqpp = NULL;
-       return 1;
-}
-
 /*
  * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
  * a locked dquot, doing an allocation (if requested) as needed.
@@ -672,10 +618,10 @@ xfs_qm_dqget(
        uint            flags,    /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
        xfs_dquot_t     **O_dqpp) /* OUT : locked incore dquot */
 {
-       xfs_dquot_t     *dqp, *dqp1;
-       xfs_dqhash_t    *h;
-       uint            version;
-       int             error;
+       struct xfs_quotainfo    *qi = mp->m_quotainfo;
+       struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+       struct xfs_dquot        *dqp;
+       int                     error;
 
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
@@ -683,7 +629,6 @@ xfs_qm_dqget(
            (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
                return (ESRCH);
        }
-       h = XFS_DQ_HASH(mp, id, type);
 
 #ifdef DEBUG
        if (xfs_do_dqerror) {
@@ -704,34 +649,28 @@ xfs_qm_dqget(
 #endif
 
 restart:
-       mutex_lock(&h->qh_lock);
+       mutex_lock(&qi->qi_tree_lock);
+       dqp = radix_tree_lookup(tree, id);
+       if (dqp) {
+               xfs_dqlock(dqp);
+               if (dqp->dq_flags & XFS_DQ_FREEING) {
+                       xfs_dqunlock(dqp);
+                       mutex_unlock(&qi->qi_tree_lock);
+                       trace_xfs_dqget_freeing(dqp);
+                       delay(1);
+                       goto restart;
+               }
 
-       /*
-        * Look in the cache (hashtable).
-        * The chain is kept locked during lookup.
-        */
-       switch (xfs_qm_dqlookup(mp, id, h, O_dqpp)) {
-       case -1:
-               XFS_STATS_INC(xs_qm_dquot_dups);
-               mutex_unlock(&h->qh_lock);
-               delay(1);
-               goto restart;
-       case 0:
+               dqp->q_nrefs++;
+               mutex_unlock(&qi->qi_tree_lock);
+
+               trace_xfs_dqget_hit(dqp);
                XFS_STATS_INC(xs_qm_dqcachehits);
-               /*
-                * The dquot was found, moved to the front of the chain,
-                * taken off the freelist if it was on it, and locked
-                * at this point. Just unlock the hashchain and return.
-                */
-               ASSERT(*O_dqpp);
-               ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
-               mutex_unlock(&h->qh_lock);
-               trace_xfs_dqget_hit(*O_dqpp);
-               return 0;       /* success */
-       default:
-               XFS_STATS_INC(xs_qm_dqcachemisses);
-               break;
+               *O_dqpp = dqp;
+               return 0;
        }
+       mutex_unlock(&qi->qi_tree_lock);
+       XFS_STATS_INC(xs_qm_dqcachemisses);
 
        /*
         * Dquot cache miss. We don't want to keep the inode lock across
@@ -742,12 +681,6 @@ restart:
         */
        if (ip)
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       /*
-        * Save the hashchain version stamp, and unlock the chain, so that
-        * we don't keep the lock across a disk read
-        */
-       version = h->qh_version;
-       mutex_unlock(&h->qh_lock);
 
        error = xfs_qm_dqread(mp, id, type, flags, &dqp);
 
@@ -757,15 +690,14 @@ restart:
        if (error)
                return error;
 
-       /*
-        * Dquot lock comes after hashlock in the lock ordering
-        */
        if (ip) {
                /*
                 * A dquot could be attached to this inode by now, since
                 * we had dropped the ilock.
                 */
                if (xfs_this_quota_on(mp, type)) {
+                       struct xfs_dquot        *dqp1;
+
                        dqp1 = xfs_inode_dquot(ip, type);
                        if (dqp1) {
                                xfs_qm_dqdestroy(dqp);
@@ -780,51 +712,27 @@ restart:
                }
        }
 
-       /*
-        * Hashlock comes after ilock in lock order
-        */
-       mutex_lock(&h->qh_lock);
-       if (version != h->qh_version) {
-               xfs_dquot_t *tmpdqp;
+       mutex_lock(&qi->qi_tree_lock);
+       error = -radix_tree_insert(tree, id, dqp);
+       if (unlikely(error)) {
+               WARN_ON(error != EEXIST);
+
                /*
-                * Now, see if somebody else put the dquot in the
-                * hashtable before us. This can happen because we didn't
-                * keep the hashchain lock. We don't have to worry about
-                * lock order between the two dquots here since dqp isn't
-                * on any findable lists yet.
+                * Duplicate found. Just throw away the new dquot and start
+                * over.
                 */
-               switch (xfs_qm_dqlookup(mp, id, h, &tmpdqp)) {
-               case 0:
-               case -1:
-                       /*
-                        * Duplicate found, either in cache or on its way out.
-                        * Just throw away the new dquot and start over.
-                        */
-                       if (tmpdqp)
-                               xfs_qm_dqput(tmpdqp);
-                       mutex_unlock(&h->qh_lock);
-                       xfs_qm_dqdestroy(dqp);
-                       XFS_STATS_INC(xs_qm_dquot_dups);
-                       goto restart;
-               default:
-                       break;
-               }
+               mutex_unlock(&qi->qi_tree_lock);
+               trace_xfs_dqget_dup(dqp);
+               xfs_qm_dqdestroy(dqp);
+               XFS_STATS_INC(xs_qm_dquot_dups);
+               goto restart;
        }
 
-       /*
-        * Put the dquot at the beginning of the hash-chain and mp's list
-        * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
-        */
-       ASSERT(mutex_is_locked(&h->qh_lock));
-       dqp->q_hash = h;
-       list_add(&dqp->q_hashlist, &h->qh_list);
-       h->qh_version++;
-
        /*
         * Attach this dquot to this filesystem's list of all dquots,
         * kept inside the mount structure in m_quotainfo field
         */
-       mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
+       mutex_lock(&qi->qi_dqlist_lock);
 
        /*
         * We return a locked dquot to the caller, with a reference taken
@@ -832,10 +740,11 @@ restart:
        xfs_dqlock(dqp);
        dqp->q_nrefs = 1;
 
-       list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
-       mp->m_quotainfo->qi_dquots++;
-       mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
-       mutex_unlock(&h->qh_lock);
+       list_add(&dqp->q_mplist, &qi->qi_dqlist);
+       qi->qi_dquots++;
+       mutex_unlock(&qi->qi_dqlist_lock);
+       mutex_unlock(&qi->qi_tree_lock);
+
  dqret:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
        trace_xfs_dqget_miss(dqp);
@@ -1117,7 +1026,6 @@ xfs_qm_dqpurge(
        struct xfs_dquot        *dqp)
 {
        struct xfs_mount        *mp = dqp->q_mount;
-       struct xfs_dqhash       *qh = dqp->q_hash;
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
 
        xfs_dqlock(dqp);
@@ -1164,10 +1072,10 @@ xfs_qm_dqpurge(
        xfs_dqfunlock(dqp);
        xfs_dqunlock(dqp);
 
-       mutex_lock(&qh->qh_lock);
-       list_del_init(&dqp->q_hashlist);
-       qh->qh_version++;
-       mutex_unlock(&qh->qh_lock);
+       mutex_lock(&qi->qi_tree_lock);
+       radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+                         be32_to_cpu(dqp->q_core.d_id));
+       mutex_unlock(&qi->qi_tree_lock);
 
        mutex_lock(&qi->qi_dqlist_lock);
        list_del_init(&dqp->q_mplist);
index f291c25e5992630d1efa55fb303767fe854e298b..4061f1731271584af5d80ca82d77319c15bbf2b6 100644 (file)
  * when quotas are off.
  */
 
-/*
- * The hash chain headers (hash buckets)
- */
-typedef struct xfs_dqhash {
-       struct list_head  qh_list;
-       struct mutex      qh_lock;
-       uint              qh_version;   /* ever increasing version */
-       uint              qh_nelems;    /* number of dquots on the list */
-} xfs_dqhash_t;
-
 struct xfs_mount;
 struct xfs_trans;
 
@@ -49,8 +39,6 @@ typedef struct xfs_dquot {
        uint             dq_flags;      /* various flags (XFS_DQ_*) */
        struct list_head q_lru;         /* global free list of dquots */
        struct list_head q_mplist;      /* mount's list of dquots */
-       struct list_head q_hashlist;    /* gloabl hash list of dquots */
-       xfs_dqhash_t    *q_hash;        /* the hashchain header */
        struct xfs_mount*q_mount;       /* filesystem this relates to */
        struct xfs_trans*q_transp;      /* trans this belongs to currently */
        uint             q_nrefs;       /* # active refs from inodes */
index a2579e1d687fc3924b56fcd18187435479e6373c..bb884e701cd9ba407a09cebaa27cd453380f84aa 100644 (file)
@@ -54,9 +54,6 @@ struct xfs_qm *xfs_Gqm;
 kmem_zone_t    *qm_dqzone;
 kmem_zone_t    *qm_dqtrxzone;
 
-STATIC void    xfs_qm_list_init(xfs_dqlist_t *, char *, int);
-STATIC void    xfs_qm_list_destroy(xfs_dqlist_t *);
-
 STATIC int     xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int     xfs_qm_init_quotainfo(xfs_mount_t *);
 STATIC int     xfs_qm_shake(struct shrinker *, struct shrink_control *);
@@ -68,37 +65,9 @@ STATIC int   xfs_qm_shake(struct shrinker *, struct shrink_control *);
 STATIC struct xfs_qm *
 xfs_Gqm_init(void)
 {
-       xfs_dqhash_t    *udqhash, *gdqhash;
        xfs_qm_t        *xqm;
-       size_t          hsize;
-       uint            i;
-
-       /*
-        * Initialize the dquot hash tables.
-        */
-       udqhash = kmem_zalloc_greedy(&hsize,
-                                    XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
-                                    XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
-       if (!udqhash)
-               goto out;
-
-       gdqhash = kmem_zalloc_large(hsize);
-       if (!gdqhash)
-               goto out_free_udqhash;
-
-       hsize /= sizeof(xfs_dqhash_t);
 
        xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
-       xqm->qm_dqhashmask = hsize - 1;
-       xqm->qm_usr_dqhtable = udqhash;
-       xqm->qm_grp_dqhtable = gdqhash;
-       ASSERT(xqm->qm_usr_dqhtable != NULL);
-       ASSERT(xqm->qm_grp_dqhtable != NULL);
-
-       for (i = 0; i < hsize; i++) {
-               xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
-               xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
-       }
 
        /*
         * dquot zone. we register our own low-memory callback.
@@ -122,11 +91,6 @@ xfs_Gqm_init(void)
 
        xqm->qm_nrefs = 0;
        return xqm;
-
- out_free_udqhash:
-       kmem_free_large(udqhash);
- out:
-       return NULL;
 }
 
 /*
@@ -136,22 +100,9 @@ STATIC void
 xfs_qm_destroy(
        struct xfs_qm   *xqm)
 {
-       int             hsize, i;
-
        ASSERT(xqm != NULL);
        ASSERT(xqm->qm_nrefs == 0);
 
-       hsize = xqm->qm_dqhashmask + 1;
-       for (i = 0; i < hsize; i++) {
-               xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
-               xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
-       }
-       kmem_free_large(xqm->qm_usr_dqhtable);
-       kmem_free_large(xqm->qm_grp_dqhtable);
-       xqm->qm_usr_dqhtable = NULL;
-       xqm->qm_grp_dqhtable = NULL;
-       xqm->qm_dqhashmask = 0;
-
        kmem_free(xqm);
 }
 
@@ -761,14 +712,6 @@ xfs_qm_dqdetach(
        }
 }
 
-/*
- * The hash chains and the mplist use the same xfs_dqhash structure as
- * their list head, but we can take the mplist qh_lock and one of the
- * hash qh_locks at the same time without any problem as they aren't
- * related.
- */
-static struct lock_class_key xfs_quota_mplist_class;
-
 /*
  * This initializes all the quota information that's kept in the
  * mount structure
@@ -802,9 +745,12 @@ xfs_qm_init_quotainfo(
                return error;
        }
 
+       INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
+       INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
+       mutex_init(&qinf->qi_tree_lock);
+
        INIT_LIST_HEAD(&qinf->qi_dqlist);
        mutex_init(&qinf->qi_dqlist_lock);
-       lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
 
        INIT_LIST_HEAD(&qinf->qi_lru_list);
        qinf->qi_lru_count = 0;
@@ -924,30 +870,6 @@ xfs_qm_destroy_quotainfo(
        mp->m_quotainfo = NULL;
 }
 
-
-
-/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
-
-/* ARGSUSED */
-STATIC void
-xfs_qm_list_init(
-       xfs_dqlist_t    *list,
-       char            *str,
-       int             n)
-{
-       mutex_init(&list->qh_lock);
-       INIT_LIST_HEAD(&list->qh_list);
-       list->qh_version = 0;
-       list->qh_nelems = 0;
-}
-
-STATIC void
-xfs_qm_list_destroy(
-       xfs_dqlist_t    *list)
-{
-       mutex_destroy(&(list->qh_lock));
-}
-
 /*
  * Create an inode and return with a reference already taken, but unlocked
  * This is how we create quota inodes
@@ -1592,10 +1514,10 @@ xfs_qm_dqfree_one(
        struct xfs_mount        *mp = dqp->q_mount;
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
 
-       mutex_lock(&dqp->q_hash->qh_lock);
-       list_del_init(&dqp->q_hashlist);
-       dqp->q_hash->qh_version++;
-       mutex_unlock(&dqp->q_hash->qh_lock);
+       mutex_lock(&qi->qi_tree_lock);
+       radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+                         be32_to_cpu(dqp->q_core.d_id));
+       mutex_unlock(&qi->qi_tree_lock);
 
        mutex_lock(&qi->qi_dqlist_lock);
        list_del_init(&dqp->q_mplist);
@@ -1634,7 +1556,6 @@ xfs_qm_dqreclaim_one(
                return;
        }
 
-       ASSERT(dqp->q_hash);
        ASSERT(!list_empty(&dqp->q_mplist));
 
        /*
index c236bba9bfab9c8eef903d2a5999cdbc585800c9..8f4b117823ccfe7602095fc02bd98ed402cf8e5f 100644 (file)
@@ -30,12 +30,6 @@ extern struct xfs_qm *xfs_Gqm;
 extern kmem_zone_t     *qm_dqzone;
 extern kmem_zone_t     *qm_dqtrxzone;
 
-/*
- * Dquot hashtable constants/threshold values.
- */
-#define XFS_QM_HASHSIZE_LOW            (PAGE_SIZE / sizeof(xfs_dqhash_t))
-#define XFS_QM_HASHSIZE_HIGH           ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t))
-
 /*
  * This defines the unit of allocation of dquots.
  * Currently, it is just one file system block, and a 4K blk contains 30
@@ -47,15 +41,10 @@ extern kmem_zone_t  *qm_dqtrxzone;
  */
 #define XFS_DQUOT_CLUSTER_SIZE_FSB     (xfs_filblks_t)1
 
-typedef xfs_dqhash_t   xfs_dqlist_t;
-
 /*
  * Quota Manager (global) structure. Lives only in core.
  */
 typedef struct xfs_qm {
-       xfs_dqlist_t    *qm_usr_dqhtable;/* udquot hash table */
-       xfs_dqlist_t    *qm_grp_dqhtable;/* gdquot hash table */
-       uint             qm_dqhashmask;  /* # buckets in dq hashtab - 1 */
        uint             qm_nrefs;       /* file systems with quota on */
        kmem_zone_t     *qm_dqzone;      /* dquot mem-alloc zone */
        kmem_zone_t     *qm_dqtrxzone;   /* t_dqinfo of transactions */
@@ -66,6 +55,9 @@ typedef struct xfs_qm {
  * The mount structure keeps a pointer to this.
  */
 typedef struct xfs_quotainfo {
+       struct radix_tree_root qi_uquota_tree;
+       struct radix_tree_root qi_gquota_tree;
+       struct mutex qi_tree_lock;
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
        struct list_head qi_lru_list;
@@ -94,6 +86,11 @@ typedef struct xfs_quotainfo {
        struct shrinker  qi_shrinker;
 } xfs_quotainfo_t;
 
+#define XFS_DQUOT_TREE(qi, type) \
+       ((type & XFS_DQ_USER) ? \
+        &((qi)->qi_uquota_tree) : \
+        &((qi)->qi_gquota_tree))
+
 
 extern void    xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
 extern int     xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
index 94a3d927d716c6ac6075323340affe7110f7d81a..6d86219d93da2c55b98eb26c190b2ec478e31bcd 100644 (file)
  */
 #define XFS_DQITER_MAP_SIZE    10
 
-/*
- * Hash into a bucket in the dquot hash table, based on <mp, id>.
- */
-#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
-                                (__psunsigned_t)(id)) & \
-                               (xfs_Gqm->qm_dqhashmask - 1))
-#define XFS_DQ_HASH(mp, id, type)   (type == XFS_DQ_USER ? \
-                                    (xfs_Gqm->qm_usr_dqhtable + \
-                                     XFS_DQ_HASHVAL(mp, id)) : \
-                                    (xfs_Gqm->qm_grp_dqhtable + \
-                                     XFS_DQ_HASHVAL(mp, id)))
 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
        !dqp->q_core.d_blk_hardlimit && \
        !dqp->q_core.d_blk_softlimit && \
index ceaf6fe67e414f083f7760b9b1ff4724075dd67b..75eb54af4d581e7f4cc9270f0abb1c72238195ac 100644 (file)
@@ -741,10 +741,10 @@ DEFINE_DQUOT_EVENT(xfs_dqalloc);
 DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
 DEFINE_DQUOT_EVENT(xfs_dqread);
 DEFINE_DQUOT_EVENT(xfs_dqread_fail);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
 DEFINE_DQUOT_EVENT(xfs_dqget_hit);
 DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
+DEFINE_DQUOT_EVENT(xfs_dqget_dup);
 DEFINE_DQUOT_EVENT(xfs_dqput);
 DEFINE_DQUOT_EVENT(xfs_dqput_wait);
 DEFINE_DQUOT_EVENT(xfs_dqput_free);