Merge branch 'for-3.6/core' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 1 Aug 2012 16:02:41 +0000 (09:02 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 1 Aug 2012 16:02:41 +0000 (09:02 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Aug 2012 16:02:41 +0000 (09:02 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Aug 2012 16:02:41 +0000 (09:02 -0700)
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt

index d8147b336c354e203addd40bb85bcc0abbeeded6..6518a55273e7094f62f84a5d83467fd96b26fd26 100644 (file)
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -38,6 +38,13 @@ read or write requests. Note that the total allocated number may be twice
  this amount, since it applies only to reads or writes (not the accumulated
  sum).
  
+To avoid priority inversion through request starvation, a request
+queue maintains a separate request pool per each cgroup when
+CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such
+per-block-cgroup request pool.  IOW, if there are N block cgroups,
+each request queue may have upto N request pools, each independently
+regulated by nr_requests.
+
  read_ahead_kb (RW)
  ------------------
  Maximum number of kilobytes to read-ahead for filesystems on this block
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index e7dee617358e810aec57ff25162ff95604fabb3c..f3b44a65fc7ad5f127bee8bcbadf5b486a7e5c71 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -31,27 +31,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
  
  static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  
-struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
-{
-       return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
-                           struct blkcg, css);
-}
-EXPORT_SYMBOL_GPL(cgroup_to_blkcg);
-
-static struct blkcg *task_blkcg(struct task_struct *tsk)
-{
-       return container_of(task_subsys_state(tsk, blkio_subsys_id),
-                           struct blkcg, css);
-}
-
-struct blkcg *bio_blkcg(struct bio *bio)
-{
-       if (bio && bio->bi_css)
-               return container_of(bio->bi_css, struct blkcg, css);
-       return task_blkcg(current);
-}
-EXPORT_SYMBOL_GPL(bio_blkcg);
-
  static bool blkcg_policy_enabled(struct request_queue *q,
                                  const struct blkcg_policy *pol)
  {
@@ -84,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg)
                 kfree(pd);
         }
  
+       blk_exit_rl(&blkg->rl);
         kfree(blkg);
  }
  
@@ -91,16 +71,18 @@ static void blkg_free(struct blkcg_gq *blkg)
   * blkg_alloc - allocate a blkg
   * @blkcg: block cgroup the new blkg is associated with
   * @q: request_queue the new blkg is associated with
+ * @gfp_mask: allocation mask to use
   *
   * Allocate a new blkg assocating @blkcg and @q.
   */
-static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
+static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
+                                  gfp_t gfp_mask)
  {
         struct blkcg_gq *blkg;
         int i;
  
         /* alloc and init base part */
-       blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
+       blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
         if (!blkg)
                 return NULL;
  
@@ -109,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
         blkg->blkcg = blkcg;
         blkg->refcnt = 1;
  
+       /* root blkg uses @q->root_rl, init rl only for !root blkgs */
+       if (blkcg != &blkcg_root) {
+               if (blk_init_rl(&blkg->rl, q, gfp_mask))
+                       goto err_free;
+               blkg->rl.blkg = blkg;
+       }
+
         for (i = 0; i < BLKCG_MAX_POLS; i++) {
                 struct blkcg_policy *pol = blkcg_policy[i];
                 struct blkg_policy_data *pd;
@@ -117,11 +106,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
                         continue;
  
                 /* alloc per-policy data and attach it to blkg */
-               pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node);
-               if (!pd) {
-                       blkg_free(blkg);
-                       return NULL;
-               }
+               pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+               if (!pd)
+                       goto err_free;
  
                 blkg->pd[i] = pd;
                 pd->blkg = blkg;
@@ -132,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
         }
  
         return blkg;
+
+err_free:
+       blkg_free(blkg);
+       return NULL;
  }
  
  static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
@@ -175,9 +166,13 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
  }
  EXPORT_SYMBOL_GPL(blkg_lookup);
  
+/*
+ * If @new_blkg is %NULL, this function tries to allocate a new one as
+ * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
+ */
  static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
-                                            struct request_queue *q)
-       __releases(q->queue_lock) __acquires(q->queue_lock)
+                                            struct request_queue *q,
+                                            struct blkcg_gq *new_blkg)
  {
         struct blkcg_gq *blkg;
         int ret;
@@ -189,24 +184,26 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
         blkg = __blkg_lookup(blkcg, q);
         if (blkg) {
                 rcu_assign_pointer(blkcg->blkg_hint, blkg);
-               return blkg;
+               goto out_free;
         }
  
         /* blkg holds a reference to blkcg */
-       if (!css_tryget(&blkcg->css))
-               return ERR_PTR(-EINVAL);
+       if (!css_tryget(&blkcg->css)) {
+               blkg = ERR_PTR(-EINVAL);
+               goto out_free;
+       }
  
         /* allocate */
-       ret = -ENOMEM;
-       blkg = blkg_alloc(blkcg, q);
-       if (unlikely(!blkg))
-               goto err_put;
+       if (!new_blkg) {
+               new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+               if (unlikely(!new_blkg)) {
+                       blkg = ERR_PTR(-ENOMEM);
+                       goto out_put;
+               }
+       }
+       blkg = new_blkg;
  
         /* insert */
-       ret = radix_tree_preload(GFP_ATOMIC);
-       if (ret)
-               goto err_free;
-
         spin_lock(&blkcg->lock);
         ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
         if (likely(!ret)) {
@@ -215,15 +212,15 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
         }
         spin_unlock(&blkcg->lock);
  
-       radix_tree_preload_end();
-
         if (!ret)
                 return blkg;
-err_free:
-       blkg_free(blkg);
-err_put:
+
+       blkg = ERR_PTR(ret);
+out_put:
         css_put(&blkcg->css);
-       return ERR_PTR(ret);
+out_free:
+       blkg_free(new_blkg);
+       return blkg;
  }
  
  struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
@@ -235,7 +232,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
          */
         if (unlikely(blk_queue_bypass(q)))
                 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
-       return __blkg_lookup_create(blkcg, q);
+       return __blkg_lookup_create(blkcg, q, NULL);
  }
  EXPORT_SYMBOL_GPL(blkg_lookup_create);
  
@@ -313,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg)
  }
  EXPORT_SYMBOL_GPL(__blkg_release);
  
+/*
+ * The next function used by blk_queue_for_each_rl().  It's a bit tricky
+ * because the root blkg uses @q->root_rl instead of its own rl.
+ */
+struct request_list *__blk_queue_next_rl(struct request_list *rl,
+                                        struct request_queue *q)
+{
+       struct list_head *ent;
+       struct blkcg_gq *blkg;
+
+       /*
+        * Determine the current blkg list_head.  The first entry is
+        * root_rl which is off @q->blkg_list and mapped to the head.
+        */
+       if (rl == &q->root_rl) {
+               ent = &q->blkg_list;
+       } else {
+               blkg = container_of(rl, struct blkcg_gq, rl);
+               ent = &blkg->q_node;
+       }
+
+       /* walk to the next list_head, skip root blkcg */
+       ent = ent->next;
+       if (ent == &q->root_blkg->q_node)
+               ent = ent->next;
+       if (ent == &q->blkg_list)
+               return NULL;
+
+       blkg = container_of(ent, struct blkcg_gq, q_node);
+       return &blkg->rl;
+}
+
  static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
                              u64 val)
  {
@@ -734,24 +763,36 @@ int blkcg_activate_policy(struct request_queue *q,
         struct blkcg_gq *blkg;
         struct blkg_policy_data *pd, *n;
         int cnt = 0, ret;
+       bool preloaded;
  
         if (blkcg_policy_enabled(q, pol))
                 return 0;
  
+       /* preallocations for root blkg */
+       blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+       if (!blkg)
+               return -ENOMEM;
+
+       preloaded = !radix_tree_preload(GFP_KERNEL);
+
         blk_queue_bypass_start(q);
  
         /* make sure the root blkg exists and count the existing blkgs */
         spin_lock_irq(q->queue_lock);
  
         rcu_read_lock();
-       blkg = __blkg_lookup_create(&blkcg_root, q);
+       blkg = __blkg_lookup_create(&blkcg_root, q, blkg);
         rcu_read_unlock();
  
+       if (preloaded)
+               radix_tree_preload_end();
+
         if (IS_ERR(blkg)) {
                 ret = PTR_ERR(blkg);
                 goto out_unlock;
         }
         q->root_blkg = blkg;
+       q->root_rl.blkg = blkg;
  
         list_for_each_entry(blkg, &q->blkg_list, q_node)
                 cnt++;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h

index 8ac457ce7783847522c1340c008d3c7789f3a1b2..24597309e23d38700a6ca2ae80cd9aab71aa3474 100644 (file)
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -17,6 +17,7 @@
  #include <linux/u64_stats_sync.h>
  #include <linux/seq_file.h>
  #include <linux/radix-tree.h>
+#include <linux/blkdev.h>
  
  /* Max limits for throttle policy */
  #define THROTL_IOPS_MAX                UINT_MAX
@@ -93,6 +94,8 @@ struct blkcg_gq {
         struct list_head                q_node;
         struct hlist_node               blkcg_node;
         struct blkcg                    *blkcg;
+       /* request allocation list for this blkcg-q pair */
+       struct request_list             rl;
         /* reference count */
         int                             refcnt;
  
@@ -120,8 +123,6 @@ struct blkcg_policy {
  
  extern struct blkcg blkcg_root;
  
-struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup);
-struct blkcg *bio_blkcg(struct bio *bio);
  struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
  struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                                     struct request_queue *q);
@@ -160,6 +161,25 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
  void blkg_conf_finish(struct blkg_conf_ctx *ctx);
  
  
+static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
+{
+       return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
+                           struct blkcg, css);
+}
+
+static inline struct blkcg *task_blkcg(struct task_struct *tsk)
+{
+       return container_of(task_subsys_state(tsk, blkio_subsys_id),
+                           struct blkcg, css);
+}
+
+static inline struct blkcg *bio_blkcg(struct bio *bio)
+{
+       if (bio && bio->bi_css)
+               return container_of(bio->bi_css, struct blkcg, css);
+       return task_blkcg(current);
+}
+
  /**
   * blkg_to_pdata - get policy private data
   * @blkg: blkg of interest
@@ -233,6 +253,95 @@ static inline void blkg_put(struct blkcg_gq *blkg)
                 __blkg_release(blkg);
  }
  
+/**
+ * blk_get_rl - get request_list to use
+ * @q: request_queue of interest
+ * @bio: bio which will be attached to the allocated request (may be %NULL)
+ *
+ * The caller wants to allocate a request from @q to use for @bio.  Find
+ * the request_list to use and obtain a reference on it.  Should be called
+ * under queue_lock.  This function is guaranteed to return non-%NULL
+ * request_list.
+ */
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                             struct bio *bio)
+{
+       struct blkcg *blkcg;
+       struct blkcg_gq *blkg;
+
+       rcu_read_lock();
+
+       blkcg = bio_blkcg(bio);
+
+       /* bypass blkg lookup and use @q->root_rl directly for root */
+       if (blkcg == &blkcg_root)
+               goto root_rl;
+
+       /*
+        * Try to use blkg->rl.  blkg lookup may fail under memory pressure
+        * or if either the blkcg or queue is going away.  Fall back to
+        * root_rl in such cases.
+        */
+       blkg = blkg_lookup_create(blkcg, q);
+       if (unlikely(IS_ERR(blkg)))
+               goto root_rl;
+
+       blkg_get(blkg);
+       rcu_read_unlock();
+       return &blkg->rl;
+root_rl:
+       rcu_read_unlock();
+       return &q->root_rl;
+}
+
+/**
+ * blk_put_rl - put request_list
+ * @rl: request_list to put
+ *
+ * Put the reference acquired by blk_get_rl().  Should be called under
+ * queue_lock.
+ */
+static inline void blk_put_rl(struct request_list *rl)
+{
+       /* root_rl may not have blkg set */
+       if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+               blkg_put(rl->blkg);
+}
+
+/**
+ * blk_rq_set_rl - associate a request with a request_list
+ * @rq: request of interest
+ * @rl: target request_list
+ *
+ * Associate @rq with @rl so that accounting and freeing can know the
+ * request_list @rq came from.
+ */
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
+{
+       rq->rl = rl;
+}
+
+/**
+ * blk_rq_rl - return the request_list a request came from
+ * @rq: request of interest
+ *
+ * Return the request_list @rq is allocated from.
+ */
+static inline struct request_list *blk_rq_rl(struct request *rq)
+{
+       return rq->rl;
+}
+
+struct request_list *__blk_queue_next_rl(struct request_list *rl,
+                                        struct request_queue *q);
+/**
+ * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
+ *
+ * Should be used under queue_lock.
+ */
+#define blk_queue_for_each_rl(rl, q)   \
+       for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
+
  /**
   * blkg_stat_add - add a value to a blkg_stat
   * @stat: target blkg_stat
@@ -351,6 +460,7 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
  #else  /* CONFIG_BLK_CGROUP */
  
  struct cgroup;
+struct blkcg;
  
  struct blkg_policy_data {
  };
@@ -361,8 +471,6 @@ struct blkcg_gq {
  struct blkcg_policy {
  };
  
-static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
-static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
  static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
  static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
  static inline void blkcg_drain_queue(struct request_queue *q) { }
@@ -374,6 +482,9 @@ static inline int blkcg_activate_policy(struct request_queue *q,
  static inline void blkcg_deactivate_policy(struct request_queue *q,
                                            const struct blkcg_policy *pol) { }
  
+static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
+static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+
  static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                   struct blkcg_policy *pol) { return NULL; }
  static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
@@ -381,5 +492,14 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
  static inline void blkg_get(struct blkcg_gq *blkg) { }
  static inline void blkg_put(struct blkcg_gq *blkg) { }
  
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                             struct bio *bio) { return &q->root_rl; }
+static inline void blk_put_rl(struct request_list *rl) { }
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
+static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+
+#define blk_queue_for_each_rl(rl, q)   \
+       for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+
  #endif /* CONFIG_BLK_CGROUP */
  #endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c

index 93eb3e4f88ce78affc79c5ca6d8b7c7b0759be5a..dd134d834d589468fe1794c71a8325920ee5ac34 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -387,7 +387,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
                 if (!list_empty(&q->queue_head) && q->request_fn)
                         __blk_run_queue(q);
  
-               drain |= q->rq.elvpriv;
+               drain |= q->nr_rqs_elvpriv;
  
                 /*
                  * Unfortunately, requests are queued at and tracked from
@@ -397,7 +397,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
                 if (drain_all) {
                         drain |= !list_empty(&q->queue_head);
                         for (i = 0; i < 2; i++) {
-                               drain |= q->rq.count[i];
+                               drain |= q->nr_rqs[i];
                                 drain |= q->in_flight[i];
                                 drain |= !list_empty(&q->flush_queue[i]);
                         }
@@ -416,9 +416,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
          * left with hung waiters. We need to wake up those waiters.
          */
         if (q->request_fn) {
+               struct request_list *rl;
+
                 spin_lock_irq(q->queue_lock);
-               for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++)
-                       wake_up_all(&q->rq.wait[i]);
+
+               blk_queue_for_each_rl(rl, q)
+                       for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
+                               wake_up_all(&rl->wait[i]);
+
                 spin_unlock_irq(q->queue_lock);
         }
  }
@@ -517,28 +522,33 @@ void blk_cleanup_queue(struct request_queue *q)
  }
  EXPORT_SYMBOL(blk_cleanup_queue);
  
-static int blk_init_free_list(struct request_queue *q)
+int blk_init_rl(struct request_list *rl, struct request_queue *q,
+               gfp_t gfp_mask)
  {
-       struct request_list *rl = &q->rq;
-
         if (unlikely(rl->rq_pool))
                 return 0;
  
+       rl->q = q;
         rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
         rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
-       rl->elvpriv = 0;
         init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
         init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
  
         rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
-                               mempool_free_slab, request_cachep, q->node);
-
+                                         mempool_free_slab, request_cachep,
+                                         gfp_mask, q->node);
         if (!rl->rq_pool)
                 return -ENOMEM;
  
         return 0;
  }
  
+void blk_exit_rl(struct request_list *rl)
+{
+       if (rl->rq_pool)
+               mempool_destroy(rl->rq_pool);
+}
+
  struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
  {
         return blk_alloc_queue_node(gfp_mask, -1);
@@ -680,7 +690,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
         if (!q)
                 return NULL;
  
-       if (blk_init_free_list(q))
+       if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
                 return NULL;
  
         q->request_fn           = rfn;
@@ -722,15 +732,15 @@ bool blk_get_queue(struct request_queue *q)
  }
  EXPORT_SYMBOL(blk_get_queue);
  
-static inline void blk_free_request(struct request_queue *q, struct request *rq)
+static inline void blk_free_request(struct request_list *rl, struct request *rq)
  {
         if (rq->cmd_flags & REQ_ELVPRIV) {
-               elv_put_request(q, rq);
+               elv_put_request(rl->q, rq);
                 if (rq->elv.icq)
                         put_io_context(rq->elv.icq->ioc);
         }
  
-       mempool_free(rq, q->rq.rq_pool);
+       mempool_free(rq, rl->rq_pool);
  }
  
  /*
@@ -767,18 +777,23 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
         ioc->last_waited = jiffies;
  }
  
-static void __freed_request(struct request_queue *q, int sync)
+static void __freed_request(struct request_list *rl, int sync)
  {
-       struct request_list *rl = &q->rq;
+       struct request_queue *q = rl->q;
  
-       if (rl->count[sync] < queue_congestion_off_threshold(q))
+       /*
+        * bdi isn't aware of blkcg yet.  As all async IOs end up root
+        * blkcg anyway, just use root blkcg state.
+        */
+       if (rl == &q->root_rl &&
+           rl->count[sync] < queue_congestion_off_threshold(q))
                 blk_clear_queue_congested(q, sync);
  
         if (rl->count[sync] + 1 <= q->nr_requests) {
                 if (waitqueue_active(&rl->wait[sync]))
                         wake_up(&rl->wait[sync]);
  
-               blk_clear_queue_full(q, sync);
+               blk_clear_rl_full(rl, sync);
         }
  }
  
@@ -786,19 +801,20 @@ static void __freed_request(struct request_queue *q, int sync)
   * A request has just been released.  Account for it, update the full and
   * congestion status, wake up any waiters.   Called under q->queue_lock.
   */
-static void freed_request(struct request_queue *q, unsigned int flags)
+static void freed_request(struct request_list *rl, unsigned int flags)
  {
-       struct request_list *rl = &q->rq;
+       struct request_queue *q = rl->q;
         int sync = rw_is_sync(flags);
  
+       q->nr_rqs[sync]--;
         rl->count[sync]--;
         if (flags & REQ_ELVPRIV)
-               rl->elvpriv--;
+               q->nr_rqs_elvpriv--;
  
-       __freed_request(q, sync);
+       __freed_request(rl, sync);
  
         if (unlikely(rl->starved[sync ^ 1]))
-               __freed_request(q, sync ^ 1);
+               __freed_request(rl, sync ^ 1);
  }
  
  /*
@@ -837,8 +853,8 @@ static struct io_context *rq_ioc(struct bio *bio)
  }
  
  /**
- * get_request - get a free request
- * @q: request_queue to allocate request from
+ * __get_request - get a free request
+ * @rl: request list to allocate from
   * @rw_flags: RW and SYNC flags
   * @bio: bio to allocate request for (can be %NULL)
   * @gfp_mask: allocation mask
@@ -850,20 +866,16 @@ static struct io_context *rq_ioc(struct bio *bio)
   * Returns %NULL on failure, with @q->queue_lock held.
   * Returns !%NULL on success, with @q->queue_lock *not held*.
   */
-static struct request *get_request(struct request_queue *q, int rw_flags,
-                                  struct bio *bio, gfp_t gfp_mask)
+static struct request *__get_request(struct request_list *rl, int rw_flags,
+                                    struct bio *bio, gfp_t gfp_mask)
  {
+       struct request_queue *q = rl->q;
         struct request *rq;
-       struct request_list *rl = &q->rq;
-       struct elevator_type *et;
-       struct io_context *ioc;
+       struct elevator_type *et = q->elevator->type;
+       struct io_context *ioc = rq_ioc(bio);
         struct io_cq *icq = NULL;
         const bool is_sync = rw_is_sync(rw_flags) != 0;
-       bool retried = false;
         int may_queue;
-retry:
-       et = q->elevator->type;
-       ioc = rq_ioc(bio);
  
         if (unlikely(blk_queue_dead(q)))
                 return NULL;
@@ -874,29 +886,15 @@ retry:
  
         if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
                 if (rl->count[is_sync]+1 >= q->nr_requests) {
-                       /*
-                        * We want ioc to record batching state.  If it's
-                        * not already there, creating a new one requires
-                        * dropping queue_lock, which in turn requires
-                        * retesting conditions to avoid queue hang.
-                        */
-                       if (!ioc && !retried) {
-                               spin_unlock_irq(q->queue_lock);
-                               create_io_context(gfp_mask, q->node);
-                               spin_lock_irq(q->queue_lock);
-                               retried = true;
-                               goto retry;
-                       }
-
                         /*
                          * The queue will fill after this allocation, so set
                          * it as full, and mark this process as "batching".
                          * This process will be allowed to complete a batch of
                          * requests, others will be blocked.
                          */
-                       if (!blk_queue_full(q, is_sync)) {
+                       if (!blk_rl_full(rl, is_sync)) {
                                 ioc_set_batching(q, ioc);
-                               blk_set_queue_full(q, is_sync);
+                               blk_set_rl_full(rl, is_sync);
                         } else {
                                 if (may_queue != ELV_MQUEUE_MUST
                                                 && !ioc_batching(q, ioc)) {
@@ -909,7 +907,12 @@ retry:
                                 }
                         }
                 }
-               blk_set_queue_congested(q, is_sync);
+               /*
+                * bdi isn't aware of blkcg yet.  As all async IOs end up
+                * root blkcg anyway, just use root blkcg state.
+                */
+               if (rl == &q->root_rl)
+                       blk_set_queue_congested(q, is_sync);
         }
  
         /*
@@ -920,6 +923,7 @@ retry:
         if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
                 return NULL;
  
+       q->nr_rqs[is_sync]++;
         rl->count[is_sync]++;
         rl->starved[is_sync] = 0;
  
@@ -935,7 +939,7 @@ retry:
          */
         if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
                 rw_flags |= REQ_ELVPRIV;
-               rl->elvpriv++;
+               q->nr_rqs_elvpriv++;
                 if (et->icq_cache && ioc)
                         icq = ioc_lookup_icq(ioc, q);
         }
@@ -945,22 +949,19 @@ retry:
         spin_unlock_irq(q->queue_lock);
  
         /* allocate and init request */
-       rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
+       rq = mempool_alloc(rl->rq_pool, gfp_mask);
         if (!rq)
                 goto fail_alloc;
  
         blk_rq_init(q, rq);
+       blk_rq_set_rl(rq, rl);
         rq->cmd_flags = rw_flags | REQ_ALLOCED;
  
         /* init elvpriv */
         if (rw_flags & REQ_ELVPRIV) {
                 if (unlikely(et->icq_cache && !icq)) {
-                       create_io_context(gfp_mask, q->node);
-                       ioc = rq_ioc(bio);
-                       if (!ioc)
-                               goto fail_elvpriv;
-
-                       icq = ioc_create_icq(ioc, q, gfp_mask);
+                       if (ioc)
+                               icq = ioc_create_icq(ioc, q, gfp_mask);
                         if (!icq)
                                 goto fail_elvpriv;
                 }
@@ -1000,7 +1001,7 @@ fail_elvpriv:
         rq->elv.icq = NULL;
  
         spin_lock_irq(q->queue_lock);
-       rl->elvpriv--;
+       q->nr_rqs_elvpriv--;
         spin_unlock_irq(q->queue_lock);
         goto out;
  
@@ -1013,7 +1014,7 @@ fail_alloc:
          * queue, but this is pretty rare.
          */
         spin_lock_irq(q->queue_lock);
-       freed_request(q, rw_flags);
+       freed_request(rl, rw_flags);
  
         /*
          * in the very unlikely event that allocation failed and no
@@ -1029,56 +1030,58 @@ rq_starved:
  }
  
  /**
- * get_request_wait - get a free request with retry
+ * get_request - get a free request
   * @q: request_queue to allocate request from
   * @rw_flags: RW and SYNC flags
   * @bio: bio to allocate request for (can be %NULL)
+ * @gfp_mask: allocation mask
   *
- * Get a free request from @q.  This function keeps retrying under memory
- * pressure and fails iff @q is dead.
+ * Get a free request from @q.  If %__GFP_WAIT is set in @gfp_mask, this
+ * function keeps retrying under memory pressure and fails iff @q is dead.
   *
   * Must be callled with @q->queue_lock held and,
   * Returns %NULL on failure, with @q->queue_lock held.
   * Returns !%NULL on success, with @q->queue_lock *not held*.
   */
-static struct request *get_request_wait(struct request_queue *q, int rw_flags,
-                                       struct bio *bio)
+static struct request *get_request(struct request_queue *q, int rw_flags,
+                                  struct bio *bio, gfp_t gfp_mask)
  {
         const bool is_sync = rw_is_sync(rw_flags) != 0;
+       DEFINE_WAIT(wait);
+       struct request_list *rl;
         struct request *rq;
  
-       rq = get_request(q, rw_flags, bio, GFP_NOIO);
-       while (!rq) {
-               DEFINE_WAIT(wait);
-               struct request_list *rl = &q->rq;
-
-               if (unlikely(blk_queue_dead(q)))
-                       return NULL;
+       rl = blk_get_rl(q, bio);        /* transferred to @rq on success */
+retry:
+       rq = __get_request(rl, rw_flags, bio, gfp_mask);
+       if (rq)
+               return rq;
  
-               prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
-                               TASK_UNINTERRUPTIBLE);
+       if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) {
+               blk_put_rl(rl);
+               return NULL;
+       }
  
-               trace_block_sleeprq(q, bio, rw_flags & 1);
+       /* wait on @rl and retry */
+       prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
+                                 TASK_UNINTERRUPTIBLE);
  
-               spin_unlock_irq(q->queue_lock);
-               io_schedule();
+       trace_block_sleeprq(q, bio, rw_flags & 1);
  
-               /*
-                * After sleeping, we become a "batching" process and
-                * will be able to allocate at least one request, and
-                * up to a big batch of them for a small period time.
-                * See ioc_batching, ioc_set_batching
-                */
-               create_io_context(GFP_NOIO, q->node);
-               ioc_set_batching(q, current->io_context);
+       spin_unlock_irq(q->queue_lock);
+       io_schedule();
  
-               spin_lock_irq(q->queue_lock);
-               finish_wait(&rl->wait[is_sync], &wait);
+       /*
+        * After sleeping, we become a "batching" process and will be able
+        * to allocate at least one request, and up to a big batch of them
+        * for a small period time.  See ioc_batching, ioc_set_batching
+        */
+       ioc_set_batching(q, current->io_context);
  
-               rq = get_request(q, rw_flags, bio, GFP_NOIO);
-       };
+       spin_lock_irq(q->queue_lock);
+       finish_wait(&rl->wait[is_sync], &wait);
  
-       return rq;
+       goto retry;
  }
  
  struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
@@ -1087,11 +1090,11 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
  
         BUG_ON(rw != READ && rw != WRITE);
  
+       /* create ioc upfront */
+       create_io_context(gfp_mask, q->node);
+
         spin_lock_irq(q->queue_lock);
-       if (gfp_mask & __GFP_WAIT)
-               rq = get_request_wait(q, rw, NULL);
-       else
-               rq = get_request(q, rw, NULL, gfp_mask);
+       rq = get_request(q, rw, NULL, gfp_mask);
         if (!rq)
                 spin_unlock_irq(q->queue_lock);
         /* q->queue_lock is unlocked at this point */
@@ -1248,12 +1251,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
          */
         if (req->cmd_flags & REQ_ALLOCED) {
                 unsigned int flags = req->cmd_flags;
+               struct request_list *rl = blk_rq_rl(req);
  
                 BUG_ON(!list_empty(&req->queuelist));
                 BUG_ON(!hlist_unhashed(&req->hash));
  
-               blk_free_request(q, req);
-               freed_request(q, flags);
+               blk_free_request(rl, req);
+               freed_request(rl, flags);
+               blk_put_rl(rl);
         }
  }
  EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1481,7 +1486,7 @@ get_rq:
          * Grab a free request. This is might sleep but can not fail.
          * Returns with the queue unlocked.
          */
-       req = get_request_wait(q, rw_flags, bio);
+       req = get_request(q, rw_flags, bio, GFP_NOIO);
         if (unlikely(!req)) {
                 bio_endio(bio, -ENODEV);        /* @q is dead */
                 goto out_unlock;
@@ -1702,6 +1707,14 @@ generic_make_request_checks(struct bio *bio)
                 goto end_io;
         }
  
+       /*
+        * Various block parts want %current->io_context and lazy ioc
+        * allocation ends up trading a lot of pain for a small amount of
+        * memory.  Just allocate it upfront.  This may fail and block
+        * layer knows how to live with it.
+        */
+       create_io_context(GFP_ATOMIC, q->node);
+
         if (blk_throtl_bio(q, bio))
                 return false;   /* throttled, will be resubmitted later */
  
diff --git a/block/blk-ioc.c b/block/blk-ioc.c

index 893b8007c657e8bd0ca93d5ba61e9e7d02aa892b..fab4cdd3f7bbb8f5d8bda98c3a8771a1ac61bc46 100644 (file)
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -244,6 +244,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
  
         /* initialize */
         atomic_long_set(&ioc->refcount, 1);
+       atomic_set(&ioc->nr_tasks, 1);
         atomic_set(&ioc->active_ref, 1);
         spin_lock_init(&ioc->lock);
         INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
diff --git a/block/blk-settings.c b/block/blk-settings.c

index d3234fc494adcd48ff1dd69c9feb117e93e333c0..565a6786032f59e40cee28bf4bde4ffb451cffa8 100644 (file)
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -143,8 +143,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
         lim->discard_zeroes_data = 1;
         lim->max_segments = USHRT_MAX;
         lim->max_hw_sectors = UINT_MAX;
-
-       lim->max_sectors = BLK_DEF_MAX_SECTORS;
+       lim->max_sectors = UINT_MAX;
  }
  EXPORT_SYMBOL(blk_set_stacking_limits);
  
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c

index aa41b47c22d2e89525a5bd3cfb9501e67634bff7..9628b291f96057a42cbf6a5492bd7480fe7e93da 100644 (file)
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -40,7 +40,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
  static ssize_t
  queue_requests_store(struct request_queue *q, const char *page, size_t count)
  {
-       struct request_list *rl = &q->rq;
+       struct request_list *rl;
         unsigned long nr;
         int ret;
  
@@ -55,6 +55,9 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
         q->nr_requests = nr;
         blk_queue_congestion_threshold(q);
  
+       /* congestion isn't cgroup aware and follows root blkcg for now */
+       rl = &q->root_rl;
+
         if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
                 blk_set_queue_congested(q, BLK_RW_SYNC);
         else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
@@ -65,19 +68,22 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
         else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
                 blk_clear_queue_congested(q, BLK_RW_ASYNC);
  
-       if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
-               blk_set_queue_full(q, BLK_RW_SYNC);
-       } else {
-               blk_clear_queue_full(q, BLK_RW_SYNC);
-               wake_up(&rl->wait[BLK_RW_SYNC]);
+       blk_queue_for_each_rl(rl, q) {
+               if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+                       blk_set_rl_full(rl, BLK_RW_SYNC);
+               } else {
+                       blk_clear_rl_full(rl, BLK_RW_SYNC);
+                       wake_up(&rl->wait[BLK_RW_SYNC]);
+               }
+
+               if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+                       blk_set_rl_full(rl, BLK_RW_ASYNC);
+               } else {
+                       blk_clear_rl_full(rl, BLK_RW_ASYNC);
+                       wake_up(&rl->wait[BLK_RW_ASYNC]);
+               }
         }
  
-       if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
-               blk_set_queue_full(q, BLK_RW_ASYNC);
-       } else {
-               blk_clear_queue_full(q, BLK_RW_ASYNC);
-               wake_up(&rl->wait[BLK_RW_ASYNC]);
-       }
         spin_unlock_irq(q->queue_lock);
         return ret;
  }
@@ -476,7 +482,6 @@ static void blk_release_queue(struct kobject *kobj)
  {
         struct request_queue *q =
                 container_of(kobj, struct request_queue, kobj);
-       struct request_list *rl = &q->rq;
  
         blk_sync_queue(q);
  
@@ -489,8 +494,7 @@ static void blk_release_queue(struct kobject *kobj)
                 elevator_exit(q->elevator);
         }
  
-       if (rl->rq_pool)
-               mempool_destroy(rl->rq_pool);
+       blk_exit_rl(&q->root_rl);
  
         if (q->queue_tags)
                 __blk_queue_free_tags(q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c

index 5b0659512047208efdcc3db7d714bc72dfd456f5..e287c19908c8a31d3c4d29b1586921066032afa6 100644 (file)
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1123,9 +1123,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
                 goto out;
         }
  
-       /* bio_associate_current() needs ioc, try creating */
-       create_io_context(GFP_ATOMIC, q->node);
-
         /*
          * A throtl_grp pointer retrieved under rcu can be used to access
          * basic fields like stats and io rates. If a group has no rules,
diff --git a/block/blk.h b/block/blk.h

index 85f6ae42f7d3f698e9e82c75064f428065953e70..2a0ea32d249fdaa9694e0249e435777565535d70 100644 (file)
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,6 +18,9 @@ static inline void __blk_get_queue(struct request_queue *q)
         kobject_get(&q->kobj);
  }
  
+int blk_init_rl(struct request_list *rl, struct request_queue *q,
+               gfp_t gfp_mask);
+void blk_exit_rl(struct request_list *rl);
  void init_request_from_bio(struct request *req, struct bio *bio);
  void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                         struct bio *bio);
@@ -33,7 +36,6 @@ bool __blk_end_bidi_request(struct request *rq, int error,
  void blk_rq_timed_out_timer(unsigned long data);
  void blk_delete_timer(struct request *);
  void blk_add_timer(struct request *);
-void __generic_unplug_device(struct request_queue *);
  
  /*
   * Internal atomic flags for request handling
diff --git a/block/bsg-lib.c b/block/bsg-lib.c

index 7ad49c88f6b197a04c66e05aab9facacd2781af4..deee61fbb7419005886234b47a68b73b5833d20e 100644 (file)
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -243,56 +243,3 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q,
         return 0;
  }
  EXPORT_SYMBOL_GPL(bsg_setup_queue);
-
-/**
- * bsg_remove_queue - Deletes the bsg dev from the q
- * @q: the request_queue that is to be torn down.
- *
- * Notes:
- *   Before unregistering the queue empty any requests that are blocked
- */
-void bsg_remove_queue(struct request_queue *q)
-{
-       struct request *req; /* block request */
-       int counts; /* totals for request_list count and starved */
-
-       if (!q)
-               return;
-
-       /* Stop taking in new requests */
-       spin_lock_irq(q->queue_lock);
-       blk_stop_queue(q);
-
-       /* drain all requests in the queue */
-       while (1) {
-               /* need the lock to fetch a request
-                * this may fetch the same reqeust as the previous pass
-                */
-               req = blk_fetch_request(q);
-               /* save requests in use and starved */
-               counts = q->rq.count[0] + q->rq.count[1] +
-                        q->rq.starved[0] + q->rq.starved[1];
-               spin_unlock_irq(q->queue_lock);
-               /* any requests still outstanding? */
-               if (counts == 0)
-                       break;
-
-               /* This may be the same req as the previous iteration,
-                * always send the blk_end_request_all after a prefetch.
-                * It is not okay to not end the request because the
-                * prefetch started the request.
-                */
-               if (req) {
-                       /* return -ENXIO to indicate that this queue is
-                        * going away
-                        */
-                       req->errors = -ENXIO;
-                       blk_end_request_all(req, -ENXIO);
-               }
-
-               msleep(200); /* allow bsg to possibly finish */
-               spin_lock_irq(q->queue_lock);
-       }
-       bsg_unregister_queue(q);
-}
-EXPORT_SYMBOL_GPL(bsg_remove_queue);
diff --git a/block/genhd.c b/block/genhd.c

index 9cf5583c90ffa75c2b010020fb22fc8ddf55e82a..cac7366957c376cedb2341753520e6f32516572c 100644 (file)
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -154,7 +154,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
                 part = rcu_dereference(ptbl->part[piter->idx]);
                 if (!part)
                         continue;
-               if (!part->nr_sects &&
+               if (!part_nr_sects_read(part) &&
                     !(piter->flags & DISK_PITER_INCL_EMPTY) &&
                     !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
                       piter->idx == 0))
@@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
  static inline int sector_in_part(struct hd_struct *part, sector_t sector)
  {
         return part->start_sect <= sector &&
-               sector < part->start_sect + part->nr_sects;
+               sector < part->start_sect + part_nr_sects_read(part);
  }
  
  /**
@@ -769,8 +769,8 @@ void __init printk_all_partitions(void)
  
                         printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
                                bdevt_str(part_devt(part), devt_buf),
-                              (unsigned long long)part->nr_sects >> 1,
-                              disk_name(disk, part->partno, name_buf),
+                              (unsigned long long)part_nr_sects_read(part) >> 1
+                              , disk_name(disk, part->partno, name_buf),
                                uuid_buf);
                         if (is_part0) {
                                 if (disk->driverfs_dev != NULL &&
@@ -862,7 +862,7 @@ static int show_partition(struct seq_file *seqf, void *v)
         while ((part = disk_part_iter_next(&piter)))
                 seq_printf(seqf, "%4d  %7d %10llu %s\n",
                            MAJOR(part_devt(part)), MINOR(part_devt(part)),
-                          (unsigned long long)part->nr_sects >> 1,
+                          (unsigned long long)part_nr_sects_read(part) >> 1,
                            disk_name(sgp, part->partno, buf));
         disk_part_iter_exit(&piter);
  
@@ -1268,6 +1268,16 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
                 }
                 disk->part_tbl->part[0] = &disk->part0;
  
+               /*
+                * set_capacity() and get_capacity() currently don't use
+                * seqcounter to read/update the part0->nr_sects. Still init
+                * the counter as we can read the sectors in IO submission
+                * patch using seqence counters.
+                *
+                * TODO: Ideally set_capacity() and get_capacity() should be
+                * converted to make use of bd_mutex and sequence counters.
+                */
+               seqcount_init(&disk->part0.nr_sects_seq);
                 hd_ref_init(&disk->part0);
  
                 disk->minors = minors;
diff --git a/block/ioctl.c b/block/ioctl.c

index ba15b2dbfb98ea55911109543f35889ea7b615da..4476e0e85d1687c08b31f81e7f633ae802b14814 100644 (file)
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -13,7 +13,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
  {
         struct block_device *bdevp;
         struct gendisk *disk;
-       struct hd_struct *part;
+       struct hd_struct *part, *lpart;
         struct blkpg_ioctl_arg a;
         struct blkpg_partition p;
         struct disk_part_iter piter;
@@ -36,8 +36,8 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
                 case BLKPG_ADD_PARTITION:
                         start = p.start >> 9;
                         length = p.length >> 9;
-                       /* check for fit in a hd_struct */ 
-                       if (sizeof(sector_t) == sizeof(long) && 
+                       /* check for fit in a hd_struct */
+                       if (sizeof(sector_t) == sizeof(long) &&
                             sizeof(long long) > sizeof(long)) {
                                 long pstart = start, plength = length;
                                 if (pstart != start || plength != length
@@ -91,6 +91,59 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
                         mutex_unlock(&bdevp->bd_mutex);
                         bdput(bdevp);
  
+                       return 0;
+               case BLKPG_RESIZE_PARTITION:
+                       start = p.start >> 9;
+                       /* new length of partition in bytes */
+                       length = p.length >> 9;
+                       /* check for fit in a hd_struct */
+                       if (sizeof(sector_t) == sizeof(long) &&
+                           sizeof(long long) > sizeof(long)) {
+                               long pstart = start, plength = length;
+                               if (pstart != start || plength != length
+                                   || pstart < 0 || plength < 0)
+                                       return -EINVAL;
+                       }
+                       part = disk_get_part(disk, partno);
+                       if (!part)
+                               return -ENXIO;
+                       bdevp = bdget(part_devt(part));
+                       if (!bdevp) {
+                               disk_put_part(part);
+                               return -ENOMEM;
+                       }
+                       mutex_lock(&bdevp->bd_mutex);
+                       mutex_lock_nested(&bdev->bd_mutex, 1);
+                       if (start != part->start_sect) {
+                               mutex_unlock(&bdevp->bd_mutex);
+                               mutex_unlock(&bdev->bd_mutex);
+                               bdput(bdevp);
+                               disk_put_part(part);
+                               return -EINVAL;
+                       }
+                       /* overlap? */
+                       disk_part_iter_init(&piter, disk,
+                                           DISK_PITER_INCL_EMPTY);
+                       while ((lpart = disk_part_iter_next(&piter))) {
+                               if (lpart->partno != partno &&
+                                  !(start + length <= lpart->start_sect ||
+                                  start >= lpart->start_sect + lpart->nr_sects)
+                                  ) {
+                                       disk_part_iter_exit(&piter);
+                                       mutex_unlock(&bdevp->bd_mutex);
+                                       mutex_unlock(&bdev->bd_mutex);
+                                       bdput(bdevp);
+                                       disk_put_part(part);
+                                       return -EBUSY;
+                               }
+                       }
+                       disk_part_iter_exit(&piter);
+                       part_nr_sects_write(part, (sector_t)length);
+                       i_size_write(bdevp->bd_inode, p.length);
+                       mutex_unlock(&bdevp->bd_mutex);
+                       mutex_unlock(&bdev->bd_mutex);
+                       bdput(bdevp);
+                       disk_put_part(part);
                         return 0;
                 default:
                         return -EINVAL;
diff --git a/block/partition-generic.c b/block/partition-generic.c

index 6df5d6928a440c53b44f158ea99b98d7d9a48660..f1d14519cc040424e79fe0f35f446e4d4e7a123d 100644 (file)
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -84,7 +84,7 @@ ssize_t part_size_show(struct device *dev,
                        struct device_attribute *attr, char *buf)
  {
         struct hd_struct *p = dev_to_part(dev);
-       return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
+       return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p));
  }
  
  static ssize_t part_ro_show(struct device *dev,
@@ -294,6 +294,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
                 err = -ENOMEM;
                 goto out_free;
         }
+
+       seqcount_init(&p->nr_sects_seq);
         pdev = part_to_dev(p);
  
         p->start_sect = start;
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c

index 2d1e68db9b3ffb5828482cdda3d797e3fa1d6677..e894ca7b54c0c895cd92cf15134f0b6196d71c43 100644 (file)
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -4146,45 +4146,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
  static void
  fc_bsg_remove(struct request_queue *q)
  {
-       struct request *req; /* block request */
-       int counts; /* totals for request_list count and starved */
-
         if (q) {
-               /* Stop taking in new requests */
-               spin_lock_irq(q->queue_lock);
-               blk_stop_queue(q);
-
-               /* drain all requests in the queue */
-               while (1) {
-                       /* need the lock to fetch a request
-                        * this may fetch the same reqeust as the previous pass
-                        */
-                       req = blk_fetch_request(q);
-                       /* save requests in use and starved */
-                       counts = q->rq.count[0] + q->rq.count[1] +
-                               q->rq.starved[0] + q->rq.starved[1];
-                       spin_unlock_irq(q->queue_lock);
-                       /* any requests still outstanding? */
-                       if (counts == 0)
-                               break;
-
-                       /* This may be the same req as the previous iteration,
-                        * always send the blk_end_request_all after a prefetch.
-                        * It is not okay to not end the request because the
-                        * prefetch started the request.
-                        */
-                       if (req) {
-                               /* return -ENXIO to indicate that this queue is
-                                * going away
-                                */
-                               req->errors = -ENXIO;
-                               blk_end_request_all(req, -ENXIO);
-                       }
-
-                       msleep(200); /* allow bsg to possibly finish */
-                       spin_lock_irq(q->queue_lock);
-               }
-
                 bsg_unregister_queue(q);
                 blk_cleanup_queue(q);
         }
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c

index 09809d06eccb3eba2b63b91767d410454d1854fa..fa1dfaa83e32986061586c4fcb2f6f8e9e23eaf9 100644 (file)
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -575,7 +575,7 @@ static int iscsi_remove_host(struct transport_container *tc,
         struct iscsi_cls_host *ihost = shost->shost_data;
  
         if (ihost->bsg_q) {
-               bsg_remove_queue(ihost->bsg_q);
+               bsg_unregister_queue(ihost->bsg_q);
                 blk_cleanup_queue(ihost->bsg_q);
         }
         return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 07954b05b86cc3ac9c4f72aa97584b551914631e..3816ce8a08fc44aea59438e0ca1c7d1a897aaa7e 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -46,16 +46,23 @@ struct blkcg_gq;
  struct request;
  typedef void (rq_end_io_fn)(struct request *, int);
  
+#define BLK_RL_SYNCFULL                (1U << 0)
+#define BLK_RL_ASYNCFULL       (1U << 1)
+
  struct request_list {
+       struct request_queue    *q;     /* the queue this rl belongs to */
+#ifdef CONFIG_BLK_CGROUP
+       struct blkcg_gq         *blkg;  /* blkg this request pool belongs to */
+#endif
         /*
          * count[], starved[], and wait[] are indexed by
          * BLK_RW_SYNC/BLK_RW_ASYNC
          */
-       int count[2];
-       int starved[2];
-       int elvpriv;
-       mempool_t *rq_pool;
-       wait_queue_head_t wait[2];
+       int                     count[2];
+       int                     starved[2];
+       mempool_t               *rq_pool;
+       wait_queue_head_t       wait[2];
+       unsigned int            flags;
  };
  
  /*
@@ -138,6 +145,7 @@ struct request {
         struct hd_struct *part;
         unsigned long start_time;
  #ifdef CONFIG_BLK_CGROUP
+       struct request_list *rl;                /* rl this rq is alloced from */
         unsigned long long start_time_ns;
         unsigned long long io_start_time_ns;    /* when passed to hardware */
  #endif
@@ -282,11 +290,16 @@ struct request_queue {
         struct list_head        queue_head;
         struct request          *last_merge;
         struct elevator_queue   *elevator;
+       int                     nr_rqs[2];      /* # allocated [a]sync rqs */
+       int                     nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
  
         /*
-        * the queue request freelist, one for reads and one for writes
+        * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
+        * is used, root blkg allocates from @q->root_rl and all other
+        * blkgs from their own blkg->rl.  Which one to use should be
+        * determined using bio_request_list().
          */
-       struct request_list     rq;
+       struct request_list     root_rl;
  
         request_fn_proc         *request_fn;
         make_request_fn         *make_request_fn;
@@ -561,27 +574,25 @@ static inline bool rq_is_sync(struct request *rq)
         return rw_is_sync(rq->cmd_flags);
  }
  
-static inline int blk_queue_full(struct request_queue *q, int sync)
+static inline bool blk_rl_full(struct request_list *rl, bool sync)
  {
-       if (sync)
-               return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags);
-       return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags);
+       unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
+
+       return rl->flags & flag;
  }
  
-static inline void blk_set_queue_full(struct request_queue *q, int sync)
+static inline void blk_set_rl_full(struct request_list *rl, bool sync)
  {
-       if (sync)
-               queue_flag_set(QUEUE_FLAG_SYNCFULL, q);
-       else
-               queue_flag_set(QUEUE_FLAG_ASYNCFULL, q);
+       unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
+
+       rl->flags |= flag;
  }
  
-static inline void blk_clear_queue_full(struct request_queue *q, int sync)
+static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
  {
-       if (sync)
-               queue_flag_clear(QUEUE_FLAG_SYNCFULL, q);
-       else
-               queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q);
+       unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
+
+       rl->flags &= ~flag;
  }
  
  
diff --git a/include/linux/blkpg.h b/include/linux/blkpg.h

index faf8a45af210053c535537c85cb25b1af228c7e0..a8519446c111ab02fb71e79e6b31fea1df666223 100644 (file)
--- a/include/linux/blkpg.h
+++ b/include/linux/blkpg.h
@@ -40,6 +40,7 @@ struct blkpg_ioctl_arg {
  /* The subfunctions (for the op field) */
  #define BLKPG_ADD_PARTITION    1
  #define BLKPG_DEL_PARTITION    2
+#define BLKPG_RESIZE_PARTITION 3
  
  /* Sizes of name fields. Unused at present. */
  #define BLKPG_DEVNAMELTH       64
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h

index f55ab8cdc10630f75131f96324278fd56cc00e81..4d0fb3df2f4adaa584e13e963b7bc6ffa02a4e26 100644 (file)
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -67,7 +67,6 @@ void bsg_job_done(struct bsg_job *job, int result,
  int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name,
                     bsg_job_fn *job_fn, int dd_job_size);
  void bsg_request_fn(struct request_queue *q);
-void bsg_remove_queue(struct request_queue *q);
  void bsg_goose_queue(struct request_queue *q);
  
  #endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h

index ae0aaa9d42faacf9d250ae509a93947fc1d06424..4f440b3e89fe7fd7e47ead6c6ed911f173205431 100644 (file)
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -97,7 +97,13 @@ struct partition_meta_info {
  
  struct hd_struct {
         sector_t start_sect;
+       /*
+        * nr_sects is protected by sequence counter. One might extend a
+        * partition while IO is happening to it and update of nr_sects
+        * can be non-atomic on 32bit machines with 64bit sector_t.
+        */
         sector_t nr_sects;
+       seqcount_t nr_sects_seq;
         sector_t alignment_offset;
         unsigned int discard_alignment;
         struct device __dev;
@@ -647,6 +653,57 @@ static inline void hd_struct_put(struct hd_struct *part)
                 __delete_partition(part);
  }
  
+/*
+ * Any access of part->nr_sects which is not protected by partition
+ * bd_mutex or gendisk bdev bd_mutex, should be done using this
+ * accessor function.
+ *
+ * Code written along the lines of i_size_read() and i_size_write().
+ * CONFIG_PREEMPT case optimizes the case of UP kernel with preemption
+ * on.
+ */
+static inline sector_t part_nr_sects_read(struct hd_struct *part)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
+       sector_t nr_sects;
+       unsigned seq;
+       do {
+               seq = read_seqcount_begin(&part->nr_sects_seq);
+               nr_sects = part->nr_sects;
+       } while (read_seqcount_retry(&part->nr_sects_seq, seq));
+       return nr_sects;
+#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
+       sector_t nr_sects;
+
+       preempt_disable();
+       nr_sects = part->nr_sects;
+       preempt_enable();
+       return nr_sects;
+#else
+       return part->nr_sects;
+#endif
+}
+
+/*
+ * Should be called with mutex lock held (typically bd_mutex) of partition
+ * to provide mutual exlusion among writers otherwise seqcount might be
+ * left in wrong state leaving the readers spinning infinitely.
+ */
+static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
+       write_seqcount_begin(&part->nr_sects_seq);
+       part->nr_sects = size;
+       write_seqcount_end(&part->nr_sects_seq);
+#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
+       preempt_disable();
+       part->nr_sects = size;
+       preempt_enable();
+#else
+       part->nr_sects = size;
+#endif
+}
+
  #else /* CONFIG_BLOCK */
  
  static inline void printk_all_partitions(void) { }
diff --git a/include/linux/mempool.h b/include/linux/mempool.h

index 7c08052e332111a7aa14d3f4bd7f0390b2a390c9..39ed62ab5b8a38ef3aafa3729767dc7d7c7587ef 100644 (file)
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -26,7 +26,8 @@ typedef struct mempool_s {
  extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                         mempool_free_t *free_fn, void *pool_data);
  extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
-                       mempool_free_t *free_fn, void *pool_data, int nid);
+                       mempool_free_t *free_fn, void *pool_data,
+                       gfp_t gfp_mask, int nid);
  
  extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask);
  extern void mempool_destroy(mempool_t *pool);
diff --git a/mm/mempool.c b/mm/mempool.c

index d9049811f3521bc690ff1535831b87dc3b682fe4..54990476c049b2fa60c5e740a0533ea70df1f856 100644 (file)
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -63,19 +63,21 @@ EXPORT_SYMBOL(mempool_destroy);
  mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                                 mempool_free_t *free_fn, void *pool_data)
  {
-       return  mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
+       return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+                                  GFP_KERNEL, NUMA_NO_NODE);
  }
  EXPORT_SYMBOL(mempool_create);
  
  mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
-                       mempool_free_t *free_fn, void *pool_data, int node_id)
+                              mempool_free_t *free_fn, void *pool_data,
+                              gfp_t gfp_mask, int node_id)
  {
         mempool_t *pool;
-       pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id);
+       pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
         if (!pool)
                 return NULL;
         pool->elements = kmalloc_node(min_nr * sizeof(void *),
-                                       GFP_KERNEL, node_id);
+                                     gfp_mask, node_id);
         if (!pool->elements) {
                 kfree(pool);
                 return NULL;
@@ -93,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
         while (pool->curr_nr < pool->min_nr) {
                 void *element;
  
-               element = pool->alloc(GFP_KERNEL, pool->pool_data);
+               element = pool->alloc(gfp_mask, pool->pool_data);
                 if (unlikely(!element)) {
                         mempool_destroy(pool);
                         return NULL;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 1 Aug 2012 16:02:41 +0000 (09:02 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 1 Aug 2012 16:02:41 +0000 (09:02 -0700)
Documentation/block/queue-sysfs.txt		patch \| blob \| history
block/blk-cgroup.c		patch \| blob \| history
block/blk-cgroup.h		patch \| blob \| history
block/blk-core.c		patch \| blob \| history
block/blk-ioc.c		patch \| blob \| history
block/blk-settings.c		patch \| blob \| history
block/blk-sysfs.c		patch \| blob \| history
block/blk-throttle.c		patch \| blob \| history
block/blk.h		patch \| blob \| history
block/bsg-lib.c		patch \| blob \| history
block/genhd.c		patch \| blob \| history
block/ioctl.c		patch \| blob \| history
block/partition-generic.c		patch \| blob \| history
drivers/scsi/scsi_transport_fc.c		patch \| blob \| history
drivers/scsi/scsi_transport_iscsi.c		patch \| blob \| history
include/linux/blkdev.h		patch \| blob \| history
include/linux/blkpg.h		patch \| blob \| history
include/linux/bsg-lib.h		patch \| blob \| history
include/linux/genhd.h		patch \| blob \| history
include/linux/mempool.h		patch \| blob \| history
mm/mempool.c		patch \| blob \| history