#include <linux/fault-inject.h>
#include <linux/list_sort.h>
#include <linux/delay.h>
+#include <linux/ratelimit.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-cgroup.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
*
* This function does not cancel any asynchronous activity arising
* out of elevator or throttling code. That would require elevaotor_exit()
- * and blk_throtl_exit() to be called with queue lock initialized.
+ * and blkcg_exit_queue() to be called with queue lock initialized.
*
*/
void blk_sync_queue(struct request_queue *q)
*/
void blk_drain_queue(struct request_queue *q, bool drain_all)
{
+ int i;
+
while (true) {
bool drain = false;
- int i;
spin_lock_irq(q->queue_lock);
- elv_drain_elevator(q);
- if (drain_all)
- blk_throtl_drain(q);
+ /*
+ * The caller might be trying to drain @q before its
+ * elevator is initialized.
+ */
+ if (q->elevator)
+ elv_drain_elevator(q);
+
+ blkcg_drain_queue(q);
/*
* This function might be called on a queue which failed
- * driver init after queue creation. Some drivers
- * (e.g. fd) get unhappy in such cases. Kick queue iff
- * dispatch queue has something on it.
+ * driver init after queue creation or is not yet fully
+ * active yet. Some drivers (e.g. fd and loop) get unhappy
+ * in such cases. Kick queue iff dispatch queue has
+ * something on it and @q has request_fn set.
*/
- if (!list_empty(&q->queue_head))
+ if (!list_empty(&q->queue_head) && q->request_fn)
__blk_run_queue(q);
- drain |= q->rq.elvpriv;
+ drain |= q->nr_rqs_elvpriv;
/*
* Unfortunately, requests are queued at and tracked from
if (drain_all) {
drain |= !list_empty(&q->queue_head);
for (i = 0; i < 2; i++) {
- drain |= q->rq.count[i];
+ drain |= q->nr_rqs[i];
drain |= q->in_flight[i];
drain |= !list_empty(&q->flush_queue[i]);
}
break;
msleep(10);
}
+
+ /*
+ * With queue marked dead, any woken up waiter will fail the
+ * allocation path, so the wakeup chaining is lost and we're
+ * left with hung waiters. We need to wake up those waiters.
+ */
+ if (q->request_fn) {
+ spin_lock_irq(q->queue_lock);
+ for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++)
+ wake_up_all(&q->rq.wait[i]);
+ spin_unlock_irq(q->queue_lock);
+ }
}
+/**
+ * blk_queue_bypass_start - enter queue bypass mode
+ * @q: queue of interest
+ *
+ * In bypass mode, only the dispatch FIFO queue of @q is used. This
+ * function makes @q enter bypass mode and drains all requests which were
+ * throttled or issued before. On return, it's guaranteed that no request
+ * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
+ * inside queue or RCU read lock.
+ */
+void blk_queue_bypass_start(struct request_queue *q)
+{
+ bool drain;
+
+ spin_lock_irq(q->queue_lock);
+ drain = !q->bypass_depth++;
+ queue_flag_set(QUEUE_FLAG_BYPASS, q);
+ spin_unlock_irq(q->queue_lock);
+
+ if (drain) {
+ blk_drain_queue(q, false);
+ /* ensure blk_queue_bypass() is %true inside RCU read lock */
+ synchronize_rcu();
+ }
+}
+EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
+
+/**
+ * blk_queue_bypass_end - leave queue bypass mode
+ * @q: queue of interest
+ *
+ * Leave bypass mode and restore the normal queueing behavior.
+ */
+void blk_queue_bypass_end(struct request_queue *q)
+{
+ spin_lock_irq(q->queue_lock);
+ if (!--q->bypass_depth)
+ queue_flag_clear(QUEUE_FLAG_BYPASS, q);
+ WARN_ON_ONCE(q->bypass_depth < 0);
+ spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
+
/**
* blk_cleanup_queue - shutdown a request queue
* @q: request queue to shutdown
/* mark @q DEAD, no new request or merges will be allowed afterwards */
mutex_lock(&q->sysfs_lock);
queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
-
spin_lock_irq(lock);
+
+ /*
+ * Dead queue is permanently in bypass mode till released. Note
+ * that, unlike blk_queue_bypass_start(), we aren't performing
+ * synchronize_rcu() after entering bypass mode to avoid the delay
+ * as some drivers create and destroy a lot of queues while
+ * probing. This is still safe because blk_release_queue() will be
+ * called only after the queue refcnt drops to zero and nothing,
+ * RCU or not, would be traversing the queue by then.
+ */
+ q->bypass_depth++;
+ queue_flag_set(QUEUE_FLAG_BYPASS, q);
+
queue_flag_set(QUEUE_FLAG_NOMERGES, q);
queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
queue_flag_set(QUEUE_FLAG_DEAD, q);
-
- if (q->queue_lock != &q->__queue_lock)
- q->queue_lock = &q->__queue_lock;
-
spin_unlock_irq(lock);
mutex_unlock(&q->sysfs_lock);
- /*
- * Drain all requests queued before DEAD marking. The caller might
- * be trying to tear down @q before its elevator is initialized, in
- * which case we don't want to call into draining.
- */
- if (q->elevator)
- blk_drain_queue(q, true);
+ /* drain all requests queued before DEAD marking */
+ blk_drain_queue(q, true);
/* @q won't process any more request, flush async actions */
del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
blk_sync_queue(q);
+ spin_lock_irq(lock);
+ if (q->queue_lock != &q->__queue_lock)
+ q->queue_lock = &q->__queue_lock;
+ spin_unlock_irq(lock);
+
/* @q is and will stay empty, shutdown and put */
blk_put_queue(q);
}
EXPORT_SYMBOL(blk_cleanup_queue);
-static int blk_init_free_list(struct request_queue *q)
+int blk_init_rl(struct request_list *rl, struct request_queue *q,
+ gfp_t gfp_mask)
{
- struct request_list *rl = &q->rq;
-
if (unlikely(rl->rq_pool))
return 0;
+ rl->q = q;
rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
- rl->elvpriv = 0;
init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
- mempool_free_slab, request_cachep, q->node);
-
+ mempool_free_slab, request_cachep,
+ gfp_mask, q->node);
if (!rl->rq_pool)
return -ENOMEM;
return 0;
}
+void blk_exit_rl(struct request_list *rl)
+{
+ if (rl->rq_pool)
+ mempool_destroy(rl->rq_pool);
+}
+
struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
{
return blk_alloc_queue_node(gfp_mask, -1);
if (err)
goto fail_id;
- if (blk_throtl_init(q))
- goto fail_id;
-
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
+ INIT_LIST_HEAD(&q->queue_head);
INIT_LIST_HEAD(&q->timeout_list);
INIT_LIST_HEAD(&q->icq_list);
+#ifdef CONFIG_BLK_CGROUP
+ INIT_LIST_HEAD(&q->blkg_list);
+#endif
INIT_LIST_HEAD(&q->flush_queue[0]);
INIT_LIST_HEAD(&q->flush_queue[1]);
INIT_LIST_HEAD(&q->flush_data_in_flight);
*/
q->queue_lock = &q->__queue_lock;
+ /*
+ * A queue starts its life with bypass turned on to avoid
+ * unnecessary bypass on/off overhead and nasty surprises during
+ * init. The initial bypass will be finished at the end of
+ * blk_init_allocated_queue().
+ */
+ q->bypass_depth = 1;
+ __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+
+ if (blkcg_init_queue(q))
+ goto fail_id;
+
return q;
fail_id:
if (!q)
return NULL;
- if (blk_init_free_list(q))
+ if (blk_init_rl(&q->rq, q, GFP_KERNEL))
return NULL;
q->request_fn = rfn;
q->sg_reserved_size = INT_MAX;
- /*
- * all done
- */
- if (!elevator_init(q, NULL)) {
- blk_queue_congestion_threshold(q);
- return q;
- }
+ /* init elevator */
+ if (elevator_init(q, NULL))
+ return NULL;
- return NULL;
+ blk_queue_congestion_threshold(q);
+
+ /* all done, end the initial bypass */
+ blk_queue_bypass_end(q);
+ return q;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
}
EXPORT_SYMBOL(blk_get_queue);
-static inline void blk_free_request(struct request_queue *q, struct request *rq)
+static inline void blk_free_request(struct request_list *rl, struct request *rq)
{
if (rq->cmd_flags & REQ_ELVPRIV) {
- elv_put_request(q, rq);
+ elv_put_request(rl->q, rq);
if (rq->elv.icq)
put_io_context(rq->elv.icq->ioc);
}
- mempool_free(rq, q->rq.rq_pool);
-}
-
-static struct request *
-blk_alloc_request(struct request_queue *q, struct io_cq *icq,
- unsigned int flags, gfp_t gfp_mask)
-{
- struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
-
- if (!rq)
- return NULL;
-
- blk_rq_init(q, rq);
-
- rq->cmd_flags = flags | REQ_ALLOCED;
-
- if (flags & REQ_ELVPRIV) {
- rq->elv.icq = icq;
- if (unlikely(elv_set_request(q, rq, gfp_mask))) {
- mempool_free(rq, q->rq.rq_pool);
- return NULL;
- }
- /* @rq->elv.icq holds on to io_context until @rq is freed */
- if (icq)
- get_io_context(icq->ioc);
- }
-
- return rq;
+ mempool_free(rq, rl->rq_pool);
}
/*
ioc->last_waited = jiffies;
}
-static void __freed_request(struct request_queue *q, int sync)
+static void __freed_request(struct request_list *rl, int sync)
{
- struct request_list *rl = &q->rq;
+ struct request_queue *q = rl->q;
if (rl->count[sync] < queue_congestion_off_threshold(q))
blk_clear_queue_congested(q, sync);
if (waitqueue_active(&rl->wait[sync]))
wake_up(&rl->wait[sync]);
- blk_clear_queue_full(q, sync);
+ blk_clear_rl_full(rl, sync);
}
}
* A request has just been released. Account for it, update the full and
* congestion status, wake up any waiters. Called under q->queue_lock.
*/
-static void freed_request(struct request_queue *q, unsigned int flags)
+static void freed_request(struct request_list *rl, unsigned int flags)
{
- struct request_list *rl = &q->rq;
+ struct request_queue *q = rl->q;
int sync = rw_is_sync(flags);
+ q->nr_rqs[sync]--;
rl->count[sync]--;
if (flags & REQ_ELVPRIV)
- rl->elvpriv--;
+ q->nr_rqs_elvpriv--;
- __freed_request(q, sync);
+ __freed_request(rl, sync);
if (unlikely(rl->starved[sync ^ 1]))
- __freed_request(q, sync ^ 1);
+ __freed_request(rl, sync ^ 1);
}
/*
}
/**
- * get_request - get a free request
- * @q: request_queue to allocate request from
+ * rq_ioc - determine io_context for request allocation
+ * @bio: request being allocated is for this bio (can be %NULL)
+ *
+ * Determine io_context to use for request allocation for @bio. May return
+ * %NULL if %current->io_context doesn't exist.
+ */
+static struct io_context *rq_ioc(struct bio *bio)
+{
+#ifdef CONFIG_BLK_CGROUP
+ if (bio && bio->bi_ioc)
+ return bio->bi_ioc;
+#endif
+ return current->io_context;
+}
+
+/**
+ * __get_request - get a free request
+ * @rl: request list to allocate from
* @rw_flags: RW and SYNC flags
* @bio: bio to allocate request for (can be %NULL)
* @gfp_mask: allocation mask
* Returns %NULL on failure, with @q->queue_lock held.
* Returns !%NULL on success, with @q->queue_lock *not held*.
*/
-static struct request *get_request(struct request_queue *q, int rw_flags,
- struct bio *bio, gfp_t gfp_mask)
+static struct request *__get_request(struct request_list *rl, int rw_flags,
+ struct bio *bio, gfp_t gfp_mask)
{
- struct request *rq = NULL;
- struct request_list *rl = &q->rq;
- struct elevator_type *et;
- struct io_context *ioc;
+ struct request_queue *q = rl->q;
+ struct request *rq;
+ struct elevator_type *et = q->elevator->type;
+ struct io_context *ioc = rq_ioc(bio);
struct io_cq *icq = NULL;
const bool is_sync = rw_is_sync(rw_flags) != 0;
- bool retried = false;
int may_queue;
-retry:
- et = q->elevator->type;
- ioc = current->io_context;
if (unlikely(blk_queue_dead(q)))
return NULL;
if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
if (rl->count[is_sync]+1 >= q->nr_requests) {
- /*
- * We want ioc to record batching state. If it's
- * not already there, creating a new one requires
- * dropping queue_lock, which in turn requires
- * retesting conditions to avoid queue hang.
- */
- if (!ioc && !retried) {
- spin_unlock_irq(q->queue_lock);
- create_io_context(current, gfp_mask, q->node);
- spin_lock_irq(q->queue_lock);
- retried = true;
- goto retry;
- }
-
/*
* The queue will fill after this allocation, so set
* it as full, and mark this process as "batching".
* This process will be allowed to complete a batch of
* requests, others will be blocked.
*/
- if (!blk_queue_full(q, is_sync)) {
+ if (!blk_rl_full(rl, is_sync)) {
ioc_set_batching(q, ioc);
- blk_set_queue_full(q, is_sync);
+ blk_set_rl_full(rl, is_sync);
} else {
if (may_queue != ELV_MQUEUE_MUST
&& !ioc_batching(q, ioc)) {
* process is not a "batcher", and not
* exempted by the IO scheduler
*/
- goto out;
+ return NULL;
}
}
}
* allocated with any setting of ->nr_requests
*/
if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
- goto out;
+ return NULL;
+ q->nr_rqs[is_sync]++;
rl->count[is_sync]++;
rl->starved[is_sync] = 0;
* Also, lookup icq while holding queue_lock. If it doesn't exist,
* it will be created after releasing queue_lock.
*/
- if (blk_rq_should_init_elevator(bio) &&
- !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
+ if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
rw_flags |= REQ_ELVPRIV;
- rl->elvpriv++;
+ q->nr_rqs_elvpriv++;
if (et->icq_cache && ioc)
icq = ioc_lookup_icq(ioc, q);
}
rw_flags |= REQ_IO_STAT;
spin_unlock_irq(q->queue_lock);
- /* create icq if missing */
- if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) {
- icq = ioc_create_icq(q, gfp_mask);
- if (!icq)
- goto fail_icq;
- }
-
- rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
+ /* allocate and init request */
+ rq = mempool_alloc(rl->rq_pool, gfp_mask);
+ if (!rq)
+ goto fail_alloc;
-fail_icq:
- if (unlikely(!rq)) {
- /*
- * Allocation failed presumably due to memory. Undo anything
- * we might have messed up.
- *
- * Allocating task should really be put onto the front of the
- * wait queue, but this is pretty rare.
- */
- spin_lock_irq(q->queue_lock);
- freed_request(q, rw_flags);
+ blk_rq_init(q, rq);
+ rq->cmd_flags = rw_flags | REQ_ALLOCED;
+
+ /* init elvpriv */
+ if (rw_flags & REQ_ELVPRIV) {
+ if (unlikely(et->icq_cache && !icq)) {
+ if (ioc)
+ icq = ioc_create_icq(ioc, q, gfp_mask);
+ if (!icq)
+ goto fail_elvpriv;
+ }
- /*
- * in the very unlikely event that allocation failed and no
- * requests for this direction was pending, mark us starved
- * so that freeing of a request in the other direction will
- * notice us. another possible fix would be to split the
- * rq mempool into READ and WRITE
- */
-rq_starved:
- if (unlikely(rl->count[is_sync] == 0))
- rl->starved[is_sync] = 1;
+ rq->elv.icq = icq;
+ if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
+ goto fail_elvpriv;
- goto out;
+ /* @rq->elv.icq holds io_context until @rq is freed */
+ if (icq)
+ get_io_context(icq->ioc);
}
-
+out:
/*
* ioc may be NULL here, and ioc_batching will be false. That's
* OK, if the queue is under the request limit then requests need
ioc->nr_batch_requests--;
trace_block_getrq(q, bio, rw_flags & 1);
-out:
return rq;
+
+fail_elvpriv:
+ /*
+ * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed
+ * and may fail indefinitely under memory pressure and thus
+ * shouldn't stall IO. Treat this request as !elvpriv. This will
+ * disturb iosched and blkcg but weird is bettern than dead.
+ */
+ printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
+ dev_name(q->backing_dev_info.dev));
+
+ rq->cmd_flags &= ~REQ_ELVPRIV;
+ rq->elv.icq = NULL;
+
+ spin_lock_irq(q->queue_lock);
+ q->nr_rqs_elvpriv--;
+ spin_unlock_irq(q->queue_lock);
+ goto out;
+
+fail_alloc:
+ /*
+ * Allocation failed presumably due to memory. Undo anything we
+ * might have messed up.
+ *
+ * Allocating task should really be put onto the front of the wait
+ * queue, but this is pretty rare.
+ */
+ spin_lock_irq(q->queue_lock);
+ freed_request(rl, rw_flags);
+
+ /*
+ * in the very unlikely event that allocation failed and no
+ * requests for this direction was pending, mark us starved so that
+ * freeing of a request in the other direction will notice
+ * us. another possible fix would be to split the rq mempool into
+ * READ and WRITE
+ */
+rq_starved:
+ if (unlikely(rl->count[is_sync] == 0))
+ rl->starved[is_sync] = 1;
+ return NULL;
}
/**
- * get_request_wait - get a free request with retry
+ * get_request - get a free request
* @q: request_queue to allocate request from
* @rw_flags: RW and SYNC flags
* @bio: bio to allocate request for (can be %NULL)
+ * @gfp_mask: allocation mask
*
- * Get a free request from @q. This function keeps retrying under memory
- * pressure and fails iff @q is dead.
+ * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this
+ * function keeps retrying under memory pressure and fails iff @q is dead.
*
* Must be callled with @q->queue_lock held and,
* Returns %NULL on failure, with @q->queue_lock held.
* Returns !%NULL on success, with @q->queue_lock *not held*.
*/
-static struct request *get_request_wait(struct request_queue *q, int rw_flags,
- struct bio *bio)
+static struct request *get_request(struct request_queue *q, int rw_flags,
+ struct bio *bio, gfp_t gfp_mask)
{
const bool is_sync = rw_is_sync(rw_flags) != 0;
+ DEFINE_WAIT(wait);
+ struct request_list *rl = &q->rq;
struct request *rq;
+retry:
+ rq = __get_request(&q->rq, rw_flags, bio, gfp_mask);
+ if (rq)
+ return rq;
- rq = get_request(q, rw_flags, bio, GFP_NOIO);
- while (!rq) {
- DEFINE_WAIT(wait);
- struct request_list *rl = &q->rq;
-
- if (unlikely(blk_queue_dead(q)))
- return NULL;
-
- prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
- TASK_UNINTERRUPTIBLE);
+ if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q)))
+ return NULL;
- trace_block_sleeprq(q, bio, rw_flags & 1);
+ /* wait on @rl and retry */
+ prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
+ TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(q->queue_lock);
- io_schedule();
+ trace_block_sleeprq(q, bio, rw_flags & 1);
- /*
- * After sleeping, we become a "batching" process and
- * will be able to allocate at least one request, and
- * up to a big batch of them for a small period time.
- * See ioc_batching, ioc_set_batching
- */
- create_io_context(current, GFP_NOIO, q->node);
- ioc_set_batching(q, current->io_context);
+ spin_unlock_irq(q->queue_lock);
+ io_schedule();
- spin_lock_irq(q->queue_lock);
- finish_wait(&rl->wait[is_sync], &wait);
+ /*
+ * After sleeping, we become a "batching" process and will be able
+ * to allocate at least one request, and up to a big batch of them
+ * for a small period time. See ioc_batching, ioc_set_batching
+ */
+ ioc_set_batching(q, current->io_context);
- rq = get_request(q, rw_flags, bio, GFP_NOIO);
- };
+ spin_lock_irq(q->queue_lock);
+ finish_wait(&rl->wait[is_sync], &wait);
- return rq;
+ goto retry;
}
struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
BUG_ON(rw != READ && rw != WRITE);
+ /* create ioc upfront */
+ create_io_context(gfp_mask, q->node);
+
spin_lock_irq(q->queue_lock);
- if (gfp_mask & __GFP_WAIT)
- rq = get_request_wait(q, rw, NULL);
- else
- rq = get_request(q, rw, NULL, gfp_mask);
+ rq = get_request(q, rw, NULL, gfp_mask);
if (!rq)
spin_unlock_irq(q->queue_lock);
/* q->queue_lock is unlocked at this point */
BUG_ON(!list_empty(&req->queuelist));
BUG_ON(!hlist_unhashed(&req->hash));
- blk_free_request(q, req);
- freed_request(q, flags);
+ blk_free_request(&q->rq, req);
+ freed_request(&q->rq, flags);
}
}
EXPORT_SYMBOL_GPL(__blk_put_request);
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
- req = get_request_wait(q, rw_flags, bio);
+ req = get_request(q, rw_flags, bio, GFP_NOIO);
if (unlikely(!req)) {
bio_endio(bio, -ENODEV); /* @q is dead */
goto out_unlock;
goto end_io;
}
+ /*
+ * Various block parts want %current->io_context and lazy ioc
+ * allocation ends up trading a lot of pain for a small amount of
+ * memory. Just allocate it upfront. This may fail and block
+ * layer knows how to live with it.
+ */
+ create_io_context(GFP_ATOMIC, q->node);
+
if (blk_throtl_bio(q, bio))
return false; /* throttled, will be resubmitted later */