#include "blk-mq.h"
#include "blk-mq-tag.h"
- void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx,
- bool reserved)
+ void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved)
{
int tag, zero = 0;
- tag = blk_mq_get_tag(tags, hctx, &zero, __GFP_WAIT, reserved);
- blk_mq_put_tag(tags, tag, &zero);
+ tag = blk_mq_get_tag(hctx, &zero, __GFP_WAIT, reserved);
+ blk_mq_put_tag(hctx, tag, &zero);
}
static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
int i;
for (i = 0; i < bt->map_nr; i++) {
- struct blk_mq_bitmap *bm = &bt->map[i];
+ struct blk_align_bitmap *bm = &bt->map[i];
int ret;
ret = find_first_zero_bit(&bm->word, bm->depth);
return bt_has_free_tags(&tags->bitmap_tags);
}
-static int __bt_get_word(struct blk_mq_bitmap *bm, unsigned int last_tag)
+ static inline void bt_index_inc(unsigned int *index)
+ {
+ *index = (*index + 1) & (BT_WAIT_QUEUES - 1);
+ }
+
+ /*
+ * If a previously inactive queue goes active, bump the active user count.
+ */
+ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+ {
+ if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
+ !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ atomic_inc(&hctx->tags->active_queues);
+
+ return true;
+ }
+
+ /*
+ * If a previously busy queue goes inactive, potential waiters could now
+ * be allowed to queue. Wake them up and check.
+ */
+ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+ {
+ struct blk_mq_tags *tags = hctx->tags;
+ struct blk_mq_bitmap_tags *bt;
+ int i, wake_index;
+
+ if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ return;
+
+ atomic_dec(&tags->active_queues);
+
+ /*
+ * Will only throttle depth on non-reserved tags
+ */
+ bt = &tags->bitmap_tags;
+ wake_index = bt->wake_index;
+ for (i = 0; i < BT_WAIT_QUEUES; i++) {
+ struct bt_wait_state *bs = &bt->bs[wake_index];
+
+ if (waitqueue_active(&bs->wait))
+ wake_up(&bs->wait);
+
+ bt_index_inc(&wake_index);
+ }
+ }
+
+ /*
+ * For shared tag users, we track the number of currently active users
+ * and attempt to provide a fair share of the tag depth for each of them.
+ */
+ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_bitmap_tags *bt)
+ {
+ unsigned int depth, users;
+
+ if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
+ return true;
+ if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ return true;
+
+ /*
+ * Don't try dividing an ant
+ */
+ if (bt->depth == 1)
+ return true;
+
+ users = atomic_read(&hctx->tags->active_queues);
+ if (!users)
+ return true;
+
+ /*
+ * Allow at least some tags
+ */
+ depth = max((bt->depth + users - 1) / users, 4U);
+ return atomic_read(&hctx->nr_active) < depth;
+ }
+
+static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
{
int tag, org_last_tag, end;
* multiple users will tend to stick to different cachelines, at least
* until the map is exhausted.
*/
- static int __bt_get(struct blk_mq_bitmap_tags *bt, unsigned int *tag_cache)
+ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
+ unsigned int *tag_cache)
{
unsigned int last_tag, org_last_tag;
int index, i, tag;
+ if (!hctx_may_queue(hctx, bt))
+ return -1;
+
last_tag = org_last_tag = *tag_cache;
index = TAG_TO_INDEX(bt, last_tag);
return tag;
}
- static inline void bt_index_inc(unsigned int *index)
- {
- *index = (*index + 1) & (BT_WAIT_QUEUES - 1);
- }
-
static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
struct blk_mq_hw_ctx *hctx)
{
DEFINE_WAIT(wait);
int tag;
- tag = __bt_get(bt, last_tag);
+ tag = __bt_get(hctx, bt, last_tag);
if (tag != -1)
return tag;
was_empty = list_empty(&wait.task_list);
prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
- tag = __bt_get(bt, last_tag);
+ tag = __bt_get(hctx, bt, last_tag);
if (tag != -1)
break;
return tag;
}
- unsigned int blk_mq_get_tag(struct blk_mq_tags *tags,
- struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
+ unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
gfp_t gfp, bool reserved)
{
if (!reserved)
- return __blk_mq_get_tag(tags, hctx, last_tag, gfp);
+ return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp);
- return __blk_mq_get_reserved_tag(tags, gfp);
+ return __blk_mq_get_reserved_tag(hctx->tags, gfp);
}
static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
bt_clear_tag(&tags->breserved_tags, tag);
}
- void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag,
+ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
unsigned int *last_tag)
{
+ struct blk_mq_tags *tags = hctx->tags;
+
if (tag >= tags->nr_reserved_tags) {
const int real_tag = tag - tags->nr_reserved_tags;
int i;
for (i = 0; i < bt->map_nr; i++) {
- struct blk_mq_bitmap *bm = &bt->map[i];
+ struct blk_align_bitmap *bm = &bt->map[i];
int bit = 0;
do {
unsigned int i, used;
for (i = 0, used = 0; i < bt->map_nr; i++) {
- struct blk_mq_bitmap *bm = &bt->map[i];
+ struct blk_align_bitmap *bm = &bt->map[i];
used += bitmap_weight(&bm->word, bm->depth);
}
}
nr = ALIGN(depth, tags_per_word) / tags_per_word;
- bt->map = kzalloc_node(nr * sizeof(struct blk_mq_bitmap),
+ bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
GFP_KERNEL, node);
if (!bt->map)
return -ENOMEM;
res = bt_unused_tags(&tags->breserved_tags);
page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
+ page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
return page - orig_page;
}
#ifndef INT_BLK_MQ_TAG_H
#define INT_BLK_MQ_TAG_H
+#include "blk-mq.h"
+
enum {
BT_WAIT_QUEUES = 8,
BT_WAIT_BATCH = 8,
#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word)
#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1))
-struct blk_mq_bitmap {
- unsigned long word;
- unsigned long depth;
-} ____cacheline_aligned_in_smp;
-
struct blk_mq_bitmap_tags {
unsigned int depth;
unsigned int wake_cnt;
unsigned int bits_per_word;
unsigned int map_nr;
- struct blk_mq_bitmap *map;
+ struct blk_align_bitmap *map;
unsigned int wake_index;
struct bt_wait_state *bs;
unsigned int nr_tags;
unsigned int nr_reserved_tags;
+ atomic_t active_queues;
+
struct blk_mq_bitmap_tags bitmap_tags;
struct blk_mq_bitmap_tags breserved_tags;
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
- extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
- extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags, struct blk_mq_hw_ctx *hctx, bool reserved);
- extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag, unsigned int *last_tag);
+ extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
+ extern void blk_mq_wait_for_tags(struct blk_mq_hw_ctx *hctx, bool reserved);
+ extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
};
+ extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+ extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
+
+ static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+ {
+ if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+ return false;
+
+ return __blk_mq_tag_busy(hctx);
+ }
+
+ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+ {
+ if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+ return;
+
+ __blk_mq_tag_idle(hctx);
+ }
+
#endif
{
unsigned int i;
- for (i = 0; i < hctx->nr_ctx_map; i++)
- if (hctx->ctx_map[i])
+ for (i = 0; i < hctx->ctx_map.map_size; i++)
+ if (hctx->ctx_map.map[i].word)
return true;
return false;
}
+static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+{
+ return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
+}
+
+#define CTX_TO_BIT(hctx, ctx) \
+ ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
+
/*
* Mark this ctx as having pending work in this hardware queue
*/
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
- if (!test_bit(ctx->index_hw, hctx->ctx_map))
- set_bit(ctx->index_hw, hctx->ctx_map);
+ struct blk_align_bitmap *bm = get_bm(hctx, ctx);
+
+ if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
+ set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+}
+
+static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+{
+ struct blk_align_bitmap *bm = get_bm(hctx, ctx);
+
+ clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
}
static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
struct request *rq;
unsigned int tag;
- tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved);
+ tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
if (tag != BLK_MQ_TAG_FAIL) {
rq = hctx->tags->rqs[tag];
+
+ rq->cmd_flags = 0;
+ if (blk_mq_tag_busy(hctx)) {
+ rq->cmd_flags = REQ_MQ_INFLIGHT;
+ atomic_inc(&hctx->nr_active);
+ }
+
rq->tag = tag;
return rq;
}
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = q;
rq->mq_ctx = ctx;
- rq->cmd_flags = rw_flags;
+ rq->cmd_flags |= rw_flags;
rq->cmd_type = 0;
/* do not touch atomic flags, it needs atomic ops against the timer */
rq->cpu = -1;
break;
}
- blk_mq_wait_for_tags(hctx->tags, hctx, reserved);
+ blk_mq_wait_for_tags(hctx, reserved);
} while (1);
return rq;
const int tag = rq->tag;
struct request_queue *q = rq->q;
+ if (rq->cmd_flags & REQ_MQ_INFLIGHT)
+ atomic_dec(&hctx->nr_active);
+
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag);
+ blk_mq_put_tag(hctx, tag, &ctx->last_tag);
blk_mq_queue_exit(q);
}
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
- if (next_set)
- mod_timer(&q->timeout, round_jiffies_up(next));
+ if (next_set) {
+ next = blk_rq_timeout(round_jiffies_up(next));
+ mod_timer(&q->timeout, next);
+ } else {
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_tag_idle(hctx);
+ }
}
/*
return false;
}
+/*
+ * Process software queues that have been marked busy, splicing them
+ * to the for-dispatch
+ */
+static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+{
+ struct blk_mq_ctx *ctx;
+ int i;
+
+ for (i = 0; i < hctx->ctx_map.map_size; i++) {
+ struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
+ unsigned int off, bit;
+
+ if (!bm->word)
+ continue;
+
+ bit = 0;
+ off = i * hctx->ctx_map.bits_per_word;
+ do {
+ bit = find_next_bit(&bm->word, bm->depth, bit);
+ if (bit >= bm->depth)
+ break;
+
+ ctx = hctx->ctxs[bit + off];
+ clear_bit(bit, &bm->word);
+ spin_lock(&ctx->lock);
+ list_splice_tail_init(&ctx->rq_list, list);
+ spin_unlock(&ctx->lock);
+
+ bit++;
+ } while (1);
+ }
+}
+
/*
* Run this hardware queue, pulling any software queues mapped to it in.
* Note that this function currently has various problems around ordering
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
- struct blk_mq_ctx *ctx;
struct request *rq;
LIST_HEAD(rq_list);
- int bit, queued;
+ int queued;
WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
/*
* Touch any software queue that has pending entries.
*/
- for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
- clear_bit(bit, hctx->ctx_map);
- ctx = hctx->ctxs[bit];
-
- spin_lock(&ctx->lock);
- list_splice_tail_init(&ctx->rq_list, &rq_list);
- spin_unlock(&ctx->lock);
- }
+ flush_busy_ctxs(hctx, &rq_list);
/*
* If we have previous entries on our dispatch list, grab them
spin_unlock(&hctx->lock);
}
- /*
- * Delete and return all entries from our dispatch list
- */
- queued = 0;
-
/*
* Now process all the entries, sending them to the driver.
*/
+ queued = 0;
while (!list_empty(&rq_list)) {
int ret;
}
if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) {
- init_request_from_bio(rq, bio);
-
+ blk_mq_bio_to_request(rq, bio);
spin_lock(&ctx->lock);
insert_rq:
__blk_mq_insert_request(hctx, rq, false);
spin_unlock(&ctx->lock);
- blk_account_io_start(rq, 1);
} else {
spin_lock(&ctx->lock);
if (!blk_mq_attempt_merge(q, ctx, bio)) {
- init_request_from_bio(rq, bio);
+ blk_mq_bio_to_request(rq, bio);
goto insert_rq;
}
spin_lock(&ctx->lock);
if (!list_empty(&ctx->rq_list)) {
list_splice_init(&ctx->rq_list, &tmp);
- clear_bit(ctx->index_hw, hctx->ctx_map);
+ blk_mq_hctx_clear_pending(hctx, ctx);
}
spin_unlock(&ctx->lock);
return NULL;
}
+static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
+{
+ kfree(bitmap->map);
+}
+
+static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
+{
+ unsigned int bpw = 8, total, num_maps, i;
+
+ bitmap->bits_per_word = bpw;
+
+ num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
+ bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
+ GFP_KERNEL, node);
+ if (!bitmap->map)
+ return -ENOMEM;
+
+ bitmap->map_size = num_maps;
+
+ total = nr_cpu_ids;
+ for (i = 0; i < num_maps; i++) {
+ bitmap->map[i].depth = min(total, bitmap->bits_per_word);
+ total -= bitmap->map[i].depth;
+ }
+
+ return 0;
+}
+
static int blk_mq_init_hw_queues(struct request_queue *q,
struct blk_mq_tag_set *set)
{
* Initialize hardware queues
*/
queue_for_each_hw_ctx(q, hctx, i) {
- unsigned int num_maps;
int node;
node = hctx->numa_node;
if (!hctx->ctxs)
break;
- num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
- hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
- GFP_KERNEL, node);
- if (!hctx->ctx_map)
+ if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
break;
- hctx->nr_ctx_map = num_maps;
hctx->nr_ctx = 0;
if (set->ops->init_hctx &&
blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
kfree(hctx->ctxs);
- kfree(hctx->ctx_map);
+ blk_mq_free_bitmap(&hctx->ctx_map);
}
return 1;
}
}
+ static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
+ {
+ struct blk_mq_hw_ctx *hctx;
+ struct request_queue *q;
+ bool shared;
+ int i;
+
+ if (set->tag_list.next == set->tag_list.prev)
+ shared = false;
+ else
+ shared = true;
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ blk_mq_freeze_queue(q);
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (shared)
+ hctx->flags |= BLK_MQ_F_TAG_SHARED;
+ else
+ hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+ }
+ blk_mq_unfreeze_queue(q);
+ }
+ }
+
+ static void blk_mq_del_queue_tag_set(struct request_queue *q)
+ {
+ struct blk_mq_tag_set *set = q->tag_set;
+
+ blk_mq_freeze_queue(q);
+
+ mutex_lock(&set->tag_list_lock);
+ list_del_init(&q->tag_set_list);
+ blk_mq_update_tag_set_depth(set);
+ mutex_unlock(&set->tag_list_lock);
+
+ blk_mq_unfreeze_queue(q);
+ }
+
+ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
+ struct request_queue *q)
+ {
+ q->tag_set = set;
+
+ mutex_lock(&set->tag_list_lock);
+ list_add_tail(&q->tag_set_list, &set->tag_list);
+ blk_mq_update_tag_set_depth(set);
+ mutex_unlock(&set->tag_list_lock);
+ }
+
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
struct blk_mq_hw_ctx **hctxs;
if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
goto err_hctxs;
+ atomic_set(&hctxs[i]->nr_active, 0);
hctxs[i]->numa_node = NUMA_NO_NODE;
hctxs[i]->queue_num = i;
}
list_add_tail(&q->all_q_node, &all_q_list);
mutex_unlock(&all_q_mutex);
+ blk_mq_add_queue_tag_set(set, q);
+
return q;
err_flush_rq:
struct blk_mq_hw_ctx *hctx;
int i;
+ blk_mq_del_queue_tag_set(q);
+
queue_for_each_hw_ctx(q, hctx, i) {
- kfree(hctx->ctx_map);
kfree(hctx->ctxs);
blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
if (q->mq_ops->exit_hctx)
goto out_unwind;
}
+ mutex_init(&set->tag_list_lock);
+ INIT_LIST_HEAD(&set->tag_list);
+
return 0;
out_unwind:
void (*notify)(void *data, unsigned long action, unsigned int cpu);
};
+struct blk_mq_ctxmap {
+ unsigned int map_size;
+ unsigned int bits_per_word;
+ struct blk_align_bitmap *map;
+};
+
struct blk_mq_hw_ctx {
struct {
spinlock_t lock;
void *driver_data;
- unsigned int nr_ctx_map;
- unsigned long *ctx_map;
+ struct blk_mq_ctxmap ctx_map;
+
unsigned int nr_ctx;
struct blk_mq_ctx **ctxs;
unsigned int numa_node;
unsigned int cmd_size; /* per-request extra data */
+ atomic_t nr_active;
+
struct blk_mq_cpu_notifier cpu_notifier;
struct kobject kobj;
};
void *driver_data;
struct blk_mq_tags **tags;
+
+ struct mutex tag_list_lock;
+ struct list_head tag_list;
};
typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
BLK_MQ_F_SHOULD_SORT = 1 << 1,
+ BLK_MQ_F_TAG_SHARED = 1 << 2,
BLK_MQ_S_STOPPED = 0,
+ BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_MAX_DEPTH = 2048,