block/blk-mq.c

   1 /*
   2  * Block multiqueue core code
   3  *
   4  * Copyright (C) 2013-2014 Jens Axboe
   5  * Copyright (C) 2013-2014 Christoph Hellwig
   6  */
   7 #include <linux/kernel.h>
   8 #include <linux/module.h>
   9 #include <linux/backing-dev.h>
  10 #include <linux/bio.h>
  11 #include <linux/blkdev.h>
  12 #include <linux/mm.h>
  13 #include <linux/init.h>
  14 #include <linux/slab.h>
  15 #include <linux/workqueue.h>
  16 #include <linux/smp.h>
  17 #include <linux/llist.h>
  18 #include <linux/list_sort.h>
  19 #include <linux/cpu.h>
  20 #include <linux/cache.h>
  21 #include <linux/sched/sysctl.h>
  22 #include <linux/delay.h>
  23 #include <linux/crash_dump.h>
  24
  25 #include <trace/events/block.h>
  26
  27 #include <linux/blk-mq.h>
  28 #include "blk.h"
  29 #include "blk-mq.h"
  30 #include "blk-mq-tag.h"
  31
  32 static DEFINE_MUTEX(all_q_mutex);
  33 static LIST_HEAD(all_q_list);
  34
  35 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
  36
  37 /*
  38  * Check if any of the ctx's have pending work in this hardware queue
  39  */
  40 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  41 {
  42         unsigned int i;
  43
  44         for (i = 0; i < hctx->ctx_map.map_size; i++)
  45                 if (hctx->ctx_map.map[i].word)
  46                         return true;
  47
  48         return false;
  49 }
  50
  51 static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
  52                                               struct blk_mq_ctx *ctx)
  53 {
  54         return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
  55 }
  56
  57 #define CTX_TO_BIT(hctx, ctx)   \
  58         ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
  59
  60 /*
  61  * Mark this ctx as having pending work in this hardware queue
  62  */
  63 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  64                                      struct blk_mq_ctx *ctx)
  65 {
  66         struct blk_align_bitmap *bm = get_bm(hctx, ctx);
  67
  68         if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
  69                 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
  70 }
  71
  72 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
  73                                       struct blk_mq_ctx *ctx)
  74 {
  75         struct blk_align_bitmap *bm = get_bm(hctx, ctx);
  76
  77         clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
  78 }
  79
  80 static int blk_mq_queue_enter(struct request_queue *q)
  81 {
  82         while (true) {
  83                 int ret;
  84
  85                 if (percpu_ref_tryget_live(&q->mq_usage_counter))
  86                         return 0;
  87
  88                 ret = wait_event_interruptible(q->mq_freeze_wq,
  89                                 !q->mq_freeze_depth || blk_queue_dying(q));
  90                 if (blk_queue_dying(q))
  91                         return -ENODEV;
  92                 if (ret)
  93                         return ret;
  94         }
  95 }
  96
  97 static void blk_mq_queue_exit(struct request_queue *q)
  98 {
  99         percpu_ref_put(&q->mq_usage_counter);
 100 }
 101
 102 static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 103 {
 104         struct request_queue *q =
 105                 container_of(ref, struct request_queue, mq_usage_counter);
 106
 107         wake_up_all(&q->mq_freeze_wq);
 108 }
 109
 110 static void blk_mq_freeze_queue_start(struct request_queue *q)
 111 {
 112         bool freeze;
 113
 114         spin_lock_irq(q->queue_lock);
 115         freeze = !q->mq_freeze_depth++;
 116         spin_unlock_irq(q->queue_lock);
 117
 118         if (freeze) {
 119                 percpu_ref_kill(&q->mq_usage_counter);
 120                 blk_mq_run_queues(q, false);
 121         }
 122 }
 123
 124 static void blk_mq_freeze_queue_wait(struct request_queue *q)
 125 {
 126         wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
 127 }
 128
 129 /*
 130  * Guarantee no request is in use, so we can change any data structure of
 131  * the queue afterward.
 132  */
 133 void blk_mq_freeze_queue(struct request_queue *q)
 134 {
 135         blk_mq_freeze_queue_start(q);
 136         blk_mq_freeze_queue_wait(q);
 137 }
 138
 139 static void blk_mq_unfreeze_queue(struct request_queue *q)
 140 {
 141         bool wake;
 142
 143         spin_lock_irq(q->queue_lock);
 144         wake = !--q->mq_freeze_depth;
 145         WARN_ON_ONCE(q->mq_freeze_depth < 0);
 146         spin_unlock_irq(q->queue_lock);
 147         if (wake) {
 148                 percpu_ref_reinit(&q->mq_usage_counter);
 149                 wake_up_all(&q->mq_freeze_wq);
 150         }
 151 }
 152
 153 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 154 {
 155         return blk_mq_has_free_tags(hctx->tags);
 156 }
 157 EXPORT_SYMBOL(blk_mq_can_queue);
 158
 159 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 160                                struct request *rq, unsigned int rw_flags)
 161 {
 162         if (blk_queue_io_stat(q))
 163                 rw_flags |= REQ_IO_STAT;
 164
 165         INIT_LIST_HEAD(&rq->queuelist);
 166         /* csd/requeue_work/fifo_time is initialized before use */
 167         rq->q = q;
 168         rq->mq_ctx = ctx;
 169         rq->cmd_flags |= rw_flags;
 170         /* do not touch atomic flags, it needs atomic ops against the timer */
 171         rq->cpu = -1;
 172         INIT_HLIST_NODE(&rq->hash);
 173         RB_CLEAR_NODE(&rq->rb_node);
 174         rq->rq_disk = NULL;
 175         rq->part = NULL;
 176         rq->start_time = jiffies;
 177 #ifdef CONFIG_BLK_CGROUP
 178         rq->rl = NULL;
 179         set_start_time_ns(rq);
 180         rq->io_start_time_ns = 0;
 181 #endif
 182         rq->nr_phys_segments = 0;
 183 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 184         rq->nr_integrity_segments = 0;
 185 #endif
 186         rq->special = NULL;
 187         /* tag was already set */
 188         rq->errors = 0;
 189
 190         rq->cmd = rq->__cmd;
 191
 192         rq->extra_len = 0;
 193         rq->sense_len = 0;
 194         rq->resid_len = 0;
 195         rq->sense = NULL;
 196
 197         INIT_LIST_HEAD(&rq->timeout_list);
 198         rq->timeout = 0;
 199
 200         rq->end_io = NULL;
 201         rq->end_io_data = NULL;
 202         rq->next_rq = NULL;
 203
 204         ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
 205 }
 206
 207 static struct request *
 208 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
 209 {
 210         struct request *rq;
 211         unsigned int tag;
 212
 213         tag = blk_mq_get_tag(data);
 214         if (tag != BLK_MQ_TAG_FAIL) {
 215                 rq = data->hctx->tags->rqs[tag];
 216
 217                 if (blk_mq_tag_busy(data->hctx)) {
 218                         rq->cmd_flags = REQ_MQ_INFLIGHT;
 219                         atomic_inc(&data->hctx->nr_active);
 220                 }
 221
 222                 rq->tag = tag;
 223                 blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
 224                 return rq;
 225         }
 226
 227         return NULL;
 228 }
 229
 230 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 231                 bool reserved)
 232 {
 233         struct blk_mq_ctx *ctx;
 234         struct blk_mq_hw_ctx *hctx;
 235         struct request *rq;
 236         struct blk_mq_alloc_data alloc_data;
 237         int ret;
 238
 239         ret = blk_mq_queue_enter(q);
 240         if (ret)
 241                 return ERR_PTR(ret);
 242
 243         ctx = blk_mq_get_ctx(q);
 244         hctx = q->mq_ops->map_queue(q, ctx->cpu);
 245         blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
 246                         reserved, ctx, hctx);
 247
 248         rq = __blk_mq_alloc_request(&alloc_data, rw);
 249         if (!rq && (gfp & __GFP_WAIT)) {
 250                 __blk_mq_run_hw_queue(hctx);
 251                 blk_mq_put_ctx(ctx);
 252
 253                 ctx = blk_mq_get_ctx(q);
 254                 hctx = q->mq_ops->map_queue(q, ctx->cpu);
 255                 blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
 256                                 hctx);
 257                 rq =  __blk_mq_alloc_request(&alloc_data, rw);
 258                 ctx = alloc_data.ctx;
 259         }
 260         blk_mq_put_ctx(ctx);
 261         if (!rq) {
 262                 blk_mq_queue_exit(q);
 263                 return ERR_PTR(-EWOULDBLOCK);
 264         }
 265         return rq;
 266 }
 267 EXPORT_SYMBOL(blk_mq_alloc_request);
 268
 269 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 270                                   struct blk_mq_ctx *ctx, struct request *rq)
 271 {
 272         const int tag = rq->tag;
 273         struct request_queue *q = rq->q;
 274
 275         if (rq->cmd_flags & REQ_MQ_INFLIGHT)
 276                 atomic_dec(&hctx->nr_active);
 277         rq->cmd_flags = 0;
 278
 279         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 280         blk_mq_put_tag(hctx, tag, &ctx->last_tag);
 281         blk_mq_queue_exit(q);
 282 }
 283
 284 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 285 {
 286         struct blk_mq_ctx *ctx = rq->mq_ctx;
 287
 288         ctx->rq_completed[rq_is_sync(rq)]++;
 289         __blk_mq_free_request(hctx, ctx, rq);
 290
 291 }
 292 EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
 293
 294 void blk_mq_free_request(struct request *rq)
 295 {
 296         struct blk_mq_hw_ctx *hctx;
 297         struct request_queue *q = rq->q;
 298
 299         hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
 300         blk_mq_free_hctx_request(hctx, rq);
 301 }
 302 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 303
 304 inline void __blk_mq_end_request(struct request *rq, int error)
 305 {
 306         blk_account_io_done(rq);
 307
 308         if (rq->end_io) {
 309                 rq->end_io(rq, error);
 310         } else {
 311                 if (unlikely(blk_bidi_rq(rq)))
 312                         blk_mq_free_request(rq->next_rq);
 313                 blk_mq_free_request(rq);
 314         }
 315 }
 316 EXPORT_SYMBOL(__blk_mq_end_request);
 317
 318 void blk_mq_end_request(struct request *rq, int error)
 319 {
 320         if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 321                 BUG();
 322         __blk_mq_end_request(rq, error);
 323 }
 324 EXPORT_SYMBOL(blk_mq_end_request);
 325
 326 static void __blk_mq_complete_request_remote(void *data)
 327 {
 328         struct request *rq = data;
 329
 330         rq->q->softirq_done_fn(rq);
 331 }
 332
 333 static void blk_mq_ipi_complete_request(struct request *rq)
 334 {
 335         struct blk_mq_ctx *ctx = rq->mq_ctx;
 336         bool shared = false;
 337         int cpu;
 338
 339         if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 340                 rq->q->softirq_done_fn(rq);
 341                 return;
 342         }
 343
 344         cpu = get_cpu();
 345         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
 346                 shared = cpus_share_cache(cpu, ctx->cpu);
 347
 348         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 349                 rq->csd.func = __blk_mq_complete_request_remote;
 350                 rq->csd.info = rq;
 351                 rq->csd.flags = 0;
 352                 smp_call_function_single_async(ctx->cpu, &rq->csd);
 353         } else {
 354                 rq->q->softirq_done_fn(rq);
 355         }
 356         put_cpu();
 357 }
 358
 359 void __blk_mq_complete_request(struct request *rq)
 360 {
 361         struct request_queue *q = rq->q;
 362
 363         if (!q->softirq_done_fn)
 364                 blk_mq_end_request(rq, rq->errors);
 365         else
 366                 blk_mq_ipi_complete_request(rq);
 367 }
 368
 369 /**
 370  * blk_mq_complete_request - end I/O on a request
 371  * @rq:         the request being processed
 372  *
 373  * Description:
 374  *      Ends all I/O on a request. It does not handle partial completions.
 375  *      The actual completion happens out-of-order, through a IPI handler.
 376  **/
 377 void blk_mq_complete_request(struct request *rq)
 378 {
 379         struct request_queue *q = rq->q;
 380
 381         if (unlikely(blk_should_fake_timeout(q)))
 382                 return;
 383         if (!blk_mark_rq_complete(rq))
 384                 __blk_mq_complete_request(rq);
 385 }
 386 EXPORT_SYMBOL(blk_mq_complete_request);
 387
 388 void blk_mq_start_request(struct request *rq)
 389 {
 390         struct request_queue *q = rq->q;
 391
 392         trace_block_rq_issue(q, rq);
 393
 394         rq->resid_len = blk_rq_bytes(rq);
 395         if (unlikely(blk_bidi_rq(rq)))
 396                 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 397
 398         blk_add_timer(rq);
 399
 400         /*
 401          * Ensure that ->deadline is visible before set the started
 402          * flag and clear the completed flag.
 403          */
 404         smp_mb__before_atomic();
 405
 406         /*
 407          * Mark us as started and clear complete. Complete might have been
 408          * set if requeue raced with timeout, which then marked it as
 409          * complete. So be sure to clear complete again when we start
 410          * the request, otherwise we'll ignore the completion event.
 411          */
 412         if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 413                 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 414         if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
 415                 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 416
 417         if (q->dma_drain_size && blk_rq_bytes(rq)) {
 418                 /*
 419                  * Make sure space for the drain appears.  We know we can do
 420                  * this because max_hw_segments has been adjusted to be one
 421                  * fewer than the device can handle.
 422                  */
 423                 rq->nr_phys_segments++;
 424         }
 425 }
 426 EXPORT_SYMBOL(blk_mq_start_request);
 427
 428 static void __blk_mq_requeue_request(struct request *rq)
 429 {
 430         struct request_queue *q = rq->q;
 431
 432         trace_block_rq_requeue(q, rq);
 433
 434         if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 435                 if (q->dma_drain_size && blk_rq_bytes(rq))
 436                         rq->nr_phys_segments--;
 437         }
 438 }
 439
 440 void blk_mq_requeue_request(struct request *rq)
 441 {
 442         __blk_mq_requeue_request(rq);
 443
 444         BUG_ON(blk_queued_rq(rq));
 445         blk_mq_add_to_requeue_list(rq, true);
 446 }
 447 EXPORT_SYMBOL(blk_mq_requeue_request);
 448
 449 static void blk_mq_requeue_work(struct work_struct *work)
 450 {
 451         struct request_queue *q =
 452                 container_of(work, struct request_queue, requeue_work);
 453         LIST_HEAD(rq_list);
 454         struct request *rq, *next;
 455         unsigned long flags;
 456
 457         spin_lock_irqsave(&q->requeue_lock, flags);
 458         list_splice_init(&q->requeue_list, &rq_list);
 459         spin_unlock_irqrestore(&q->requeue_lock, flags);
 460
 461         list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 462                 if (!(rq->cmd_flags & REQ_SOFTBARRIER))
 463                         continue;
 464
 465                 rq->cmd_flags &= ~REQ_SOFTBARRIER;
 466                 list_del_init(&rq->queuelist);
 467                 blk_mq_insert_request(rq, true, false, false);
 468         }
 469
 470         while (!list_empty(&rq_list)) {
 471                 rq = list_entry(rq_list.next, struct request, queuelist);
 472                 list_del_init(&rq->queuelist);
 473                 blk_mq_insert_request(rq, false, false, false);
 474         }
 475
 476         /*
 477          * Use the start variant of queue running here, so that running
 478          * the requeue work will kick stopped queues.
 479          */
 480         blk_mq_start_hw_queues(q);
 481 }
 482
 483 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
 484 {
 485         struct request_queue *q = rq->q;
 486         unsigned long flags;
 487
 488         /*
 489          * We abuse this flag that is otherwise used by the I/O scheduler to
 490          * request head insertation from the workqueue.
 491          */
 492         BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
 493
 494         spin_lock_irqsave(&q->requeue_lock, flags);
 495         if (at_head) {
 496                 rq->cmd_flags |= REQ_SOFTBARRIER;
 497                 list_add(&rq->queuelist, &q->requeue_list);
 498         } else {
 499                 list_add_tail(&rq->queuelist, &q->requeue_list);
 500         }
 501         spin_unlock_irqrestore(&q->requeue_lock, flags);
 502 }
 503 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 504
 505 void blk_mq_kick_requeue_list(struct request_queue *q)
 506 {
 507         kblockd_schedule_work(&q->requeue_work);
 508 }
 509 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 510
 511 static inline bool is_flush_request(struct request *rq,
 512                 struct blk_flush_queue *fq, unsigned int tag)
 513 {
 514         return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
 515                         fq->flush_rq->tag == tag);
 516 }
 517
 518 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 519 {
 520         struct request *rq = tags->rqs[tag];
 521         /* mq_ctx of flush rq is always cloned from the corresponding req */
 522         struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx);
 523
 524         if (!is_flush_request(rq, fq, tag))
 525                 return rq;
 526
 527         return fq->flush_rq;
 528 }
 529 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 530
 531 struct blk_mq_timeout_data {
 532         unsigned long next;
 533         unsigned int next_set;
 534 };
 535
 536 void blk_mq_rq_timed_out(struct request *req, bool reserved)
 537 {
 538         struct blk_mq_ops *ops = req->q->mq_ops;
 539         enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 540
 541         /*
 542          * We know that complete is set at this point. If STARTED isn't set
 543          * anymore, then the request isn't active and the "timeout" should
 544          * just be ignored. This can happen due to the bitflag ordering.
 545          * Timeout first checks if STARTED is set, and if it is, assumes
 546          * the request is active. But if we race with completion, then
 547          * we both flags will get cleared. So check here again, and ignore
 548          * a timeout event with a request that isn't active.
 549          */
 550         if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
 551                 return;
 552
 553         if (ops->timeout)
 554                 ret = ops->timeout(req, reserved);
 555
 556         switch (ret) {
 557         case BLK_EH_HANDLED:
 558                 __blk_mq_complete_request(req);
 559                 break;
 560         case BLK_EH_RESET_TIMER:
 561                 blk_add_timer(req);
 562                 blk_clear_rq_complete(req);
 563                 break;
 564         case BLK_EH_NOT_HANDLED:
 565                 break;
 566         default:
 567                 printk(KERN_ERR "block: bad eh return: %d\n", ret);
 568                 break;
 569         }
 570 }
 571
 572 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 573                 struct request *rq, void *priv, bool reserved)
 574 {
 575         struct blk_mq_timeout_data *data = priv;
 576
 577         if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 578                 return;
 579
 580         if (time_after_eq(jiffies, rq->deadline)) {
 581                 if (!blk_mark_rq_complete(rq))
 582                         blk_mq_rq_timed_out(rq, reserved);
 583         } else if (!data->next_set || time_after(data->next, rq->deadline)) {
 584                 data->next = rq->deadline;
 585                 data->next_set = 1;
 586         }
 587 }
 588
 589 static void blk_mq_rq_timer(unsigned long priv)
 590 {
 591         struct request_queue *q = (struct request_queue *)priv;
 592         struct blk_mq_timeout_data data = {
 593                 .next           = 0,
 594                 .next_set       = 0,
 595         };
 596         struct blk_mq_hw_ctx *hctx;
 597         int i;
 598
 599         queue_for_each_hw_ctx(q, hctx, i) {
 600                 /*
 601                  * If not software queues are currently mapped to this
 602                  * hardware queue, there's nothing to check
 603                  */
 604                 if (!blk_mq_hw_queue_mapped(hctx))
 605                         continue;
 606
 607                 blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
 608         }
 609
 610         if (data.next_set) {
 611                 data.next = blk_rq_timeout(round_jiffies_up(data.next));
 612                 mod_timer(&q->timeout, data.next);
 613         } else {
 614                 queue_for_each_hw_ctx(q, hctx, i)
 615                         blk_mq_tag_idle(hctx);
 616         }
 617 }
 618
 619 /*
 620  * Reverse check our software queue for entries that we could potentially
 621  * merge with. Currently includes a hand-wavy stop count of 8, to not spend
 622  * too much time checking for merges.
 623  */
 624 static bool blk_mq_attempt_merge(struct request_queue *q,
 625                                  struct blk_mq_ctx *ctx, struct bio *bio)
 626 {
 627         struct request *rq;
 628         int checked = 8;
 629
 630         list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
 631                 int el_ret;
 632
 633                 if (!checked--)
 634                         break;
 635
 636                 if (!blk_rq_merge_ok(rq, bio))
 637                         continue;
 638
 639                 el_ret = blk_try_merge(rq, bio);
 640                 if (el_ret == ELEVATOR_BACK_MERGE) {
 641                         if (bio_attempt_back_merge(q, rq, bio)) {
 642                                 ctx->rq_merged++;
 643                                 return true;
 644                         }
 645                         break;
 646                 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
 647                         if (bio_attempt_front_merge(q, rq, bio)) {
 648                                 ctx->rq_merged++;
 649                                 return true;
 650                         }
 651                         break;
 652                 }
 653         }
 654
 655         return false;
 656 }
 657
 658 /*
 659  * Process software queues that have been marked busy, splicing them
 660  * to the for-dispatch
 661  */
 662 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 663 {
 664         struct blk_mq_ctx *ctx;
 665         int i;
 666
 667         for (i = 0; i < hctx->ctx_map.map_size; i++) {
 668                 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
 669                 unsigned int off, bit;
 670
 671                 if (!bm->word)
 672                         continue;
 673
 674                 bit = 0;
 675                 off = i * hctx->ctx_map.bits_per_word;
 676                 do {
 677                         bit = find_next_bit(&bm->word, bm->depth, bit);
 678                         if (bit >= bm->depth)
 679                                 break;
 680
 681                         ctx = hctx->ctxs[bit + off];
 682                         clear_bit(bit, &bm->word);
 683                         spin_lock(&ctx->lock);
 684                         list_splice_tail_init(&ctx->rq_list, list);
 685                         spin_unlock(&ctx->lock);
 686
 687                         bit++;
 688                 } while (1);
 689         }
 690 }
 691
 692 /*
 693  * Run this hardware queue, pulling any software queues mapped to it in.
 694  * Note that this function currently has various problems around ordering
 695  * of IO. In particular, we'd like FIFO behaviour on handling existing
 696  * items on the hctx->dispatch list. Ignore that for now.
 697  */
 698 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 699 {
 700         struct request_queue *q = hctx->queue;
 701         struct request *rq;
 702         LIST_HEAD(rq_list);
 703         LIST_HEAD(driver_list);
 704         struct list_head *dptr;
 705         int queued;
 706
 707         WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
 708
 709         if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 710                 return;
 711
 712         hctx->run++;
 713
 714         /*
 715          * Touch any software queue that has pending entries.
 716          */
 717         flush_busy_ctxs(hctx, &rq_list);
 718
 719         /*
 720          * If we have previous entries on our dispatch list, grab them
 721          * and stuff them at the front for more fair dispatch.
 722          */
 723         if (!list_empty_careful(&hctx->dispatch)) {
 724                 spin_lock(&hctx->lock);
 725                 if (!list_empty(&hctx->dispatch))
 726                         list_splice_init(&hctx->dispatch, &rq_list);
 727                 spin_unlock(&hctx->lock);
 728         }
 729
 730         /*
 731          * Start off with dptr being NULL, so we start the first request
 732          * immediately, even if we have more pending.
 733          */
 734         dptr = NULL;
 735
 736         /*
 737          * Now process all the entries, sending them to the driver.
 738          */
 739         queued = 0;
 740         while (!list_empty(&rq_list)) {
 741                 struct blk_mq_queue_data bd;
 742                 int ret;
 743
 744                 rq = list_first_entry(&rq_list, struct request, queuelist);
 745                 list_del_init(&rq->queuelist);
 746
 747                 bd.rq = rq;
 748                 bd.list = dptr;
 749                 bd.last = list_empty(&rq_list);
 750
 751                 ret = q->mq_ops->queue_rq(hctx, &bd);
 752                 switch (ret) {
 753                 case BLK_MQ_RQ_QUEUE_OK:
 754                         queued++;
 755                         continue;
 756                 case BLK_MQ_RQ_QUEUE_BUSY:
 757                         list_add(&rq->queuelist, &rq_list);
 758                         __blk_mq_requeue_request(rq);
 759                         break;
 760                 default:
 761                         pr_err("blk-mq: bad return on queue: %d\n", ret);
 762                 case BLK_MQ_RQ_QUEUE_ERROR:
 763                         rq->errors = -EIO;
 764                         blk_mq_end_request(rq, rq->errors);
 765                         break;
 766                 }
 767
 768                 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
 769                         break;
 770
 771                 /*
 772                  * We've done the first request. If we have more than 1
 773                  * left in the list, set dptr to defer issue.
 774                  */
 775                 if (!dptr && rq_list.next != rq_list.prev)
 776                         dptr = &driver_list;
 777         }
 778
 779         if (!queued)
 780                 hctx->dispatched[0]++;
 781         else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
 782                 hctx->dispatched[ilog2(queued) + 1]++;
 783
 784         /*
 785          * Any items that need requeuing? Stuff them into hctx->dispatch,
 786          * that is where we will continue on next queue run.
 787          */
 788         if (!list_empty(&rq_list)) {
 789                 spin_lock(&hctx->lock);
 790                 list_splice(&rq_list, &hctx->dispatch);
 791                 spin_unlock(&hctx->lock);
 792         }
 793 }
 794
 795 /*
 796  * It'd be great if the workqueue API had a way to pass
 797  * in a mask and had some smarts for more clever placement.
 798  * For now we just round-robin here, switching for every
 799  * BLK_MQ_CPU_WORK_BATCH queued items.
 800  */
 801 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 802 {
 803         if (hctx->queue->nr_hw_queues == 1)
 804                 return WORK_CPU_UNBOUND;
 805
 806         if (--hctx->next_cpu_batch <= 0) {
 807                 int cpu = hctx->next_cpu, next_cpu;
 808
 809                 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
 810                 if (next_cpu >= nr_cpu_ids)
 811                         next_cpu = cpumask_first(hctx->cpumask);
 812
 813                 hctx->next_cpu = next_cpu;
 814                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 815
 816                 return cpu;
 817         }
 818
 819         return hctx->next_cpu;
 820 }
 821
 822 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 823 {
 824         if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
 825             !blk_mq_hw_queue_mapped(hctx)))
 826                 return;
 827
 828         if (!async) {
 829                 int cpu = get_cpu();
 830                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
 831                         __blk_mq_run_hw_queue(hctx);
 832                         put_cpu();
 833                         return;
 834                 }
 835
 836                 put_cpu();
 837         }
 838
 839         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
 840                         &hctx->run_work, 0);
 841 }
 842
 843 void blk_mq_run_queues(struct request_queue *q, bool async)
 844 {
 845         struct blk_mq_hw_ctx *hctx;
 846         int i;
 847
 848         queue_for_each_hw_ctx(q, hctx, i) {
 849                 if ((!blk_mq_hctx_has_pending(hctx) &&
 850                     list_empty_careful(&hctx->dispatch)) ||
 851                     test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 852                         continue;
 853
 854                 blk_mq_run_hw_queue(hctx, async);
 855         }
 856 }
 857 EXPORT_SYMBOL(blk_mq_run_queues);
 858
 859 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 860 {
 861         cancel_delayed_work(&hctx->run_work);
 862         cancel_delayed_work(&hctx->delay_work);
 863         set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 864 }
 865 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 866
 867 void blk_mq_stop_hw_queues(struct request_queue *q)
 868 {
 869         struct blk_mq_hw_ctx *hctx;
 870         int i;
 871
 872         queue_for_each_hw_ctx(q, hctx, i)
 873                 blk_mq_stop_hw_queue(hctx);
 874 }
 875 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 876
 877 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 878 {
 879         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 880
 881         blk_mq_run_hw_queue(hctx, false);
 882 }
 883 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 884
 885 void blk_mq_start_hw_queues(struct request_queue *q)
 886 {
 887         struct blk_mq_hw_ctx *hctx;
 888         int i;
 889
 890         queue_for_each_hw_ctx(q, hctx, i)
 891                 blk_mq_start_hw_queue(hctx);
 892 }
 893 EXPORT_SYMBOL(blk_mq_start_hw_queues);
 894
 895
 896 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 897 {
 898         struct blk_mq_hw_ctx *hctx;
 899         int i;
 900
 901         queue_for_each_hw_ctx(q, hctx, i) {
 902                 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 903                         continue;
 904
 905                 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 906                 blk_mq_run_hw_queue(hctx, async);
 907         }
 908 }
 909 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 910
 911 static void blk_mq_run_work_fn(struct work_struct *work)
 912 {
 913         struct blk_mq_hw_ctx *hctx;
 914
 915         hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
 916
 917         __blk_mq_run_hw_queue(hctx);
 918 }
 919
 920 static void blk_mq_delay_work_fn(struct work_struct *work)
 921 {
 922         struct blk_mq_hw_ctx *hctx;
 923
 924         hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
 925
 926         if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
 927                 __blk_mq_run_hw_queue(hctx);
 928 }
 929
 930 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 931 {
 932         if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
 933                 return;
 934
 935         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
 936                         &hctx->delay_work, msecs_to_jiffies(msecs));
 937 }
 938 EXPORT_SYMBOL(blk_mq_delay_queue);
 939
 940 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 941                                     struct request *rq, bool at_head)
 942 {
 943         struct blk_mq_ctx *ctx = rq->mq_ctx;
 944
 945         trace_block_rq_insert(hctx->queue, rq);
 946
 947         if (at_head)
 948                 list_add(&rq->queuelist, &ctx->rq_list);
 949         else
 950                 list_add_tail(&rq->queuelist, &ctx->rq_list);
 951
 952         blk_mq_hctx_mark_pending(hctx, ctx);
 953 }
 954
 955 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
 956                 bool async)
 957 {
 958         struct request_queue *q = rq->q;
 959         struct blk_mq_hw_ctx *hctx;
 960         struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
 961
 962         current_ctx = blk_mq_get_ctx(q);
 963         if (!cpu_online(ctx->cpu))
 964                 rq->mq_ctx = ctx = current_ctx;
 965
 966         hctx = q->mq_ops->map_queue(q, ctx->cpu);
 967
 968         spin_lock(&ctx->lock);
 969         __blk_mq_insert_request(hctx, rq, at_head);
 970         spin_unlock(&ctx->lock);
 971
 972         if (run_queue)
 973                 blk_mq_run_hw_queue(hctx, async);
 974
 975         blk_mq_put_ctx(current_ctx);
 976 }
 977
 978 static void blk_mq_insert_requests(struct request_queue *q,
 979                                      struct blk_mq_ctx *ctx,
 980                                      struct list_head *list,
 981                                      int depth,
 982                                      bool from_schedule)
 983
 984 {
 985         struct blk_mq_hw_ctx *hctx;
 986         struct blk_mq_ctx *current_ctx;
 987
 988         trace_block_unplug(q, depth, !from_schedule);
 989
 990         current_ctx = blk_mq_get_ctx(q);
 991
 992         if (!cpu_online(ctx->cpu))
 993                 ctx = current_ctx;
 994         hctx = q->mq_ops->map_queue(q, ctx->cpu);
 995
 996         /*
 997          * preemption doesn't flush plug list, so it's possible ctx->cpu is
 998          * offline now
 999          */
1000         spin_lock(&ctx->lock);
1001         while (!list_empty(list)) {
1002                 struct request *rq;
1003
1004                 rq = list_first_entry(list, struct request, queuelist);
1005                 list_del_init(&rq->queuelist);
1006                 rq->mq_ctx = ctx;
1007                 __blk_mq_insert_request(hctx, rq, false);
1008         }
1009         spin_unlock(&ctx->lock);
1010
1011         blk_mq_run_hw_queue(hctx, from_schedule);
1012         blk_mq_put_ctx(current_ctx);
1013 }
1014
1015 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1016 {
1017         struct request *rqa = container_of(a, struct request, queuelist);
1018         struct request *rqb = container_of(b, struct request, queuelist);
1019
1020         return !(rqa->mq_ctx < rqb->mq_ctx ||
1021                  (rqa->mq_ctx == rqb->mq_ctx &&
1022                   blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1023 }
1024
1025 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1026 {
1027         struct blk_mq_ctx *this_ctx;
1028         struct request_queue *this_q;
1029         struct request *rq;
1030         LIST_HEAD(list);
1031         LIST_HEAD(ctx_list);
1032         unsigned int depth;
1033
1034         list_splice_init(&plug->mq_list, &list);
1035
1036         list_sort(NULL, &list, plug_ctx_cmp);
1037
1038         this_q = NULL;
1039         this_ctx = NULL;
1040         depth = 0;
1041
1042         while (!list_empty(&list)) {
1043                 rq = list_entry_rq(list.next);
1044                 list_del_init(&rq->queuelist);
1045                 BUG_ON(!rq->q);
1046                 if (rq->mq_ctx != this_ctx) {
1047                         if (this_ctx) {
1048                                 blk_mq_insert_requests(this_q, this_ctx,
1049                                                         &ctx_list, depth,
1050                                                         from_schedule);
1051                         }
1052
1053                         this_ctx = rq->mq_ctx;
1054                         this_q = rq->q;
1055                         depth = 0;
1056                 }
1057
1058                 depth++;
1059                 list_add_tail(&rq->queuelist, &ctx_list);
1060         }
1061
1062         /*
1063          * If 'this_ctx' is set, we know we have entries to complete
1064          * on 'ctx_list'. Do those.
1065          */
1066         if (this_ctx) {
1067                 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
1068                                        from_schedule);
1069         }
1070 }
1071
1072 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1073 {
1074         init_request_from_bio(rq, bio);
1075
1076         if (blk_do_io_stat(rq))
1077                 blk_account_io_start(rq, 1);
1078 }
1079
1080 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1081 {
1082         return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1083                 !blk_queue_nomerges(hctx->queue);
1084 }
1085
1086 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
1087                                          struct blk_mq_ctx *ctx,
1088                                          struct request *rq, struct bio *bio)
1089 {
1090         if (!hctx_allow_merges(hctx)) {
1091                 blk_mq_bio_to_request(rq, bio);
1092                 spin_lock(&ctx->lock);
1093 insert_rq:
1094                 __blk_mq_insert_request(hctx, rq, false);
1095                 spin_unlock(&ctx->lock);
1096                 return false;
1097         } else {
1098                 struct request_queue *q = hctx->queue;
1099
1100                 spin_lock(&ctx->lock);
1101                 if (!blk_mq_attempt_merge(q, ctx, bio)) {
1102                         blk_mq_bio_to_request(rq, bio);
1103                         goto insert_rq;
1104                 }
1105
1106                 spin_unlock(&ctx->lock);
1107                 __blk_mq_free_request(hctx, ctx, rq);
1108                 return true;
1109         }
1110 }
1111
1112 struct blk_map_ctx {
1113         struct blk_mq_hw_ctx *hctx;
1114         struct blk_mq_ctx *ctx;
1115 };
1116
1117 static struct request *blk_mq_map_request(struct request_queue *q,
1118                                           struct bio *bio,
1119                                           struct blk_map_ctx *data)
1120 {
1121         struct blk_mq_hw_ctx *hctx;
1122         struct blk_mq_ctx *ctx;
1123         struct request *rq;
1124         int rw = bio_data_dir(bio);
1125         struct blk_mq_alloc_data alloc_data;
1126
1127         if (unlikely(blk_mq_queue_enter(q))) {
1128                 bio_endio(bio, -EIO);
1129                 return NULL;
1130         }
1131
1132         ctx = blk_mq_get_ctx(q);
1133         hctx = q->mq_ops->map_queue(q, ctx->cpu);
1134
1135         if (rw_is_sync(bio->bi_rw))
1136                 rw |= REQ_SYNC;
1137
1138         trace_block_getrq(q, bio, rw);
1139         blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
1140                         hctx);
1141         rq = __blk_mq_alloc_request(&alloc_data, rw);
1142         if (unlikely(!rq)) {
1143                 __blk_mq_run_hw_queue(hctx);
1144                 blk_mq_put_ctx(ctx);
1145                 trace_block_sleeprq(q, bio, rw);
1146
1147                 ctx = blk_mq_get_ctx(q);
1148                 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1149                 blk_mq_set_alloc_data(&alloc_data, q,
1150                                 __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
1151                 rq = __blk_mq_alloc_request(&alloc_data, rw);
1152                 ctx = alloc_data.ctx;
1153                 hctx = alloc_data.hctx;
1154         }
1155
1156         hctx->queued++;
1157         data->hctx = hctx;
1158         data->ctx = ctx;
1159         return rq;
1160 }
1161
1162 /*
1163  * Multiple hardware queue variant. This will not use per-process plugs,
1164  * but will attempt to bypass the hctx queueing if we can go straight to
1165  * hardware for SYNC IO.
1166  */
1167 static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1168 {
1169         const int is_sync = rw_is_sync(bio->bi_rw);
1170         const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1171         struct blk_map_ctx data;
1172         struct request *rq;
1173
1174         blk_queue_bounce(q, &bio);
1175
1176         if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1177                 bio_endio(bio, -EIO);
1178                 return;
1179         }
1180
1181         rq = blk_mq_map_request(q, bio, &data);
1182         if (unlikely(!rq))
1183                 return;
1184
1185         if (unlikely(is_flush_fua)) {
1186                 blk_mq_bio_to_request(rq, bio);
1187                 blk_insert_flush(rq);
1188                 goto run_queue;
1189         }
1190
1191         /*
1192          * If the driver supports defer issued based on 'last', then
1193          * queue it up like normal since we can potentially save some
1194          * CPU this way.
1195          */
1196         if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1197                 struct blk_mq_queue_data bd = {
1198                         .rq = rq,
1199                         .list = NULL,
1200                         .last = 1
1201                 };
1202                 int ret;
1203
1204                 blk_mq_bio_to_request(rq, bio);
1205
1206                 /*
1207                  * For OK queue, we are done. For error, kill it. Any other
1208                  * error (busy), just add it to our list as we previously
1209                  * would have done
1210                  */
1211                 ret = q->mq_ops->queue_rq(data.hctx, &bd);
1212                 if (ret == BLK_MQ_RQ_QUEUE_OK)
1213                         goto done;
1214                 else {
1215                         __blk_mq_requeue_request(rq);
1216
1217                         if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1218                                 rq->errors = -EIO;
1219                                 blk_mq_end_request(rq, rq->errors);
1220                                 goto done;
1221                         }
1222                 }
1223         }
1224
1225         if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1226                 /*
1227                  * For a SYNC request, send it to the hardware immediately. For
1228                  * an ASYNC request, just ensure that we run it later on. The
1229                  * latter allows for merging opportunities and more efficient
1230                  * dispatching.
1231                  */
1232 run_queue:
1233                 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1234         }
1235 done:
1236         blk_mq_put_ctx(data.ctx);
1237 }
1238
1239 /*
1240  * Single hardware queue variant. This will attempt to use any per-process
1241  * plug for merging and IO deferral.
1242  */
1243 static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1244 {
1245         const int is_sync = rw_is_sync(bio->bi_rw);
1246         const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1247         unsigned int use_plug, request_count = 0;
1248         struct blk_map_ctx data;
1249         struct request *rq;
1250
1251         /*
1252          * If we have multiple hardware queues, just go directly to
1253          * one of those for sync IO.
1254          */
1255         use_plug = !is_flush_fua && !is_sync;
1256
1257         blk_queue_bounce(q, &bio);
1258
1259         if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1260                 bio_endio(bio, -EIO);
1261                 return;
1262         }
1263
1264         if (use_plug && !blk_queue_nomerges(q) &&
1265             blk_attempt_plug_merge(q, bio, &request_count))
1266                 return;
1267
1268         rq = blk_mq_map_request(q, bio, &data);
1269         if (unlikely(!rq))
1270                 return;
1271
1272         if (unlikely(is_flush_fua)) {
1273                 blk_mq_bio_to_request(rq, bio);
1274                 blk_insert_flush(rq);
1275                 goto run_queue;
1276         }
1277
1278         /*
1279          * A task plug currently exists. Since this is completely lockless,
1280          * utilize that to temporarily store requests until the task is
1281          * either done or scheduled away.
1282          */
1283         if (use_plug) {
1284                 struct blk_plug *plug = current->plug;
1285
1286                 if (plug) {
1287                         blk_mq_bio_to_request(rq, bio);
1288                         if (list_empty(&plug->mq_list))
1289                                 trace_block_plug(q);
1290                         else if (request_count >= BLK_MAX_REQUEST_COUNT) {
1291                                 blk_flush_plug_list(plug, false);
1292                                 trace_block_plug(q);
1293                         }
1294                         list_add_tail(&rq->queuelist, &plug->mq_list);
1295                         blk_mq_put_ctx(data.ctx);
1296                         return;
1297                 }
1298         }
1299
1300         if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1301                 /*
1302                  * For a SYNC request, send it to the hardware immediately. For
1303                  * an ASYNC request, just ensure that we run it later on. The
1304                  * latter allows for merging opportunities and more efficient
1305                  * dispatching.
1306                  */
1307 run_queue:
1308                 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1309         }
1310
1311         blk_mq_put_ctx(data.ctx);
1312 }
1313
1314 /*
1315  * Default mapping to a software queue, since we use one per CPU.
1316  */
1317 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
1318 {
1319         return q->queue_hw_ctx[q->mq_map[cpu]];
1320 }
1321 EXPORT_SYMBOL(blk_mq_map_queue);
1322
1323 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
1324                 struct blk_mq_tags *tags, unsigned int hctx_idx)
1325 {
1326         struct page *page;
1327
1328         if (tags->rqs && set->ops->exit_request) {
1329                 int i;
1330
1331                 for (i = 0; i < tags->nr_tags; i++) {
1332                         if (!tags->rqs[i])
1333                                 continue;
1334                         set->ops->exit_request(set->driver_data, tags->rqs[i],
1335                                                 hctx_idx, i);
1336                         tags->rqs[i] = NULL;
1337                 }
1338         }
1339
1340         while (!list_empty(&tags->page_list)) {
1341                 page = list_first_entry(&tags->page_list, struct page, lru);
1342                 list_del_init(&page->lru);
1343                 __free_pages(page, page->private);
1344         }
1345
1346         kfree(tags->rqs);
1347
1348         blk_mq_free_tags(tags);
1349 }
1350
1351 static size_t order_to_size(unsigned int order)
1352 {
1353         return (size_t)PAGE_SIZE << order;
1354 }
1355
1356 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1357                 unsigned int hctx_idx)
1358 {
1359         struct blk_mq_tags *tags;
1360         unsigned int i, j, entries_per_page, max_order = 4;
1361         size_t rq_size, left;
1362
1363         tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
1364                                 set->numa_node);
1365         if (!tags)
1366                 return NULL;
1367
1368         INIT_LIST_HEAD(&tags->page_list);
1369
1370         tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
1371                                  GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
1372                                  set->numa_node);
1373         if (!tags->rqs) {
1374                 blk_mq_free_tags(tags);
1375                 return NULL;
1376         }
1377
1378         /*
1379          * rq_size is the size of the request plus driver payload, rounded
1380          * to the cacheline size
1381          */
1382         rq_size = round_up(sizeof(struct request) + set->cmd_size,
1383                                 cache_line_size());
1384         left = rq_size * set->queue_depth;
1385
1386         for (i = 0; i < set->queue_depth; ) {
1387                 int this_order = max_order;
1388                 struct page *page;
1389                 int to_do;
1390                 void *p;
1391
1392                 while (left < order_to_size(this_order - 1) && this_order)
1393                         this_order--;
1394
1395                 do {
1396                         page = alloc_pages_node(set->numa_node,
1397                                 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
1398                                 this_order);
1399                         if (page)
1400                                 break;
1401                         if (!this_order--)
1402                                 break;
1403                         if (order_to_size(this_order) < rq_size)
1404                                 break;
1405                 } while (1);
1406
1407                 if (!page)
1408                         goto fail;
1409
1410                 page->private = this_order;
1411                 list_add_tail(&page->lru, &tags->page_list);
1412
1413                 p = page_address(page);
1414                 entries_per_page = order_to_size(this_order) / rq_size;
1415                 to_do = min(entries_per_page, set->queue_depth - i);
1416                 left -= to_do * rq_size;
1417                 for (j = 0; j < to_do; j++) {
1418                         tags->rqs[i] = p;
1419                         tags->rqs[i]->atomic_flags = 0;
1420                         tags->rqs[i]->cmd_flags = 0;
1421                         if (set->ops->init_request) {
1422                                 if (set->ops->init_request(set->driver_data,
1423                                                 tags->rqs[i], hctx_idx, i,
1424                                                 set->numa_node)) {
1425                                         tags->rqs[i] = NULL;
1426                                         goto fail;
1427                                 }
1428                         }
1429
1430                         p += rq_size;
1431                         i++;
1432                 }
1433         }
1434
1435         return tags;
1436
1437 fail:
1438         blk_mq_free_rq_map(set, tags, hctx_idx);
1439         return NULL;
1440 }
1441
1442 static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
1443 {
1444         kfree(bitmap->map);
1445 }
1446
1447 static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
1448 {
1449         unsigned int bpw = 8, total, num_maps, i;
1450
1451         bitmap->bits_per_word = bpw;
1452
1453         num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
1454         bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
1455                                         GFP_KERNEL, node);
1456         if (!bitmap->map)
1457                 return -ENOMEM;
1458
1459         bitmap->map_size = num_maps;
1460
1461         total = nr_cpu_ids;
1462         for (i = 0; i < num_maps; i++) {
1463                 bitmap->map[i].depth = min(total, bitmap->bits_per_word);
1464                 total -= bitmap->map[i].depth;
1465         }
1466
1467         return 0;
1468 }
1469
1470 static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
1471 {
1472         struct request_queue *q = hctx->queue;
1473         struct blk_mq_ctx *ctx;
1474         LIST_HEAD(tmp);
1475
1476         /*
1477          * Move ctx entries to new CPU, if this one is going away.
1478          */
1479         ctx = __blk_mq_get_ctx(q, cpu);
1480
1481         spin_lock(&ctx->lock);
1482         if (!list_empty(&ctx->rq_list)) {
1483                 list_splice_init(&ctx->rq_list, &tmp);
1484                 blk_mq_hctx_clear_pending(hctx, ctx);
1485         }
1486         spin_unlock(&ctx->lock);
1487
1488         if (list_empty(&tmp))
1489                 return NOTIFY_OK;
1490
1491         ctx = blk_mq_get_ctx(q);
1492         spin_lock(&ctx->lock);
1493
1494         while (!list_empty(&tmp)) {
1495                 struct request *rq;
1496
1497                 rq = list_first_entry(&tmp, struct request, queuelist);
1498                 rq->mq_ctx = ctx;
1499                 list_move_tail(&rq->queuelist, &ctx->rq_list);
1500         }
1501
1502         hctx = q->mq_ops->map_queue(q, ctx->cpu);
1503         blk_mq_hctx_mark_pending(hctx, ctx);
1504
1505         spin_unlock(&ctx->lock);
1506
1507         blk_mq_run_hw_queue(hctx, true);
1508         blk_mq_put_ctx(ctx);
1509         return NOTIFY_OK;
1510 }
1511
1512 static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
1513 {
1514         struct request_queue *q = hctx->queue;
1515         struct blk_mq_tag_set *set = q->tag_set;
1516
1517         if (set->tags[hctx->queue_num])
1518                 return NOTIFY_OK;
1519
1520         set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
1521         if (!set->tags[hctx->queue_num])
1522                 return NOTIFY_STOP;
1523
1524         hctx->tags = set->tags[hctx->queue_num];
1525         return NOTIFY_OK;
1526 }
1527
1528 static int blk_mq_hctx_notify(void *data, unsigned long action,
1529                               unsigned int cpu)
1530 {
1531         struct blk_mq_hw_ctx *hctx = data;
1532
1533         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1534                 return blk_mq_hctx_cpu_offline(hctx, cpu);
1535         else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
1536                 return blk_mq_hctx_cpu_online(hctx, cpu);
1537
1538         return NOTIFY_OK;
1539 }
1540
1541 static void blk_mq_exit_hctx(struct request_queue *q,
1542                 struct blk_mq_tag_set *set,
1543                 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1544 {
1545         unsigned flush_start_tag = set->queue_depth;
1546
1547         blk_mq_tag_idle(hctx);
1548
1549         if (set->ops->exit_request)
1550                 set->ops->exit_request(set->driver_data,
1551                                        hctx->fq->flush_rq, hctx_idx,
1552                                        flush_start_tag + hctx_idx);
1553
1554         if (set->ops->exit_hctx)
1555                 set->ops->exit_hctx(hctx, hctx_idx);
1556
1557         blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1558         blk_free_flush_queue(hctx->fq);
1559         kfree(hctx->ctxs);
1560         blk_mq_free_bitmap(&hctx->ctx_map);
1561 }
1562
1563 static void blk_mq_exit_hw_queues(struct request_queue *q,
1564                 struct blk_mq_tag_set *set, int nr_queue)
1565 {
1566         struct blk_mq_hw_ctx *hctx;
1567         unsigned int i;
1568
1569         queue_for_each_hw_ctx(q, hctx, i) {
1570                 if (i == nr_queue)
1571                         break;
1572                 blk_mq_exit_hctx(q, set, hctx, i);
1573         }
1574 }
1575
1576 static void blk_mq_free_hw_queues(struct request_queue *q,
1577                 struct blk_mq_tag_set *set)
1578 {
1579         struct blk_mq_hw_ctx *hctx;
1580         unsigned int i;
1581
1582         queue_for_each_hw_ctx(q, hctx, i) {
1583                 free_cpumask_var(hctx->cpumask);
1584                 kfree(hctx);
1585         }
1586 }
1587
1588 static int blk_mq_init_hctx(struct request_queue *q,
1589                 struct blk_mq_tag_set *set,
1590                 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
1591 {
1592         int node;
1593         unsigned flush_start_tag = set->queue_depth;
1594
1595         node = hctx->numa_node;
1596         if (node == NUMA_NO_NODE)
1597                 node = hctx->numa_node = set->numa_node;
1598
1599         INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1600         INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1601         spin_lock_init(&hctx->lock);
1602         INIT_LIST_HEAD(&hctx->dispatch);
1603         hctx->queue = q;
1604         hctx->queue_num = hctx_idx;
1605         hctx->flags = set->flags;
1606         hctx->cmd_size = set->cmd_size;
1607
1608         blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1609                                         blk_mq_hctx_notify, hctx);
1610         blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1611
1612         hctx->tags = set->tags[hctx_idx];
1613
1614         /*
1615          * Allocate space for all possible cpus to avoid allocation at
1616          * runtime
1617          */
1618         hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1619                                         GFP_KERNEL, node);
1620         if (!hctx->ctxs)
1621                 goto unregister_cpu_notifier;
1622
1623         if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
1624                 goto free_ctxs;
1625
1626         hctx->nr_ctx = 0;
1627
1628         if (set->ops->init_hctx &&
1629             set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1630                 goto free_bitmap;
1631
1632         hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
1633         if (!hctx->fq)
1634                 goto exit_hctx;
1635
1636         if (set->ops->init_request &&
1637             set->ops->init_request(set->driver_data,
1638                                    hctx->fq->flush_rq, hctx_idx,
1639                                    flush_start_tag + hctx_idx, node))
1640                 goto free_fq;
1641
1642         return 0;
1643
1644  free_fq:
1645         kfree(hctx->fq);
1646  exit_hctx:
1647         if (set->ops->exit_hctx)
1648                 set->ops->exit_hctx(hctx, hctx_idx);
1649  free_bitmap:
1650         blk_mq_free_bitmap(&hctx->ctx_map);
1651  free_ctxs:
1652         kfree(hctx->ctxs);
1653  unregister_cpu_notifier:
1654         blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1655
1656         return -1;
1657 }
1658
1659 static int blk_mq_init_hw_queues(struct request_queue *q,
1660                 struct blk_mq_tag_set *set)
1661 {
1662         struct blk_mq_hw_ctx *hctx;
1663         unsigned int i;
1664
1665         /*
1666          * Initialize hardware queues
1667          */
1668         queue_for_each_hw_ctx(q, hctx, i) {
1669                 if (blk_mq_init_hctx(q, set, hctx, i))
1670                         break;
1671         }
1672
1673         if (i == q->nr_hw_queues)
1674                 return 0;
1675
1676         /*
1677          * Init failed
1678          */
1679         blk_mq_exit_hw_queues(q, set, i);
1680
1681         return 1;
1682 }
1683
1684 static void blk_mq_init_cpu_queues(struct request_queue *q,
1685                                    unsigned int nr_hw_queues)
1686 {
1687         unsigned int i;
1688
1689         for_each_possible_cpu(i) {
1690                 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1691                 struct blk_mq_hw_ctx *hctx;
1692
1693                 memset(__ctx, 0, sizeof(*__ctx));
1694                 __ctx->cpu = i;
1695                 spin_lock_init(&__ctx->lock);
1696                 INIT_LIST_HEAD(&__ctx->rq_list);
1697                 __ctx->queue = q;
1698
1699                 /* If the cpu isn't online, the cpu is mapped to first hctx */
1700                 if (!cpu_online(i))
1701                         continue;
1702
1703                 hctx = q->mq_ops->map_queue(q, i);
1704                 cpumask_set_cpu(i, hctx->cpumask);
1705                 hctx->nr_ctx++;
1706
1707                 /*
1708                  * Set local node, IFF we have more than one hw queue. If
1709                  * not, we remain on the home node of the device
1710                  */
1711                 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1712                         hctx->numa_node = cpu_to_node(i);
1713         }
1714 }
1715
1716 static void blk_mq_map_swqueue(struct request_queue *q)
1717 {
1718         unsigned int i;
1719         struct blk_mq_hw_ctx *hctx;
1720         struct blk_mq_ctx *ctx;
1721
1722         queue_for_each_hw_ctx(q, hctx, i) {
1723                 cpumask_clear(hctx->cpumask);
1724                 hctx->nr_ctx = 0;
1725         }
1726
1727         /*
1728          * Map software to hardware queues
1729          */
1730         queue_for_each_ctx(q, ctx, i) {
1731                 /* If the cpu isn't online, the cpu is mapped to first hctx */
1732                 if (!cpu_online(i))
1733                         continue;
1734
1735                 hctx = q->mq_ops->map_queue(q, i);
1736                 cpumask_set_cpu(i, hctx->cpumask);
1737                 ctx->index_hw = hctx->nr_ctx;
1738                 hctx->ctxs[hctx->nr_ctx++] = ctx;
1739         }
1740
1741         queue_for_each_hw_ctx(q, hctx, i) {
1742                 /*
1743                  * If no software queues are mapped to this hardware queue,
1744                  * disable it and free the request entries.
1745                  */
1746                 if (!hctx->nr_ctx) {
1747                         struct blk_mq_tag_set *set = q->tag_set;
1748
1749                         if (set->tags[i]) {
1750                                 blk_mq_free_rq_map(set, set->tags[i], i);
1751                                 set->tags[i] = NULL;
1752                                 hctx->tags = NULL;
1753                         }
1754                         continue;
1755                 }
1756
1757                 /*
1758                  * Initialize batch roundrobin counts
1759                  */
1760                 hctx->next_cpu = cpumask_first(hctx->cpumask);
1761                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1762         }
1763 }
1764
1765 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
1766 {
1767         struct blk_mq_hw_ctx *hctx;
1768         struct request_queue *q;
1769         bool shared;
1770         int i;
1771
1772         if (set->tag_list.next == set->tag_list.prev)
1773                 shared = false;
1774         else
1775                 shared = true;
1776
1777         list_for_each_entry(q, &set->tag_list, tag_set_list) {
1778                 blk_mq_freeze_queue(q);
1779
1780                 queue_for_each_hw_ctx(q, hctx, i) {
1781                         if (shared)
1782                                 hctx->flags |= BLK_MQ_F_TAG_SHARED;
1783                         else
1784                                 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1785                 }
1786                 blk_mq_unfreeze_queue(q);
1787         }
1788 }
1789
1790 static void blk_mq_del_queue_tag_set(struct request_queue *q)
1791 {
1792         struct blk_mq_tag_set *set = q->tag_set;
1793
1794         mutex_lock(&set->tag_list_lock);
1795         list_del_init(&q->tag_set_list);
1796         blk_mq_update_tag_set_depth(set);
1797         mutex_unlock(&set->tag_list_lock);
1798 }
1799
1800 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1801                                      struct request_queue *q)
1802 {
1803         q->tag_set = set;
1804
1805         mutex_lock(&set->tag_list_lock);
1806         list_add_tail(&q->tag_set_list, &set->tag_list);
1807         blk_mq_update_tag_set_depth(set);
1808         mutex_unlock(&set->tag_list_lock);
1809 }
1810
1811 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1812 {
1813         struct blk_mq_hw_ctx **hctxs;
1814         struct blk_mq_ctx __percpu *ctx;
1815         struct request_queue *q;
1816         unsigned int *map;
1817         int i;
1818
1819         ctx = alloc_percpu(struct blk_mq_ctx);
1820         if (!ctx)
1821                 return ERR_PTR(-ENOMEM);
1822
1823         hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1824                         set->numa_node);
1825
1826         if (!hctxs)
1827                 goto err_percpu;
1828
1829         map = blk_mq_make_queue_map(set);
1830         if (!map)
1831                 goto err_map;
1832
1833         for (i = 0; i < set->nr_hw_queues; i++) {
1834                 int node = blk_mq_hw_queue_to_node(map, i);
1835
1836                 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
1837                                         GFP_KERNEL, node);
1838                 if (!hctxs[i])
1839                         goto err_hctxs;
1840
1841                 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
1842                                                 node))
1843                         goto err_hctxs;
1844
1845                 atomic_set(&hctxs[i]->nr_active, 0);
1846                 hctxs[i]->numa_node = node;
1847                 hctxs[i]->queue_num = i;
1848         }
1849
1850         q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1851         if (!q)
1852                 goto err_hctxs;
1853
1854         /*
1855          * Init percpu_ref in atomic mode so that it's faster to shutdown.
1856          * See blk_register_queue() for details.
1857          */
1858         if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
1859                             PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
1860                 goto err_map;
1861
1862         setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1863         blk_queue_rq_timeout(q, 30000);
1864
1865         q->nr_queues = nr_cpu_ids;
1866         q->nr_hw_queues = set->nr_hw_queues;
1867         q->mq_map = map;
1868
1869         q->queue_ctx = ctx;
1870         q->queue_hw_ctx = hctxs;
1871
1872         q->mq_ops = set->ops;
1873         q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
1874
1875         if (!(set->flags & BLK_MQ_F_SG_MERGE))
1876                 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
1877
1878         q->sg_reserved_size = INT_MAX;
1879
1880         INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
1881         INIT_LIST_HEAD(&q->requeue_list);
1882         spin_lock_init(&q->requeue_lock);
1883
1884         if (q->nr_hw_queues > 1)
1885                 blk_queue_make_request(q, blk_mq_make_request);
1886         else
1887                 blk_queue_make_request(q, blk_sq_make_request);
1888
1889         if (set->timeout)
1890                 blk_queue_rq_timeout(q, set->timeout);
1891
1892         /*
1893          * Do this after blk_queue_make_request() overrides it...
1894          */
1895         q->nr_requests = set->queue_depth;
1896
1897         if (set->ops->complete)
1898                 blk_queue_softirq_done(q, set->ops->complete);
1899
1900         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
1901
1902         if (blk_mq_init_hw_queues(q, set))
1903                 goto err_hw;
1904
1905         mutex_lock(&all_q_mutex);
1906         list_add_tail(&q->all_q_node, &all_q_list);
1907         mutex_unlock(&all_q_mutex);
1908
1909         blk_mq_add_queue_tag_set(set, q);
1910
1911         blk_mq_map_swqueue(q);
1912
1913         return q;
1914
1915 err_hw:
1916         blk_cleanup_queue(q);
1917 err_hctxs:
1918         kfree(map);
1919         for (i = 0; i < set->nr_hw_queues; i++) {
1920                 if (!hctxs[i])
1921                         break;
1922                 free_cpumask_var(hctxs[i]->cpumask);
1923                 kfree(hctxs[i]);
1924         }
1925 err_map:
1926         kfree(hctxs);
1927 err_percpu:
1928         free_percpu(ctx);
1929         return ERR_PTR(-ENOMEM);
1930 }
1931 EXPORT_SYMBOL(blk_mq_init_queue);
1932
1933 void blk_mq_free_queue(struct request_queue *q)
1934 {
1935         struct blk_mq_tag_set   *set = q->tag_set;
1936
1937         blk_mq_del_queue_tag_set(q);
1938
1939         blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
1940         blk_mq_free_hw_queues(q, set);
1941
1942         percpu_ref_exit(&q->mq_usage_counter);
1943
1944         free_percpu(q->queue_ctx);
1945         kfree(q->queue_hw_ctx);
1946         kfree(q->mq_map);
1947
1948         q->queue_ctx = NULL;
1949         q->queue_hw_ctx = NULL;
1950         q->mq_map = NULL;
1951
1952         mutex_lock(&all_q_mutex);
1953         list_del_init(&q->all_q_node);
1954         mutex_unlock(&all_q_mutex);
1955 }
1956
1957 /* Basically redo blk_mq_init_queue with queue frozen */
1958 static void blk_mq_queue_reinit(struct request_queue *q)
1959 {
1960         WARN_ON_ONCE(!q->mq_freeze_depth);
1961
1962         blk_mq_sysfs_unregister(q);
1963
1964         blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
1965
1966         /*
1967          * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
1968          * we should change hctx numa_node according to new topology (this
1969          * involves free and re-allocate memory, worthy doing?)
1970          */
1971
1972         blk_mq_map_swqueue(q);
1973
1974         blk_mq_sysfs_register(q);
1975 }
1976
1977 static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1978                                       unsigned long action, void *hcpu)
1979 {
1980         struct request_queue *q;
1981
1982         /*
1983          * Before new mappings are established, hotadded cpu might already
1984          * start handling requests. This doesn't break anything as we map
1985          * offline CPUs to first hardware queue. We will re-init the queue
1986          * below to get optimal settings.
1987          */
1988         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1989             action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
1990                 return NOTIFY_OK;
1991
1992         mutex_lock(&all_q_mutex);
1993
1994         /*
1995          * We need to freeze and reinit all existing queues.  Freezing
1996          * involves synchronous wait for an RCU grace period and doing it
1997          * one by one may take a long time.  Start freezing all queues in
1998          * one swoop and then wait for the completions so that freezing can
1999          * take place in parallel.
2000          */
2001         list_for_each_entry(q, &all_q_list, all_q_node)
2002                 blk_mq_freeze_queue_start(q);
2003         list_for_each_entry(q, &all_q_list, all_q_node)
2004                 blk_mq_freeze_queue_wait(q);
2005
2006         list_for_each_entry(q, &all_q_list, all_q_node)
2007                 blk_mq_queue_reinit(q);
2008
2009         list_for_each_entry(q, &all_q_list, all_q_node)
2010                 blk_mq_unfreeze_queue(q);
2011
2012         mutex_unlock(&all_q_mutex);
2013         return NOTIFY_OK;
2014 }
2015
2016 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2017 {
2018         int i;
2019
2020         for (i = 0; i < set->nr_hw_queues; i++) {
2021                 set->tags[i] = blk_mq_init_rq_map(set, i);
2022                 if (!set->tags[i])
2023                         goto out_unwind;
2024         }
2025
2026         return 0;
2027
2028 out_unwind:
2029         while (--i >= 0)
2030                 blk_mq_free_rq_map(set, set->tags[i], i);
2031
2032         return -ENOMEM;
2033 }
2034
2035 /*
2036  * Allocate the request maps associated with this tag_set. Note that this
2037  * may reduce the depth asked for, if memory is tight. set->queue_depth
2038  * will be updated to reflect the allocated depth.
2039  */
2040 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2041 {
2042         unsigned int depth;
2043         int err;
2044
2045         depth = set->queue_depth;
2046         do {
2047                 err = __blk_mq_alloc_rq_maps(set);
2048                 if (!err)
2049                         break;
2050
2051                 set->queue_depth >>= 1;
2052                 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2053                         err = -ENOMEM;
2054                         break;
2055                 }
2056         } while (set->queue_depth);
2057
2058         if (!set->queue_depth || err) {
2059                 pr_err("blk-mq: failed to allocate request map\n");
2060                 return -ENOMEM;
2061         }
2062
2063         if (depth != set->queue_depth)
2064                 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2065                                                 depth, set->queue_depth);
2066
2067         return 0;
2068 }
2069
2070 /*
2071  * Alloc a tag set to be associated with one or more request queues.
2072  * May fail with EINVAL for various error conditions. May adjust the
2073  * requested depth down, if if it too large. In that case, the set
2074  * value will be stored in set->queue_depth.
2075  */
2076 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2077 {
2078         BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2079
2080         if (!set->nr_hw_queues)
2081                 return -EINVAL;
2082         if (!set->queue_depth)
2083                 return -EINVAL;
2084         if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2085                 return -EINVAL;
2086
2087         if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
2088                 return -EINVAL;
2089
2090         if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2091                 pr_info("blk-mq: reduced tag depth to %u\n",
2092                         BLK_MQ_MAX_DEPTH);
2093                 set->queue_depth = BLK_MQ_MAX_DEPTH;
2094         }
2095
2096         /*
2097          * If a crashdump is active, then we are potentially in a very
2098          * memory constrained environment. Limit us to 1 queue and
2099          * 64 tags to prevent using too much memory.
2100          */
2101         if (is_kdump_kernel()) {
2102                 set->nr_hw_queues = 1;
2103                 set->queue_depth = min(64U, set->queue_depth);
2104         }
2105
2106         set->tags = kmalloc_node(set->nr_hw_queues *
2107                                  sizeof(struct blk_mq_tags *),
2108                                  GFP_KERNEL, set->numa_node);
2109         if (!set->tags)
2110                 return -ENOMEM;
2111
2112         if (blk_mq_alloc_rq_maps(set))
2113                 goto enomem;
2114
2115         mutex_init(&set->tag_list_lock);
2116         INIT_LIST_HEAD(&set->tag_list);
2117
2118         return 0;
2119 enomem:
2120         kfree(set->tags);
2121         set->tags = NULL;
2122         return -ENOMEM;
2123 }
2124 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2125
2126 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2127 {
2128         int i;
2129
2130         for (i = 0; i < set->nr_hw_queues; i++) {
2131                 if (set->tags[i])
2132                         blk_mq_free_rq_map(set, set->tags[i], i);
2133         }
2134
2135         kfree(set->tags);
2136         set->tags = NULL;
2137 }
2138 EXPORT_SYMBOL(blk_mq_free_tag_set);
2139
2140 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2141 {
2142         struct blk_mq_tag_set *set = q->tag_set;
2143         struct blk_mq_hw_ctx *hctx;
2144         int i, ret;
2145
2146         if (!set || nr > set->queue_depth)
2147                 return -EINVAL;
2148
2149         ret = 0;
2150         queue_for_each_hw_ctx(q, hctx, i) {
2151                 ret = blk_mq_tag_update_depth(hctx->tags, nr);
2152                 if (ret)
2153                         break;
2154         }
2155
2156         if (!ret)
2157                 q->nr_requests = nr;
2158
2159         return ret;
2160 }
2161
2162 void blk_mq_disable_hotplug(void)
2163 {
2164         mutex_lock(&all_q_mutex);
2165 }
2166
2167 void blk_mq_enable_hotplug(void)
2168 {
2169         mutex_unlock(&all_q_mutex);
2170 }
2171
2172 static int __init blk_mq_init(void)
2173 {
2174         blk_mq_cpu_init();
2175
2176         hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
2177
2178         return 0;
2179 }
2180 subsys_initcall(blk_mq_init);