blk-mq: Exit queue on alloc failure
[firefly-linux-kernel-4.4.55.git] / block / blk-mq.c
1 /*
2  * Block multiqueue core code
3  *
4  * Copyright (C) 2013-2014 Jens Axboe
5  * Copyright (C) 2013-2014 Christoph Hellwig
6  */
7 #include <linux/kernel.h>
8 #include <linux/module.h>
9 #include <linux/backing-dev.h>
10 #include <linux/bio.h>
11 #include <linux/blkdev.h>
12 #include <linux/mm.h>
13 #include <linux/init.h>
14 #include <linux/slab.h>
15 #include <linux/workqueue.h>
16 #include <linux/smp.h>
17 #include <linux/llist.h>
18 #include <linux/list_sort.h>
19 #include <linux/cpu.h>
20 #include <linux/cache.h>
21 #include <linux/sched/sysctl.h>
22 #include <linux/delay.h>
23 #include <linux/crash_dump.h>
24
25 #include <trace/events/block.h>
26
27 #include <linux/blk-mq.h>
28 #include "blk.h"
29 #include "blk-mq.h"
30 #include "blk-mq-tag.h"
31
32 static DEFINE_MUTEX(all_q_mutex);
33 static LIST_HEAD(all_q_list);
34
35 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
36
37 /*
38  * Check if any of the ctx's have pending work in this hardware queue
39  */
40 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
41 {
42         unsigned int i;
43
44         for (i = 0; i < hctx->ctx_map.map_size; i++)
45                 if (hctx->ctx_map.map[i].word)
46                         return true;
47
48         return false;
49 }
50
51 static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
52                                               struct blk_mq_ctx *ctx)
53 {
54         return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
55 }
56
57 #define CTX_TO_BIT(hctx, ctx)   \
58         ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
59
60 /*
61  * Mark this ctx as having pending work in this hardware queue
62  */
63 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
64                                      struct blk_mq_ctx *ctx)
65 {
66         struct blk_align_bitmap *bm = get_bm(hctx, ctx);
67
68         if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
69                 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
70 }
71
72 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
73                                       struct blk_mq_ctx *ctx)
74 {
75         struct blk_align_bitmap *bm = get_bm(hctx, ctx);
76
77         clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
78 }
79
80 static int blk_mq_queue_enter(struct request_queue *q)
81 {
82         while (true) {
83                 int ret;
84
85                 if (percpu_ref_tryget_live(&q->mq_usage_counter))
86                         return 0;
87
88                 ret = wait_event_interruptible(q->mq_freeze_wq,
89                                 !q->mq_freeze_depth || blk_queue_dying(q));
90                 if (blk_queue_dying(q))
91                         return -ENODEV;
92                 if (ret)
93                         return ret;
94         }
95 }
96
97 static void blk_mq_queue_exit(struct request_queue *q)
98 {
99         percpu_ref_put(&q->mq_usage_counter);
100 }
101
102 static void blk_mq_usage_counter_release(struct percpu_ref *ref)
103 {
104         struct request_queue *q =
105                 container_of(ref, struct request_queue, mq_usage_counter);
106
107         wake_up_all(&q->mq_freeze_wq);
108 }
109
110 static void blk_mq_freeze_queue_start(struct request_queue *q)
111 {
112         bool freeze;
113
114         spin_lock_irq(q->queue_lock);
115         freeze = !q->mq_freeze_depth++;
116         spin_unlock_irq(q->queue_lock);
117
118         if (freeze) {
119                 percpu_ref_kill(&q->mq_usage_counter);
120                 blk_mq_run_queues(q, false);
121         }
122 }
123
124 static void blk_mq_freeze_queue_wait(struct request_queue *q)
125 {
126         wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
127 }
128
129 /*
130  * Guarantee no request is in use, so we can change any data structure of
131  * the queue afterward.
132  */
133 void blk_mq_freeze_queue(struct request_queue *q)
134 {
135         blk_mq_freeze_queue_start(q);
136         blk_mq_freeze_queue_wait(q);
137 }
138
139 static void blk_mq_unfreeze_queue(struct request_queue *q)
140 {
141         bool wake;
142
143         spin_lock_irq(q->queue_lock);
144         wake = !--q->mq_freeze_depth;
145         WARN_ON_ONCE(q->mq_freeze_depth < 0);
146         spin_unlock_irq(q->queue_lock);
147         if (wake) {
148                 percpu_ref_reinit(&q->mq_usage_counter);
149                 wake_up_all(&q->mq_freeze_wq);
150         }
151 }
152
153 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
154 {
155         return blk_mq_has_free_tags(hctx->tags);
156 }
157 EXPORT_SYMBOL(blk_mq_can_queue);
158
159 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
160                                struct request *rq, unsigned int rw_flags)
161 {
162         if (blk_queue_io_stat(q))
163                 rw_flags |= REQ_IO_STAT;
164
165         INIT_LIST_HEAD(&rq->queuelist);
166         /* csd/requeue_work/fifo_time is initialized before use */
167         rq->q = q;
168         rq->mq_ctx = ctx;
169         rq->cmd_flags |= rw_flags;
170         /* do not touch atomic flags, it needs atomic ops against the timer */
171         rq->cpu = -1;
172         INIT_HLIST_NODE(&rq->hash);
173         RB_CLEAR_NODE(&rq->rb_node);
174         rq->rq_disk = NULL;
175         rq->part = NULL;
176         rq->start_time = jiffies;
177 #ifdef CONFIG_BLK_CGROUP
178         rq->rl = NULL;
179         set_start_time_ns(rq);
180         rq->io_start_time_ns = 0;
181 #endif
182         rq->nr_phys_segments = 0;
183 #if defined(CONFIG_BLK_DEV_INTEGRITY)
184         rq->nr_integrity_segments = 0;
185 #endif
186         rq->special = NULL;
187         /* tag was already set */
188         rq->errors = 0;
189
190         rq->cmd = rq->__cmd;
191
192         rq->extra_len = 0;
193         rq->sense_len = 0;
194         rq->resid_len = 0;
195         rq->sense = NULL;
196
197         INIT_LIST_HEAD(&rq->timeout_list);
198         rq->timeout = 0;
199
200         rq->end_io = NULL;
201         rq->end_io_data = NULL;
202         rq->next_rq = NULL;
203
204         ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
205 }
206
207 static struct request *
208 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
209 {
210         struct request *rq;
211         unsigned int tag;
212
213         tag = blk_mq_get_tag(data);
214         if (tag != BLK_MQ_TAG_FAIL) {
215                 rq = data->hctx->tags->rqs[tag];
216
217                 if (blk_mq_tag_busy(data->hctx)) {
218                         rq->cmd_flags = REQ_MQ_INFLIGHT;
219                         atomic_inc(&data->hctx->nr_active);
220                 }
221
222                 rq->tag = tag;
223                 blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
224                 return rq;
225         }
226
227         return NULL;
228 }
229
230 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
231                 bool reserved)
232 {
233         struct blk_mq_ctx *ctx;
234         struct blk_mq_hw_ctx *hctx;
235         struct request *rq;
236         struct blk_mq_alloc_data alloc_data;
237         int ret;
238
239         ret = blk_mq_queue_enter(q);
240         if (ret)
241                 return ERR_PTR(ret);
242
243         ctx = blk_mq_get_ctx(q);
244         hctx = q->mq_ops->map_queue(q, ctx->cpu);
245         blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
246                         reserved, ctx, hctx);
247
248         rq = __blk_mq_alloc_request(&alloc_data, rw);
249         if (!rq && (gfp & __GFP_WAIT)) {
250                 __blk_mq_run_hw_queue(hctx);
251                 blk_mq_put_ctx(ctx);
252
253                 ctx = blk_mq_get_ctx(q);
254                 hctx = q->mq_ops->map_queue(q, ctx->cpu);
255                 blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
256                                 hctx);
257                 rq =  __blk_mq_alloc_request(&alloc_data, rw);
258                 ctx = alloc_data.ctx;
259         }
260         blk_mq_put_ctx(ctx);
261         if (!rq) {
262                 blk_mq_queue_exit(q);
263                 return ERR_PTR(-EWOULDBLOCK);
264         }
265         return rq;
266 }
267 EXPORT_SYMBOL(blk_mq_alloc_request);
268
269 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
270                                   struct blk_mq_ctx *ctx, struct request *rq)
271 {
272         const int tag = rq->tag;
273         struct request_queue *q = rq->q;
274
275         if (rq->cmd_flags & REQ_MQ_INFLIGHT)
276                 atomic_dec(&hctx->nr_active);
277         rq->cmd_flags = 0;
278
279         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
280         blk_mq_put_tag(hctx, tag, &ctx->last_tag);
281         blk_mq_queue_exit(q);
282 }
283
284 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
285 {
286         struct blk_mq_ctx *ctx = rq->mq_ctx;
287
288         ctx->rq_completed[rq_is_sync(rq)]++;
289         __blk_mq_free_request(hctx, ctx, rq);
290
291 }
292 EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
293
294 void blk_mq_free_request(struct request *rq)
295 {
296         struct blk_mq_hw_ctx *hctx;
297         struct request_queue *q = rq->q;
298
299         hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
300         blk_mq_free_hctx_request(hctx, rq);
301 }
302 EXPORT_SYMBOL_GPL(blk_mq_free_request);
303
304 inline void __blk_mq_end_request(struct request *rq, int error)
305 {
306         blk_account_io_done(rq);
307
308         if (rq->end_io) {
309                 rq->end_io(rq, error);
310         } else {
311                 if (unlikely(blk_bidi_rq(rq)))
312                         blk_mq_free_request(rq->next_rq);
313                 blk_mq_free_request(rq);
314         }
315 }
316 EXPORT_SYMBOL(__blk_mq_end_request);
317
318 void blk_mq_end_request(struct request *rq, int error)
319 {
320         if (blk_update_request(rq, error, blk_rq_bytes(rq)))
321                 BUG();
322         __blk_mq_end_request(rq, error);
323 }
324 EXPORT_SYMBOL(blk_mq_end_request);
325
326 static void __blk_mq_complete_request_remote(void *data)
327 {
328         struct request *rq = data;
329
330         rq->q->softirq_done_fn(rq);
331 }
332
333 static void blk_mq_ipi_complete_request(struct request *rq)
334 {
335         struct blk_mq_ctx *ctx = rq->mq_ctx;
336         bool shared = false;
337         int cpu;
338
339         if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
340                 rq->q->softirq_done_fn(rq);
341                 return;
342         }
343
344         cpu = get_cpu();
345         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
346                 shared = cpus_share_cache(cpu, ctx->cpu);
347
348         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
349                 rq->csd.func = __blk_mq_complete_request_remote;
350                 rq->csd.info = rq;
351                 rq->csd.flags = 0;
352                 smp_call_function_single_async(ctx->cpu, &rq->csd);
353         } else {
354                 rq->q->softirq_done_fn(rq);
355         }
356         put_cpu();
357 }
358
359 void __blk_mq_complete_request(struct request *rq)
360 {
361         struct request_queue *q = rq->q;
362
363         if (!q->softirq_done_fn)
364                 blk_mq_end_request(rq, rq->errors);
365         else
366                 blk_mq_ipi_complete_request(rq);
367 }
368
369 /**
370  * blk_mq_complete_request - end I/O on a request
371  * @rq:         the request being processed
372  *
373  * Description:
374  *      Ends all I/O on a request. It does not handle partial completions.
375  *      The actual completion happens out-of-order, through a IPI handler.
376  **/
377 void blk_mq_complete_request(struct request *rq)
378 {
379         struct request_queue *q = rq->q;
380
381         if (unlikely(blk_should_fake_timeout(q)))
382                 return;
383         if (!blk_mark_rq_complete(rq))
384                 __blk_mq_complete_request(rq);
385 }
386 EXPORT_SYMBOL(blk_mq_complete_request);
387
388 void blk_mq_start_request(struct request *rq)
389 {
390         struct request_queue *q = rq->q;
391
392         trace_block_rq_issue(q, rq);
393
394         rq->resid_len = blk_rq_bytes(rq);
395         if (unlikely(blk_bidi_rq(rq)))
396                 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
397
398         blk_add_timer(rq);
399
400         /*
401          * Ensure that ->deadline is visible before set the started
402          * flag and clear the completed flag.
403          */
404         smp_mb__before_atomic();
405
406         /*
407          * Mark us as started and clear complete. Complete might have been
408          * set if requeue raced with timeout, which then marked it as
409          * complete. So be sure to clear complete again when we start
410          * the request, otherwise we'll ignore the completion event.
411          */
412         if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
413                 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
414         if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
415                 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
416
417         if (q->dma_drain_size && blk_rq_bytes(rq)) {
418                 /*
419                  * Make sure space for the drain appears.  We know we can do
420                  * this because max_hw_segments has been adjusted to be one
421                  * fewer than the device can handle.
422                  */
423                 rq->nr_phys_segments++;
424         }
425 }
426 EXPORT_SYMBOL(blk_mq_start_request);
427
428 static void __blk_mq_requeue_request(struct request *rq)
429 {
430         struct request_queue *q = rq->q;
431
432         trace_block_rq_requeue(q, rq);
433
434         if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
435                 if (q->dma_drain_size && blk_rq_bytes(rq))
436                         rq->nr_phys_segments--;
437         }
438 }
439
440 void blk_mq_requeue_request(struct request *rq)
441 {
442         __blk_mq_requeue_request(rq);
443
444         BUG_ON(blk_queued_rq(rq));
445         blk_mq_add_to_requeue_list(rq, true);
446 }
447 EXPORT_SYMBOL(blk_mq_requeue_request);
448
449 static void blk_mq_requeue_work(struct work_struct *work)
450 {
451         struct request_queue *q =
452                 container_of(work, struct request_queue, requeue_work);
453         LIST_HEAD(rq_list);
454         struct request *rq, *next;
455         unsigned long flags;
456
457         spin_lock_irqsave(&q->requeue_lock, flags);
458         list_splice_init(&q->requeue_list, &rq_list);
459         spin_unlock_irqrestore(&q->requeue_lock, flags);
460
461         list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
462                 if (!(rq->cmd_flags & REQ_SOFTBARRIER))
463                         continue;
464
465                 rq->cmd_flags &= ~REQ_SOFTBARRIER;
466                 list_del_init(&rq->queuelist);
467                 blk_mq_insert_request(rq, true, false, false);
468         }
469
470         while (!list_empty(&rq_list)) {
471                 rq = list_entry(rq_list.next, struct request, queuelist);
472                 list_del_init(&rq->queuelist);
473                 blk_mq_insert_request(rq, false, false, false);
474         }
475
476         /*
477          * Use the start variant of queue running here, so that running
478          * the requeue work will kick stopped queues.
479          */
480         blk_mq_start_hw_queues(q);
481 }
482
483 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
484 {
485         struct request_queue *q = rq->q;
486         unsigned long flags;
487
488         /*
489          * We abuse this flag that is otherwise used by the I/O scheduler to
490          * request head insertation from the workqueue.
491          */
492         BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
493
494         spin_lock_irqsave(&q->requeue_lock, flags);
495         if (at_head) {
496                 rq->cmd_flags |= REQ_SOFTBARRIER;
497                 list_add(&rq->queuelist, &q->requeue_list);
498         } else {
499                 list_add_tail(&rq->queuelist, &q->requeue_list);
500         }
501         spin_unlock_irqrestore(&q->requeue_lock, flags);
502 }
503 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
504
505 void blk_mq_kick_requeue_list(struct request_queue *q)
506 {
507         kblockd_schedule_work(&q->requeue_work);
508 }
509 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
510
511 static inline bool is_flush_request(struct request *rq,
512                 struct blk_flush_queue *fq, unsigned int tag)
513 {
514         return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
515                         fq->flush_rq->tag == tag);
516 }
517
518 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
519 {
520         struct request *rq = tags->rqs[tag];
521         /* mq_ctx of flush rq is always cloned from the corresponding req */
522         struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx);
523
524         if (!is_flush_request(rq, fq, tag))
525                 return rq;
526
527         return fq->flush_rq;
528 }
529 EXPORT_SYMBOL(blk_mq_tag_to_rq);
530
531 struct blk_mq_timeout_data {
532         unsigned long next;
533         unsigned int next_set;
534 };
535
536 void blk_mq_rq_timed_out(struct request *req, bool reserved)
537 {
538         struct blk_mq_ops *ops = req->q->mq_ops;
539         enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
540
541         /*
542          * We know that complete is set at this point. If STARTED isn't set
543          * anymore, then the request isn't active and the "timeout" should
544          * just be ignored. This can happen due to the bitflag ordering.
545          * Timeout first checks if STARTED is set, and if it is, assumes
546          * the request is active. But if we race with completion, then
547          * we both flags will get cleared. So check here again, and ignore
548          * a timeout event with a request that isn't active.
549          */
550         if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
551                 return;
552
553         if (ops->timeout)
554                 ret = ops->timeout(req, reserved);
555
556         switch (ret) {
557         case BLK_EH_HANDLED:
558                 __blk_mq_complete_request(req);
559                 break;
560         case BLK_EH_RESET_TIMER:
561                 blk_add_timer(req);
562                 blk_clear_rq_complete(req);
563                 break;
564         case BLK_EH_NOT_HANDLED:
565                 break;
566         default:
567                 printk(KERN_ERR "block: bad eh return: %d\n", ret);
568                 break;
569         }
570 }
571                 
572 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
573                 struct request *rq, void *priv, bool reserved)
574 {
575         struct blk_mq_timeout_data *data = priv;
576
577         if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
578                 return;
579
580         if (time_after_eq(jiffies, rq->deadline)) {
581                 if (!blk_mark_rq_complete(rq))
582                         blk_mq_rq_timed_out(rq, reserved);
583         } else if (!data->next_set || time_after(data->next, rq->deadline)) {
584                 data->next = rq->deadline;
585                 data->next_set = 1;
586         }
587 }
588
589 static void blk_mq_rq_timer(unsigned long priv)
590 {
591         struct request_queue *q = (struct request_queue *)priv;
592         struct blk_mq_timeout_data data = {
593                 .next           = 0,
594                 .next_set       = 0,
595         };
596         struct blk_mq_hw_ctx *hctx;
597         int i;
598
599         queue_for_each_hw_ctx(q, hctx, i) {
600                 /*
601                  * If not software queues are currently mapped to this
602                  * hardware queue, there's nothing to check
603                  */
604                 if (!blk_mq_hw_queue_mapped(hctx))
605                         continue;
606
607                 blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
608         }
609
610         if (data.next_set) {
611                 data.next = blk_rq_timeout(round_jiffies_up(data.next));
612                 mod_timer(&q->timeout, data.next);
613         } else {
614                 queue_for_each_hw_ctx(q, hctx, i)
615                         blk_mq_tag_idle(hctx);
616         }
617 }
618
619 /*
620  * Reverse check our software queue for entries that we could potentially
621  * merge with. Currently includes a hand-wavy stop count of 8, to not spend
622  * too much time checking for merges.
623  */
624 static bool blk_mq_attempt_merge(struct request_queue *q,
625                                  struct blk_mq_ctx *ctx, struct bio *bio)
626 {
627         struct request *rq;
628         int checked = 8;
629
630         list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
631                 int el_ret;
632
633                 if (!checked--)
634                         break;
635
636                 if (!blk_rq_merge_ok(rq, bio))
637                         continue;
638
639                 el_ret = blk_try_merge(rq, bio);
640                 if (el_ret == ELEVATOR_BACK_MERGE) {
641                         if (bio_attempt_back_merge(q, rq, bio)) {
642                                 ctx->rq_merged++;
643                                 return true;
644                         }
645                         break;
646                 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
647                         if (bio_attempt_front_merge(q, rq, bio)) {
648                                 ctx->rq_merged++;
649                                 return true;
650                         }
651                         break;
652                 }
653         }
654
655         return false;
656 }
657
658 /*
659  * Process software queues that have been marked busy, splicing them
660  * to the for-dispatch
661  */
662 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
663 {
664         struct blk_mq_ctx *ctx;
665         int i;
666
667         for (i = 0; i < hctx->ctx_map.map_size; i++) {
668                 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
669                 unsigned int off, bit;
670
671                 if (!bm->word)
672                         continue;
673
674                 bit = 0;
675                 off = i * hctx->ctx_map.bits_per_word;
676                 do {
677                         bit = find_next_bit(&bm->word, bm->depth, bit);
678                         if (bit >= bm->depth)
679                                 break;
680
681                         ctx = hctx->ctxs[bit + off];
682                         clear_bit(bit, &bm->word);
683                         spin_lock(&ctx->lock);
684                         list_splice_tail_init(&ctx->rq_list, list);
685                         spin_unlock(&ctx->lock);
686
687                         bit++;
688                 } while (1);
689         }
690 }
691
692 /*
693  * Run this hardware queue, pulling any software queues mapped to it in.
694  * Note that this function currently has various problems around ordering
695  * of IO. In particular, we'd like FIFO behaviour on handling existing
696  * items on the hctx->dispatch list. Ignore that for now.
697  */
698 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
699 {
700         struct request_queue *q = hctx->queue;
701         struct request *rq;
702         LIST_HEAD(rq_list);
703         LIST_HEAD(driver_list);
704         struct list_head *dptr;
705         int queued;
706
707         WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
708
709         if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
710                 return;
711
712         hctx->run++;
713
714         /*
715          * Touch any software queue that has pending entries.
716          */
717         flush_busy_ctxs(hctx, &rq_list);
718
719         /*
720          * If we have previous entries on our dispatch list, grab them
721          * and stuff them at the front for more fair dispatch.
722          */
723         if (!list_empty_careful(&hctx->dispatch)) {
724                 spin_lock(&hctx->lock);
725                 if (!list_empty(&hctx->dispatch))
726                         list_splice_init(&hctx->dispatch, &rq_list);
727                 spin_unlock(&hctx->lock);
728         }
729
730         /*
731          * Start off with dptr being NULL, so we start the first request
732          * immediately, even if we have more pending.
733          */
734         dptr = NULL;
735
736         /*
737          * Now process all the entries, sending them to the driver.
738          */
739         queued = 0;
740         while (!list_empty(&rq_list)) {
741                 struct blk_mq_queue_data bd;
742                 int ret;
743
744                 rq = list_first_entry(&rq_list, struct request, queuelist);
745                 list_del_init(&rq->queuelist);
746
747                 bd.rq = rq;
748                 bd.list = dptr;
749                 bd.last = list_empty(&rq_list);
750
751                 ret = q->mq_ops->queue_rq(hctx, &bd);
752                 switch (ret) {
753                 case BLK_MQ_RQ_QUEUE_OK:
754                         queued++;
755                         continue;
756                 case BLK_MQ_RQ_QUEUE_BUSY:
757                         list_add(&rq->queuelist, &rq_list);
758                         __blk_mq_requeue_request(rq);
759                         break;
760                 default:
761                         pr_err("blk-mq: bad return on queue: %d\n", ret);
762                 case BLK_MQ_RQ_QUEUE_ERROR:
763                         rq->errors = -EIO;
764                         blk_mq_end_request(rq, rq->errors);
765                         break;
766                 }
767
768                 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
769                         break;
770
771                 /*
772                  * We've done the first request. If we have more than 1
773                  * left in the list, set dptr to defer issue.
774                  */
775                 if (!dptr && rq_list.next != rq_list.prev)
776                         dptr = &driver_list;
777         }
778
779         if (!queued)
780                 hctx->dispatched[0]++;
781         else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
782                 hctx->dispatched[ilog2(queued) + 1]++;
783
784         /*
785          * Any items that need requeuing? Stuff them into hctx->dispatch,
786          * that is where we will continue on next queue run.
787          */
788         if (!list_empty(&rq_list)) {
789                 spin_lock(&hctx->lock);
790                 list_splice(&rq_list, &hctx->dispatch);
791                 spin_unlock(&hctx->lock);
792         }
793 }
794
795 /*
796  * It'd be great if the workqueue API had a way to pass
797  * in a mask and had some smarts for more clever placement.
798  * For now we just round-robin here, switching for every
799  * BLK_MQ_CPU_WORK_BATCH queued items.
800  */
801 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
802 {
803         if (hctx->queue->nr_hw_queues == 1)
804                 return WORK_CPU_UNBOUND;
805
806         if (--hctx->next_cpu_batch <= 0) {
807                 int cpu = hctx->next_cpu, next_cpu;
808
809                 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
810                 if (next_cpu >= nr_cpu_ids)
811                         next_cpu = cpumask_first(hctx->cpumask);
812
813                 hctx->next_cpu = next_cpu;
814                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
815
816                 return cpu;
817         }
818
819         return hctx->next_cpu;
820 }
821
822 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
823 {
824         if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
825             !blk_mq_hw_queue_mapped(hctx)))
826                 return;
827
828         if (!async) {
829                 int cpu = get_cpu();
830                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
831                         __blk_mq_run_hw_queue(hctx);
832                         put_cpu();
833                         return;
834                 }
835
836                 put_cpu();
837         }
838
839         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
840                         &hctx->run_work, 0);
841 }
842
843 void blk_mq_run_queues(struct request_queue *q, bool async)
844 {
845         struct blk_mq_hw_ctx *hctx;
846         int i;
847
848         queue_for_each_hw_ctx(q, hctx, i) {
849                 if ((!blk_mq_hctx_has_pending(hctx) &&
850                     list_empty_careful(&hctx->dispatch)) ||
851                     test_bit(BLK_MQ_S_STOPPED, &hctx->state))
852                         continue;
853
854                 blk_mq_run_hw_queue(hctx, async);
855         }
856 }
857 EXPORT_SYMBOL(blk_mq_run_queues);
858
859 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
860 {
861         cancel_delayed_work(&hctx->run_work);
862         cancel_delayed_work(&hctx->delay_work);
863         set_bit(BLK_MQ_S_STOPPED, &hctx->state);
864 }
865 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
866
867 void blk_mq_stop_hw_queues(struct request_queue *q)
868 {
869         struct blk_mq_hw_ctx *hctx;
870         int i;
871
872         queue_for_each_hw_ctx(q, hctx, i)
873                 blk_mq_stop_hw_queue(hctx);
874 }
875 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
876
877 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
878 {
879         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
880
881         blk_mq_run_hw_queue(hctx, false);
882 }
883 EXPORT_SYMBOL(blk_mq_start_hw_queue);
884
885 void blk_mq_start_hw_queues(struct request_queue *q)
886 {
887         struct blk_mq_hw_ctx *hctx;
888         int i;
889
890         queue_for_each_hw_ctx(q, hctx, i)
891                 blk_mq_start_hw_queue(hctx);
892 }
893 EXPORT_SYMBOL(blk_mq_start_hw_queues);
894
895
896 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
897 {
898         struct blk_mq_hw_ctx *hctx;
899         int i;
900
901         queue_for_each_hw_ctx(q, hctx, i) {
902                 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
903                         continue;
904
905                 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
906                 blk_mq_run_hw_queue(hctx, async);
907         }
908 }
909 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
910
911 static void blk_mq_run_work_fn(struct work_struct *work)
912 {
913         struct blk_mq_hw_ctx *hctx;
914
915         hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
916
917         __blk_mq_run_hw_queue(hctx);
918 }
919
920 static void blk_mq_delay_work_fn(struct work_struct *work)
921 {
922         struct blk_mq_hw_ctx *hctx;
923
924         hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
925
926         if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
927                 __blk_mq_run_hw_queue(hctx);
928 }
929
930 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
931 {
932         if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
933                 return;
934
935         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
936                         &hctx->delay_work, msecs_to_jiffies(msecs));
937 }
938 EXPORT_SYMBOL(blk_mq_delay_queue);
939
940 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
941                                     struct request *rq, bool at_head)
942 {
943         struct blk_mq_ctx *ctx = rq->mq_ctx;
944
945         trace_block_rq_insert(hctx->queue, rq);
946
947         if (at_head)
948                 list_add(&rq->queuelist, &ctx->rq_list);
949         else
950                 list_add_tail(&rq->queuelist, &ctx->rq_list);
951
952         blk_mq_hctx_mark_pending(hctx, ctx);
953 }
954
955 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
956                 bool async)
957 {
958         struct request_queue *q = rq->q;
959         struct blk_mq_hw_ctx *hctx;
960         struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
961
962         current_ctx = blk_mq_get_ctx(q);
963         if (!cpu_online(ctx->cpu))
964                 rq->mq_ctx = ctx = current_ctx;
965
966         hctx = q->mq_ops->map_queue(q, ctx->cpu);
967
968         spin_lock(&ctx->lock);
969         __blk_mq_insert_request(hctx, rq, at_head);
970         spin_unlock(&ctx->lock);
971
972         if (run_queue)
973                 blk_mq_run_hw_queue(hctx, async);
974
975         blk_mq_put_ctx(current_ctx);
976 }
977
978 static void blk_mq_insert_requests(struct request_queue *q,
979                                      struct blk_mq_ctx *ctx,
980                                      struct list_head *list,
981                                      int depth,
982                                      bool from_schedule)
983
984 {
985         struct blk_mq_hw_ctx *hctx;
986         struct blk_mq_ctx *current_ctx;
987
988         trace_block_unplug(q, depth, !from_schedule);
989
990         current_ctx = blk_mq_get_ctx(q);
991
992         if (!cpu_online(ctx->cpu))
993                 ctx = current_ctx;
994         hctx = q->mq_ops->map_queue(q, ctx->cpu);
995
996         /*
997          * preemption doesn't flush plug list, so it's possible ctx->cpu is
998          * offline now
999          */
1000         spin_lock(&ctx->lock);
1001         while (!list_empty(list)) {
1002                 struct request *rq;
1003
1004                 rq = list_first_entry(list, struct request, queuelist);
1005                 list_del_init(&rq->queuelist);
1006                 rq->mq_ctx = ctx;
1007                 __blk_mq_insert_request(hctx, rq, false);
1008         }
1009         spin_unlock(&ctx->lock);
1010
1011         blk_mq_run_hw_queue(hctx, from_schedule);
1012         blk_mq_put_ctx(current_ctx);
1013 }
1014
1015 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1016 {
1017         struct request *rqa = container_of(a, struct request, queuelist);
1018         struct request *rqb = container_of(b, struct request, queuelist);
1019
1020         return !(rqa->mq_ctx < rqb->mq_ctx ||
1021                  (rqa->mq_ctx == rqb->mq_ctx &&
1022                   blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1023 }
1024
1025 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1026 {
1027         struct blk_mq_ctx *this_ctx;
1028         struct request_queue *this_q;
1029         struct request *rq;
1030         LIST_HEAD(list);
1031         LIST_HEAD(ctx_list);
1032         unsigned int depth;
1033
1034         list_splice_init(&plug->mq_list, &list);
1035
1036         list_sort(NULL, &list, plug_ctx_cmp);
1037
1038         this_q = NULL;
1039         this_ctx = NULL;
1040         depth = 0;
1041
1042         while (!list_empty(&list)) {
1043                 rq = list_entry_rq(list.next);
1044                 list_del_init(&rq->queuelist);
1045                 BUG_ON(!rq->q);
1046                 if (rq->mq_ctx != this_ctx) {
1047                         if (this_ctx) {
1048                                 blk_mq_insert_requests(this_q, this_ctx,
1049                                                         &ctx_list, depth,
1050                                                         from_schedule);
1051                         }
1052
1053                         this_ctx = rq->mq_ctx;
1054                         this_q = rq->q;
1055                         depth = 0;
1056                 }
1057
1058                 depth++;
1059                 list_add_tail(&rq->queuelist, &ctx_list);
1060         }
1061
1062         /*
1063          * If 'this_ctx' is set, we know we have entries to complete
1064          * on 'ctx_list'. Do those.
1065          */
1066         if (this_ctx) {
1067                 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
1068                                        from_schedule);
1069         }
1070 }
1071
1072 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1073 {
1074         init_request_from_bio(rq, bio);
1075
1076         if (blk_do_io_stat(rq))
1077                 blk_account_io_start(rq, 1);
1078 }
1079
1080 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1081 {
1082         return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1083                 !blk_queue_nomerges(hctx->queue);
1084 }
1085
1086 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
1087                                          struct blk_mq_ctx *ctx,
1088                                          struct request *rq, struct bio *bio)
1089 {
1090         if (!hctx_allow_merges(hctx)) {
1091                 blk_mq_bio_to_request(rq, bio);
1092                 spin_lock(&ctx->lock);
1093 insert_rq:
1094                 __blk_mq_insert_request(hctx, rq, false);
1095                 spin_unlock(&ctx->lock);
1096                 return false;
1097         } else {
1098                 struct request_queue *q = hctx->queue;
1099
1100                 spin_lock(&ctx->lock);
1101                 if (!blk_mq_attempt_merge(q, ctx, bio)) {
1102                         blk_mq_bio_to_request(rq, bio);
1103                         goto insert_rq;
1104                 }
1105
1106                 spin_unlock(&ctx->lock);
1107                 __blk_mq_free_request(hctx, ctx, rq);
1108                 return true;
1109         }
1110 }
1111
1112 struct blk_map_ctx {
1113         struct blk_mq_hw_ctx *hctx;
1114         struct blk_mq_ctx *ctx;
1115 };
1116
1117 static struct request *blk_mq_map_request(struct request_queue *q,
1118                                           struct bio *bio,
1119                                           struct blk_map_ctx *data)
1120 {
1121         struct blk_mq_hw_ctx *hctx;
1122         struct blk_mq_ctx *ctx;
1123         struct request *rq;
1124         int rw = bio_data_dir(bio);
1125         struct blk_mq_alloc_data alloc_data;
1126
1127         if (unlikely(blk_mq_queue_enter(q))) {
1128                 bio_endio(bio, -EIO);
1129                 return NULL;
1130         }
1131
1132         ctx = blk_mq_get_ctx(q);
1133         hctx = q->mq_ops->map_queue(q, ctx->cpu);
1134
1135         if (rw_is_sync(bio->bi_rw))
1136                 rw |= REQ_SYNC;
1137
1138         trace_block_getrq(q, bio, rw);
1139         blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
1140                         hctx);
1141         rq = __blk_mq_alloc_request(&alloc_data, rw);
1142         if (unlikely(!rq)) {
1143                 __blk_mq_run_hw_queue(hctx);
1144                 blk_mq_put_ctx(ctx);
1145                 trace_block_sleeprq(q, bio, rw);
1146
1147                 ctx = blk_mq_get_ctx(q);
1148                 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1149                 blk_mq_set_alloc_data(&alloc_data, q,
1150                                 __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
1151                 rq = __blk_mq_alloc_request(&alloc_data, rw);
1152                 ctx = alloc_data.ctx;
1153                 hctx = alloc_data.hctx;
1154         }
1155
1156         hctx->queued++;
1157         data->hctx = hctx;
1158         data->ctx = ctx;
1159         return rq;
1160 }
1161
1162 /*
1163  * Multiple hardware queue variant. This will not use per-process plugs,
1164  * but will attempt to bypass the hctx queueing if we can go straight to
1165  * hardware for SYNC IO.
1166  */
1167 static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1168 {
1169         const int is_sync = rw_is_sync(bio->bi_rw);
1170         const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1171         struct blk_map_ctx data;
1172         struct request *rq;
1173
1174         blk_queue_bounce(q, &bio);
1175
1176         if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1177                 bio_endio(bio, -EIO);
1178                 return;
1179         }
1180
1181         rq = blk_mq_map_request(q, bio, &data);
1182         if (unlikely(!rq))
1183                 return;
1184
1185         if (unlikely(is_flush_fua)) {
1186                 blk_mq_bio_to_request(rq, bio);
1187                 blk_insert_flush(rq);
1188                 goto run_queue;
1189         }
1190
1191         /*
1192          * If the driver supports defer issued based on 'last', then
1193          * queue it up like normal since we can potentially save some
1194          * CPU this way.
1195          */
1196         if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1197                 struct blk_mq_queue_data bd = {
1198                         .rq = rq,
1199                         .list = NULL,
1200                         .last = 1
1201                 };
1202                 int ret;
1203
1204                 blk_mq_bio_to_request(rq, bio);
1205
1206                 /*
1207                  * For OK queue, we are done. For error, kill it. Any other
1208                  * error (busy), just add it to our list as we previously
1209                  * would have done
1210                  */
1211                 ret = q->mq_ops->queue_rq(data.hctx, &bd);
1212                 if (ret == BLK_MQ_RQ_QUEUE_OK)
1213                         goto done;
1214                 else {
1215                         __blk_mq_requeue_request(rq);
1216
1217                         if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1218                                 rq->errors = -EIO;
1219                                 blk_mq_end_request(rq, rq->errors);
1220                                 goto done;
1221                         }
1222                 }
1223         }
1224
1225         if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1226                 /*
1227                  * For a SYNC request, send it to the hardware immediately. For
1228                  * an ASYNC request, just ensure that we run it later on. The
1229                  * latter allows for merging opportunities and more efficient
1230                  * dispatching.
1231                  */
1232 run_queue:
1233                 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1234         }
1235 done:
1236         blk_mq_put_ctx(data.ctx);
1237 }
1238
1239 /*
1240  * Single hardware queue variant. This will attempt to use any per-process
1241  * plug for merging and IO deferral.
1242  */
1243 static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1244 {
1245         const int is_sync = rw_is_sync(bio->bi_rw);
1246         const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1247         unsigned int use_plug, request_count = 0;
1248         struct blk_map_ctx data;
1249         struct request *rq;
1250
1251         /*
1252          * If we have multiple hardware queues, just go directly to
1253          * one of those for sync IO.
1254          */
1255         use_plug = !is_flush_fua && !is_sync;
1256
1257         blk_queue_bounce(q, &bio);
1258
1259         if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1260                 bio_endio(bio, -EIO);
1261                 return;
1262         }
1263
1264         if (use_plug && !blk_queue_nomerges(q) &&
1265             blk_attempt_plug_merge(q, bio, &request_count))
1266                 return;
1267
1268         rq = blk_mq_map_request(q, bio, &data);
1269         if (unlikely(!rq))
1270                 return;
1271
1272         if (unlikely(is_flush_fua)) {
1273                 blk_mq_bio_to_request(rq, bio);
1274                 blk_insert_flush(rq);
1275                 goto run_queue;
1276         }
1277
1278         /*
1279          * A task plug currently exists. Since this is completely lockless,
1280          * utilize that to temporarily store requests until the task is
1281          * either done or scheduled away.
1282          */
1283         if (use_plug) {
1284                 struct blk_plug *plug = current->plug;
1285
1286                 if (plug) {
1287                         blk_mq_bio_to_request(rq, bio);
1288                         if (list_empty(&plug->mq_list))
1289                                 trace_block_plug(q);
1290                         else if (request_count >= BLK_MAX_REQUEST_COUNT) {
1291                                 blk_flush_plug_list(plug, false);
1292                                 trace_block_plug(q);
1293                         }
1294                         list_add_tail(&rq->queuelist, &plug->mq_list);
1295                         blk_mq_put_ctx(data.ctx);
1296                         return;
1297                 }
1298         }
1299
1300         if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1301                 /*
1302                  * For a SYNC request, send it to the hardware immediately. For
1303                  * an ASYNC request, just ensure that we run it later on. The
1304                  * latter allows for merging opportunities and more efficient
1305                  * dispatching.
1306                  */
1307 run_queue:
1308                 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1309         }
1310
1311         blk_mq_put_ctx(data.ctx);
1312 }
1313
1314 /*
1315  * Default mapping to a software queue, since we use one per CPU.
1316  */
1317 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
1318 {
1319         return q->queue_hw_ctx[q->mq_map[cpu]];
1320 }
1321 EXPORT_SYMBOL(blk_mq_map_queue);
1322
1323 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
1324                 struct blk_mq_tags *tags, unsigned int hctx_idx)
1325 {
1326         struct page *page;
1327
1328         if (tags->rqs && set->ops->exit_request) {
1329                 int i;
1330
1331                 for (i = 0; i < tags->nr_tags; i++) {
1332                         if (!tags->rqs[i])
1333                                 continue;
1334                         set->ops->exit_request(set->driver_data, tags->rqs[i],
1335                                                 hctx_idx, i);
1336                         tags->rqs[i] = NULL;
1337                 }
1338         }
1339
1340         while (!list_empty(&tags->page_list)) {
1341                 page = list_first_entry(&tags->page_list, struct page, lru);
1342                 list_del_init(&page->lru);
1343                 __free_pages(page, page->private);
1344         }
1345
1346         kfree(tags->rqs);
1347
1348         blk_mq_free_tags(tags);
1349 }
1350
1351 static size_t order_to_size(unsigned int order)
1352 {
1353         return (size_t)PAGE_SIZE << order;
1354 }
1355
1356 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1357                 unsigned int hctx_idx)
1358 {
1359         struct blk_mq_tags *tags;
1360         unsigned int i, j, entries_per_page, max_order = 4;
1361         size_t rq_size, left;
1362
1363         tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
1364                                 set->numa_node);
1365         if (!tags)
1366                 return NULL;
1367
1368         INIT_LIST_HEAD(&tags->page_list);
1369
1370         tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
1371                                  GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
1372                                  set->numa_node);
1373         if (!tags->rqs) {
1374                 blk_mq_free_tags(tags);
1375                 return NULL;
1376         }
1377
1378         /*
1379          * rq_size is the size of the request plus driver payload, rounded
1380          * to the cacheline size
1381          */
1382         rq_size = round_up(sizeof(struct request) + set->cmd_size,
1383                                 cache_line_size());
1384         left = rq_size * set->queue_depth;
1385
1386         for (i = 0; i < set->queue_depth; ) {
1387                 int this_order = max_order;
1388                 struct page *page;
1389                 int to_do;
1390                 void *p;
1391
1392                 while (left < order_to_size(this_order - 1) && this_order)
1393                         this_order--;
1394
1395                 do {
1396                         page = alloc_pages_node(set->numa_node,
1397                                 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
1398                                 this_order);
1399                         if (page)
1400                                 break;
1401                         if (!this_order--)
1402                                 break;
1403                         if (order_to_size(this_order) < rq_size)
1404                                 break;
1405                 } while (1);
1406
1407                 if (!page)
1408                         goto fail;
1409
1410                 page->private = this_order;
1411                 list_add_tail(&page->lru, &tags->page_list);
1412
1413                 p = page_address(page);
1414                 entries_per_page = order_to_size(this_order) / rq_size;
1415                 to_do = min(entries_per_page, set->queue_depth - i);
1416                 left -= to_do * rq_size;
1417                 for (j = 0; j < to_do; j++) {
1418                         tags->rqs[i] = p;
1419                         tags->rqs[i]->atomic_flags = 0;
1420                         tags->rqs[i]->cmd_flags = 0;
1421                         if (set->ops->init_request) {
1422                                 if (set->ops->init_request(set->driver_data,
1423                                                 tags->rqs[i], hctx_idx, i,
1424                                                 set->numa_node)) {
1425                                         tags->rqs[i] = NULL;
1426                                         goto fail;
1427                                 }
1428                         }
1429
1430                         p += rq_size;
1431                         i++;
1432                 }
1433         }
1434
1435         return tags;
1436
1437 fail:
1438         blk_mq_free_rq_map(set, tags, hctx_idx);
1439         return NULL;
1440 }
1441
1442 static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
1443 {
1444         kfree(bitmap->map);
1445 }
1446
1447 static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
1448 {
1449         unsigned int bpw = 8, total, num_maps, i;
1450
1451         bitmap->bits_per_word = bpw;
1452
1453         num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
1454         bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
1455                                         GFP_KERNEL, node);
1456         if (!bitmap->map)
1457                 return -ENOMEM;
1458
1459         bitmap->map_size = num_maps;
1460
1461         total = nr_cpu_ids;
1462         for (i = 0; i < num_maps; i++) {
1463                 bitmap->map[i].depth = min(total, bitmap->bits_per_word);
1464                 total -= bitmap->map[i].depth;
1465         }
1466
1467         return 0;
1468 }
1469
1470 static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
1471 {
1472         struct request_queue *q = hctx->queue;
1473         struct blk_mq_ctx *ctx;
1474         LIST_HEAD(tmp);
1475
1476         /*
1477          * Move ctx entries to new CPU, if this one is going away.
1478          */
1479         ctx = __blk_mq_get_ctx(q, cpu);
1480
1481         spin_lock(&ctx->lock);
1482         if (!list_empty(&ctx->rq_list)) {
1483                 list_splice_init(&ctx->rq_list, &tmp);
1484                 blk_mq_hctx_clear_pending(hctx, ctx);
1485         }
1486         spin_unlock(&ctx->lock);
1487
1488         if (list_empty(&tmp))
1489                 return NOTIFY_OK;
1490
1491         ctx = blk_mq_get_ctx(q);
1492         spin_lock(&ctx->lock);
1493
1494         while (!list_empty(&tmp)) {
1495                 struct request *rq;
1496
1497                 rq = list_first_entry(&tmp, struct request, queuelist);
1498                 rq->mq_ctx = ctx;
1499                 list_move_tail(&rq->queuelist, &ctx->rq_list);
1500         }
1501
1502         hctx = q->mq_ops->map_queue(q, ctx->cpu);
1503         blk_mq_hctx_mark_pending(hctx, ctx);
1504
1505         spin_unlock(&ctx->lock);
1506
1507         blk_mq_run_hw_queue(hctx, true);
1508         blk_mq_put_ctx(ctx);
1509         return NOTIFY_OK;
1510 }
1511
1512 static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
1513 {
1514         struct request_queue *q = hctx->queue;
1515         struct blk_mq_tag_set *set = q->tag_set;
1516
1517         if (set->tags[hctx->queue_num])
1518                 return NOTIFY_OK;
1519
1520         set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
1521         if (!set->tags[hctx->queue_num])
1522                 return NOTIFY_STOP;
1523
1524         hctx->tags = set->tags[hctx->queue_num];
1525         return NOTIFY_OK;
1526 }
1527
1528 static int blk_mq_hctx_notify(void *data, unsigned long action,
1529                               unsigned int cpu)
1530 {
1531         struct blk_mq_hw_ctx *hctx = data;
1532
1533         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1534                 return blk_mq_hctx_cpu_offline(hctx, cpu);
1535         else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
1536                 return blk_mq_hctx_cpu_online(hctx, cpu);
1537
1538         return NOTIFY_OK;
1539 }
1540
1541 static void blk_mq_exit_hctx(struct request_queue *q,
1542                 struct blk_mq_tag_set *set,
1543                 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1544 {
1545         unsigned flush_start_tag = set->queue_depth;
1546
1547         blk_mq_tag_idle(hctx);
1548
1549         if (set->ops->exit_request)
1550                 set->ops->exit_request(set->driver_data,
1551                                        hctx->fq->flush_rq, hctx_idx,
1552                                        flush_start_tag + hctx_idx);
1553
1554         if (set->ops->exit_hctx)
1555                 set->ops->exit_hctx(hctx, hctx_idx);
1556
1557         blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1558         blk_free_flush_queue(hctx->fq);
1559         kfree(hctx->ctxs);
1560         blk_mq_free_bitmap(&hctx->ctx_map);
1561 }
1562
1563 static void blk_mq_exit_hw_queues(struct request_queue *q,
1564                 struct blk_mq_tag_set *set, int nr_queue)
1565 {
1566         struct blk_mq_hw_ctx *hctx;
1567         unsigned int i;
1568
1569         queue_for_each_hw_ctx(q, hctx, i) {
1570                 if (i == nr_queue)
1571                         break;
1572                 blk_mq_exit_hctx(q, set, hctx, i);
1573         }
1574 }
1575
1576 static void blk_mq_free_hw_queues(struct request_queue *q,
1577                 struct blk_mq_tag_set *set)
1578 {
1579         struct blk_mq_hw_ctx *hctx;
1580         unsigned int i;
1581
1582         queue_for_each_hw_ctx(q, hctx, i) {
1583                 free_cpumask_var(hctx->cpumask);
1584                 kfree(hctx);
1585         }
1586 }
1587
1588 static int blk_mq_init_hctx(struct request_queue *q,
1589                 struct blk_mq_tag_set *set,
1590                 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
1591 {
1592         int node;
1593         unsigned flush_start_tag = set->queue_depth;
1594
1595         node = hctx->numa_node;
1596         if (node == NUMA_NO_NODE)
1597                 node = hctx->numa_node = set->numa_node;
1598
1599         INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1600         INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1601         spin_lock_init(&hctx->lock);
1602         INIT_LIST_HEAD(&hctx->dispatch);
1603         hctx->queue = q;
1604         hctx->queue_num = hctx_idx;
1605         hctx->flags = set->flags;
1606         hctx->cmd_size = set->cmd_size;
1607
1608         blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1609                                         blk_mq_hctx_notify, hctx);
1610         blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1611
1612         hctx->tags = set->tags[hctx_idx];
1613
1614         /*
1615          * Allocate space for all possible cpus to avoid allocation at
1616          * runtime
1617          */
1618         hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1619                                         GFP_KERNEL, node);
1620         if (!hctx->ctxs)
1621                 goto unregister_cpu_notifier;
1622
1623         if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
1624                 goto free_ctxs;
1625
1626         hctx->nr_ctx = 0;
1627
1628         if (set->ops->init_hctx &&
1629             set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1630                 goto free_bitmap;
1631
1632         hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
1633         if (!hctx->fq)
1634                 goto exit_hctx;
1635
1636         if (set->ops->init_request &&
1637             set->ops->init_request(set->driver_data,
1638                                    hctx->fq->flush_rq, hctx_idx,
1639                                    flush_start_tag + hctx_idx, node))
1640                 goto free_fq;
1641
1642         return 0;
1643
1644  free_fq:
1645         kfree(hctx->fq);
1646  exit_hctx:
1647         if (set->ops->exit_hctx)
1648                 set->ops->exit_hctx(hctx, hctx_idx);
1649  free_bitmap:
1650         blk_mq_free_bitmap(&hctx->ctx_map);
1651  free_ctxs:
1652         kfree(hctx->ctxs);
1653  unregister_cpu_notifier:
1654         blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1655
1656         return -1;
1657 }
1658
1659 static int blk_mq_init_hw_queues(struct request_queue *q,
1660                 struct blk_mq_tag_set *set)
1661 {
1662         struct blk_mq_hw_ctx *hctx;
1663         unsigned int i;
1664
1665         /*
1666          * Initialize hardware queues
1667          */
1668         queue_for_each_hw_ctx(q, hctx, i) {
1669                 if (blk_mq_init_hctx(q, set, hctx, i))
1670                         break;
1671         }
1672
1673         if (i == q->nr_hw_queues)
1674                 return 0;
1675
1676         /*
1677          * Init failed
1678          */
1679         blk_mq_exit_hw_queues(q, set, i);
1680
1681         return 1;
1682 }
1683
1684 static void blk_mq_init_cpu_queues(struct request_queue *q,
1685                                    unsigned int nr_hw_queues)
1686 {
1687         unsigned int i;
1688
1689         for_each_possible_cpu(i) {
1690                 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1691                 struct blk_mq_hw_ctx *hctx;
1692
1693                 memset(__ctx, 0, sizeof(*__ctx));
1694                 __ctx->cpu = i;
1695                 spin_lock_init(&__ctx->lock);
1696                 INIT_LIST_HEAD(&__ctx->rq_list);
1697                 __ctx->queue = q;
1698
1699                 /* If the cpu isn't online, the cpu is mapped to first hctx */
1700                 if (!cpu_online(i))
1701                         continue;
1702
1703                 hctx = q->mq_ops->map_queue(q, i);
1704                 cpumask_set_cpu(i, hctx->cpumask);
1705                 hctx->nr_ctx++;
1706
1707                 /*
1708                  * Set local node, IFF we have more than one hw queue. If
1709                  * not, we remain on the home node of the device
1710                  */
1711                 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1712                         hctx->numa_node = cpu_to_node(i);
1713         }
1714 }
1715
1716 static void blk_mq_map_swqueue(struct request_queue *q)
1717 {
1718         unsigned int i;
1719         struct blk_mq_hw_ctx *hctx;
1720         struct blk_mq_ctx *ctx;
1721
1722         queue_for_each_hw_ctx(q, hctx, i) {
1723                 cpumask_clear(hctx->cpumask);
1724                 hctx->nr_ctx = 0;
1725         }
1726
1727         /*
1728          * Map software to hardware queues
1729          */
1730         queue_for_each_ctx(q, ctx, i) {
1731                 /* If the cpu isn't online, the cpu is mapped to first hctx */
1732                 if (!cpu_online(i))
1733                         continue;
1734
1735                 hctx = q->mq_ops->map_queue(q, i);
1736                 cpumask_set_cpu(i, hctx->cpumask);
1737                 ctx->index_hw = hctx->nr_ctx;
1738                 hctx->ctxs[hctx->nr_ctx++] = ctx;
1739         }
1740
1741         queue_for_each_hw_ctx(q, hctx, i) {
1742                 /*
1743                  * If no software queues are mapped to this hardware queue,
1744                  * disable it and free the request entries.
1745                  */
1746                 if (!hctx->nr_ctx) {
1747                         struct blk_mq_tag_set *set = q->tag_set;
1748
1749                         if (set->tags[i]) {
1750                                 blk_mq_free_rq_map(set, set->tags[i], i);
1751                                 set->tags[i] = NULL;
1752                                 hctx->tags = NULL;
1753                         }
1754                         continue;
1755                 }
1756
1757                 /*
1758                  * Initialize batch roundrobin counts
1759                  */
1760                 hctx->next_cpu = cpumask_first(hctx->cpumask);
1761                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1762         }
1763 }
1764
1765 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
1766 {
1767         struct blk_mq_hw_ctx *hctx;
1768         struct request_queue *q;
1769         bool shared;
1770         int i;
1771
1772         if (set->tag_list.next == set->tag_list.prev)
1773                 shared = false;
1774         else
1775                 shared = true;
1776
1777         list_for_each_entry(q, &set->tag_list, tag_set_list) {
1778                 blk_mq_freeze_queue(q);
1779
1780                 queue_for_each_hw_ctx(q, hctx, i) {
1781                         if (shared)
1782                                 hctx->flags |= BLK_MQ_F_TAG_SHARED;
1783                         else
1784                                 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1785                 }
1786                 blk_mq_unfreeze_queue(q);
1787         }
1788 }
1789
1790 static void blk_mq_del_queue_tag_set(struct request_queue *q)
1791 {
1792         struct blk_mq_tag_set *set = q->tag_set;
1793
1794         mutex_lock(&set->tag_list_lock);
1795         list_del_init(&q->tag_set_list);
1796         blk_mq_update_tag_set_depth(set);
1797         mutex_unlock(&set->tag_list_lock);
1798 }
1799
1800 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1801                                      struct request_queue *q)
1802 {
1803         q->tag_set = set;
1804
1805         mutex_lock(&set->tag_list_lock);
1806         list_add_tail(&q->tag_set_list, &set->tag_list);
1807         blk_mq_update_tag_set_depth(set);
1808         mutex_unlock(&set->tag_list_lock);
1809 }
1810
1811 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1812 {
1813         struct blk_mq_hw_ctx **hctxs;
1814         struct blk_mq_ctx __percpu *ctx;
1815         struct request_queue *q;
1816         unsigned int *map;
1817         int i;
1818
1819         ctx = alloc_percpu(struct blk_mq_ctx);
1820         if (!ctx)
1821                 return ERR_PTR(-ENOMEM);
1822
1823         hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1824                         set->numa_node);
1825
1826         if (!hctxs)
1827                 goto err_percpu;
1828
1829         map = blk_mq_make_queue_map(set);
1830         if (!map)
1831                 goto err_map;
1832
1833         for (i = 0; i < set->nr_hw_queues; i++) {
1834                 int node = blk_mq_hw_queue_to_node(map, i);
1835
1836                 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
1837                                         GFP_KERNEL, node);
1838                 if (!hctxs[i])
1839                         goto err_hctxs;
1840
1841                 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
1842                                                 node))
1843                         goto err_hctxs;
1844
1845                 atomic_set(&hctxs[i]->nr_active, 0);
1846                 hctxs[i]->numa_node = node;
1847                 hctxs[i]->queue_num = i;
1848         }
1849
1850         q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1851         if (!q)
1852                 goto err_hctxs;
1853
1854         /*
1855          * Init percpu_ref in atomic mode so that it's faster to shutdown.
1856          * See blk_register_queue() for details.
1857          */
1858         if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
1859                             PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
1860                 goto err_map;
1861
1862         setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1863         blk_queue_rq_timeout(q, 30000);
1864
1865         q->nr_queues = nr_cpu_ids;
1866         q->nr_hw_queues = set->nr_hw_queues;
1867         q->mq_map = map;
1868
1869         q->queue_ctx = ctx;
1870         q->queue_hw_ctx = hctxs;
1871
1872         q->mq_ops = set->ops;
1873         q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
1874
1875         if (!(set->flags & BLK_MQ_F_SG_MERGE))
1876                 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
1877
1878         q->sg_reserved_size = INT_MAX;
1879
1880         INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
1881         INIT_LIST_HEAD(&q->requeue_list);
1882         spin_lock_init(&q->requeue_lock);
1883
1884         if (q->nr_hw_queues > 1)
1885                 blk_queue_make_request(q, blk_mq_make_request);
1886         else
1887                 blk_queue_make_request(q, blk_sq_make_request);
1888
1889         if (set->timeout)
1890                 blk_queue_rq_timeout(q, set->timeout);
1891
1892         /*
1893          * Do this after blk_queue_make_request() overrides it...
1894          */
1895         q->nr_requests = set->queue_depth;
1896
1897         if (set->ops->complete)
1898                 blk_queue_softirq_done(q, set->ops->complete);
1899
1900         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
1901
1902         if (blk_mq_init_hw_queues(q, set))
1903                 goto err_hw;
1904
1905         mutex_lock(&all_q_mutex);
1906         list_add_tail(&q->all_q_node, &all_q_list);
1907         mutex_unlock(&all_q_mutex);
1908
1909         blk_mq_add_queue_tag_set(set, q);
1910
1911         blk_mq_map_swqueue(q);
1912
1913         return q;
1914
1915 err_hw:
1916         blk_cleanup_queue(q);
1917 err_hctxs:
1918         kfree(map);
1919         for (i = 0; i < set->nr_hw_queues; i++) {
1920                 if (!hctxs[i])
1921                         break;
1922                 free_cpumask_var(hctxs[i]->cpumask);
1923                 kfree(hctxs[i]);
1924         }
1925 err_map:
1926         kfree(hctxs);
1927 err_percpu:
1928         free_percpu(ctx);
1929         return ERR_PTR(-ENOMEM);
1930 }
1931 EXPORT_SYMBOL(blk_mq_init_queue);
1932
1933 void blk_mq_free_queue(struct request_queue *q)
1934 {
1935         struct blk_mq_tag_set   *set = q->tag_set;
1936
1937         blk_mq_del_queue_tag_set(q);
1938
1939         blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
1940         blk_mq_free_hw_queues(q, set);
1941
1942         percpu_ref_exit(&q->mq_usage_counter);
1943
1944         free_percpu(q->queue_ctx);
1945         kfree(q->queue_hw_ctx);
1946         kfree(q->mq_map);
1947
1948         q->queue_ctx = NULL;
1949         q->queue_hw_ctx = NULL;
1950         q->mq_map = NULL;
1951
1952         mutex_lock(&all_q_mutex);
1953         list_del_init(&q->all_q_node);
1954         mutex_unlock(&all_q_mutex);
1955 }
1956
1957 /* Basically redo blk_mq_init_queue with queue frozen */
1958 static void blk_mq_queue_reinit(struct request_queue *q)
1959 {
1960         WARN_ON_ONCE(!q->mq_freeze_depth);
1961
1962         blk_mq_sysfs_unregister(q);
1963
1964         blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
1965
1966         /*
1967          * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
1968          * we should change hctx numa_node according to new topology (this
1969          * involves free and re-allocate memory, worthy doing?)
1970          */
1971
1972         blk_mq_map_swqueue(q);
1973
1974         blk_mq_sysfs_register(q);
1975 }
1976
1977 static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1978                                       unsigned long action, void *hcpu)
1979 {
1980         struct request_queue *q;
1981
1982         /*
1983          * Before new mappings are established, hotadded cpu might already
1984          * start handling requests. This doesn't break anything as we map
1985          * offline CPUs to first hardware queue. We will re-init the queue
1986          * below to get optimal settings.
1987          */
1988         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1989             action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
1990                 return NOTIFY_OK;
1991
1992         mutex_lock(&all_q_mutex);
1993
1994         /*
1995          * We need to freeze and reinit all existing queues.  Freezing
1996          * involves synchronous wait for an RCU grace period and doing it
1997          * one by one may take a long time.  Start freezing all queues in
1998          * one swoop and then wait for the completions so that freezing can
1999          * take place in parallel.
2000          */
2001         list_for_each_entry(q, &all_q_list, all_q_node)
2002                 blk_mq_freeze_queue_start(q);
2003         list_for_each_entry(q, &all_q_list, all_q_node)
2004                 blk_mq_freeze_queue_wait(q);
2005
2006         list_for_each_entry(q, &all_q_list, all_q_node)
2007                 blk_mq_queue_reinit(q);
2008
2009         list_for_each_entry(q, &all_q_list, all_q_node)
2010                 blk_mq_unfreeze_queue(q);
2011
2012         mutex_unlock(&all_q_mutex);
2013         return NOTIFY_OK;
2014 }
2015
2016 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2017 {
2018         int i;
2019
2020         for (i = 0; i < set->nr_hw_queues; i++) {
2021                 set->tags[i] = blk_mq_init_rq_map(set, i);
2022                 if (!set->tags[i])
2023                         goto out_unwind;
2024         }
2025
2026         return 0;
2027
2028 out_unwind:
2029         while (--i >= 0)
2030                 blk_mq_free_rq_map(set, set->tags[i], i);
2031
2032         return -ENOMEM;
2033 }
2034
2035 /*
2036  * Allocate the request maps associated with this tag_set. Note that this
2037  * may reduce the depth asked for, if memory is tight. set->queue_depth
2038  * will be updated to reflect the allocated depth.
2039  */
2040 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2041 {
2042         unsigned int depth;
2043         int err;
2044
2045         depth = set->queue_depth;
2046         do {
2047                 err = __blk_mq_alloc_rq_maps(set);
2048                 if (!err)
2049                         break;
2050
2051                 set->queue_depth >>= 1;
2052                 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2053                         err = -ENOMEM;
2054                         break;
2055                 }
2056         } while (set->queue_depth);
2057
2058         if (!set->queue_depth || err) {
2059                 pr_err("blk-mq: failed to allocate request map\n");
2060                 return -ENOMEM;
2061         }
2062
2063         if (depth != set->queue_depth)
2064                 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2065                                                 depth, set->queue_depth);
2066
2067         return 0;
2068 }
2069
2070 /*
2071  * Alloc a tag set to be associated with one or more request queues.
2072  * May fail with EINVAL for various error conditions. May adjust the
2073  * requested depth down, if if it too large. In that case, the set
2074  * value will be stored in set->queue_depth.
2075  */
2076 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2077 {
2078         BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2079
2080         if (!set->nr_hw_queues)
2081                 return -EINVAL;
2082         if (!set->queue_depth)
2083                 return -EINVAL;
2084         if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2085                 return -EINVAL;
2086
2087         if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
2088                 return -EINVAL;
2089
2090         if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2091                 pr_info("blk-mq: reduced tag depth to %u\n",
2092                         BLK_MQ_MAX_DEPTH);
2093                 set->queue_depth = BLK_MQ_MAX_DEPTH;
2094         }
2095
2096         /*
2097          * If a crashdump is active, then we are potentially in a very
2098          * memory constrained environment. Limit us to 1 queue and
2099          * 64 tags to prevent using too much memory.
2100          */
2101         if (is_kdump_kernel()) {
2102                 set->nr_hw_queues = 1;
2103                 set->queue_depth = min(64U, set->queue_depth);
2104         }
2105
2106         set->tags = kmalloc_node(set->nr_hw_queues *
2107                                  sizeof(struct blk_mq_tags *),
2108                                  GFP_KERNEL, set->numa_node);
2109         if (!set->tags)
2110                 return -ENOMEM;
2111
2112         if (blk_mq_alloc_rq_maps(set))
2113                 goto enomem;
2114
2115         mutex_init(&set->tag_list_lock);
2116         INIT_LIST_HEAD(&set->tag_list);
2117
2118         return 0;
2119 enomem:
2120         kfree(set->tags);
2121         set->tags = NULL;
2122         return -ENOMEM;
2123 }
2124 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2125
2126 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2127 {
2128         int i;
2129
2130         for (i = 0; i < set->nr_hw_queues; i++) {
2131                 if (set->tags[i])
2132                         blk_mq_free_rq_map(set, set->tags[i], i);
2133         }
2134
2135         kfree(set->tags);
2136         set->tags = NULL;
2137 }
2138 EXPORT_SYMBOL(blk_mq_free_tag_set);
2139
2140 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2141 {
2142         struct blk_mq_tag_set *set = q->tag_set;
2143         struct blk_mq_hw_ctx *hctx;
2144         int i, ret;
2145
2146         if (!set || nr > set->queue_depth)
2147                 return -EINVAL;
2148
2149         ret = 0;
2150         queue_for_each_hw_ctx(q, hctx, i) {
2151                 ret = blk_mq_tag_update_depth(hctx->tags, nr);
2152                 if (ret)
2153                         break;
2154         }
2155
2156         if (!ret)
2157                 q->nr_requests = nr;
2158
2159         return ret;
2160 }
2161
2162 void blk_mq_disable_hotplug(void)
2163 {
2164         mutex_lock(&all_q_mutex);
2165 }
2166
2167 void blk_mq_enable_hotplug(void)
2168 {
2169         mutex_unlock(&all_q_mutex);
2170 }
2171
2172 static int __init blk_mq_init(void)
2173 {
2174         blk_mq_cpu_init();
2175
2176         hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
2177
2178         return 0;
2179 }
2180 subsys_initcall(blk_mq_init);