xen-blkback-porting
[firefly-linux-kernel-4.4.55.git] / drivers / xen / blkback / blkback.c
1 /******************************************************************************
2  * arch/xen/drivers/blkif/backend/main.c
3  *
4  * Back-end of the driver for virtual block devices. This portion of the
5  * driver exports a 'unified' block-device interface that can be accessed
6  * by any operating system that implements a compatible front end. A
7  * reference front-end implementation can be found in:
8  *  arch/xen/drivers/blkif/frontend
9  *
10  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
11  * Copyright (c) 2005, Christopher Clark
12  *
13  * This program is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU General Public License version 2
15  * as published by the Free Software Foundation; or, when distributed
16  * separately from the Linux kernel or incorporated into other
17  * software packages, subject to the following license:
18  *
19  * Permission is hereby granted, free of charge, to any person obtaining a copy
20  * of this source file (the "Software"), to deal in the Software without
21  * restriction, including without limitation the rights to use, copy, modify,
22  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23  * and to permit persons to whom the Software is furnished to do so, subject to
24  * the following conditions:
25  *
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  *
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35  * IN THE SOFTWARE.
36  */
37
38 #include <linux/spinlock.h>
39 #include <linux/kthread.h>
40 #include <linux/list.h>
41 #include <linux/delay.h>
42 #include <linux/freezer.h>
43 #include <xen/balloon.h>
44 #include <xen/events.h>
45 #include <xen/page.h>
46 #include <asm/xen/hypervisor.h>
47 #include <asm/xen/hypercall.h>
48 #include "common.h"
49
50 /*
51  * These are rather arbitrary. They are fairly large because adjacent requests
52  * pulled from a communication ring are quite likely to end up being part of
53  * the same scatter/gather request at the disc.
54  *
55  * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
56  *
57  * This will increase the chances of being able to write whole tracks.
58  * 64 should be enough to keep us competitive with Linux.
59  */
60 static int blkif_reqs = 64;
61 module_param_named(reqs, blkif_reqs, int, 0);
62 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
63
64 /* Run-time switchable: /sys/module/blkback/parameters/ */
65 static unsigned int log_stats = 0;
66 static unsigned int debug_lvl = 0;
67 module_param(log_stats, int, 0644);
68 module_param(debug_lvl, int, 0644);
69
70 /*
71  * Each outstanding request that we've passed to the lower device layers has a
72  * 'pending_req' allocated to it. Each buffer_head that completes decrements
73  * the pendcnt towards zero. When it hits zero, the specified domain has a
74  * response queued for it, with the saved 'id' passed back.
75  */
76 typedef struct {
77         blkif_t       *blkif;
78         u64            id;
79         int            nr_pages;
80         atomic_t       pendcnt;
81         unsigned short operation;
82         int            status;
83         struct list_head free_list;
84 } pending_req_t;
85
86 static pending_req_t *pending_reqs;
87 static struct list_head pending_free;
88 static DEFINE_SPINLOCK(pending_free_lock);
89 static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
90
91 #define BLKBACK_INVALID_HANDLE (~0)
92
93 static struct page **pending_pages;
94 static grant_handle_t *pending_grant_handles;
95
96 static inline int vaddr_pagenr(pending_req_t *req, int seg)
97 {
98         return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
99 }
100
101 static inline unsigned long vaddr(pending_req_t *req, int seg)
102 {
103         unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
104         return (unsigned long)pfn_to_kaddr(pfn);
105 }
106
107 #define pending_handle(_req, _seg) \
108         (pending_grant_handles[vaddr_pagenr(_req, _seg)])
109
110
111 static int do_block_io_op(blkif_t *blkif);
112 static void dispatch_rw_block_io(blkif_t *blkif,
113                                  struct blkif_request *req,
114                                  pending_req_t *pending_req);
115 static void make_response(blkif_t *blkif, u64 id,
116                           unsigned short op, int st);
117
118 /******************************************************************
119  * misc small helpers
120  */
121 static pending_req_t* alloc_req(void)
122 {
123         pending_req_t *req = NULL;
124         unsigned long flags;
125
126         spin_lock_irqsave(&pending_free_lock, flags);
127         if (!list_empty(&pending_free)) {
128                 req = list_entry(pending_free.next, pending_req_t, free_list);
129                 list_del(&req->free_list);
130         }
131         spin_unlock_irqrestore(&pending_free_lock, flags);
132         return req;
133 }
134
135 static void free_req(pending_req_t *req)
136 {
137         unsigned long flags;
138         int was_empty;
139
140         spin_lock_irqsave(&pending_free_lock, flags);
141         was_empty = list_empty(&pending_free);
142         list_add(&req->free_list, &pending_free);
143         spin_unlock_irqrestore(&pending_free_lock, flags);
144         if (was_empty)
145                 wake_up(&pending_free_wq);
146 }
147
148 static void unplug_queue(blkif_t *blkif)
149 {
150         if (blkif->plug == NULL)
151                 return;
152         if (blkif->plug->unplug_fn)
153                 blkif->plug->unplug_fn(blkif->plug);
154         blk_put_queue(blkif->plug);
155         blkif->plug = NULL;
156 }
157
158 static void plug_queue(blkif_t *blkif, struct block_device *bdev)
159 {
160         struct request_queue *q = bdev_get_queue(bdev);
161
162         if (q == blkif->plug)
163                 return;
164         unplug_queue(blkif);
165         blk_get_queue(q);
166         blkif->plug = q;
167 }
168
169 static void fast_flush_area(pending_req_t *req)
170 {
171         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
172         unsigned int i, invcount = 0;
173         grant_handle_t handle;
174         int ret;
175
176         for (i = 0; i < req->nr_pages; i++) {
177                 handle = pending_handle(req, i);
178                 if (handle == BLKBACK_INVALID_HANDLE)
179                         continue;
180                 gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
181                                     GNTMAP_host_map, handle);
182                 pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
183                 invcount++;
184         }
185
186         ret = HYPERVISOR_grant_table_op(
187                 GNTTABOP_unmap_grant_ref, unmap, invcount);
188         BUG_ON(ret);
189 }
190
191 /******************************************************************
192  * SCHEDULER FUNCTIONS
193  */
194
195 static void print_stats(blkif_t *blkif)
196 {
197         printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d\n",
198                current->comm, blkif->st_oo_req,
199                blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
200         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
201         blkif->st_rd_req = 0;
202         blkif->st_wr_req = 0;
203         blkif->st_oo_req = 0;
204 }
205
206 int blkif_schedule(void *arg)
207 {
208         blkif_t *blkif = arg;
209
210         blkif_get(blkif);
211
212         if (debug_lvl)
213                 printk(KERN_DEBUG "%s: started\n", current->comm);
214
215         while (!kthread_should_stop()) {
216                 if (try_to_freeze())
217                         continue;
218
219                 wait_event_interruptible(
220                         blkif->wq,
221                         blkif->waiting_reqs || kthread_should_stop());
222                 wait_event_interruptible(
223                         pending_free_wq,
224                         !list_empty(&pending_free) || kthread_should_stop());
225
226                 blkif->waiting_reqs = 0;
227                 smp_mb(); /* clear flag *before* checking for work */
228
229                 if (do_block_io_op(blkif))
230                         blkif->waiting_reqs = 1;
231                 unplug_queue(blkif);
232
233                 if (log_stats && time_after(jiffies, blkif->st_print))
234                         print_stats(blkif);
235         }
236
237         if (log_stats)
238                 print_stats(blkif);
239         if (debug_lvl)
240                 printk(KERN_DEBUG "%s: exiting\n", current->comm);
241
242         blkif->xenblkd = NULL;
243         blkif_put(blkif);
244
245         return 0;
246 }
247
248 /******************************************************************
249  * COMPLETION CALLBACK -- Called as bh->b_end_io()
250  */
251
252 static void __end_block_io_op(pending_req_t *pending_req, int error)
253 {
254         /* An error fails the entire request. */
255         if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
256             (error == -EOPNOTSUPP)) {
257                 DPRINTK("blkback: write barrier op failed, not supported\n");
258                 blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
259                 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
260         } else if (error) {
261                 DPRINTK("Buffer not up-to-date at end of operation, "
262                         "error=%d\n", error);
263                 pending_req->status = BLKIF_RSP_ERROR;
264         }
265
266         if (atomic_dec_and_test(&pending_req->pendcnt)) {
267                 fast_flush_area(pending_req);
268                 make_response(pending_req->blkif, pending_req->id,
269                               pending_req->operation, pending_req->status);
270                 blkif_put(pending_req->blkif);
271                 free_req(pending_req);
272         }
273 }
274
275 static void end_block_io_op(struct bio *bio, int error)
276 {
277         __end_block_io_op(bio->bi_private, error);
278         bio_put(bio);
279 }
280
281
282 /******************************************************************************
283  * NOTIFICATION FROM GUEST OS.
284  */
285
286 static void blkif_notify_work(blkif_t *blkif)
287 {
288         blkif->waiting_reqs = 1;
289         wake_up(&blkif->wq);
290 }
291
292 irqreturn_t blkif_be_int(int irq, void *dev_id)
293 {
294         blkif_notify_work(dev_id);
295         return IRQ_HANDLED;
296 }
297
298
299
300 /******************************************************************
301  * DOWNWARD CALLS -- These interface with the block-device layer proper.
302  */
303
304 static int do_block_io_op(blkif_t *blkif)
305 {
306         union blkif_back_rings *blk_rings = &blkif->blk_rings;
307         struct blkif_request req;
308         pending_req_t *pending_req;
309         RING_IDX rc, rp;
310         int more_to_do = 0;
311
312         rc = blk_rings->common.req_cons;
313         rp = blk_rings->common.sring->req_prod;
314         rmb(); /* Ensure we see queued requests up to 'rp'. */
315
316         while (rc != rp) {
317
318                 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
319                         break;
320
321                 pending_req = alloc_req();
322                 if (NULL == pending_req) {
323                         blkif->st_oo_req++;
324                         more_to_do = 1;
325                         break;
326                 }
327
328                 if (kthread_should_stop()) {
329                         more_to_do = 1;
330                         break;
331                 }
332
333                 switch (blkif->blk_protocol) {
334                 case BLKIF_PROTOCOL_NATIVE:
335                         memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
336                         break;
337                 case BLKIF_PROTOCOL_X86_32:
338                         blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
339                         break;
340                 case BLKIF_PROTOCOL_X86_64:
341                         blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
342                         break;
343                 default:
344                         BUG();
345                 }
346                 blk_rings->common.req_cons = ++rc; /* before make_response() */
347
348                 /* Apply all sanity checks to /private copy/ of request. */
349                 barrier();
350
351                 switch (req.operation) {
352                 case BLKIF_OP_READ:
353                         blkif->st_rd_req++;
354                         dispatch_rw_block_io(blkif, &req, pending_req);
355                         break;
356                 case BLKIF_OP_WRITE_BARRIER:
357                         blkif->st_br_req++;
358                         /* fall through */
359                 case BLKIF_OP_WRITE:
360                         blkif->st_wr_req++;
361                         dispatch_rw_block_io(blkif, &req, pending_req);
362                         break;
363                 default:
364                         /* A good sign something is wrong: sleep for a while to
365                          * avoid excessive CPU consumption by a bad guest. */
366                         msleep(1);
367                         DPRINTK("error: unknown block io operation [%d]\n",
368                                 req.operation);
369                         make_response(blkif, req.id, req.operation,
370                                       BLKIF_RSP_ERROR);
371                         free_req(pending_req);
372                         break;
373                 }
374
375                 /* Yield point for this unbounded loop. */
376                 cond_resched();
377         }
378
379         return more_to_do;
380 }
381
382 static void dispatch_rw_block_io(blkif_t *blkif,
383                                  struct blkif_request *req,
384                                  pending_req_t *pending_req)
385 {
386         extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
387         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
388         struct phys_req preq;
389         struct {
390                 unsigned long buf; unsigned int nsec;
391         } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
392         unsigned int nseg;
393         struct bio *bio = NULL;
394         int ret, i;
395         int operation;
396
397         switch (req->operation) {
398         case BLKIF_OP_READ:
399                 operation = READ;
400                 break;
401         case BLKIF_OP_WRITE:
402                 operation = WRITE;
403                 break;
404         case BLKIF_OP_WRITE_BARRIER:
405                 operation = WRITE_BARRIER;
406                 break;
407         default:
408                 operation = 0; /* make gcc happy */
409                 BUG();
410         }
411
412         /* Check that number of segments is sane. */
413         nseg = req->nr_segments;
414         if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
415             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
416                 DPRINTK("Bad number of segments in request (%d)\n", nseg);
417                 goto fail_response;
418         }
419
420         preq.dev           = req->handle;
421         preq.sector_number = req->sector_number;
422         preq.nr_sects      = 0;
423
424         pending_req->blkif     = blkif;
425         pending_req->id        = req->id;
426         pending_req->operation = req->operation;
427         pending_req->status    = BLKIF_RSP_OKAY;
428         pending_req->nr_pages  = nseg;
429
430         for (i = 0; i < nseg; i++) {
431                 uint32_t flags;
432
433                 seg[i].nsec = req->seg[i].last_sect -
434                         req->seg[i].first_sect + 1;
435
436                 if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
437                     (req->seg[i].last_sect < req->seg[i].first_sect))
438                         goto fail_response;
439                 preq.nr_sects += seg[i].nsec;
440
441                 flags = GNTMAP_host_map;
442                 if (operation != READ)
443                         flags |= GNTMAP_readonly;
444                 gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
445                                   req->seg[i].gref, blkif->domid);
446         }
447
448         ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
449         BUG_ON(ret);
450
451         for (i = 0; i < nseg; i++) {
452                 if (unlikely(map[i].status != 0)) {
453                         DPRINTK("invalid buffer -- could not remap it\n");
454                         map[i].handle = BLKBACK_INVALID_HANDLE;
455                         ret |= 1;
456                 }
457
458                 pending_handle(pending_req, i) = map[i].handle;
459
460                 if (ret)
461                         continue;
462
463                 set_phys_to_machine(__pa(vaddr(
464                         pending_req, i)) >> PAGE_SHIFT,
465                         FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
466                 seg[i].buf  = map[i].dev_bus_addr |
467                         (req->seg[i].first_sect << 9);
468         }
469
470         if (ret)
471                 goto fail_flush;
472
473         if (vbd_translate(&preq, blkif, operation) != 0) {
474                 DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
475                         operation == READ ? "read" : "write",
476                         preq.sector_number,
477                         preq.sector_number + preq.nr_sects, preq.dev);
478                 goto fail_flush;
479         }
480
481         plug_queue(blkif, preq.bdev);
482         atomic_set(&pending_req->pendcnt, 1);
483         blkif_get(blkif);
484
485         for (i = 0; i < nseg; i++) {
486                 if (((int)preq.sector_number|(int)seg[i].nsec) &
487                     ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
488                         DPRINTK("Misaligned I/O request from domain %d",
489                                 blkif->domid);
490                         goto fail_put_bio;
491                 }
492
493                 while ((bio == NULL) ||
494                        (bio_add_page(bio,
495                                      virt_to_page(vaddr(pending_req, i)),
496                                      seg[i].nsec << 9,
497                                      seg[i].buf & ~PAGE_MASK) == 0)) {
498                         if (bio) {
499                                 atomic_inc(&pending_req->pendcnt);
500                                 submit_bio(operation, bio);
501                         }
502
503                         bio = bio_alloc(GFP_KERNEL, nseg-i);
504                         if (unlikely(bio == NULL))
505                                 goto fail_put_bio;
506
507                         bio->bi_bdev    = preq.bdev;
508                         bio->bi_private = pending_req;
509                         bio->bi_end_io  = end_block_io_op;
510                         bio->bi_sector  = preq.sector_number;
511                 }
512
513                 preq.sector_number += seg[i].nsec;
514         }
515
516         if (!bio) {
517                 BUG_ON(operation != WRITE_BARRIER);
518                 bio = bio_alloc(GFP_KERNEL, 0);
519                 if (unlikely(bio == NULL))
520                         goto fail_put_bio;
521
522                 bio->bi_bdev    = preq.bdev;
523                 bio->bi_private = pending_req;
524                 bio->bi_end_io  = end_block_io_op;
525                 bio->bi_sector  = -1;
526         }
527
528         submit_bio(operation, bio);
529
530         if (operation == READ)
531                 blkif->st_rd_sect += preq.nr_sects;
532         else if (operation == WRITE || operation == WRITE_BARRIER)
533                 blkif->st_wr_sect += preq.nr_sects;
534
535         return;
536
537  fail_flush:
538         fast_flush_area(pending_req);
539  fail_response:
540         make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
541         free_req(pending_req);
542         msleep(1); /* back off a bit */
543         return;
544
545  fail_put_bio:
546         __end_block_io_op(pending_req, -EINVAL);
547         if (bio)
548                 bio_put(bio);
549         unplug_queue(blkif);
550         msleep(1); /* back off a bit */
551         return;
552 }
553
554
555
556 /******************************************************************
557  * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
558  */
559
560
561 static void make_response(blkif_t *blkif, u64 id,
562                           unsigned short op, int st)
563 {
564         struct blkif_response  resp;
565         unsigned long     flags;
566         union blkif_back_rings *blk_rings = &blkif->blk_rings;
567         int more_to_do = 0;
568         int notify;
569
570         resp.id        = id;
571         resp.operation = op;
572         resp.status    = st;
573
574         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
575         /* Place on the response ring for the relevant domain. */
576         switch (blkif->blk_protocol) {
577         case BLKIF_PROTOCOL_NATIVE:
578                 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
579                        &resp, sizeof(resp));
580                 break;
581         case BLKIF_PROTOCOL_X86_32:
582                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
583                        &resp, sizeof(resp));
584                 break;
585         case BLKIF_PROTOCOL_X86_64:
586                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
587                        &resp, sizeof(resp));
588                 break;
589         default:
590                 BUG();
591         }
592         blk_rings->common.rsp_prod_pvt++;
593         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
594         if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
595                 /*
596                  * Tail check for pending requests. Allows frontend to avoid
597                  * notifications if requests are already in flight (lower
598                  * overheads and promotes batching).
599                  */
600                 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
601
602         } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
603                 more_to_do = 1;
604         }
605
606         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
607
608         if (more_to_do)
609                 blkif_notify_work(blkif);
610         if (notify)
611                 notify_remote_via_irq(blkif->irq);
612 }
613
614 static int __init blkif_init(void)
615 {
616         int i, mmap_pages;
617
618         printk(KERN_CRIT "***blkif_init\n");
619         if (!xen_pv_domain())
620                 return -ENODEV;
621
622         mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
623
624         pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
625                                         blkif_reqs, GFP_KERNEL);
626         pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
627                                         mmap_pages, GFP_KERNEL);
628         pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
629
630         if (!pending_reqs || !pending_grant_handles || !pending_pages)
631                 goto out_of_memory;
632
633         for (i = 0; i < mmap_pages; i++)
634                 pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
635
636         blkif_interface_init();
637
638         memset(pending_reqs, 0, sizeof(pending_reqs));
639         INIT_LIST_HEAD(&pending_free);
640
641         for (i = 0; i < blkif_reqs; i++)
642                 list_add_tail(&pending_reqs[i].free_list, &pending_free);
643
644         blkif_xenbus_init();
645
646         return 0;
647
648  out_of_memory:
649         kfree(pending_reqs);
650         kfree(pending_grant_handles);
651         free_empty_pages_and_pagevec(pending_pages, mmap_pages);
652         printk("%s: out of memory\n", __FUNCTION__);
653         return -ENOMEM;
654 }
655
656 module_init(blkif_init);
657
658 MODULE_LICENSE("Dual BSD/GPL");