xen: add blkback support
[firefly-linux-kernel-4.4.55.git] / drivers / xen / blkback / blkback.c
1 /******************************************************************************
2  * arch/xen/drivers/blkif/backend/main.c
3  *
4  * Back-end of the driver for virtual block devices. This portion of the
5  * driver exports a 'unified' block-device interface that can be accessed
6  * by any operating system that implements a compatible front end. A
7  * reference front-end implementation can be found in:
8  *  arch/xen/drivers/blkif/frontend
9  *
10  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
11  * Copyright (c) 2005, Christopher Clark
12  *
13  * This program is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU General Public License version 2
15  * as published by the Free Software Foundation; or, when distributed
16  * separately from the Linux kernel or incorporated into other
17  * software packages, subject to the following license:
18  *
19  * Permission is hereby granted, free of charge, to any person obtaining a copy
20  * of this source file (the "Software"), to deal in the Software without
21  * restriction, including without limitation the rights to use, copy, modify,
22  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23  * and to permit persons to whom the Software is furnished to do so, subject to
24  * the following conditions:
25  *
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  *
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35  * IN THE SOFTWARE.
36  */
37
38 #include <linux/spinlock.h>
39 #include <linux/kthread.h>
40 #include <linux/list.h>
41 #include <linux/delay.h>
42 #include <xen/balloon.h>
43 #include <asm/hypervisor.h>
44 #include "common.h"
45
46 /*
47  * These are rather arbitrary. They are fairly large because adjacent requests
48  * pulled from a communication ring are quite likely to end up being part of
49  * the same scatter/gather request at the disc.
50  *
51  * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
52  *
53  * This will increase the chances of being able to write whole tracks.
54  * 64 should be enough to keep us competitive with Linux.
55  */
56 static int blkif_reqs = 64;
57 module_param_named(reqs, blkif_reqs, int, 0);
58 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
59
60 /* Run-time switchable: /sys/module/blkback/parameters/ */
61 static unsigned int log_stats = 0;
62 static unsigned int debug_lvl = 0;
63 module_param(log_stats, int, 0644);
64 module_param(debug_lvl, int, 0644);
65
66 /*
67  * Each outstanding request that we've passed to the lower device layers has a
68  * 'pending_req' allocated to it. Each buffer_head that completes decrements
69  * the pendcnt towards zero. When it hits zero, the specified domain has a
70  * response queued for it, with the saved 'id' passed back.
71  */
72 typedef struct {
73         blkif_t       *blkif;
74         u64            id;
75         int            nr_pages;
76         atomic_t       pendcnt;
77         unsigned short operation;
78         int            status;
79         struct list_head free_list;
80 } pending_req_t;
81
82 static pending_req_t *pending_reqs;
83 static struct list_head pending_free;
84 static DEFINE_SPINLOCK(pending_free_lock);
85 static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
86
87 #define BLKBACK_INVALID_HANDLE (~0)
88
89 static struct page **pending_pages;
90 static grant_handle_t *pending_grant_handles;
91
92 static inline int vaddr_pagenr(pending_req_t *req, int seg)
93 {
94         return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
95 }
96
97 static inline unsigned long vaddr(pending_req_t *req, int seg)
98 {
99         unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
100         return (unsigned long)pfn_to_kaddr(pfn);
101 }
102
103 #define pending_handle(_req, _seg) \
104         (pending_grant_handles[vaddr_pagenr(_req, _seg)])
105
106
107 static int do_block_io_op(blkif_t *blkif);
108 static void dispatch_rw_block_io(blkif_t *blkif,
109                                  blkif_request_t *req,
110                                  pending_req_t *pending_req);
111 static void make_response(blkif_t *blkif, u64 id,
112                           unsigned short op, int st);
113
114 /******************************************************************
115  * misc small helpers
116  */
117 static pending_req_t* alloc_req(void)
118 {
119         pending_req_t *req = NULL;
120         unsigned long flags;
121
122         spin_lock_irqsave(&pending_free_lock, flags);
123         if (!list_empty(&pending_free)) {
124                 req = list_entry(pending_free.next, pending_req_t, free_list);
125                 list_del(&req->free_list);
126         }
127         spin_unlock_irqrestore(&pending_free_lock, flags);
128         return req;
129 }
130
131 static void free_req(pending_req_t *req)
132 {
133         unsigned long flags;
134         int was_empty;
135
136         spin_lock_irqsave(&pending_free_lock, flags);
137         was_empty = list_empty(&pending_free);
138         list_add(&req->free_list, &pending_free);
139         spin_unlock_irqrestore(&pending_free_lock, flags);
140         if (was_empty)
141                 wake_up(&pending_free_wq);
142 }
143
144 static void unplug_queue(blkif_t *blkif)
145 {
146         if (blkif->plug == NULL)
147                 return;
148         if (blkif->plug->unplug_fn)
149                 blkif->plug->unplug_fn(blkif->plug);
150         blk_put_queue(blkif->plug);
151         blkif->plug = NULL;
152 }
153
154 static void plug_queue(blkif_t *blkif, struct block_device *bdev)
155 {
156         request_queue_t *q = bdev_get_queue(bdev);
157
158         if (q == blkif->plug)
159                 return;
160         unplug_queue(blkif);
161         blk_get_queue(q);
162         blkif->plug = q;
163 }
164
165 static void fast_flush_area(pending_req_t *req)
166 {
167         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
168         unsigned int i, invcount = 0;
169         grant_handle_t handle;
170         int ret;
171
172         for (i = 0; i < req->nr_pages; i++) {
173                 handle = pending_handle(req, i);
174                 if (handle == BLKBACK_INVALID_HANDLE)
175                         continue;
176                 gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
177                                     GNTMAP_host_map, handle);
178                 pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
179                 invcount++;
180         }
181
182         ret = HYPERVISOR_grant_table_op(
183                 GNTTABOP_unmap_grant_ref, unmap, invcount);
184         BUG_ON(ret);
185 }
186
187 /******************************************************************
188  * SCHEDULER FUNCTIONS
189  */
190
191 static void print_stats(blkif_t *blkif)
192 {
193         printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d\n",
194                current->comm, blkif->st_oo_req,
195                blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
196         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
197         blkif->st_rd_req = 0;
198         blkif->st_wr_req = 0;
199         blkif->st_oo_req = 0;
200 }
201
202 int blkif_schedule(void *arg)
203 {
204         blkif_t *blkif = arg;
205
206         blkif_get(blkif);
207
208         if (debug_lvl)
209                 printk(KERN_DEBUG "%s: started\n", current->comm);
210
211         while (!kthread_should_stop()) {
212                 if (try_to_freeze())
213                         continue;
214
215                 wait_event_interruptible(
216                         blkif->wq,
217                         blkif->waiting_reqs || kthread_should_stop());
218                 wait_event_interruptible(
219                         pending_free_wq,
220                         !list_empty(&pending_free) || kthread_should_stop());
221
222                 blkif->waiting_reqs = 0;
223                 smp_mb(); /* clear flag *before* checking for work */
224
225                 if (do_block_io_op(blkif))
226                         blkif->waiting_reqs = 1;
227                 unplug_queue(blkif);
228
229                 if (log_stats && time_after(jiffies, blkif->st_print))
230                         print_stats(blkif);
231         }
232
233         if (log_stats)
234                 print_stats(blkif);
235         if (debug_lvl)
236                 printk(KERN_DEBUG "%s: exiting\n", current->comm);
237
238         blkif->xenblkd = NULL;
239         blkif_put(blkif);
240
241         return 0;
242 }
243
244 /******************************************************************
245  * COMPLETION CALLBACK -- Called as bh->b_end_io()
246  */
247
248 static void __end_block_io_op(pending_req_t *pending_req, int error)
249 {
250         /* An error fails the entire request. */
251         if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
252             (error == -EOPNOTSUPP)) {
253                 DPRINTK("blkback: write barrier op failed, not supported\n");
254                 blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
255                 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
256         } else if (error) {
257                 DPRINTK("Buffer not up-to-date at end of operation, "
258                         "error=%d\n", error);
259                 pending_req->status = BLKIF_RSP_ERROR;
260         }
261
262         if (atomic_dec_and_test(&pending_req->pendcnt)) {
263                 fast_flush_area(pending_req);
264                 make_response(pending_req->blkif, pending_req->id,
265                               pending_req->operation, pending_req->status);
266                 blkif_put(pending_req->blkif);
267                 free_req(pending_req);
268         }
269 }
270
271 static int end_block_io_op(struct bio *bio, unsigned int done, int error)
272 {
273         if (bio->bi_size != 0)
274                 return 1;
275         __end_block_io_op(bio->bi_private, error);
276         bio_put(bio);
277         return error;
278 }
279
280
281 /******************************************************************************
282  * NOTIFICATION FROM GUEST OS.
283  */
284
285 static void blkif_notify_work(blkif_t *blkif)
286 {
287         blkif->waiting_reqs = 1;
288         wake_up(&blkif->wq);
289 }
290
291 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
292 {
293         blkif_notify_work(dev_id);
294         return IRQ_HANDLED;
295 }
296
297
298
299 /******************************************************************
300  * DOWNWARD CALLS -- These interface with the block-device layer proper.
301  */
302
303 static int do_block_io_op(blkif_t *blkif)
304 {
305         blkif_back_rings_t *blk_rings = &blkif->blk_rings;
306         blkif_request_t req;
307         pending_req_t *pending_req;
308         RING_IDX rc, rp;
309         int more_to_do = 0;
310
311         rc = blk_rings->common.req_cons;
312         rp = blk_rings->common.sring->req_prod;
313         rmb(); /* Ensure we see queued requests up to 'rp'. */
314
315         while (rc != rp) {
316
317                 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
318                         break;
319
320                 pending_req = alloc_req();
321                 if (NULL == pending_req) {
322                         blkif->st_oo_req++;
323                         more_to_do = 1;
324                         break;
325                 }
326
327                 if (kthread_should_stop()) {
328                         more_to_do = 1;
329                         break;
330                 }
331
332                 switch (blkif->blk_protocol) {
333                 case BLKIF_PROTOCOL_NATIVE:
334                         memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
335                         break;
336                 case BLKIF_PROTOCOL_X86_32:
337                         blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
338                         break;
339                 case BLKIF_PROTOCOL_X86_64:
340                         blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
341                         break;
342                 default:
343                         BUG();
344                 }
345                 blk_rings->common.req_cons = ++rc; /* before make_response() */
346
347                 /* Apply all sanity checks to /private copy/ of request. */
348                 barrier();
349
350                 switch (req.operation) {
351                 case BLKIF_OP_READ:
352                         blkif->st_rd_req++;
353                         dispatch_rw_block_io(blkif, &req, pending_req);
354                         break;
355                 case BLKIF_OP_WRITE_BARRIER:
356                         blkif->st_br_req++;
357                         /* fall through */
358                 case BLKIF_OP_WRITE:
359                         blkif->st_wr_req++;
360                         dispatch_rw_block_io(blkif, &req, pending_req);
361                         break;
362                 default:
363                         /* A good sign something is wrong: sleep for a while to
364                          * avoid excessive CPU consumption by a bad guest. */
365                         msleep(1);
366                         DPRINTK("error: unknown block io operation [%d]\n",
367                                 req.operation);
368                         make_response(blkif, req.id, req.operation,
369                                       BLKIF_RSP_ERROR);
370                         free_req(pending_req);
371                         break;
372                 }
373
374                 /* Yield point for this unbounded loop. */
375                 cond_resched();
376         }
377
378         return more_to_do;
379 }
380
381 static void dispatch_rw_block_io(blkif_t *blkif,
382                                  blkif_request_t *req,
383                                  pending_req_t *pending_req)
384 {
385         extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
386         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
387         struct phys_req preq;
388         struct {
389                 unsigned long buf; unsigned int nsec;
390         } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
391         unsigned int nseg;
392         struct bio *bio = NULL;
393         int ret, i;
394         int operation;
395
396         switch (req->operation) {
397         case BLKIF_OP_READ:
398                 operation = READ;
399                 break;
400         case BLKIF_OP_WRITE:
401                 operation = WRITE;
402                 break;
403         case BLKIF_OP_WRITE_BARRIER:
404                 operation = WRITE_BARRIER;
405                 break;
406         default:
407                 operation = 0; /* make gcc happy */
408                 BUG();
409         }
410
411         /* Check that number of segments is sane. */
412         nseg = req->nr_segments;
413         if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
414             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
415                 DPRINTK("Bad number of segments in request (%d)\n", nseg);
416                 goto fail_response;
417         }
418
419         preq.dev           = req->handle;
420         preq.sector_number = req->sector_number;
421         preq.nr_sects      = 0;
422
423         pending_req->blkif     = blkif;
424         pending_req->id        = req->id;
425         pending_req->operation = req->operation;
426         pending_req->status    = BLKIF_RSP_OKAY;
427         pending_req->nr_pages  = nseg;
428
429         for (i = 0; i < nseg; i++) {
430                 uint32_t flags;
431
432                 seg[i].nsec = req->seg[i].last_sect -
433                         req->seg[i].first_sect + 1;
434
435                 if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
436                     (req->seg[i].last_sect < req->seg[i].first_sect))
437                         goto fail_response;
438                 preq.nr_sects += seg[i].nsec;
439
440                 flags = GNTMAP_host_map;
441                 if (operation != READ)
442                         flags |= GNTMAP_readonly;
443                 gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
444                                   req->seg[i].gref, blkif->domid);
445         }
446
447         ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
448         BUG_ON(ret);
449
450         for (i = 0; i < nseg; i++) {
451                 if (unlikely(map[i].status != 0)) {
452                         DPRINTK("invalid buffer -- could not remap it\n");
453                         map[i].handle = BLKBACK_INVALID_HANDLE;
454                         ret |= 1;
455                 }
456
457                 pending_handle(pending_req, i) = map[i].handle;
458
459                 if (ret)
460                         continue;
461
462                 set_phys_to_machine(__pa(vaddr(
463                         pending_req, i)) >> PAGE_SHIFT,
464                         FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
465                 seg[i].buf  = map[i].dev_bus_addr |
466                         (req->seg[i].first_sect << 9);
467         }
468
469         if (ret)
470                 goto fail_flush;
471
472         if (vbd_translate(&preq, blkif, operation) != 0) {
473                 DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
474                         operation == READ ? "read" : "write",
475                         preq.sector_number,
476                         preq.sector_number + preq.nr_sects, preq.dev);
477                 goto fail_flush;
478         }
479
480         plug_queue(blkif, preq.bdev);
481         atomic_set(&pending_req->pendcnt, 1);
482         blkif_get(blkif);
483
484         for (i = 0; i < nseg; i++) {
485                 if (((int)preq.sector_number|(int)seg[i].nsec) &
486                     ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
487                         DPRINTK("Misaligned I/O request from domain %d",
488                                 blkif->domid);
489                         goto fail_put_bio;
490                 }
491
492                 while ((bio == NULL) ||
493                        (bio_add_page(bio,
494                                      virt_to_page(vaddr(pending_req, i)),
495                                      seg[i].nsec << 9,
496                                      seg[i].buf & ~PAGE_MASK) == 0)) {
497                         if (bio) {
498                                 atomic_inc(&pending_req->pendcnt);
499                                 submit_bio(operation, bio);
500                         }
501
502                         bio = bio_alloc(GFP_KERNEL, nseg-i);
503                         if (unlikely(bio == NULL))
504                                 goto fail_put_bio;
505
506                         bio->bi_bdev    = preq.bdev;
507                         bio->bi_private = pending_req;
508                         bio->bi_end_io  = end_block_io_op;
509                         bio->bi_sector  = preq.sector_number;
510                 }
511
512                 preq.sector_number += seg[i].nsec;
513         }
514
515         if (!bio) {
516                 BUG_ON(operation != WRITE_BARRIER);
517                 bio = bio_alloc(GFP_KERNEL, 0);
518                 if (unlikely(bio == NULL))
519                         goto fail_put_bio;
520
521                 bio->bi_bdev    = preq.bdev;
522                 bio->bi_private = pending_req;
523                 bio->bi_end_io  = end_block_io_op;
524                 bio->bi_sector  = -1;
525         }
526
527         submit_bio(operation, bio);
528
529         if (operation == READ)
530                 blkif->st_rd_sect += preq.nr_sects;
531         else if (operation == WRITE || operation == WRITE_BARRIER)
532                 blkif->st_wr_sect += preq.nr_sects;
533
534         return;
535
536  fail_flush:
537         fast_flush_area(pending_req);
538  fail_response:
539         make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
540         free_req(pending_req);
541         msleep(1); /* back off a bit */
542         return;
543
544  fail_put_bio:
545         __end_block_io_op(pending_req, -EINVAL);
546         if (bio)
547                 bio_put(bio);
548         unplug_queue(blkif);
549         msleep(1); /* back off a bit */
550         return;
551 }
552
553
554
555 /******************************************************************
556  * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
557  */
558
559
560 static void make_response(blkif_t *blkif, u64 id,
561                           unsigned short op, int st)
562 {
563         blkif_response_t  resp;
564         unsigned long     flags;
565         blkif_back_rings_t *blk_rings = &blkif->blk_rings;
566         int more_to_do = 0;
567         int notify;
568
569         resp.id        = id;
570         resp.operation = op;
571         resp.status    = st;
572
573         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
574         /* Place on the response ring for the relevant domain. */
575         switch (blkif->blk_protocol) {
576         case BLKIF_PROTOCOL_NATIVE:
577                 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
578                        &resp, sizeof(resp));
579                 break;
580         case BLKIF_PROTOCOL_X86_32:
581                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
582                        &resp, sizeof(resp));
583                 break;
584         case BLKIF_PROTOCOL_X86_64:
585                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
586                        &resp, sizeof(resp));
587                 break;
588         default:
589                 BUG();
590         }
591         blk_rings->common.rsp_prod_pvt++;
592         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
593         if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
594                 /*
595                  * Tail check for pending requests. Allows frontend to avoid
596                  * notifications if requests are already in flight (lower
597                  * overheads and promotes batching).
598                  */
599                 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
600
601         } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
602                 more_to_do = 1;
603         }
604
605         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
606
607         if (more_to_do)
608                 blkif_notify_work(blkif);
609         if (notify)
610                 notify_remote_via_irq(blkif->irq);
611 }
612
613 static int __init blkif_init(void)
614 {
615         int i, mmap_pages;
616
617         if (!is_running_on_xen())
618                 return -ENODEV;
619
620         mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
621
622         pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
623                                         blkif_reqs, GFP_KERNEL);
624         pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
625                                         mmap_pages, GFP_KERNEL);
626         pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
627
628         if (!pending_reqs || !pending_grant_handles || !pending_pages)
629                 goto out_of_memory;
630
631         for (i = 0; i < mmap_pages; i++)
632                 pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
633
634         blkif_interface_init();
635
636         memset(pending_reqs, 0, sizeof(pending_reqs));
637         INIT_LIST_HEAD(&pending_free);
638
639         for (i = 0; i < blkif_reqs; i++)
640                 list_add_tail(&pending_reqs[i].free_list, &pending_free);
641
642         blkif_xenbus_init();
643
644         return 0;
645
646  out_of_memory:
647         kfree(pending_reqs);
648         kfree(pending_grant_handles);
649         free_empty_pages_and_pagevec(pending_pages, mmap_pages);
650         printk("%s: out of memory\n", __FUNCTION__);
651         return -ENOMEM;
652 }
653
654 module_init(blkif_init);
655
656 MODULE_LICENSE("Dual BSD/GPL");