rbd: separate initialization of osd data
[firefly-linux-kernel-4.4.55.git] / drivers / block / rbd.c
1 /*
2    rbd.c -- Export ceph rados objects as a Linux block device
3
4
5    based on drivers/block/osdblk.c:
6
7    Copyright 2009 Red Hat, Inc.
8
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation.
12
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17
18    You should have received a copy of the GNU General Public License
19    along with this program; see the file COPYING.  If not, write to
20    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
24    For usage instructions, please refer to:
25
26                  Documentation/ABI/testing/sysfs-bus-rbd
27
28  */
29
30 #include <linux/ceph/libceph.h>
31 #include <linux/ceph/osd_client.h>
32 #include <linux/ceph/mon_client.h>
33 #include <linux/ceph/decode.h>
34 #include <linux/parser.h>
35
36 #include <linux/kernel.h>
37 #include <linux/device.h>
38 #include <linux/module.h>
39 #include <linux/fs.h>
40 #include <linux/blkdev.h>
41
42 #include "rbd_types.h"
43
44 #define RBD_DEBUG       /* Activate rbd_assert() calls */
45
46 /*
47  * The basic unit of block I/O is a sector.  It is interpreted in a
48  * number of contexts in Linux (blk, bio, genhd), but the default is
49  * universally 512 bytes.  These symbols are just slightly more
50  * meaningful than the bare numbers they represent.
51  */
52 #define SECTOR_SHIFT    9
53 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
54
55 #define RBD_DRV_NAME "rbd"
56 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
57
58 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
59
60 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
61 #define RBD_MAX_SNAP_NAME_LEN   \
62                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63
64 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
65
66 #define RBD_SNAP_HEAD_NAME      "-"
67
68 /* This allows a single page to hold an image name sent by OSD */
69 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
70 #define RBD_IMAGE_ID_LEN_MAX    64
71
72 #define RBD_OBJ_PREFIX_LEN_MAX  64
73
74 /* Feature bits */
75
76 #define RBD_FEATURE_LAYERING      1
77
78 /* Features supported by this (client software) implementation. */
79
80 #define RBD_FEATURES_ALL          (0)
81
82 /*
83  * An RBD device name will be "rbd#", where the "rbd" comes from
84  * RBD_DRV_NAME above, and # is a unique integer identifier.
85  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
86  * enough to hold all possible device names.
87  */
88 #define DEV_NAME_LEN            32
89 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
90
91 /*
92  * block device image metadata (in-memory version)
93  */
94 struct rbd_image_header {
95         /* These four fields never change for a given rbd image */
96         char *object_prefix;
97         u64 features;
98         __u8 obj_order;
99         __u8 crypt_type;
100         __u8 comp_type;
101
102         /* The remaining fields need to be updated occasionally */
103         u64 image_size;
104         struct ceph_snap_context *snapc;
105         char *snap_names;
106         u64 *snap_sizes;
107
108         u64 obj_version;
109 };
110
111 /*
112  * An rbd image specification.
113  *
114  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
115  * identify an image.  Each rbd_dev structure includes a pointer to
116  * an rbd_spec structure that encapsulates this identity.
117  *
118  * Each of the id's in an rbd_spec has an associated name.  For a
119  * user-mapped image, the names are supplied and the id's associated
120  * with them are looked up.  For a layered image, a parent image is
121  * defined by the tuple, and the names are looked up.
122  *
123  * An rbd_dev structure contains a parent_spec pointer which is
124  * non-null if the image it represents is a child in a layered
125  * image.  This pointer will refer to the rbd_spec structure used
126  * by the parent rbd_dev for its own identity (i.e., the structure
127  * is shared between the parent and child).
128  *
129  * Since these structures are populated once, during the discovery
130  * phase of image construction, they are effectively immutable so
131  * we make no effort to synchronize access to them.
132  *
133  * Note that code herein does not assume the image name is known (it
134  * could be a null pointer).
135  */
136 struct rbd_spec {
137         u64             pool_id;
138         char            *pool_name;
139
140         char            *image_id;
141         char            *image_name;
142
143         u64             snap_id;
144         char            *snap_name;
145
146         struct kref     kref;
147 };
148
149 /*
150  * an instance of the client.  multiple devices may share an rbd client.
151  */
152 struct rbd_client {
153         struct ceph_client      *client;
154         struct kref             kref;
155         struct list_head        node;
156 };
157
158 struct rbd_img_request;
159 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
160
161 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
162
163 struct rbd_obj_request;
164 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
165
166 enum obj_request_type {
167         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
168 };
169
170 struct rbd_obj_request {
171         const char              *object_name;
172         u64                     offset;         /* object start byte */
173         u64                     length;         /* bytes from offset */
174
175         struct rbd_img_request  *img_request;
176         struct list_head        links;          /* img_request->obj_requests */
177         u32                     which;          /* posn image request list */
178
179         enum obj_request_type   type;
180         union {
181                 struct bio      *bio_list;
182                 struct {
183                         struct page     **pages;
184                         u32             page_count;
185                 };
186         };
187
188         struct ceph_osd_request *osd_req;
189
190         u64                     xferred;        /* bytes transferred */
191         u64                     version;
192         int                     result;
193         atomic_t                done;
194
195         rbd_obj_callback_t      callback;
196         struct completion       completion;
197
198         struct kref             kref;
199 };
200
201 struct rbd_img_request {
202         struct request          *rq;
203         struct rbd_device       *rbd_dev;
204         u64                     offset; /* starting image byte offset */
205         u64                     length; /* byte count from offset */
206         bool                    write_request;  /* false for read */
207         union {
208                 struct ceph_snap_context *snapc;        /* for writes */
209                 u64             snap_id;                /* for reads */
210         };
211         spinlock_t              completion_lock;/* protects next_completion */
212         u32                     next_completion;
213         rbd_img_callback_t      callback;
214
215         u32                     obj_request_count;
216         struct list_head        obj_requests;   /* rbd_obj_request structs */
217
218         struct kref             kref;
219 };
220
221 #define for_each_obj_request(ireq, oreq) \
222         list_for_each_entry(oreq, &(ireq)->obj_requests, links)
223 #define for_each_obj_request_from(ireq, oreq) \
224         list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
225 #define for_each_obj_request_safe(ireq, oreq, n) \
226         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
227
228 struct rbd_snap {
229         struct  device          dev;
230         const char              *name;
231         u64                     size;
232         struct list_head        node;
233         u64                     id;
234         u64                     features;
235 };
236
237 struct rbd_mapping {
238         u64                     size;
239         u64                     features;
240         bool                    read_only;
241 };
242
243 /*
244  * a single device
245  */
246 struct rbd_device {
247         int                     dev_id;         /* blkdev unique id */
248
249         int                     major;          /* blkdev assigned major */
250         struct gendisk          *disk;          /* blkdev's gendisk and rq */
251
252         u32                     image_format;   /* Either 1 or 2 */
253         struct rbd_client       *rbd_client;
254
255         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
256
257         spinlock_t              lock;           /* queue, flags, open_count */
258
259         struct rbd_image_header header;
260         unsigned long           flags;          /* possibly lock protected */
261         struct rbd_spec         *spec;
262
263         char                    *header_name;
264
265         struct ceph_file_layout layout;
266
267         struct ceph_osd_event   *watch_event;
268         struct rbd_obj_request  *watch_request;
269
270         struct rbd_spec         *parent_spec;
271         u64                     parent_overlap;
272
273         /* protects updating the header */
274         struct rw_semaphore     header_rwsem;
275
276         struct rbd_mapping      mapping;
277
278         struct list_head        node;
279
280         /* list of snapshots */
281         struct list_head        snaps;
282
283         /* sysfs related */
284         struct device           dev;
285         unsigned long           open_count;     /* protected by lock */
286 };
287
288 /*
289  * Flag bits for rbd_dev->flags.  If atomicity is required,
290  * rbd_dev->lock is used to protect access.
291  *
292  * Currently, only the "removing" flag (which is coupled with the
293  * "open_count" field) requires atomic access.
294  */
295 enum rbd_dev_flags {
296         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
297         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
298 };
299
300 static DEFINE_MUTEX(ctl_mutex);   /* Serialize open/close/setup/teardown */
301
302 static LIST_HEAD(rbd_dev_list);    /* devices */
303 static DEFINE_SPINLOCK(rbd_dev_list_lock);
304
305 static LIST_HEAD(rbd_client_list);              /* clients */
306 static DEFINE_SPINLOCK(rbd_client_list_lock);
307
308 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
309 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
310
311 static void rbd_dev_release(struct device *dev);
312 static void rbd_remove_snap_dev(struct rbd_snap *snap);
313
314 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
315                        size_t count);
316 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
317                           size_t count);
318
319 static struct bus_attribute rbd_bus_attrs[] = {
320         __ATTR(add, S_IWUSR, NULL, rbd_add),
321         __ATTR(remove, S_IWUSR, NULL, rbd_remove),
322         __ATTR_NULL
323 };
324
325 static struct bus_type rbd_bus_type = {
326         .name           = "rbd",
327         .bus_attrs      = rbd_bus_attrs,
328 };
329
330 static void rbd_root_dev_release(struct device *dev)
331 {
332 }
333
334 static struct device rbd_root_dev = {
335         .init_name =    "rbd",
336         .release =      rbd_root_dev_release,
337 };
338
339 static __printf(2, 3)
340 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
341 {
342         struct va_format vaf;
343         va_list args;
344
345         va_start(args, fmt);
346         vaf.fmt = fmt;
347         vaf.va = &args;
348
349         if (!rbd_dev)
350                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
351         else if (rbd_dev->disk)
352                 printk(KERN_WARNING "%s: %s: %pV\n",
353                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
354         else if (rbd_dev->spec && rbd_dev->spec->image_name)
355                 printk(KERN_WARNING "%s: image %s: %pV\n",
356                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
357         else if (rbd_dev->spec && rbd_dev->spec->image_id)
358                 printk(KERN_WARNING "%s: id %s: %pV\n",
359                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
360         else    /* punt */
361                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
362                         RBD_DRV_NAME, rbd_dev, &vaf);
363         va_end(args);
364 }
365
366 #ifdef RBD_DEBUG
367 #define rbd_assert(expr)                                                \
368                 if (unlikely(!(expr))) {                                \
369                         printk(KERN_ERR "\nAssertion failure in %s() "  \
370                                                 "at line %d:\n\n"       \
371                                         "\trbd_assert(%s);\n\n",        \
372                                         __func__, __LINE__, #expr);     \
373                         BUG();                                          \
374                 }
375 #else /* !RBD_DEBUG */
376 #  define rbd_assert(expr)      ((void) 0)
377 #endif /* !RBD_DEBUG */
378
379 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
380 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
381
382 static int rbd_open(struct block_device *bdev, fmode_t mode)
383 {
384         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
385         bool removing = false;
386
387         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
388                 return -EROFS;
389
390         spin_lock_irq(&rbd_dev->lock);
391         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
392                 removing = true;
393         else
394                 rbd_dev->open_count++;
395         spin_unlock_irq(&rbd_dev->lock);
396         if (removing)
397                 return -ENOENT;
398
399         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
400         (void) get_device(&rbd_dev->dev);
401         set_device_ro(bdev, rbd_dev->mapping.read_only);
402         mutex_unlock(&ctl_mutex);
403
404         return 0;
405 }
406
407 static int rbd_release(struct gendisk *disk, fmode_t mode)
408 {
409         struct rbd_device *rbd_dev = disk->private_data;
410         unsigned long open_count_before;
411
412         spin_lock_irq(&rbd_dev->lock);
413         open_count_before = rbd_dev->open_count--;
414         spin_unlock_irq(&rbd_dev->lock);
415         rbd_assert(open_count_before > 0);
416
417         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
418         put_device(&rbd_dev->dev);
419         mutex_unlock(&ctl_mutex);
420
421         return 0;
422 }
423
424 static const struct block_device_operations rbd_bd_ops = {
425         .owner                  = THIS_MODULE,
426         .open                   = rbd_open,
427         .release                = rbd_release,
428 };
429
430 /*
431  * Initialize an rbd client instance.
432  * We own *ceph_opts.
433  */
434 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
435 {
436         struct rbd_client *rbdc;
437         int ret = -ENOMEM;
438
439         dout("%s:\n", __func__);
440         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
441         if (!rbdc)
442                 goto out_opt;
443
444         kref_init(&rbdc->kref);
445         INIT_LIST_HEAD(&rbdc->node);
446
447         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
448
449         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
450         if (IS_ERR(rbdc->client))
451                 goto out_mutex;
452         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
453
454         ret = ceph_open_session(rbdc->client);
455         if (ret < 0)
456                 goto out_err;
457
458         spin_lock(&rbd_client_list_lock);
459         list_add_tail(&rbdc->node, &rbd_client_list);
460         spin_unlock(&rbd_client_list_lock);
461
462         mutex_unlock(&ctl_mutex);
463         dout("%s: rbdc %p\n", __func__, rbdc);
464
465         return rbdc;
466
467 out_err:
468         ceph_destroy_client(rbdc->client);
469 out_mutex:
470         mutex_unlock(&ctl_mutex);
471         kfree(rbdc);
472 out_opt:
473         if (ceph_opts)
474                 ceph_destroy_options(ceph_opts);
475         dout("%s: error %d\n", __func__, ret);
476
477         return ERR_PTR(ret);
478 }
479
480 /*
481  * Find a ceph client with specific addr and configuration.  If
482  * found, bump its reference count.
483  */
484 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
485 {
486         struct rbd_client *client_node;
487         bool found = false;
488
489         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
490                 return NULL;
491
492         spin_lock(&rbd_client_list_lock);
493         list_for_each_entry(client_node, &rbd_client_list, node) {
494                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
495                         kref_get(&client_node->kref);
496                         found = true;
497                         break;
498                 }
499         }
500         spin_unlock(&rbd_client_list_lock);
501
502         return found ? client_node : NULL;
503 }
504
505 /*
506  * mount options
507  */
508 enum {
509         Opt_last_int,
510         /* int args above */
511         Opt_last_string,
512         /* string args above */
513         Opt_read_only,
514         Opt_read_write,
515         /* Boolean args above */
516         Opt_last_bool,
517 };
518
519 static match_table_t rbd_opts_tokens = {
520         /* int args above */
521         /* string args above */
522         {Opt_read_only, "read_only"},
523         {Opt_read_only, "ro"},          /* Alternate spelling */
524         {Opt_read_write, "read_write"},
525         {Opt_read_write, "rw"},         /* Alternate spelling */
526         /* Boolean args above */
527         {-1, NULL}
528 };
529
530 struct rbd_options {
531         bool    read_only;
532 };
533
534 #define RBD_READ_ONLY_DEFAULT   false
535
536 static int parse_rbd_opts_token(char *c, void *private)
537 {
538         struct rbd_options *rbd_opts = private;
539         substring_t argstr[MAX_OPT_ARGS];
540         int token, intval, ret;
541
542         token = match_token(c, rbd_opts_tokens, argstr);
543         if (token < 0)
544                 return -EINVAL;
545
546         if (token < Opt_last_int) {
547                 ret = match_int(&argstr[0], &intval);
548                 if (ret < 0) {
549                         pr_err("bad mount option arg (not int) "
550                                "at '%s'\n", c);
551                         return ret;
552                 }
553                 dout("got int token %d val %d\n", token, intval);
554         } else if (token > Opt_last_int && token < Opt_last_string) {
555                 dout("got string token %d val %s\n", token,
556                      argstr[0].from);
557         } else if (token > Opt_last_string && token < Opt_last_bool) {
558                 dout("got Boolean token %d\n", token);
559         } else {
560                 dout("got token %d\n", token);
561         }
562
563         switch (token) {
564         case Opt_read_only:
565                 rbd_opts->read_only = true;
566                 break;
567         case Opt_read_write:
568                 rbd_opts->read_only = false;
569                 break;
570         default:
571                 rbd_assert(false);
572                 break;
573         }
574         return 0;
575 }
576
577 /*
578  * Get a ceph client with specific addr and configuration, if one does
579  * not exist create it.
580  */
581 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
582 {
583         struct rbd_client *rbdc;
584
585         rbdc = rbd_client_find(ceph_opts);
586         if (rbdc)       /* using an existing client */
587                 ceph_destroy_options(ceph_opts);
588         else
589                 rbdc = rbd_client_create(ceph_opts);
590
591         return rbdc;
592 }
593
594 /*
595  * Destroy ceph client
596  *
597  * Caller must hold rbd_client_list_lock.
598  */
599 static void rbd_client_release(struct kref *kref)
600 {
601         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
602
603         dout("%s: rbdc %p\n", __func__, rbdc);
604         spin_lock(&rbd_client_list_lock);
605         list_del(&rbdc->node);
606         spin_unlock(&rbd_client_list_lock);
607
608         ceph_destroy_client(rbdc->client);
609         kfree(rbdc);
610 }
611
612 /*
613  * Drop reference to ceph client node. If it's not referenced anymore, release
614  * it.
615  */
616 static void rbd_put_client(struct rbd_client *rbdc)
617 {
618         if (rbdc)
619                 kref_put(&rbdc->kref, rbd_client_release);
620 }
621
622 static bool rbd_image_format_valid(u32 image_format)
623 {
624         return image_format == 1 || image_format == 2;
625 }
626
627 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
628 {
629         size_t size;
630         u32 snap_count;
631
632         /* The header has to start with the magic rbd header text */
633         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
634                 return false;
635
636         /* The bio layer requires at least sector-sized I/O */
637
638         if (ondisk->options.order < SECTOR_SHIFT)
639                 return false;
640
641         /* If we use u64 in a few spots we may be able to loosen this */
642
643         if (ondisk->options.order > 8 * sizeof (int) - 1)
644                 return false;
645
646         /*
647          * The size of a snapshot header has to fit in a size_t, and
648          * that limits the number of snapshots.
649          */
650         snap_count = le32_to_cpu(ondisk->snap_count);
651         size = SIZE_MAX - sizeof (struct ceph_snap_context);
652         if (snap_count > size / sizeof (__le64))
653                 return false;
654
655         /*
656          * Not only that, but the size of the entire the snapshot
657          * header must also be representable in a size_t.
658          */
659         size -= snap_count * sizeof (__le64);
660         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
661                 return false;
662
663         return true;
664 }
665
666 /*
667  * Create a new header structure, translate header format from the on-disk
668  * header.
669  */
670 static int rbd_header_from_disk(struct rbd_image_header *header,
671                                  struct rbd_image_header_ondisk *ondisk)
672 {
673         u32 snap_count;
674         size_t len;
675         size_t size;
676         u32 i;
677
678         memset(header, 0, sizeof (*header));
679
680         snap_count = le32_to_cpu(ondisk->snap_count);
681
682         len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
683         header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
684         if (!header->object_prefix)
685                 return -ENOMEM;
686         memcpy(header->object_prefix, ondisk->object_prefix, len);
687         header->object_prefix[len] = '\0';
688
689         if (snap_count) {
690                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
691
692                 /* Save a copy of the snapshot names */
693
694                 if (snap_names_len > (u64) SIZE_MAX)
695                         return -EIO;
696                 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
697                 if (!header->snap_names)
698                         goto out_err;
699                 /*
700                  * Note that rbd_dev_v1_header_read() guarantees
701                  * the ondisk buffer we're working with has
702                  * snap_names_len bytes beyond the end of the
703                  * snapshot id array, this memcpy() is safe.
704                  */
705                 memcpy(header->snap_names, &ondisk->snaps[snap_count],
706                         snap_names_len);
707
708                 /* Record each snapshot's size */
709
710                 size = snap_count * sizeof (*header->snap_sizes);
711                 header->snap_sizes = kmalloc(size, GFP_KERNEL);
712                 if (!header->snap_sizes)
713                         goto out_err;
714                 for (i = 0; i < snap_count; i++)
715                         header->snap_sizes[i] =
716                                 le64_to_cpu(ondisk->snaps[i].image_size);
717         } else {
718                 WARN_ON(ondisk->snap_names_len);
719                 header->snap_names = NULL;
720                 header->snap_sizes = NULL;
721         }
722
723         header->features = 0;   /* No features support in v1 images */
724         header->obj_order = ondisk->options.order;
725         header->crypt_type = ondisk->options.crypt_type;
726         header->comp_type = ondisk->options.comp_type;
727
728         /* Allocate and fill in the snapshot context */
729
730         header->image_size = le64_to_cpu(ondisk->image_size);
731         size = sizeof (struct ceph_snap_context);
732         size += snap_count * sizeof (header->snapc->snaps[0]);
733         header->snapc = kzalloc(size, GFP_KERNEL);
734         if (!header->snapc)
735                 goto out_err;
736
737         atomic_set(&header->snapc->nref, 1);
738         header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
739         header->snapc->num_snaps = snap_count;
740         for (i = 0; i < snap_count; i++)
741                 header->snapc->snaps[i] =
742                         le64_to_cpu(ondisk->snaps[i].id);
743
744         return 0;
745
746 out_err:
747         kfree(header->snap_sizes);
748         header->snap_sizes = NULL;
749         kfree(header->snap_names);
750         header->snap_names = NULL;
751         kfree(header->object_prefix);
752         header->object_prefix = NULL;
753
754         return -ENOMEM;
755 }
756
757 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
758 {
759         struct rbd_snap *snap;
760
761         if (snap_id == CEPH_NOSNAP)
762                 return RBD_SNAP_HEAD_NAME;
763
764         list_for_each_entry(snap, &rbd_dev->snaps, node)
765                 if (snap_id == snap->id)
766                         return snap->name;
767
768         return NULL;
769 }
770
771 static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
772 {
773
774         struct rbd_snap *snap;
775
776         list_for_each_entry(snap, &rbd_dev->snaps, node) {
777                 if (!strcmp(snap_name, snap->name)) {
778                         rbd_dev->spec->snap_id = snap->id;
779                         rbd_dev->mapping.size = snap->size;
780                         rbd_dev->mapping.features = snap->features;
781
782                         return 0;
783                 }
784         }
785
786         return -ENOENT;
787 }
788
789 static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
790 {
791         int ret;
792
793         if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
794                     sizeof (RBD_SNAP_HEAD_NAME))) {
795                 rbd_dev->spec->snap_id = CEPH_NOSNAP;
796                 rbd_dev->mapping.size = rbd_dev->header.image_size;
797                 rbd_dev->mapping.features = rbd_dev->header.features;
798                 ret = 0;
799         } else {
800                 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
801                 if (ret < 0)
802                         goto done;
803                 rbd_dev->mapping.read_only = true;
804         }
805         set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
806
807 done:
808         return ret;
809 }
810
811 static void rbd_header_free(struct rbd_image_header *header)
812 {
813         kfree(header->object_prefix);
814         header->object_prefix = NULL;
815         kfree(header->snap_sizes);
816         header->snap_sizes = NULL;
817         kfree(header->snap_names);
818         header->snap_names = NULL;
819         ceph_put_snap_context(header->snapc);
820         header->snapc = NULL;
821 }
822
823 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
824 {
825         char *name;
826         u64 segment;
827         int ret;
828
829         name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
830         if (!name)
831                 return NULL;
832         segment = offset >> rbd_dev->header.obj_order;
833         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
834                         rbd_dev->header.object_prefix, segment);
835         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
836                 pr_err("error formatting segment name for #%llu (%d)\n",
837                         segment, ret);
838                 kfree(name);
839                 name = NULL;
840         }
841
842         return name;
843 }
844
845 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
846 {
847         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
848
849         return offset & (segment_size - 1);
850 }
851
852 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
853                                 u64 offset, u64 length)
854 {
855         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
856
857         offset &= segment_size - 1;
858
859         rbd_assert(length <= U64_MAX - offset);
860         if (offset + length > segment_size)
861                 length = segment_size - offset;
862
863         return length;
864 }
865
866 /*
867  * returns the size of an object in the image
868  */
869 static u64 rbd_obj_bytes(struct rbd_image_header *header)
870 {
871         return 1 << header->obj_order;
872 }
873
874 /*
875  * bio helpers
876  */
877
878 static void bio_chain_put(struct bio *chain)
879 {
880         struct bio *tmp;
881
882         while (chain) {
883                 tmp = chain;
884                 chain = chain->bi_next;
885                 bio_put(tmp);
886         }
887 }
888
889 /*
890  * zeros a bio chain, starting at specific offset
891  */
892 static void zero_bio_chain(struct bio *chain, int start_ofs)
893 {
894         struct bio_vec *bv;
895         unsigned long flags;
896         void *buf;
897         int i;
898         int pos = 0;
899
900         while (chain) {
901                 bio_for_each_segment(bv, chain, i) {
902                         if (pos + bv->bv_len > start_ofs) {
903                                 int remainder = max(start_ofs - pos, 0);
904                                 buf = bvec_kmap_irq(bv, &flags);
905                                 memset(buf + remainder, 0,
906                                        bv->bv_len - remainder);
907                                 bvec_kunmap_irq(buf, &flags);
908                         }
909                         pos += bv->bv_len;
910                 }
911
912                 chain = chain->bi_next;
913         }
914 }
915
916 /*
917  * Clone a portion of a bio, starting at the given byte offset
918  * and continuing for the number of bytes indicated.
919  */
920 static struct bio *bio_clone_range(struct bio *bio_src,
921                                         unsigned int offset,
922                                         unsigned int len,
923                                         gfp_t gfpmask)
924 {
925         struct bio_vec *bv;
926         unsigned int resid;
927         unsigned short idx;
928         unsigned int voff;
929         unsigned short end_idx;
930         unsigned short vcnt;
931         struct bio *bio;
932
933         /* Handle the easy case for the caller */
934
935         if (!offset && len == bio_src->bi_size)
936                 return bio_clone(bio_src, gfpmask);
937
938         if (WARN_ON_ONCE(!len))
939                 return NULL;
940         if (WARN_ON_ONCE(len > bio_src->bi_size))
941                 return NULL;
942         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
943                 return NULL;
944
945         /* Find first affected segment... */
946
947         resid = offset;
948         __bio_for_each_segment(bv, bio_src, idx, 0) {
949                 if (resid < bv->bv_len)
950                         break;
951                 resid -= bv->bv_len;
952         }
953         voff = resid;
954
955         /* ...and the last affected segment */
956
957         resid += len;
958         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
959                 if (resid <= bv->bv_len)
960                         break;
961                 resid -= bv->bv_len;
962         }
963         vcnt = end_idx - idx + 1;
964
965         /* Build the clone */
966
967         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
968         if (!bio)
969                 return NULL;    /* ENOMEM */
970
971         bio->bi_bdev = bio_src->bi_bdev;
972         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
973         bio->bi_rw = bio_src->bi_rw;
974         bio->bi_flags |= 1 << BIO_CLONED;
975
976         /*
977          * Copy over our part of the bio_vec, then update the first
978          * and last (or only) entries.
979          */
980         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
981                         vcnt * sizeof (struct bio_vec));
982         bio->bi_io_vec[0].bv_offset += voff;
983         if (vcnt > 1) {
984                 bio->bi_io_vec[0].bv_len -= voff;
985                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
986         } else {
987                 bio->bi_io_vec[0].bv_len = len;
988         }
989
990         bio->bi_vcnt = vcnt;
991         bio->bi_size = len;
992         bio->bi_idx = 0;
993
994         return bio;
995 }
996
997 /*
998  * Clone a portion of a bio chain, starting at the given byte offset
999  * into the first bio in the source chain and continuing for the
1000  * number of bytes indicated.  The result is another bio chain of
1001  * exactly the given length, or a null pointer on error.
1002  *
1003  * The bio_src and offset parameters are both in-out.  On entry they
1004  * refer to the first source bio and the offset into that bio where
1005  * the start of data to be cloned is located.
1006  *
1007  * On return, bio_src is updated to refer to the bio in the source
1008  * chain that contains first un-cloned byte, and *offset will
1009  * contain the offset of that byte within that bio.
1010  */
1011 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1012                                         unsigned int *offset,
1013                                         unsigned int len,
1014                                         gfp_t gfpmask)
1015 {
1016         struct bio *bi = *bio_src;
1017         unsigned int off = *offset;
1018         struct bio *chain = NULL;
1019         struct bio **end;
1020
1021         /* Build up a chain of clone bios up to the limit */
1022
1023         if (!bi || off >= bi->bi_size || !len)
1024                 return NULL;            /* Nothing to clone */
1025
1026         end = &chain;
1027         while (len) {
1028                 unsigned int bi_size;
1029                 struct bio *bio;
1030
1031                 if (!bi) {
1032                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1033                         goto out_err;   /* EINVAL; ran out of bio's */
1034                 }
1035                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1036                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1037                 if (!bio)
1038                         goto out_err;   /* ENOMEM */
1039
1040                 *end = bio;
1041                 end = &bio->bi_next;
1042
1043                 off += bi_size;
1044                 if (off == bi->bi_size) {
1045                         bi = bi->bi_next;
1046                         off = 0;
1047                 }
1048                 len -= bi_size;
1049         }
1050         *bio_src = bi;
1051         *offset = off;
1052
1053         return chain;
1054 out_err:
1055         bio_chain_put(chain);
1056
1057         return NULL;
1058 }
1059
1060 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1061 {
1062         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1063                 atomic_read(&obj_request->kref.refcount));
1064         kref_get(&obj_request->kref);
1065 }
1066
1067 static void rbd_obj_request_destroy(struct kref *kref);
1068 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1069 {
1070         rbd_assert(obj_request != NULL);
1071         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1072                 atomic_read(&obj_request->kref.refcount));
1073         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1074 }
1075
1076 static void rbd_img_request_get(struct rbd_img_request *img_request)
1077 {
1078         dout("%s: img %p (was %d)\n", __func__, img_request,
1079                 atomic_read(&img_request->kref.refcount));
1080         kref_get(&img_request->kref);
1081 }
1082
1083 static void rbd_img_request_destroy(struct kref *kref);
1084 static void rbd_img_request_put(struct rbd_img_request *img_request)
1085 {
1086         rbd_assert(img_request != NULL);
1087         dout("%s: img %p (was %d)\n", __func__, img_request,
1088                 atomic_read(&img_request->kref.refcount));
1089         kref_put(&img_request->kref, rbd_img_request_destroy);
1090 }
1091
1092 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1093                                         struct rbd_obj_request *obj_request)
1094 {
1095         rbd_assert(obj_request->img_request == NULL);
1096
1097         rbd_obj_request_get(obj_request);
1098         obj_request->img_request = img_request;
1099         obj_request->which = img_request->obj_request_count;
1100         rbd_assert(obj_request->which != BAD_WHICH);
1101         img_request->obj_request_count++;
1102         list_add_tail(&obj_request->links, &img_request->obj_requests);
1103         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1104                 obj_request->which);
1105 }
1106
1107 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1108                                         struct rbd_obj_request *obj_request)
1109 {
1110         rbd_assert(obj_request->which != BAD_WHICH);
1111
1112         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1113                 obj_request->which);
1114         list_del(&obj_request->links);
1115         rbd_assert(img_request->obj_request_count > 0);
1116         img_request->obj_request_count--;
1117         rbd_assert(obj_request->which == img_request->obj_request_count);
1118         obj_request->which = BAD_WHICH;
1119         rbd_assert(obj_request->img_request == img_request);
1120         obj_request->img_request = NULL;
1121         obj_request->callback = NULL;
1122         rbd_obj_request_put(obj_request);
1123 }
1124
1125 static bool obj_request_type_valid(enum obj_request_type type)
1126 {
1127         switch (type) {
1128         case OBJ_REQUEST_NODATA:
1129         case OBJ_REQUEST_BIO:
1130         case OBJ_REQUEST_PAGES:
1131                 return true;
1132         default:
1133                 return false;
1134         }
1135 }
1136
1137 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1138                                 struct rbd_obj_request *obj_request)
1139 {
1140         dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1141
1142         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1143 }
1144
1145 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1146 {
1147         dout("%s: img %p\n", __func__, img_request);
1148         if (img_request->callback)
1149                 img_request->callback(img_request);
1150         else
1151                 rbd_img_request_put(img_request);
1152 }
1153
1154 /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1155
1156 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1157 {
1158         dout("%s: obj %p\n", __func__, obj_request);
1159
1160         return wait_for_completion_interruptible(&obj_request->completion);
1161 }
1162
1163 static void obj_request_done_init(struct rbd_obj_request *obj_request)
1164 {
1165         atomic_set(&obj_request->done, 0);
1166         smp_wmb();
1167 }
1168
1169 static void obj_request_done_set(struct rbd_obj_request *obj_request)
1170 {
1171         int done;
1172
1173         done = atomic_inc_return(&obj_request->done);
1174         if (done > 1) {
1175                 struct rbd_img_request *img_request = obj_request->img_request;
1176                 struct rbd_device *rbd_dev;
1177
1178                 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1179                 rbd_warn(rbd_dev, "obj_request %p was already done\n",
1180                         obj_request);
1181         }
1182 }
1183
1184 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1185 {
1186         smp_mb();
1187         return atomic_read(&obj_request->done) != 0;
1188 }
1189
1190 static void
1191 rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1192 {
1193         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1194                 obj_request, obj_request->img_request, obj_request->result,
1195                 obj_request->xferred, obj_request->length);
1196         /*
1197          * ENOENT means a hole in the image.  We zero-fill the
1198          * entire length of the request.  A short read also implies
1199          * zero-fill to the end of the request.  Either way we
1200          * update the xferred count to indicate the whole request
1201          * was satisfied.
1202          */
1203         BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
1204         if (obj_request->result == -ENOENT) {
1205                 zero_bio_chain(obj_request->bio_list, 0);
1206                 obj_request->result = 0;
1207                 obj_request->xferred = obj_request->length;
1208         } else if (obj_request->xferred < obj_request->length &&
1209                         !obj_request->result) {
1210                 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1211                 obj_request->xferred = obj_request->length;
1212         }
1213         obj_request_done_set(obj_request);
1214 }
1215
1216 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1217 {
1218         dout("%s: obj %p cb %p\n", __func__, obj_request,
1219                 obj_request->callback);
1220         if (obj_request->callback)
1221                 obj_request->callback(obj_request);
1222         else
1223                 complete_all(&obj_request->completion);
1224 }
1225
1226 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1227 {
1228         dout("%s: obj %p\n", __func__, obj_request);
1229         obj_request_done_set(obj_request);
1230 }
1231
1232 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1233 {
1234         dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
1235                 obj_request->result, obj_request->xferred, obj_request->length);
1236         if (obj_request->img_request)
1237                 rbd_img_obj_request_read_callback(obj_request);
1238         else
1239                 obj_request_done_set(obj_request);
1240 }
1241
1242 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1243 {
1244         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1245                 obj_request->result, obj_request->length);
1246         /*
1247          * There is no such thing as a successful short write.
1248          * Our xferred value is the number of bytes transferred
1249          * back.  Set it to our originally-requested length.
1250          */
1251         obj_request->xferred = obj_request->length;
1252         obj_request_done_set(obj_request);
1253 }
1254
1255 /*
1256  * For a simple stat call there's nothing to do.  We'll do more if
1257  * this is part of a write sequence for a layered image.
1258  */
1259 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1260 {
1261         dout("%s: obj %p\n", __func__, obj_request);
1262         obj_request_done_set(obj_request);
1263 }
1264
1265 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1266                                 struct ceph_msg *msg)
1267 {
1268         struct rbd_obj_request *obj_request = osd_req->r_priv;
1269         u16 opcode;
1270
1271         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1272         rbd_assert(osd_req == obj_request->osd_req);
1273         rbd_assert(!!obj_request->img_request ^
1274                                 (obj_request->which == BAD_WHICH));
1275
1276         if (osd_req->r_result < 0)
1277                 obj_request->result = osd_req->r_result;
1278         obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1279
1280         WARN_ON(osd_req->r_num_ops != 1);       /* For now */
1281
1282         /*
1283          * We support a 64-bit length, but ultimately it has to be
1284          * passed to blk_end_request(), which takes an unsigned int.
1285          */
1286         obj_request->xferred = osd_req->r_reply_op_len[0];
1287         rbd_assert(obj_request->xferred < (u64) UINT_MAX);
1288         opcode = osd_req->r_ops[0].op;
1289         switch (opcode) {
1290         case CEPH_OSD_OP_READ:
1291                 rbd_osd_read_callback(obj_request);
1292                 break;
1293         case CEPH_OSD_OP_WRITE:
1294                 rbd_osd_write_callback(obj_request);
1295                 break;
1296         case CEPH_OSD_OP_STAT:
1297                 rbd_osd_stat_callback(obj_request);
1298                 break;
1299         case CEPH_OSD_OP_CALL:
1300         case CEPH_OSD_OP_NOTIFY_ACK:
1301         case CEPH_OSD_OP_WATCH:
1302                 rbd_osd_trivial_callback(obj_request);
1303                 break;
1304         default:
1305                 rbd_warn(NULL, "%s: unsupported op %hu\n",
1306                         obj_request->object_name, (unsigned short) opcode);
1307                 break;
1308         }
1309
1310         if (obj_request_done_test(obj_request))
1311                 rbd_obj_request_complete(obj_request);
1312 }
1313
1314 static void rbd_osd_req_format(struct rbd_obj_request *obj_request,
1315                                         bool write_request)
1316 {
1317         struct rbd_img_request *img_request = obj_request->img_request;
1318         struct ceph_osd_request *osd_req = obj_request->osd_req;
1319         struct ceph_snap_context *snapc = NULL;
1320         u64 snap_id = CEPH_NOSNAP;
1321         struct timespec *mtime = NULL;
1322         struct timespec now;
1323
1324         rbd_assert(osd_req != NULL);
1325
1326         if (write_request) {
1327                 now = CURRENT_TIME;
1328                 mtime = &now;
1329                 if (img_request)
1330                         snapc = img_request->snapc;
1331         } else if (img_request) {
1332                 snap_id = img_request->snap_id;
1333         }
1334         ceph_osdc_build_request(osd_req, obj_request->offset,
1335                         snapc, snap_id, mtime);
1336 }
1337
1338 static struct ceph_osd_request *rbd_osd_req_create(
1339                                         struct rbd_device *rbd_dev,
1340                                         bool write_request,
1341                                         struct rbd_obj_request *obj_request)
1342 {
1343         struct rbd_img_request *img_request = obj_request->img_request;
1344         struct ceph_snap_context *snapc = NULL;
1345         struct ceph_osd_client *osdc;
1346         struct ceph_osd_request *osd_req;
1347
1348         if (img_request) {
1349                 rbd_assert(img_request->write_request == write_request);
1350                 if (img_request->write_request)
1351                         snapc = img_request->snapc;
1352         }
1353
1354         /* Allocate and initialize the request, for the single op */
1355
1356         osdc = &rbd_dev->rbd_client->client->osdc;
1357         osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1358         if (!osd_req)
1359                 return NULL;    /* ENOMEM */
1360
1361         if (write_request)
1362                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1363         else
1364                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1365
1366         osd_req->r_callback = rbd_osd_req_callback;
1367         osd_req->r_priv = obj_request;
1368
1369         osd_req->r_oid_len = strlen(obj_request->object_name);
1370         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1371         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1372
1373         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1374
1375         return osd_req;
1376 }
1377
1378 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1379 {
1380         ceph_osdc_put_request(osd_req);
1381 }
1382
1383 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1384
1385 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1386                                                 u64 offset, u64 length,
1387                                                 enum obj_request_type type)
1388 {
1389         struct rbd_obj_request *obj_request;
1390         size_t size;
1391         char *name;
1392
1393         rbd_assert(obj_request_type_valid(type));
1394
1395         size = strlen(object_name) + 1;
1396         obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1397         if (!obj_request)
1398                 return NULL;
1399
1400         name = (char *)(obj_request + 1);
1401         obj_request->object_name = memcpy(name, object_name, size);
1402         obj_request->offset = offset;
1403         obj_request->length = length;
1404         obj_request->which = BAD_WHICH;
1405         obj_request->type = type;
1406         INIT_LIST_HEAD(&obj_request->links);
1407         obj_request_done_init(obj_request);
1408         init_completion(&obj_request->completion);
1409         kref_init(&obj_request->kref);
1410
1411         dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1412                 offset, length, (int)type, obj_request);
1413
1414         return obj_request;
1415 }
1416
1417 static void rbd_obj_request_destroy(struct kref *kref)
1418 {
1419         struct rbd_obj_request *obj_request;
1420
1421         obj_request = container_of(kref, struct rbd_obj_request, kref);
1422
1423         dout("%s: obj %p\n", __func__, obj_request);
1424
1425         rbd_assert(obj_request->img_request == NULL);
1426         rbd_assert(obj_request->which == BAD_WHICH);
1427
1428         if (obj_request->osd_req)
1429                 rbd_osd_req_destroy(obj_request->osd_req);
1430
1431         rbd_assert(obj_request_type_valid(obj_request->type));
1432         switch (obj_request->type) {
1433         case OBJ_REQUEST_NODATA:
1434                 break;          /* Nothing to do */
1435         case OBJ_REQUEST_BIO:
1436                 if (obj_request->bio_list)
1437                         bio_chain_put(obj_request->bio_list);
1438                 break;
1439         case OBJ_REQUEST_PAGES:
1440                 if (obj_request->pages)
1441                         ceph_release_page_vector(obj_request->pages,
1442                                                 obj_request->page_count);
1443                 break;
1444         }
1445
1446         kfree(obj_request);
1447 }
1448
1449 /*
1450  * Caller is responsible for filling in the list of object requests
1451  * that comprises the image request, and the Linux request pointer
1452  * (if there is one).
1453  */
1454 static struct rbd_img_request *rbd_img_request_create(
1455                                         struct rbd_device *rbd_dev,
1456                                         u64 offset, u64 length,
1457                                         bool write_request)
1458 {
1459         struct rbd_img_request *img_request;
1460         struct ceph_snap_context *snapc = NULL;
1461
1462         img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1463         if (!img_request)
1464                 return NULL;
1465
1466         if (write_request) {
1467                 down_read(&rbd_dev->header_rwsem);
1468                 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1469                 up_read(&rbd_dev->header_rwsem);
1470                 if (WARN_ON(!snapc)) {
1471                         kfree(img_request);
1472                         return NULL;    /* Shouldn't happen */
1473                 }
1474         }
1475
1476         img_request->rq = NULL;
1477         img_request->rbd_dev = rbd_dev;
1478         img_request->offset = offset;
1479         img_request->length = length;
1480         img_request->write_request = write_request;
1481         if (write_request)
1482                 img_request->snapc = snapc;
1483         else
1484                 img_request->snap_id = rbd_dev->spec->snap_id;
1485         spin_lock_init(&img_request->completion_lock);
1486         img_request->next_completion = 0;
1487         img_request->callback = NULL;
1488         img_request->obj_request_count = 0;
1489         INIT_LIST_HEAD(&img_request->obj_requests);
1490         kref_init(&img_request->kref);
1491
1492         rbd_img_request_get(img_request);       /* Avoid a warning */
1493         rbd_img_request_put(img_request);       /* TEMPORARY */
1494
1495         dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1496                 write_request ? "write" : "read", offset, length,
1497                 img_request);
1498
1499         return img_request;
1500 }
1501
1502 static void rbd_img_request_destroy(struct kref *kref)
1503 {
1504         struct rbd_img_request *img_request;
1505         struct rbd_obj_request *obj_request;
1506         struct rbd_obj_request *next_obj_request;
1507
1508         img_request = container_of(kref, struct rbd_img_request, kref);
1509
1510         dout("%s: img %p\n", __func__, img_request);
1511
1512         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1513                 rbd_img_obj_request_del(img_request, obj_request);
1514         rbd_assert(img_request->obj_request_count == 0);
1515
1516         if (img_request->write_request)
1517                 ceph_put_snap_context(img_request->snapc);
1518
1519         kfree(img_request);
1520 }
1521
1522 static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1523                                         struct bio *bio_list)
1524 {
1525         struct rbd_device *rbd_dev = img_request->rbd_dev;
1526         struct rbd_obj_request *obj_request = NULL;
1527         struct rbd_obj_request *next_obj_request;
1528         bool write_request = img_request->write_request;
1529         unsigned int bio_offset;
1530         u64 image_offset;
1531         u64 resid;
1532         u16 opcode;
1533
1534         dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1535
1536         opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
1537         bio_offset = 0;
1538         image_offset = img_request->offset;
1539         rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1540         resid = img_request->length;
1541         rbd_assert(resid > 0);
1542         while (resid) {
1543                 struct ceph_osd_request *osd_req;
1544                 struct ceph_osd_data *osd_data;
1545                 const char *object_name;
1546                 unsigned int clone_size;
1547                 u64 offset;
1548                 u64 length;
1549
1550                 object_name = rbd_segment_name(rbd_dev, image_offset);
1551                 if (!object_name)
1552                         goto out_unwind;
1553                 offset = rbd_segment_offset(rbd_dev, image_offset);
1554                 length = rbd_segment_length(rbd_dev, image_offset, resid);
1555                 obj_request = rbd_obj_request_create(object_name,
1556                                                 offset, length,
1557                                                 OBJ_REQUEST_BIO);
1558                 kfree(object_name);     /* object request has its own copy */
1559                 if (!obj_request)
1560                         goto out_unwind;
1561
1562                 rbd_assert(length <= (u64) UINT_MAX);
1563                 clone_size = (unsigned int) length;
1564                 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1565                                                 &bio_offset, clone_size,
1566                                                 GFP_ATOMIC);
1567                 if (!obj_request->bio_list)
1568                         goto out_partial;
1569
1570                 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1571                                                 obj_request);
1572                 if (!osd_req)
1573                         goto out_partial;
1574                 obj_request->osd_req = osd_req;
1575
1576                 osd_data = write_request ? &osd_req->r_data_out
1577                                          : &osd_req->r_data_in;
1578                 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1579                                                 0, 0);
1580                 ceph_osd_data_bio_init(osd_data, obj_request->bio_list,
1581                                         obj_request->length);
1582                 osd_req_op_extent_osd_data(osd_req, 0, osd_data);
1583                 rbd_osd_req_format(obj_request, write_request);
1584
1585                 /* status and version are initially zero-filled */
1586
1587                 rbd_img_obj_request_add(img_request, obj_request);
1588
1589                 image_offset += length;
1590                 resid -= length;
1591         }
1592
1593         return 0;
1594
1595 out_partial:
1596         rbd_obj_request_put(obj_request);
1597 out_unwind:
1598         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1599                 rbd_obj_request_put(obj_request);
1600
1601         return -ENOMEM;
1602 }
1603
1604 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1605 {
1606         struct rbd_img_request *img_request;
1607         u32 which = obj_request->which;
1608         bool more = true;
1609
1610         img_request = obj_request->img_request;
1611
1612         dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1613         rbd_assert(img_request != NULL);
1614         rbd_assert(img_request->rq != NULL);
1615         rbd_assert(img_request->obj_request_count > 0);
1616         rbd_assert(which != BAD_WHICH);
1617         rbd_assert(which < img_request->obj_request_count);
1618         rbd_assert(which >= img_request->next_completion);
1619
1620         spin_lock_irq(&img_request->completion_lock);
1621         if (which != img_request->next_completion)
1622                 goto out;
1623
1624         for_each_obj_request_from(img_request, obj_request) {
1625                 unsigned int xferred;
1626                 int result;
1627
1628                 rbd_assert(more);
1629                 rbd_assert(which < img_request->obj_request_count);
1630
1631                 if (!obj_request_done_test(obj_request))
1632                         break;
1633
1634                 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1635                 xferred = (unsigned int) obj_request->xferred;
1636                 result = (int) obj_request->result;
1637                 if (result)
1638                         rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1639                                 img_request->write_request ? "write" : "read",
1640                                 result, xferred);
1641
1642                 more = blk_end_request(img_request->rq, result, xferred);
1643                 which++;
1644         }
1645
1646         rbd_assert(more ^ (which == img_request->obj_request_count));
1647         img_request->next_completion = which;
1648 out:
1649         spin_unlock_irq(&img_request->completion_lock);
1650
1651         if (!more)
1652                 rbd_img_request_complete(img_request);
1653 }
1654
1655 static int rbd_img_request_submit(struct rbd_img_request *img_request)
1656 {
1657         struct rbd_device *rbd_dev = img_request->rbd_dev;
1658         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1659         struct rbd_obj_request *obj_request;
1660         struct rbd_obj_request *next_obj_request;
1661
1662         dout("%s: img %p\n", __func__, img_request);
1663         for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
1664                 int ret;
1665
1666                 obj_request->callback = rbd_img_obj_callback;
1667                 ret = rbd_obj_request_submit(osdc, obj_request);
1668                 if (ret)
1669                         return ret;
1670                 /*
1671                  * The image request has its own reference to each
1672                  * of its object requests, so we can safely drop the
1673                  * initial one here.
1674                  */
1675                 rbd_obj_request_put(obj_request);
1676         }
1677
1678         return 0;
1679 }
1680
1681 static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1682                                    u64 ver, u64 notify_id)
1683 {
1684         struct rbd_obj_request *obj_request;
1685         struct ceph_osd_client *osdc;
1686         int ret;
1687
1688         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1689                                                         OBJ_REQUEST_NODATA);
1690         if (!obj_request)
1691                 return -ENOMEM;
1692
1693         ret = -ENOMEM;
1694         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
1695         if (!obj_request->osd_req)
1696                 goto out;
1697
1698         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
1699                                         notify_id, ver, 0);
1700         rbd_osd_req_format(obj_request, false);
1701
1702         osdc = &rbd_dev->rbd_client->client->osdc;
1703         obj_request->callback = rbd_obj_request_put;
1704         ret = rbd_obj_request_submit(osdc, obj_request);
1705 out:
1706         if (ret)
1707                 rbd_obj_request_put(obj_request);
1708
1709         return ret;
1710 }
1711
1712 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1713 {
1714         struct rbd_device *rbd_dev = (struct rbd_device *)data;
1715         u64 hver;
1716         int rc;
1717
1718         if (!rbd_dev)
1719                 return;
1720
1721         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1722                 rbd_dev->header_name, (unsigned long long) notify_id,
1723                 (unsigned int) opcode);
1724         rc = rbd_dev_refresh(rbd_dev, &hver);
1725         if (rc)
1726                 rbd_warn(rbd_dev, "got notification but failed to "
1727                            " update snaps: %d\n", rc);
1728
1729         rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1730 }
1731
1732 /*
1733  * Request sync osd watch/unwatch.  The value of "start" determines
1734  * whether a watch request is being initiated or torn down.
1735  */
1736 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1737 {
1738         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1739         struct rbd_obj_request *obj_request;
1740         int ret;
1741
1742         rbd_assert(start ^ !!rbd_dev->watch_event);
1743         rbd_assert(start ^ !!rbd_dev->watch_request);
1744
1745         if (start) {
1746                 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
1747                                                 &rbd_dev->watch_event);
1748                 if (ret < 0)
1749                         return ret;
1750                 rbd_assert(rbd_dev->watch_event != NULL);
1751         }
1752
1753         ret = -ENOMEM;
1754         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1755                                                         OBJ_REQUEST_NODATA);
1756         if (!obj_request)
1757                 goto out_cancel;
1758
1759         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
1760         if (!obj_request->osd_req)
1761                 goto out_cancel;
1762
1763         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
1764                                 rbd_dev->watch_event->cookie,
1765                                 rbd_dev->header.obj_version, start);
1766         rbd_osd_req_format(obj_request, true);
1767
1768         if (start)
1769                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
1770         else
1771                 ceph_osdc_unregister_linger_request(osdc,
1772                                         rbd_dev->watch_request->osd_req);
1773         ret = rbd_obj_request_submit(osdc, obj_request);
1774         if (ret)
1775                 goto out_cancel;
1776         ret = rbd_obj_request_wait(obj_request);
1777         if (ret)
1778                 goto out_cancel;
1779         ret = obj_request->result;
1780         if (ret)
1781                 goto out_cancel;
1782
1783         /*
1784          * A watch request is set to linger, so the underlying osd
1785          * request won't go away until we unregister it.  We retain
1786          * a pointer to the object request during that time (in
1787          * rbd_dev->watch_request), so we'll keep a reference to
1788          * it.  We'll drop that reference (below) after we've
1789          * unregistered it.
1790          */
1791         if (start) {
1792                 rbd_dev->watch_request = obj_request;
1793
1794                 return 0;
1795         }
1796
1797         /* We have successfully torn down the watch request */
1798
1799         rbd_obj_request_put(rbd_dev->watch_request);
1800         rbd_dev->watch_request = NULL;
1801 out_cancel:
1802         /* Cancel the event if we're tearing down, or on error */
1803         ceph_osdc_cancel_event(rbd_dev->watch_event);
1804         rbd_dev->watch_event = NULL;
1805         if (obj_request)
1806                 rbd_obj_request_put(obj_request);
1807
1808         return ret;
1809 }
1810
1811 /*
1812  * Synchronous osd object method call
1813  */
1814 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1815                              const char *object_name,
1816                              const char *class_name,
1817                              const char *method_name,
1818                              const char *outbound,
1819                              size_t outbound_size,
1820                              char *inbound,
1821                              size_t inbound_size,
1822                              u64 *version)
1823 {
1824         struct rbd_obj_request *obj_request;
1825         struct ceph_osd_data *osd_data;
1826         struct ceph_osd_client *osdc;
1827         struct page **pages;
1828         u32 page_count;
1829         int ret;
1830
1831         /*
1832          * Method calls are ultimately read operations.  The result
1833          * should placed into the inbound buffer provided.  They
1834          * also supply outbound data--parameters for the object
1835          * method.  Currently if this is present it will be a
1836          * snapshot id.
1837          */
1838         page_count = (u32) calc_pages_for(0, inbound_size);
1839         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1840         if (IS_ERR(pages))
1841                 return PTR_ERR(pages);
1842
1843         ret = -ENOMEM;
1844         obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
1845                                                         OBJ_REQUEST_PAGES);
1846         if (!obj_request)
1847                 goto out;
1848
1849         obj_request->pages = pages;
1850         obj_request->page_count = page_count;
1851
1852         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
1853         if (!obj_request->osd_req)
1854                 goto out;
1855
1856         osd_data = &obj_request->osd_req->r_data_in;
1857         osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
1858                                         class_name, method_name,
1859                                         outbound, outbound_size);
1860         ceph_osd_data_pages_init(osd_data, obj_request->pages, inbound_size,
1861                                         0, false, false);
1862         osd_req_op_cls_response_data(obj_request->osd_req, 0, osd_data);
1863         rbd_osd_req_format(obj_request, false);
1864
1865         osdc = &rbd_dev->rbd_client->client->osdc;
1866         ret = rbd_obj_request_submit(osdc, obj_request);
1867         if (ret)
1868                 goto out;
1869         ret = rbd_obj_request_wait(obj_request);
1870         if (ret)
1871                 goto out;
1872
1873         ret = obj_request->result;
1874         if (ret < 0)
1875                 goto out;
1876         ret = 0;
1877         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
1878         if (version)
1879                 *version = obj_request->version;
1880 out:
1881         if (obj_request)
1882                 rbd_obj_request_put(obj_request);
1883         else
1884                 ceph_release_page_vector(pages, page_count);
1885
1886         return ret;
1887 }
1888
1889 static void rbd_request_fn(struct request_queue *q)
1890                 __releases(q->queue_lock) __acquires(q->queue_lock)
1891 {
1892         struct rbd_device *rbd_dev = q->queuedata;
1893         bool read_only = rbd_dev->mapping.read_only;
1894         struct request *rq;
1895         int result;
1896
1897         while ((rq = blk_fetch_request(q))) {
1898                 bool write_request = rq_data_dir(rq) == WRITE;
1899                 struct rbd_img_request *img_request;
1900                 u64 offset;
1901                 u64 length;
1902
1903                 /* Ignore any non-FS requests that filter through. */
1904
1905                 if (rq->cmd_type != REQ_TYPE_FS) {
1906                         dout("%s: non-fs request type %d\n", __func__,
1907                                 (int) rq->cmd_type);
1908                         __blk_end_request_all(rq, 0);
1909                         continue;
1910                 }
1911
1912                 /* Ignore/skip any zero-length requests */
1913
1914                 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1915                 length = (u64) blk_rq_bytes(rq);
1916
1917                 if (!length) {
1918                         dout("%s: zero-length request\n", __func__);
1919                         __blk_end_request_all(rq, 0);
1920                         continue;
1921                 }
1922
1923                 spin_unlock_irq(q->queue_lock);
1924
1925                 /* Disallow writes to a read-only device */
1926
1927                 if (write_request) {
1928                         result = -EROFS;
1929                         if (read_only)
1930                                 goto end_request;
1931                         rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1932                 }
1933
1934                 /*
1935                  * Quit early if the mapped snapshot no longer
1936                  * exists.  It's still possible the snapshot will
1937                  * have disappeared by the time our request arrives
1938                  * at the osd, but there's no sense in sending it if
1939                  * we already know.
1940                  */
1941                 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
1942                         dout("request for non-existent snapshot");
1943                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1944                         result = -ENXIO;
1945                         goto end_request;
1946                 }
1947
1948                 result = -EINVAL;
1949                 if (WARN_ON(offset && length > U64_MAX - offset + 1))
1950                         goto end_request;       /* Shouldn't happen */
1951
1952                 result = -ENOMEM;
1953                 img_request = rbd_img_request_create(rbd_dev, offset, length,
1954                                                         write_request);
1955                 if (!img_request)
1956                         goto end_request;
1957
1958                 img_request->rq = rq;
1959
1960                 result = rbd_img_request_fill_bio(img_request, rq->bio);
1961                 if (!result)
1962                         result = rbd_img_request_submit(img_request);
1963                 if (result)
1964                         rbd_img_request_put(img_request);
1965 end_request:
1966                 spin_lock_irq(q->queue_lock);
1967                 if (result < 0) {
1968                         rbd_warn(rbd_dev, "obj_request %s result %d\n",
1969                                 write_request ? "write" : "read", result);
1970                         __blk_end_request_all(rq, result);
1971                 }
1972         }
1973 }
1974
1975 /*
1976  * a queue callback. Makes sure that we don't create a bio that spans across
1977  * multiple osd objects. One exception would be with a single page bios,
1978  * which we handle later at bio_chain_clone_range()
1979  */
1980 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1981                           struct bio_vec *bvec)
1982 {
1983         struct rbd_device *rbd_dev = q->queuedata;
1984         sector_t sector_offset;
1985         sector_t sectors_per_obj;
1986         sector_t obj_sector_offset;
1987         int ret;
1988
1989         /*
1990          * Find how far into its rbd object the partition-relative
1991          * bio start sector is to offset relative to the enclosing
1992          * device.
1993          */
1994         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1995         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1996         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1997
1998         /*
1999          * Compute the number of bytes from that offset to the end
2000          * of the object.  Account for what's already used by the bio.
2001          */
2002         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2003         if (ret > bmd->bi_size)
2004                 ret -= bmd->bi_size;
2005         else
2006                 ret = 0;
2007
2008         /*
2009          * Don't send back more than was asked for.  And if the bio
2010          * was empty, let the whole thing through because:  "Note
2011          * that a block device *must* allow a single page to be
2012          * added to an empty bio."
2013          */
2014         rbd_assert(bvec->bv_len <= PAGE_SIZE);
2015         if (ret > (int) bvec->bv_len || !bmd->bi_size)
2016                 ret = (int) bvec->bv_len;
2017
2018         return ret;
2019 }
2020
2021 static void rbd_free_disk(struct rbd_device *rbd_dev)
2022 {
2023         struct gendisk *disk = rbd_dev->disk;
2024
2025         if (!disk)
2026                 return;
2027
2028         if (disk->flags & GENHD_FL_UP)
2029                 del_gendisk(disk);
2030         if (disk->queue)
2031                 blk_cleanup_queue(disk->queue);
2032         put_disk(disk);
2033 }
2034
2035 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2036                                 const char *object_name,
2037                                 u64 offset, u64 length,
2038                                 char *buf, u64 *version)
2039
2040 {
2041         struct rbd_obj_request *obj_request;
2042         struct ceph_osd_data *osd_data;
2043         struct ceph_osd_client *osdc;
2044         struct page **pages = NULL;
2045         u32 page_count;
2046         size_t size;
2047         int ret;
2048
2049         page_count = (u32) calc_pages_for(offset, length);
2050         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2051         if (IS_ERR(pages))
2052                 ret = PTR_ERR(pages);
2053
2054         ret = -ENOMEM;
2055         obj_request = rbd_obj_request_create(object_name, offset, length,
2056                                                         OBJ_REQUEST_PAGES);
2057         if (!obj_request)
2058                 goto out;
2059
2060         obj_request->pages = pages;
2061         obj_request->page_count = page_count;
2062
2063         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2064         if (!obj_request->osd_req)
2065                 goto out;
2066
2067         osd_data = &obj_request->osd_req->r_data_in;
2068         osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2069                                         offset, length, 0, 0);
2070         ceph_osd_data_pages_init(osd_data, obj_request->pages,
2071                                         obj_request->length,
2072                                         obj_request->offset & ~PAGE_MASK,
2073                                         false, false);
2074         osd_req_op_extent_osd_data(obj_request->osd_req, 0, osd_data);
2075         rbd_osd_req_format(obj_request, false);
2076
2077         osdc = &rbd_dev->rbd_client->client->osdc;
2078         ret = rbd_obj_request_submit(osdc, obj_request);
2079         if (ret)
2080                 goto out;
2081         ret = rbd_obj_request_wait(obj_request);
2082         if (ret)
2083                 goto out;
2084
2085         ret = obj_request->result;
2086         if (ret < 0)
2087                 goto out;
2088
2089         rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2090         size = (size_t) obj_request->xferred;
2091         ceph_copy_from_page_vector(pages, buf, 0, size);
2092         rbd_assert(size <= (size_t) INT_MAX);
2093         ret = (int) size;
2094         if (version)
2095                 *version = obj_request->version;
2096 out:
2097         if (obj_request)
2098                 rbd_obj_request_put(obj_request);
2099         else
2100                 ceph_release_page_vector(pages, page_count);
2101
2102         return ret;
2103 }
2104
2105 /*
2106  * Read the complete header for the given rbd device.
2107  *
2108  * Returns a pointer to a dynamically-allocated buffer containing
2109  * the complete and validated header.  Caller can pass the address
2110  * of a variable that will be filled in with the version of the
2111  * header object at the time it was read.
2112  *
2113  * Returns a pointer-coded errno if a failure occurs.
2114  */
2115 static struct rbd_image_header_ondisk *
2116 rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2117 {
2118         struct rbd_image_header_ondisk *ondisk = NULL;
2119         u32 snap_count = 0;
2120         u64 names_size = 0;
2121         u32 want_count;
2122         int ret;
2123
2124         /*
2125          * The complete header will include an array of its 64-bit
2126          * snapshot ids, followed by the names of those snapshots as
2127          * a contiguous block of NUL-terminated strings.  Note that
2128          * the number of snapshots could change by the time we read
2129          * it in, in which case we re-read it.
2130          */
2131         do {
2132                 size_t size;
2133
2134                 kfree(ondisk);
2135
2136                 size = sizeof (*ondisk);
2137                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2138                 size += names_size;
2139                 ondisk = kmalloc(size, GFP_KERNEL);
2140                 if (!ondisk)
2141                         return ERR_PTR(-ENOMEM);
2142
2143                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
2144                                        0, size,
2145                                        (char *) ondisk, version);
2146                 if (ret < 0)
2147                         goto out_err;
2148                 if (WARN_ON((size_t) ret < size)) {
2149                         ret = -ENXIO;
2150                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2151                                 size, ret);
2152                         goto out_err;
2153                 }
2154                 if (!rbd_dev_ondisk_valid(ondisk)) {
2155                         ret = -ENXIO;
2156                         rbd_warn(rbd_dev, "invalid header");
2157                         goto out_err;
2158                 }
2159
2160                 names_size = le64_to_cpu(ondisk->snap_names_len);
2161                 want_count = snap_count;
2162                 snap_count = le32_to_cpu(ondisk->snap_count);
2163         } while (snap_count != want_count);
2164
2165         return ondisk;
2166
2167 out_err:
2168         kfree(ondisk);
2169
2170         return ERR_PTR(ret);
2171 }
2172
2173 /*
2174  * reload the ondisk the header
2175  */
2176 static int rbd_read_header(struct rbd_device *rbd_dev,
2177                            struct rbd_image_header *header)
2178 {
2179         struct rbd_image_header_ondisk *ondisk;
2180         u64 ver = 0;
2181         int ret;
2182
2183         ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2184         if (IS_ERR(ondisk))
2185                 return PTR_ERR(ondisk);
2186         ret = rbd_header_from_disk(header, ondisk);
2187         if (ret >= 0)
2188                 header->obj_version = ver;
2189         kfree(ondisk);
2190
2191         return ret;
2192 }
2193
2194 static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2195 {
2196         struct rbd_snap *snap;
2197         struct rbd_snap *next;
2198
2199         list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
2200                 rbd_remove_snap_dev(snap);
2201 }
2202
2203 static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2204 {
2205         sector_t size;
2206
2207         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
2208                 return;
2209
2210         size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2211         dout("setting size to %llu sectors", (unsigned long long) size);
2212         rbd_dev->mapping.size = (u64) size;
2213         set_capacity(rbd_dev->disk, size);
2214 }
2215
2216 /*
2217  * only read the first part of the ondisk header, without the snaps info
2218  */
2219 static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2220 {
2221         int ret;
2222         struct rbd_image_header h;
2223
2224         ret = rbd_read_header(rbd_dev, &h);
2225         if (ret < 0)
2226                 return ret;
2227
2228         down_write(&rbd_dev->header_rwsem);
2229
2230         /* Update image size, and check for resize of mapped image */
2231         rbd_dev->header.image_size = h.image_size;
2232         rbd_update_mapping_size(rbd_dev);
2233
2234         /* rbd_dev->header.object_prefix shouldn't change */
2235         kfree(rbd_dev->header.snap_sizes);
2236         kfree(rbd_dev->header.snap_names);
2237         /* osd requests may still refer to snapc */
2238         ceph_put_snap_context(rbd_dev->header.snapc);
2239
2240         if (hver)
2241                 *hver = h.obj_version;
2242         rbd_dev->header.obj_version = h.obj_version;
2243         rbd_dev->header.image_size = h.image_size;
2244         rbd_dev->header.snapc = h.snapc;
2245         rbd_dev->header.snap_names = h.snap_names;
2246         rbd_dev->header.snap_sizes = h.snap_sizes;
2247         /* Free the extra copy of the object prefix */
2248         WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2249         kfree(h.object_prefix);
2250
2251         ret = rbd_dev_snaps_update(rbd_dev);
2252         if (!ret)
2253                 ret = rbd_dev_snaps_register(rbd_dev);
2254
2255         up_write(&rbd_dev->header_rwsem);
2256
2257         return ret;
2258 }
2259
2260 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
2261 {
2262         int ret;
2263
2264         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
2265         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2266         if (rbd_dev->image_format == 1)
2267                 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2268         else
2269                 ret = rbd_dev_v2_refresh(rbd_dev, hver);
2270         mutex_unlock(&ctl_mutex);
2271
2272         return ret;
2273 }
2274
2275 static int rbd_init_disk(struct rbd_device *rbd_dev)
2276 {
2277         struct gendisk *disk;
2278         struct request_queue *q;
2279         u64 segment_size;
2280
2281         /* create gendisk info */
2282         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2283         if (!disk)
2284                 return -ENOMEM;
2285
2286         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2287                  rbd_dev->dev_id);
2288         disk->major = rbd_dev->major;
2289         disk->first_minor = 0;
2290         disk->fops = &rbd_bd_ops;
2291         disk->private_data = rbd_dev;
2292
2293         q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2294         if (!q)
2295                 goto out_disk;
2296
2297         /* We use the default size, but let's be explicit about it. */
2298         blk_queue_physical_block_size(q, SECTOR_SIZE);
2299
2300         /* set io sizes to object size */
2301         segment_size = rbd_obj_bytes(&rbd_dev->header);
2302         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2303         blk_queue_max_segment_size(q, segment_size);
2304         blk_queue_io_min(q, segment_size);
2305         blk_queue_io_opt(q, segment_size);
2306
2307         blk_queue_merge_bvec(q, rbd_merge_bvec);
2308         disk->queue = q;
2309
2310         q->queuedata = rbd_dev;
2311
2312         rbd_dev->disk = disk;
2313
2314         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2315
2316         return 0;
2317 out_disk:
2318         put_disk(disk);
2319
2320         return -ENOMEM;
2321 }
2322
2323 /*
2324   sysfs
2325 */
2326
2327 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2328 {
2329         return container_of(dev, struct rbd_device, dev);
2330 }
2331
2332 static ssize_t rbd_size_show(struct device *dev,
2333                              struct device_attribute *attr, char *buf)
2334 {
2335         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2336         sector_t size;
2337
2338         down_read(&rbd_dev->header_rwsem);
2339         size = get_capacity(rbd_dev->disk);
2340         up_read(&rbd_dev->header_rwsem);
2341
2342         return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2343 }
2344
2345 /*
2346  * Note this shows the features for whatever's mapped, which is not
2347  * necessarily the base image.
2348  */
2349 static ssize_t rbd_features_show(struct device *dev,
2350                              struct device_attribute *attr, char *buf)
2351 {
2352         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2353
2354         return sprintf(buf, "0x%016llx\n",
2355                         (unsigned long long) rbd_dev->mapping.features);
2356 }
2357
2358 static ssize_t rbd_major_show(struct device *dev,
2359                               struct device_attribute *attr, char *buf)
2360 {
2361         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2362
2363         return sprintf(buf, "%d\n", rbd_dev->major);
2364 }
2365
2366 static ssize_t rbd_client_id_show(struct device *dev,
2367                                   struct device_attribute *attr, char *buf)
2368 {
2369         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2370
2371         return sprintf(buf, "client%lld\n",
2372                         ceph_client_id(rbd_dev->rbd_client->client));
2373 }
2374
2375 static ssize_t rbd_pool_show(struct device *dev,
2376                              struct device_attribute *attr, char *buf)
2377 {
2378         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2379
2380         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2381 }
2382
2383 static ssize_t rbd_pool_id_show(struct device *dev,
2384                              struct device_attribute *attr, char *buf)
2385 {
2386         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2387
2388         return sprintf(buf, "%llu\n",
2389                 (unsigned long long) rbd_dev->spec->pool_id);
2390 }
2391
2392 static ssize_t rbd_name_show(struct device *dev,
2393                              struct device_attribute *attr, char *buf)
2394 {
2395         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2396
2397         if (rbd_dev->spec->image_name)
2398                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2399
2400         return sprintf(buf, "(unknown)\n");
2401 }
2402
2403 static ssize_t rbd_image_id_show(struct device *dev,
2404                              struct device_attribute *attr, char *buf)
2405 {
2406         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2407
2408         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2409 }
2410
2411 /*
2412  * Shows the name of the currently-mapped snapshot (or
2413  * RBD_SNAP_HEAD_NAME for the base image).
2414  */
2415 static ssize_t rbd_snap_show(struct device *dev,
2416                              struct device_attribute *attr,
2417                              char *buf)
2418 {
2419         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2420
2421         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2422 }
2423
2424 /*
2425  * For an rbd v2 image, shows the pool id, image id, and snapshot id
2426  * for the parent image.  If there is no parent, simply shows
2427  * "(no parent image)".
2428  */
2429 static ssize_t rbd_parent_show(struct device *dev,
2430                              struct device_attribute *attr,
2431                              char *buf)
2432 {
2433         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2434         struct rbd_spec *spec = rbd_dev->parent_spec;
2435         int count;
2436         char *bufp = buf;
2437
2438         if (!spec)
2439                 return sprintf(buf, "(no parent image)\n");
2440
2441         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2442                         (unsigned long long) spec->pool_id, spec->pool_name);
2443         if (count < 0)
2444                 return count;
2445         bufp += count;
2446
2447         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2448                         spec->image_name ? spec->image_name : "(unknown)");
2449         if (count < 0)
2450                 return count;
2451         bufp += count;
2452
2453         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2454                         (unsigned long long) spec->snap_id, spec->snap_name);
2455         if (count < 0)
2456                 return count;
2457         bufp += count;
2458
2459         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2460         if (count < 0)
2461                 return count;
2462         bufp += count;
2463
2464         return (ssize_t) (bufp - buf);
2465 }
2466
2467 static ssize_t rbd_image_refresh(struct device *dev,
2468                                  struct device_attribute *attr,
2469                                  const char *buf,
2470                                  size_t size)
2471 {
2472         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2473         int ret;
2474
2475         ret = rbd_dev_refresh(rbd_dev, NULL);
2476
2477         return ret < 0 ? ret : size;
2478 }
2479
2480 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2481 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2482 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2483 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2484 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
2485 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2486 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2487 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2488 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2489 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2490 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2491
2492 static struct attribute *rbd_attrs[] = {
2493         &dev_attr_size.attr,
2494         &dev_attr_features.attr,
2495         &dev_attr_major.attr,
2496         &dev_attr_client_id.attr,
2497         &dev_attr_pool.attr,
2498         &dev_attr_pool_id.attr,
2499         &dev_attr_name.attr,
2500         &dev_attr_image_id.attr,
2501         &dev_attr_current_snap.attr,
2502         &dev_attr_parent.attr,
2503         &dev_attr_refresh.attr,
2504         NULL
2505 };
2506
2507 static struct attribute_group rbd_attr_group = {
2508         .attrs = rbd_attrs,
2509 };
2510
2511 static const struct attribute_group *rbd_attr_groups[] = {
2512         &rbd_attr_group,
2513         NULL
2514 };
2515
2516 static void rbd_sysfs_dev_release(struct device *dev)
2517 {
2518 }
2519
2520 static struct device_type rbd_device_type = {
2521         .name           = "rbd",
2522         .groups         = rbd_attr_groups,
2523         .release        = rbd_sysfs_dev_release,
2524 };
2525
2526
2527 /*
2528   sysfs - snapshots
2529 */
2530
2531 static ssize_t rbd_snap_size_show(struct device *dev,
2532                                   struct device_attribute *attr,
2533                                   char *buf)
2534 {
2535         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2536
2537         return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2538 }
2539
2540 static ssize_t rbd_snap_id_show(struct device *dev,
2541                                 struct device_attribute *attr,
2542                                 char *buf)
2543 {
2544         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2545
2546         return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2547 }
2548
2549 static ssize_t rbd_snap_features_show(struct device *dev,
2550                                 struct device_attribute *attr,
2551                                 char *buf)
2552 {
2553         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2554
2555         return sprintf(buf, "0x%016llx\n",
2556                         (unsigned long long) snap->features);
2557 }
2558
2559 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2560 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2561 static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2562
2563 static struct attribute *rbd_snap_attrs[] = {
2564         &dev_attr_snap_size.attr,
2565         &dev_attr_snap_id.attr,
2566         &dev_attr_snap_features.attr,
2567         NULL,
2568 };
2569
2570 static struct attribute_group rbd_snap_attr_group = {
2571         .attrs = rbd_snap_attrs,
2572 };
2573
2574 static void rbd_snap_dev_release(struct device *dev)
2575 {
2576         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2577         kfree(snap->name);
2578         kfree(snap);
2579 }
2580
2581 static const struct attribute_group *rbd_snap_attr_groups[] = {
2582         &rbd_snap_attr_group,
2583         NULL
2584 };
2585
2586 static struct device_type rbd_snap_device_type = {
2587         .groups         = rbd_snap_attr_groups,
2588         .release        = rbd_snap_dev_release,
2589 };
2590
2591 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2592 {
2593         kref_get(&spec->kref);
2594
2595         return spec;
2596 }
2597
2598 static void rbd_spec_free(struct kref *kref);
2599 static void rbd_spec_put(struct rbd_spec *spec)
2600 {
2601         if (spec)
2602                 kref_put(&spec->kref, rbd_spec_free);
2603 }
2604
2605 static struct rbd_spec *rbd_spec_alloc(void)
2606 {
2607         struct rbd_spec *spec;
2608
2609         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2610         if (!spec)
2611                 return NULL;
2612         kref_init(&spec->kref);
2613
2614         rbd_spec_put(rbd_spec_get(spec));       /* TEMPORARY */
2615
2616         return spec;
2617 }
2618
2619 static void rbd_spec_free(struct kref *kref)
2620 {
2621         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2622
2623         kfree(spec->pool_name);
2624         kfree(spec->image_id);
2625         kfree(spec->image_name);
2626         kfree(spec->snap_name);
2627         kfree(spec);
2628 }
2629
2630 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2631                                 struct rbd_spec *spec)
2632 {
2633         struct rbd_device *rbd_dev;
2634
2635         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2636         if (!rbd_dev)
2637                 return NULL;
2638
2639         spin_lock_init(&rbd_dev->lock);
2640         rbd_dev->flags = 0;
2641         INIT_LIST_HEAD(&rbd_dev->node);
2642         INIT_LIST_HEAD(&rbd_dev->snaps);
2643         init_rwsem(&rbd_dev->header_rwsem);
2644
2645         rbd_dev->spec = spec;
2646         rbd_dev->rbd_client = rbdc;
2647
2648         /* Initialize the layout used for all rbd requests */
2649
2650         rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2651         rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2652         rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2653         rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2654
2655         return rbd_dev;
2656 }
2657
2658 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2659 {
2660         rbd_spec_put(rbd_dev->parent_spec);
2661         kfree(rbd_dev->header_name);
2662         rbd_put_client(rbd_dev->rbd_client);
2663         rbd_spec_put(rbd_dev->spec);
2664         kfree(rbd_dev);
2665 }
2666
2667 static bool rbd_snap_registered(struct rbd_snap *snap)
2668 {
2669         bool ret = snap->dev.type == &rbd_snap_device_type;
2670         bool reg = device_is_registered(&snap->dev);
2671
2672         rbd_assert(!ret ^ reg);
2673
2674         return ret;
2675 }
2676
2677 static void rbd_remove_snap_dev(struct rbd_snap *snap)
2678 {
2679         list_del(&snap->node);
2680         if (device_is_registered(&snap->dev))
2681                 device_unregister(&snap->dev);
2682 }
2683
2684 static int rbd_register_snap_dev(struct rbd_snap *snap,
2685                                   struct device *parent)
2686 {
2687         struct device *dev = &snap->dev;
2688         int ret;
2689
2690         dev->type = &rbd_snap_device_type;
2691         dev->parent = parent;
2692         dev->release = rbd_snap_dev_release;
2693         dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2694         dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2695
2696         ret = device_register(dev);
2697
2698         return ret;
2699 }
2700
2701 static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2702                                                 const char *snap_name,
2703                                                 u64 snap_id, u64 snap_size,
2704                                                 u64 snap_features)
2705 {
2706         struct rbd_snap *snap;
2707         int ret;
2708
2709         snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2710         if (!snap)
2711                 return ERR_PTR(-ENOMEM);
2712
2713         ret = -ENOMEM;
2714         snap->name = kstrdup(snap_name, GFP_KERNEL);
2715         if (!snap->name)
2716                 goto err;
2717
2718         snap->id = snap_id;
2719         snap->size = snap_size;
2720         snap->features = snap_features;
2721
2722         return snap;
2723
2724 err:
2725         kfree(snap->name);
2726         kfree(snap);
2727
2728         return ERR_PTR(ret);
2729 }
2730
2731 static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2732                 u64 *snap_size, u64 *snap_features)
2733 {
2734         char *snap_name;
2735
2736         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2737
2738         *snap_size = rbd_dev->header.snap_sizes[which];
2739         *snap_features = 0;     /* No features for v1 */
2740
2741         /* Skip over names until we find the one we are looking for */
2742
2743         snap_name = rbd_dev->header.snap_names;
2744         while (which--)
2745                 snap_name += strlen(snap_name) + 1;
2746
2747         return snap_name;
2748 }
2749
2750 /*
2751  * Get the size and object order for an image snapshot, or if
2752  * snap_id is CEPH_NOSNAP, gets this information for the base
2753  * image.
2754  */
2755 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2756                                 u8 *order, u64 *snap_size)
2757 {
2758         __le64 snapid = cpu_to_le64(snap_id);
2759         int ret;
2760         struct {
2761                 u8 order;
2762                 __le64 size;
2763         } __attribute__ ((packed)) size_buf = { 0 };
2764
2765         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2766                                 "rbd", "get_size",
2767                                 (char *) &snapid, sizeof (snapid),
2768                                 (char *) &size_buf, sizeof (size_buf), NULL);
2769         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2770         if (ret < 0)
2771                 return ret;
2772
2773         *order = size_buf.order;
2774         *snap_size = le64_to_cpu(size_buf.size);
2775
2776         dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
2777                 (unsigned long long) snap_id, (unsigned int) *order,
2778                 (unsigned long long) *snap_size);
2779
2780         return 0;
2781 }
2782
2783 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2784 {
2785         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2786                                         &rbd_dev->header.obj_order,
2787                                         &rbd_dev->header.image_size);
2788 }
2789
2790 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2791 {
2792         void *reply_buf;
2793         int ret;
2794         void *p;
2795
2796         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2797         if (!reply_buf)
2798                 return -ENOMEM;
2799
2800         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2801                                 "rbd", "get_object_prefix",
2802                                 NULL, 0,
2803                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
2804         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2805         if (ret < 0)
2806                 goto out;
2807
2808         p = reply_buf;
2809         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2810                                                 p + RBD_OBJ_PREFIX_LEN_MAX,
2811                                                 NULL, GFP_NOIO);
2812
2813         if (IS_ERR(rbd_dev->header.object_prefix)) {
2814                 ret = PTR_ERR(rbd_dev->header.object_prefix);
2815                 rbd_dev->header.object_prefix = NULL;
2816         } else {
2817                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
2818         }
2819
2820 out:
2821         kfree(reply_buf);
2822
2823         return ret;
2824 }
2825
2826 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2827                 u64 *snap_features)
2828 {
2829         __le64 snapid = cpu_to_le64(snap_id);
2830         struct {
2831                 __le64 features;
2832                 __le64 incompat;
2833         } features_buf = { 0 };
2834         u64 incompat;
2835         int ret;
2836
2837         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2838                                 "rbd", "get_features",
2839                                 (char *) &snapid, sizeof (snapid),
2840                                 (char *) &features_buf, sizeof (features_buf),
2841                                 NULL);
2842         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2843         if (ret < 0)
2844                 return ret;
2845
2846         incompat = le64_to_cpu(features_buf.incompat);
2847         if (incompat & ~RBD_FEATURES_ALL)
2848                 return -ENXIO;
2849
2850         *snap_features = le64_to_cpu(features_buf.features);
2851
2852         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2853                 (unsigned long long) snap_id,
2854                 (unsigned long long) *snap_features,
2855                 (unsigned long long) le64_to_cpu(features_buf.incompat));
2856
2857         return 0;
2858 }
2859
2860 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2861 {
2862         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2863                                                 &rbd_dev->header.features);
2864 }
2865
2866 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2867 {
2868         struct rbd_spec *parent_spec;
2869         size_t size;
2870         void *reply_buf = NULL;
2871         __le64 snapid;
2872         void *p;
2873         void *end;
2874         char *image_id;
2875         u64 overlap;
2876         int ret;
2877
2878         parent_spec = rbd_spec_alloc();
2879         if (!parent_spec)
2880                 return -ENOMEM;
2881
2882         size = sizeof (__le64) +                                /* pool_id */
2883                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
2884                 sizeof (__le64) +                               /* snap_id */
2885                 sizeof (__le64);                                /* overlap */
2886         reply_buf = kmalloc(size, GFP_KERNEL);
2887         if (!reply_buf) {
2888                 ret = -ENOMEM;
2889                 goto out_err;
2890         }
2891
2892         snapid = cpu_to_le64(CEPH_NOSNAP);
2893         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2894                                 "rbd", "get_parent",
2895                                 (char *) &snapid, sizeof (snapid),
2896                                 (char *) reply_buf, size, NULL);
2897         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2898         if (ret < 0)
2899                 goto out_err;
2900
2901         ret = -ERANGE;
2902         p = reply_buf;
2903         end = (char *) reply_buf + size;
2904         ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2905         if (parent_spec->pool_id == CEPH_NOPOOL)
2906                 goto out;       /* No parent?  No problem. */
2907
2908         /* The ceph file layout needs to fit pool id in 32 bits */
2909
2910         ret = -EIO;
2911         if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2912                 goto out;
2913
2914         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2915         if (IS_ERR(image_id)) {
2916                 ret = PTR_ERR(image_id);
2917                 goto out_err;
2918         }
2919         parent_spec->image_id = image_id;
2920         ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2921         ceph_decode_64_safe(&p, end, overlap, out_err);
2922
2923         rbd_dev->parent_overlap = overlap;
2924         rbd_dev->parent_spec = parent_spec;
2925         parent_spec = NULL;     /* rbd_dev now owns this */
2926 out:
2927         ret = 0;
2928 out_err:
2929         kfree(reply_buf);
2930         rbd_spec_put(parent_spec);
2931
2932         return ret;
2933 }
2934
2935 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2936 {
2937         size_t image_id_size;
2938         char *image_id;
2939         void *p;
2940         void *end;
2941         size_t size;
2942         void *reply_buf = NULL;
2943         size_t len = 0;
2944         char *image_name = NULL;
2945         int ret;
2946
2947         rbd_assert(!rbd_dev->spec->image_name);
2948
2949         len = strlen(rbd_dev->spec->image_id);
2950         image_id_size = sizeof (__le32) + len;
2951         image_id = kmalloc(image_id_size, GFP_KERNEL);
2952         if (!image_id)
2953                 return NULL;
2954
2955         p = image_id;
2956         end = (char *) image_id + image_id_size;
2957         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
2958
2959         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2960         reply_buf = kmalloc(size, GFP_KERNEL);
2961         if (!reply_buf)
2962                 goto out;
2963
2964         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
2965                                 "rbd", "dir_get_name",
2966                                 image_id, image_id_size,
2967                                 (char *) reply_buf, size, NULL);
2968         if (ret < 0)
2969                 goto out;
2970         p = reply_buf;
2971         end = (char *) reply_buf + size;
2972         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2973         if (IS_ERR(image_name))
2974                 image_name = NULL;
2975         else
2976                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2977 out:
2978         kfree(reply_buf);
2979         kfree(image_id);
2980
2981         return image_name;
2982 }
2983
2984 /*
2985  * When a parent image gets probed, we only have the pool, image,
2986  * and snapshot ids but not the names of any of them.  This call
2987  * is made later to fill in those names.  It has to be done after
2988  * rbd_dev_snaps_update() has completed because some of the
2989  * information (in particular, snapshot name) is not available
2990  * until then.
2991  */
2992 static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2993 {
2994         struct ceph_osd_client *osdc;
2995         const char *name;
2996         void *reply_buf = NULL;
2997         int ret;
2998
2999         if (rbd_dev->spec->pool_name)
3000                 return 0;       /* Already have the names */
3001
3002         /* Look up the pool name */
3003
3004         osdc = &rbd_dev->rbd_client->client->osdc;
3005         name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3006         if (!name) {
3007                 rbd_warn(rbd_dev, "there is no pool with id %llu",
3008                         rbd_dev->spec->pool_id);        /* Really a BUG() */
3009                 return -EIO;
3010         }
3011
3012         rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3013         if (!rbd_dev->spec->pool_name)
3014                 return -ENOMEM;
3015
3016         /* Fetch the image name; tolerate failure here */
3017
3018         name = rbd_dev_image_name(rbd_dev);
3019         if (name)
3020                 rbd_dev->spec->image_name = (char *) name;
3021         else
3022                 rbd_warn(rbd_dev, "unable to get image name");
3023
3024         /* Look up the snapshot name. */
3025
3026         name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3027         if (!name) {
3028                 rbd_warn(rbd_dev, "no snapshot with id %llu",
3029                         rbd_dev->spec->snap_id);        /* Really a BUG() */
3030                 ret = -EIO;
3031                 goto out_err;
3032         }
3033         rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3034         if(!rbd_dev->spec->snap_name)
3035                 goto out_err;
3036
3037         return 0;
3038 out_err:
3039         kfree(reply_buf);
3040         kfree(rbd_dev->spec->pool_name);
3041         rbd_dev->spec->pool_name = NULL;
3042
3043         return ret;
3044 }
3045
3046 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
3047 {
3048         size_t size;
3049         int ret;
3050         void *reply_buf;
3051         void *p;
3052         void *end;
3053         u64 seq;
3054         u32 snap_count;
3055         struct ceph_snap_context *snapc;
3056         u32 i;
3057
3058         /*
3059          * We'll need room for the seq value (maximum snapshot id),
3060          * snapshot count, and array of that many snapshot ids.
3061          * For now we have a fixed upper limit on the number we're
3062          * prepared to receive.
3063          */
3064         size = sizeof (__le64) + sizeof (__le32) +
3065                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
3066         reply_buf = kzalloc(size, GFP_KERNEL);
3067         if (!reply_buf)
3068                 return -ENOMEM;
3069
3070         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3071                                 "rbd", "get_snapcontext",
3072                                 NULL, 0,
3073                                 reply_buf, size, ver);
3074         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3075         if (ret < 0)
3076                 goto out;
3077
3078         ret = -ERANGE;
3079         p = reply_buf;
3080         end = (char *) reply_buf + size;
3081         ceph_decode_64_safe(&p, end, seq, out);
3082         ceph_decode_32_safe(&p, end, snap_count, out);
3083
3084         /*
3085          * Make sure the reported number of snapshot ids wouldn't go
3086          * beyond the end of our buffer.  But before checking that,
3087          * make sure the computed size of the snapshot context we
3088          * allocate is representable in a size_t.
3089          */
3090         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3091                                  / sizeof (u64)) {
3092                 ret = -EINVAL;
3093                 goto out;
3094         }
3095         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3096                 goto out;
3097
3098         size = sizeof (struct ceph_snap_context) +
3099                                 snap_count * sizeof (snapc->snaps[0]);
3100         snapc = kmalloc(size, GFP_KERNEL);
3101         if (!snapc) {
3102                 ret = -ENOMEM;
3103                 goto out;
3104         }
3105
3106         atomic_set(&snapc->nref, 1);
3107         snapc->seq = seq;
3108         snapc->num_snaps = snap_count;
3109         for (i = 0; i < snap_count; i++)
3110                 snapc->snaps[i] = ceph_decode_64(&p);
3111
3112         rbd_dev->header.snapc = snapc;
3113
3114         dout("  snap context seq = %llu, snap_count = %u\n",
3115                 (unsigned long long) seq, (unsigned int) snap_count);
3116
3117 out:
3118         kfree(reply_buf);
3119
3120         return 0;
3121 }
3122
3123 static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3124 {
3125         size_t size;
3126         void *reply_buf;
3127         __le64 snap_id;
3128         int ret;
3129         void *p;
3130         void *end;
3131         char *snap_name;
3132
3133         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3134         reply_buf = kmalloc(size, GFP_KERNEL);
3135         if (!reply_buf)
3136                 return ERR_PTR(-ENOMEM);
3137
3138         snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3139         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3140                                 "rbd", "get_snapshot_name",
3141                                 (char *) &snap_id, sizeof (snap_id),
3142                                 reply_buf, size, NULL);
3143         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3144         if (ret < 0)
3145                 goto out;
3146
3147         p = reply_buf;
3148         end = (char *) reply_buf + size;
3149         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3150         if (IS_ERR(snap_name)) {
3151                 ret = PTR_ERR(snap_name);
3152                 goto out;
3153         } else {
3154                 dout("  snap_id 0x%016llx snap_name = %s\n",
3155                         (unsigned long long) le64_to_cpu(snap_id), snap_name);
3156         }
3157         kfree(reply_buf);
3158
3159         return snap_name;
3160 out:
3161         kfree(reply_buf);
3162
3163         return ERR_PTR(ret);
3164 }
3165
3166 static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3167                 u64 *snap_size, u64 *snap_features)
3168 {
3169         u64 snap_id;
3170         u8 order;
3171         int ret;
3172
3173         snap_id = rbd_dev->header.snapc->snaps[which];
3174         ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3175         if (ret)
3176                 return ERR_PTR(ret);
3177         ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3178         if (ret)
3179                 return ERR_PTR(ret);
3180
3181         return rbd_dev_v2_snap_name(rbd_dev, which);
3182 }
3183
3184 static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3185                 u64 *snap_size, u64 *snap_features)
3186 {
3187         if (rbd_dev->image_format == 1)
3188                 return rbd_dev_v1_snap_info(rbd_dev, which,
3189                                         snap_size, snap_features);
3190         if (rbd_dev->image_format == 2)
3191                 return rbd_dev_v2_snap_info(rbd_dev, which,
3192                                         snap_size, snap_features);
3193         return ERR_PTR(-EINVAL);
3194 }
3195
3196 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3197 {
3198         int ret;
3199         __u8 obj_order;
3200
3201         down_write(&rbd_dev->header_rwsem);
3202
3203         /* Grab old order first, to see if it changes */
3204
3205         obj_order = rbd_dev->header.obj_order,
3206         ret = rbd_dev_v2_image_size(rbd_dev);
3207         if (ret)
3208                 goto out;
3209         if (rbd_dev->header.obj_order != obj_order) {
3210                 ret = -EIO;
3211                 goto out;
3212         }
3213         rbd_update_mapping_size(rbd_dev);
3214
3215         ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3216         dout("rbd_dev_v2_snap_context returned %d\n", ret);
3217         if (ret)
3218                 goto out;
3219         ret = rbd_dev_snaps_update(rbd_dev);
3220         dout("rbd_dev_snaps_update returned %d\n", ret);
3221         if (ret)
3222                 goto out;
3223         ret = rbd_dev_snaps_register(rbd_dev);
3224         dout("rbd_dev_snaps_register returned %d\n", ret);
3225 out:
3226         up_write(&rbd_dev->header_rwsem);
3227
3228         return ret;
3229 }
3230
3231 /*
3232  * Scan the rbd device's current snapshot list and compare it to the
3233  * newly-received snapshot context.  Remove any existing snapshots
3234  * not present in the new snapshot context.  Add a new snapshot for
3235  * any snaphots in the snapshot context not in the current list.
3236  * And verify there are no changes to snapshots we already know
3237  * about.
3238  *
3239  * Assumes the snapshots in the snapshot context are sorted by
3240  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
3241  * are also maintained in that order.)
3242  */
3243 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3244 {
3245         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3246         const u32 snap_count = snapc->num_snaps;
3247         struct list_head *head = &rbd_dev->snaps;
3248         struct list_head *links = head->next;
3249         u32 index = 0;
3250
3251         dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
3252         while (index < snap_count || links != head) {
3253                 u64 snap_id;
3254                 struct rbd_snap *snap;
3255                 char *snap_name;
3256                 u64 snap_size = 0;
3257                 u64 snap_features = 0;
3258
3259                 snap_id = index < snap_count ? snapc->snaps[index]
3260                                              : CEPH_NOSNAP;
3261                 snap = links != head ? list_entry(links, struct rbd_snap, node)
3262                                      : NULL;
3263                 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3264
3265                 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3266                         struct list_head *next = links->next;
3267
3268                         /*
3269                          * A previously-existing snapshot is not in
3270                          * the new snap context.
3271                          *
3272                          * If the now missing snapshot is the one the
3273                          * image is mapped to, clear its exists flag
3274                          * so we can avoid sending any more requests
3275                          * to it.
3276                          */
3277                         if (rbd_dev->spec->snap_id == snap->id)
3278                                 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3279                         rbd_remove_snap_dev(snap);
3280                         dout("%ssnap id %llu has been removed\n",
3281                                 rbd_dev->spec->snap_id == snap->id ?
3282                                                         "mapped " : "",
3283                                 (unsigned long long) snap->id);
3284
3285                         /* Done with this list entry; advance */
3286
3287                         links = next;
3288                         continue;
3289                 }
3290
3291                 snap_name = rbd_dev_snap_info(rbd_dev, index,
3292                                         &snap_size, &snap_features);
3293                 if (IS_ERR(snap_name))
3294                         return PTR_ERR(snap_name);
3295
3296                 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3297                         (unsigned long long) snap_id);
3298                 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3299                         struct rbd_snap *new_snap;
3300
3301                         /* We haven't seen this snapshot before */
3302
3303                         new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3304                                         snap_id, snap_size, snap_features);
3305                         if (IS_ERR(new_snap)) {
3306                                 int err = PTR_ERR(new_snap);
3307
3308                                 dout("  failed to add dev, error %d\n", err);
3309
3310                                 return err;
3311                         }
3312
3313                         /* New goes before existing, or at end of list */
3314
3315                         dout("  added dev%s\n", snap ? "" : " at end\n");
3316                         if (snap)
3317                                 list_add_tail(&new_snap->node, &snap->node);
3318                         else
3319                                 list_add_tail(&new_snap->node, head);
3320                 } else {
3321                         /* Already have this one */
3322
3323                         dout("  already present\n");
3324
3325                         rbd_assert(snap->size == snap_size);
3326                         rbd_assert(!strcmp(snap->name, snap_name));
3327                         rbd_assert(snap->features == snap_features);
3328
3329                         /* Done with this list entry; advance */
3330
3331                         links = links->next;
3332                 }
3333
3334                 /* Advance to the next entry in the snapshot context */
3335
3336                 index++;
3337         }
3338         dout("%s: done\n", __func__);
3339
3340         return 0;
3341 }
3342
3343 /*
3344  * Scan the list of snapshots and register the devices for any that
3345  * have not already been registered.
3346  */
3347 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3348 {
3349         struct rbd_snap *snap;
3350         int ret = 0;
3351
3352         dout("%s:\n", __func__);
3353         if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3354                 return -EIO;
3355
3356         list_for_each_entry(snap, &rbd_dev->snaps, node) {
3357                 if (!rbd_snap_registered(snap)) {
3358                         ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3359                         if (ret < 0)
3360                                 break;
3361                 }
3362         }
3363         dout("%s: returning %d\n", __func__, ret);
3364
3365         return ret;
3366 }
3367
3368 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3369 {
3370         struct device *dev;
3371         int ret;
3372
3373         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3374
3375         dev = &rbd_dev->dev;
3376         dev->bus = &rbd_bus_type;
3377         dev->type = &rbd_device_type;
3378         dev->parent = &rbd_root_dev;
3379         dev->release = rbd_dev_release;
3380         dev_set_name(dev, "%d", rbd_dev->dev_id);
3381         ret = device_register(dev);
3382
3383         mutex_unlock(&ctl_mutex);
3384
3385         return ret;
3386 }
3387
3388 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3389 {
3390         device_unregister(&rbd_dev->dev);
3391 }
3392
3393 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
3394
3395 /*
3396  * Get a unique rbd identifier for the given new rbd_dev, and add
3397  * the rbd_dev to the global list.  The minimum rbd id is 1.
3398  */
3399 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3400 {
3401         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3402
3403         spin_lock(&rbd_dev_list_lock);
3404         list_add_tail(&rbd_dev->node, &rbd_dev_list);
3405         spin_unlock(&rbd_dev_list_lock);
3406         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3407                 (unsigned long long) rbd_dev->dev_id);
3408 }
3409
3410 /*
3411  * Remove an rbd_dev from the global list, and record that its
3412  * identifier is no longer in use.
3413  */
3414 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
3415 {
3416         struct list_head *tmp;
3417         int rbd_id = rbd_dev->dev_id;
3418         int max_id;
3419
3420         rbd_assert(rbd_id > 0);
3421
3422         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3423                 (unsigned long long) rbd_dev->dev_id);
3424         spin_lock(&rbd_dev_list_lock);
3425         list_del_init(&rbd_dev->node);
3426
3427         /*
3428          * If the id being "put" is not the current maximum, there
3429          * is nothing special we need to do.
3430          */
3431         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3432                 spin_unlock(&rbd_dev_list_lock);
3433                 return;
3434         }
3435
3436         /*
3437          * We need to update the current maximum id.  Search the
3438          * list to find out what it is.  We're more likely to find
3439          * the maximum at the end, so search the list backward.
3440          */
3441         max_id = 0;
3442         list_for_each_prev(tmp, &rbd_dev_list) {
3443                 struct rbd_device *rbd_dev;
3444
3445                 rbd_dev = list_entry(tmp, struct rbd_device, node);
3446                 if (rbd_dev->dev_id > max_id)
3447                         max_id = rbd_dev->dev_id;
3448         }
3449         spin_unlock(&rbd_dev_list_lock);
3450
3451         /*
3452          * The max id could have been updated by rbd_dev_id_get(), in
3453          * which case it now accurately reflects the new maximum.
3454          * Be careful not to overwrite the maximum value in that
3455          * case.
3456          */
3457         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3458         dout("  max dev id has been reset\n");
3459 }
3460
3461 /*
3462  * Skips over white space at *buf, and updates *buf to point to the
3463  * first found non-space character (if any). Returns the length of
3464  * the token (string of non-white space characters) found.  Note
3465  * that *buf must be terminated with '\0'.
3466  */
3467 static inline size_t next_token(const char **buf)
3468 {
3469         /*
3470         * These are the characters that produce nonzero for
3471         * isspace() in the "C" and "POSIX" locales.
3472         */
3473         const char *spaces = " \f\n\r\t\v";
3474
3475         *buf += strspn(*buf, spaces);   /* Find start of token */
3476
3477         return strcspn(*buf, spaces);   /* Return token length */
3478 }
3479
3480 /*
3481  * Finds the next token in *buf, and if the provided token buffer is
3482  * big enough, copies the found token into it.  The result, if
3483  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3484  * must be terminated with '\0' on entry.
3485  *
3486  * Returns the length of the token found (not including the '\0').
3487  * Return value will be 0 if no token is found, and it will be >=
3488  * token_size if the token would not fit.
3489  *
3490  * The *buf pointer will be updated to point beyond the end of the
3491  * found token.  Note that this occurs even if the token buffer is
3492  * too small to hold it.
3493  */
3494 static inline size_t copy_token(const char **buf,
3495                                 char *token,
3496                                 size_t token_size)
3497 {
3498         size_t len;
3499
3500         len = next_token(buf);
3501         if (len < token_size) {
3502                 memcpy(token, *buf, len);
3503                 *(token + len) = '\0';
3504         }
3505         *buf += len;
3506
3507         return len;
3508 }
3509
3510 /*
3511  * Finds the next token in *buf, dynamically allocates a buffer big
3512  * enough to hold a copy of it, and copies the token into the new
3513  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3514  * that a duplicate buffer is created even for a zero-length token.
3515  *
3516  * Returns a pointer to the newly-allocated duplicate, or a null
3517  * pointer if memory for the duplicate was not available.  If
3518  * the lenp argument is a non-null pointer, the length of the token
3519  * (not including the '\0') is returned in *lenp.
3520  *
3521  * If successful, the *buf pointer will be updated to point beyond
3522  * the end of the found token.
3523  *
3524  * Note: uses GFP_KERNEL for allocation.
3525  */
3526 static inline char *dup_token(const char **buf, size_t *lenp)
3527 {
3528         char *dup;
3529         size_t len;
3530
3531         len = next_token(buf);
3532         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3533         if (!dup)
3534                 return NULL;
3535         *(dup + len) = '\0';
3536         *buf += len;
3537
3538         if (lenp)
3539                 *lenp = len;
3540
3541         return dup;
3542 }
3543
3544 /*
3545  * Parse the options provided for an "rbd add" (i.e., rbd image
3546  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3547  * and the data written is passed here via a NUL-terminated buffer.
3548  * Returns 0 if successful or an error code otherwise.
3549  *
3550  * The information extracted from these options is recorded in
3551  * the other parameters which return dynamically-allocated
3552  * structures:
3553  *  ceph_opts
3554  *      The address of a pointer that will refer to a ceph options
3555  *      structure.  Caller must release the returned pointer using
3556  *      ceph_destroy_options() when it is no longer needed.
3557  *  rbd_opts
3558  *      Address of an rbd options pointer.  Fully initialized by
3559  *      this function; caller must release with kfree().
3560  *  spec
3561  *      Address of an rbd image specification pointer.  Fully
3562  *      initialized by this function based on parsed options.
3563  *      Caller must release with rbd_spec_put().
3564  *
3565  * The options passed take this form:
3566  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3567  * where:
3568  *  <mon_addrs>
3569  *      A comma-separated list of one or more monitor addresses.
3570  *      A monitor address is an ip address, optionally followed
3571  *      by a port number (separated by a colon).
3572  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3573  *  <options>
3574  *      A comma-separated list of ceph and/or rbd options.
3575  *  <pool_name>
3576  *      The name of the rados pool containing the rbd image.
3577  *  <image_name>
3578  *      The name of the image in that pool to map.
3579  *  <snap_id>
3580  *      An optional snapshot id.  If provided, the mapping will
3581  *      present data from the image at the time that snapshot was
3582  *      created.  The image head is used if no snapshot id is
3583  *      provided.  Snapshot mappings are always read-only.
3584  */
3585 static int rbd_add_parse_args(const char *buf,
3586                                 struct ceph_options **ceph_opts,
3587                                 struct rbd_options **opts,
3588                                 struct rbd_spec **rbd_spec)
3589 {
3590         size_t len;
3591         char *options;
3592         const char *mon_addrs;
3593         size_t mon_addrs_size;
3594         struct rbd_spec *spec = NULL;
3595         struct rbd_options *rbd_opts = NULL;
3596         struct ceph_options *copts;
3597         int ret;
3598
3599         /* The first four tokens are required */
3600
3601         len = next_token(&buf);
3602         if (!len) {
3603                 rbd_warn(NULL, "no monitor address(es) provided");
3604                 return -EINVAL;
3605         }
3606         mon_addrs = buf;
3607         mon_addrs_size = len + 1;
3608         buf += len;
3609
3610         ret = -EINVAL;
3611         options = dup_token(&buf, NULL);
3612         if (!options)
3613                 return -ENOMEM;
3614         if (!*options) {
3615                 rbd_warn(NULL, "no options provided");
3616                 goto out_err;
3617         }
3618
3619         spec = rbd_spec_alloc();
3620         if (!spec)
3621                 goto out_mem;
3622
3623         spec->pool_name = dup_token(&buf, NULL);
3624         if (!spec->pool_name)
3625                 goto out_mem;
3626         if (!*spec->pool_name) {
3627                 rbd_warn(NULL, "no pool name provided");
3628                 goto out_err;
3629         }
3630
3631         spec->image_name = dup_token(&buf, NULL);
3632         if (!spec->image_name)
3633                 goto out_mem;
3634         if (!*spec->image_name) {
3635                 rbd_warn(NULL, "no image name provided");
3636                 goto out_err;
3637         }
3638
3639         /*
3640          * Snapshot name is optional; default is to use "-"
3641          * (indicating the head/no snapshot).
3642          */
3643         len = next_token(&buf);
3644         if (!len) {
3645                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3646                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3647         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
3648                 ret = -ENAMETOOLONG;
3649                 goto out_err;
3650         }
3651         spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3652         if (!spec->snap_name)
3653                 goto out_mem;
3654         *(spec->snap_name + len) = '\0';
3655
3656         /* Initialize all rbd options to the defaults */
3657
3658         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3659         if (!rbd_opts)
3660                 goto out_mem;
3661
3662         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3663
3664         copts = ceph_parse_options(options, mon_addrs,
3665                                         mon_addrs + mon_addrs_size - 1,
3666                                         parse_rbd_opts_token, rbd_opts);
3667         if (IS_ERR(copts)) {
3668                 ret = PTR_ERR(copts);
3669                 goto out_err;
3670         }
3671         kfree(options);
3672
3673         *ceph_opts = copts;
3674         *opts = rbd_opts;
3675         *rbd_spec = spec;
3676
3677         return 0;
3678 out_mem:
3679         ret = -ENOMEM;
3680 out_err:
3681         kfree(rbd_opts);
3682         rbd_spec_put(spec);
3683         kfree(options);
3684
3685         return ret;
3686 }
3687
3688 /*
3689  * An rbd format 2 image has a unique identifier, distinct from the
3690  * name given to it by the user.  Internally, that identifier is
3691  * what's used to specify the names of objects related to the image.
3692  *
3693  * A special "rbd id" object is used to map an rbd image name to its
3694  * id.  If that object doesn't exist, then there is no v2 rbd image
3695  * with the supplied name.
3696  *
3697  * This function will record the given rbd_dev's image_id field if
3698  * it can be determined, and in that case will return 0.  If any
3699  * errors occur a negative errno will be returned and the rbd_dev's
3700  * image_id field will be unchanged (and should be NULL).
3701  */
3702 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3703 {
3704         int ret;
3705         size_t size;
3706         char *object_name;
3707         void *response;
3708         void *p;
3709
3710         /*
3711          * When probing a parent image, the image id is already
3712          * known (and the image name likely is not).  There's no
3713          * need to fetch the image id again in this case.
3714          */
3715         if (rbd_dev->spec->image_id)
3716                 return 0;
3717
3718         /*
3719          * First, see if the format 2 image id file exists, and if
3720          * so, get the image's persistent id from it.
3721          */
3722         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3723         object_name = kmalloc(size, GFP_NOIO);
3724         if (!object_name)
3725                 return -ENOMEM;
3726         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3727         dout("rbd id object name is %s\n", object_name);
3728
3729         /* Response will be an encoded string, which includes a length */
3730
3731         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3732         response = kzalloc(size, GFP_NOIO);
3733         if (!response) {
3734                 ret = -ENOMEM;
3735                 goto out;
3736         }
3737
3738         ret = rbd_obj_method_sync(rbd_dev, object_name,
3739                                 "rbd", "get_id",
3740                                 NULL, 0,
3741                                 response, RBD_IMAGE_ID_LEN_MAX, NULL);
3742         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3743         if (ret < 0)
3744                 goto out;
3745
3746         p = response;
3747         rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3748                                                 p + RBD_IMAGE_ID_LEN_MAX,
3749                                                 NULL, GFP_NOIO);
3750         if (IS_ERR(rbd_dev->spec->image_id)) {
3751                 ret = PTR_ERR(rbd_dev->spec->image_id);
3752                 rbd_dev->spec->image_id = NULL;
3753         } else {
3754                 dout("image_id is %s\n", rbd_dev->spec->image_id);
3755         }
3756 out:
3757         kfree(response);
3758         kfree(object_name);
3759
3760         return ret;
3761 }
3762
3763 static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3764 {
3765         int ret;
3766         size_t size;
3767
3768         /* Version 1 images have no id; empty string is used */
3769
3770         rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3771         if (!rbd_dev->spec->image_id)
3772                 return -ENOMEM;
3773
3774         /* Record the header object name for this rbd image. */
3775
3776         size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3777         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3778         if (!rbd_dev->header_name) {
3779                 ret = -ENOMEM;
3780                 goto out_err;
3781         }
3782         sprintf(rbd_dev->header_name, "%s%s",
3783                 rbd_dev->spec->image_name, RBD_SUFFIX);
3784
3785         /* Populate rbd image metadata */
3786
3787         ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3788         if (ret < 0)
3789                 goto out_err;
3790
3791         /* Version 1 images have no parent (no layering) */
3792
3793         rbd_dev->parent_spec = NULL;
3794         rbd_dev->parent_overlap = 0;
3795
3796         rbd_dev->image_format = 1;
3797
3798         dout("discovered version 1 image, header name is %s\n",
3799                 rbd_dev->header_name);
3800
3801         return 0;
3802
3803 out_err:
3804         kfree(rbd_dev->header_name);
3805         rbd_dev->header_name = NULL;
3806         kfree(rbd_dev->spec->image_id);
3807         rbd_dev->spec->image_id = NULL;
3808
3809         return ret;
3810 }
3811
3812 static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3813 {
3814         size_t size;
3815         int ret;
3816         u64 ver = 0;
3817
3818         /*
3819          * Image id was filled in by the caller.  Record the header
3820          * object name for this rbd image.
3821          */
3822         size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3823         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3824         if (!rbd_dev->header_name)
3825                 return -ENOMEM;
3826         sprintf(rbd_dev->header_name, "%s%s",
3827                         RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
3828
3829         /* Get the size and object order for the image */
3830
3831         ret = rbd_dev_v2_image_size(rbd_dev);
3832         if (ret < 0)
3833                 goto out_err;
3834
3835         /* Get the object prefix (a.k.a. block_name) for the image */
3836
3837         ret = rbd_dev_v2_object_prefix(rbd_dev);
3838         if (ret < 0)
3839                 goto out_err;
3840
3841         /* Get the and check features for the image */
3842
3843         ret = rbd_dev_v2_features(rbd_dev);
3844         if (ret < 0)
3845                 goto out_err;
3846
3847         /* If the image supports layering, get the parent info */
3848
3849         if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3850                 ret = rbd_dev_v2_parent_info(rbd_dev);
3851                 if (ret < 0)
3852                         goto out_err;
3853         }
3854
3855         /* crypto and compression type aren't (yet) supported for v2 images */
3856
3857         rbd_dev->header.crypt_type = 0;
3858         rbd_dev->header.comp_type = 0;
3859
3860         /* Get the snapshot context, plus the header version */
3861
3862         ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
3863         if (ret)
3864                 goto out_err;
3865         rbd_dev->header.obj_version = ver;
3866
3867         rbd_dev->image_format = 2;
3868
3869         dout("discovered version 2 image, header name is %s\n",
3870                 rbd_dev->header_name);
3871
3872         return 0;
3873 out_err:
3874         rbd_dev->parent_overlap = 0;
3875         rbd_spec_put(rbd_dev->parent_spec);
3876         rbd_dev->parent_spec = NULL;
3877         kfree(rbd_dev->header_name);
3878         rbd_dev->header_name = NULL;
3879         kfree(rbd_dev->header.object_prefix);
3880         rbd_dev->header.object_prefix = NULL;
3881
3882         return ret;
3883 }
3884
3885 static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3886 {
3887         int ret;
3888
3889         /* no need to lock here, as rbd_dev is not registered yet */
3890         ret = rbd_dev_snaps_update(rbd_dev);
3891         if (ret)
3892                 return ret;
3893
3894         ret = rbd_dev_probe_update_spec(rbd_dev);
3895         if (ret)
3896                 goto err_out_snaps;
3897
3898         ret = rbd_dev_set_mapping(rbd_dev);
3899         if (ret)
3900                 goto err_out_snaps;
3901
3902         /* generate unique id: find highest unique id, add one */
3903         rbd_dev_id_get(rbd_dev);
3904
3905         /* Fill in the device name, now that we have its id. */
3906         BUILD_BUG_ON(DEV_NAME_LEN
3907                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3908         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3909
3910         /* Get our block major device number. */
3911
3912         ret = register_blkdev(0, rbd_dev->name);
3913         if (ret < 0)
3914                 goto err_out_id;
3915         rbd_dev->major = ret;
3916
3917         /* Set up the blkdev mapping. */
3918
3919         ret = rbd_init_disk(rbd_dev);
3920         if (ret)
3921                 goto err_out_blkdev;
3922
3923         ret = rbd_bus_add_dev(rbd_dev);
3924         if (ret)
3925                 goto err_out_disk;
3926
3927         /*
3928          * At this point cleanup in the event of an error is the job
3929          * of the sysfs code (initiated by rbd_bus_del_dev()).
3930          */
3931         down_write(&rbd_dev->header_rwsem);
3932         ret = rbd_dev_snaps_register(rbd_dev);
3933         up_write(&rbd_dev->header_rwsem);
3934         if (ret)
3935                 goto err_out_bus;
3936
3937         ret = rbd_dev_header_watch_sync(rbd_dev, 1);
3938         if (ret)
3939                 goto err_out_bus;
3940
3941         /* Everything's ready.  Announce the disk to the world. */
3942
3943         add_disk(rbd_dev->disk);
3944
3945         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3946                 (unsigned long long) rbd_dev->mapping.size);
3947
3948         return ret;
3949 err_out_bus:
3950         /* this will also clean up rest of rbd_dev stuff */
3951
3952         rbd_bus_del_dev(rbd_dev);
3953
3954         return ret;
3955 err_out_disk:
3956         rbd_free_disk(rbd_dev);
3957 err_out_blkdev:
3958         unregister_blkdev(rbd_dev->major, rbd_dev->name);
3959 err_out_id:
3960         rbd_dev_id_put(rbd_dev);
3961 err_out_snaps:
3962         rbd_remove_all_snaps(rbd_dev);
3963
3964         return ret;
3965 }
3966
3967 /*
3968  * Probe for the existence of the header object for the given rbd
3969  * device.  For format 2 images this includes determining the image
3970  * id.
3971  */
3972 static int rbd_dev_probe(struct rbd_device *rbd_dev)
3973 {
3974         int ret;
3975
3976         /*
3977          * Get the id from the image id object.  If it's not a
3978          * format 2 image, we'll get ENOENT back, and we'll assume
3979          * it's a format 1 image.
3980          */
3981         ret = rbd_dev_image_id(rbd_dev);
3982         if (ret)
3983                 ret = rbd_dev_v1_probe(rbd_dev);
3984         else
3985                 ret = rbd_dev_v2_probe(rbd_dev);
3986         if (ret) {
3987                 dout("probe failed, returning %d\n", ret);
3988
3989                 return ret;
3990         }
3991
3992         ret = rbd_dev_probe_finish(rbd_dev);
3993         if (ret)
3994                 rbd_header_free(&rbd_dev->header);
3995
3996         return ret;
3997 }
3998
3999 static ssize_t rbd_add(struct bus_type *bus,
4000                        const char *buf,
4001                        size_t count)
4002 {
4003         struct rbd_device *rbd_dev = NULL;
4004         struct ceph_options *ceph_opts = NULL;
4005         struct rbd_options *rbd_opts = NULL;
4006         struct rbd_spec *spec = NULL;
4007         struct rbd_client *rbdc;
4008         struct ceph_osd_client *osdc;
4009         int rc = -ENOMEM;
4010
4011         if (!try_module_get(THIS_MODULE))
4012                 return -ENODEV;
4013
4014         /* parse add command */
4015         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4016         if (rc < 0)
4017                 goto err_out_module;
4018
4019         rbdc = rbd_get_client(ceph_opts);
4020         if (IS_ERR(rbdc)) {
4021                 rc = PTR_ERR(rbdc);
4022                 goto err_out_args;
4023         }
4024         ceph_opts = NULL;       /* rbd_dev client now owns this */
4025
4026         /* pick the pool */
4027         osdc = &rbdc->client->osdc;
4028         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4029         if (rc < 0)
4030                 goto err_out_client;
4031         spec->pool_id = (u64) rc;
4032
4033         /* The ceph file layout needs to fit pool id in 32 bits */
4034
4035         if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4036                 rc = -EIO;
4037                 goto err_out_client;
4038         }
4039
4040         rbd_dev = rbd_dev_create(rbdc, spec);
4041         if (!rbd_dev)
4042                 goto err_out_client;
4043         rbdc = NULL;            /* rbd_dev now owns this */
4044         spec = NULL;            /* rbd_dev now owns this */
4045
4046         rbd_dev->mapping.read_only = rbd_opts->read_only;
4047         kfree(rbd_opts);
4048         rbd_opts = NULL;        /* done with this */
4049
4050         rc = rbd_dev_probe(rbd_dev);
4051         if (rc < 0)
4052                 goto err_out_rbd_dev;
4053
4054         return count;
4055 err_out_rbd_dev:
4056         rbd_dev_destroy(rbd_dev);
4057 err_out_client:
4058         rbd_put_client(rbdc);
4059 err_out_args:
4060         if (ceph_opts)
4061                 ceph_destroy_options(ceph_opts);
4062         kfree(rbd_opts);
4063         rbd_spec_put(spec);
4064 err_out_module:
4065         module_put(THIS_MODULE);
4066
4067         dout("Error adding device %s\n", buf);
4068
4069         return (ssize_t) rc;
4070 }
4071
4072 static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4073 {
4074         struct list_head *tmp;
4075         struct rbd_device *rbd_dev;
4076
4077         spin_lock(&rbd_dev_list_lock);
4078         list_for_each(tmp, &rbd_dev_list) {
4079                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4080                 if (rbd_dev->dev_id == dev_id) {
4081                         spin_unlock(&rbd_dev_list_lock);
4082                         return rbd_dev;
4083                 }
4084         }
4085         spin_unlock(&rbd_dev_list_lock);
4086         return NULL;
4087 }
4088
4089 static void rbd_dev_release(struct device *dev)
4090 {
4091         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4092
4093         if (rbd_dev->watch_event)
4094                 rbd_dev_header_watch_sync(rbd_dev, 0);
4095
4096         /* clean up and free blkdev */
4097         rbd_free_disk(rbd_dev);
4098         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4099
4100         /* release allocated disk header fields */
4101         rbd_header_free(&rbd_dev->header);
4102
4103         /* done with the id, and with the rbd_dev */
4104         rbd_dev_id_put(rbd_dev);
4105         rbd_assert(rbd_dev->rbd_client != NULL);
4106         rbd_dev_destroy(rbd_dev);
4107
4108         /* release module ref */
4109         module_put(THIS_MODULE);
4110 }
4111
4112 static ssize_t rbd_remove(struct bus_type *bus,
4113                           const char *buf,
4114                           size_t count)
4115 {
4116         struct rbd_device *rbd_dev = NULL;
4117         int target_id, rc;
4118         unsigned long ul;
4119         int ret = count;
4120
4121         rc = strict_strtoul(buf, 10, &ul);
4122         if (rc)
4123                 return rc;
4124
4125         /* convert to int; abort if we lost anything in the conversion */
4126         target_id = (int) ul;
4127         if (target_id != ul)
4128                 return -EINVAL;
4129
4130         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4131
4132         rbd_dev = __rbd_get_dev(target_id);
4133         if (!rbd_dev) {
4134                 ret = -ENOENT;
4135                 goto done;
4136         }
4137
4138         spin_lock_irq(&rbd_dev->lock);
4139         if (rbd_dev->open_count)
4140                 ret = -EBUSY;
4141         else
4142                 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4143         spin_unlock_irq(&rbd_dev->lock);
4144         if (ret < 0)
4145                 goto done;
4146
4147         rbd_remove_all_snaps(rbd_dev);
4148         rbd_bus_del_dev(rbd_dev);
4149
4150 done:
4151         mutex_unlock(&ctl_mutex);
4152
4153         return ret;
4154 }
4155
4156 /*
4157  * create control files in sysfs
4158  * /sys/bus/rbd/...
4159  */
4160 static int rbd_sysfs_init(void)
4161 {
4162         int ret;
4163
4164         ret = device_register(&rbd_root_dev);
4165         if (ret < 0)
4166                 return ret;
4167
4168         ret = bus_register(&rbd_bus_type);
4169         if (ret < 0)
4170                 device_unregister(&rbd_root_dev);
4171
4172         return ret;
4173 }
4174
4175 static void rbd_sysfs_cleanup(void)
4176 {
4177         bus_unregister(&rbd_bus_type);
4178         device_unregister(&rbd_root_dev);
4179 }
4180
4181 static int __init rbd_init(void)
4182 {
4183         int rc;
4184
4185         if (!libceph_compatible(NULL)) {
4186                 rbd_warn(NULL, "libceph incompatibility (quitting)");
4187
4188                 return -EINVAL;
4189         }
4190         rc = rbd_sysfs_init();
4191         if (rc)
4192                 return rc;
4193         pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4194         return 0;
4195 }
4196
4197 static void __exit rbd_exit(void)
4198 {
4199         rbd_sysfs_cleanup();
4200 }
4201
4202 module_init(rbd_init);
4203 module_exit(rbd_exit);
4204
4205 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4206 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4207 MODULE_DESCRIPTION("rados block device");
4208
4209 /* following authorship retained from original osdblk.c */
4210 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4211
4212 MODULE_LICENSE("GPL");