drivers/block/rbd.c

   1 /*
   2    rbd.c -- Export ceph rados objects as a Linux block device
   3
   4
   5    based on drivers/block/osdblk.c:
   6
   7    Copyright 2009 Red Hat, Inc.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; see the file COPYING.  If not, write to
  20    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  21
  22
  23
  24    For usage instructions, please refer to:
  25
  26                  Documentation/ABI/testing/sysfs-bus-rbd
  27
  28  */
  29
  30 #include <linux/ceph/libceph.h>
  31 #include <linux/ceph/osd_client.h>
  32 #include <linux/ceph/mon_client.h>
  33 #include <linux/ceph/decode.h>
  34 #include <linux/parser.h>
  35
  36 #include <linux/kernel.h>
  37 #include <linux/device.h>
  38 #include <linux/module.h>
  39 #include <linux/fs.h>
  40 #include <linux/blkdev.h>
  41
  42 #include "rbd_types.h"
  43
  44 #define RBD_DEBUG       /* Activate rbd_assert() calls */
  45
  46 /*
  47  * The basic unit of block I/O is a sector.  It is interpreted in a
  48  * number of contexts in Linux (blk, bio, genhd), but the default is
  49  * universally 512 bytes.  These symbols are just slightly more
  50  * meaningful than the bare numbers they represent.
  51  */
  52 #define SECTOR_SHIFT    9
  53 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  54
  55 #define RBD_DRV_NAME "rbd"
  56 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  57
  58 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
  59
  60 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
  61 #define RBD_MAX_SNAP_NAME_LEN   \
  62                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
  63
  64 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
  65
  66 #define RBD_SNAP_HEAD_NAME      "-"
  67
  68 /* This allows a single page to hold an image name sent by OSD */
  69 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
  70 #define RBD_IMAGE_ID_LEN_MAX    64
  71
  72 #define RBD_OBJ_PREFIX_LEN_MAX  64
  73
  74 /* Feature bits */
  75
  76 #define RBD_FEATURE_LAYERING    (1<<0)
  77 #define RBD_FEATURE_STRIPINGV2  (1<<1)
  78 #define RBD_FEATURES_ALL \
  79             (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
  80
  81 /* Features supported by this (client software) implementation. */
  82
  83 #define RBD_FEATURES_SUPPORTED  (0)
  84
  85 /*
  86  * An RBD device name will be "rbd#", where the "rbd" comes from
  87  * RBD_DRV_NAME above, and # is a unique integer identifier.
  88  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
  89  * enough to hold all possible device names.
  90  */
  91 #define DEV_NAME_LEN            32
  92 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
  93
  94 /*
  95  * block device image metadata (in-memory version)
  96  */
  97 struct rbd_image_header {
  98         /* These four fields never change for a given rbd image */
  99         char *object_prefix;
 100         u64 features;
 101         __u8 obj_order;
 102         __u8 crypt_type;
 103         __u8 comp_type;
 104
 105         /* The remaining fields need to be updated occasionally */
 106         u64 image_size;
 107         struct ceph_snap_context *snapc;
 108         char *snap_names;
 109         u64 *snap_sizes;
 110
 111         u64 obj_version;
 112 };
 113
 114 /*
 115  * An rbd image specification.
 116  *
 117  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 118  * identify an image.  Each rbd_dev structure includes a pointer to
 119  * an rbd_spec structure that encapsulates this identity.
 120  *
 121  * Each of the id's in an rbd_spec has an associated name.  For a
 122  * user-mapped image, the names are supplied and the id's associated
 123  * with them are looked up.  For a layered image, a parent image is
 124  * defined by the tuple, and the names are looked up.
 125  *
 126  * An rbd_dev structure contains a parent_spec pointer which is
 127  * non-null if the image it represents is a child in a layered
 128  * image.  This pointer will refer to the rbd_spec structure used
 129  * by the parent rbd_dev for its own identity (i.e., the structure
 130  * is shared between the parent and child).
 131  *
 132  * Since these structures are populated once, during the discovery
 133  * phase of image construction, they are effectively immutable so
 134  * we make no effort to synchronize access to them.
 135  *
 136  * Note that code herein does not assume the image name is known (it
 137  * could be a null pointer).
 138  */
 139 struct rbd_spec {
 140         u64             pool_id;
 141         char            *pool_name;
 142
 143         char            *image_id;
 144         char            *image_name;
 145
 146         u64             snap_id;
 147         char            *snap_name;
 148
 149         struct kref     kref;
 150 };
 151
 152 /*
 153  * an instance of the client.  multiple devices may share an rbd client.
 154  */
 155 struct rbd_client {
 156         struct ceph_client      *client;
 157         struct kref             kref;
 158         struct list_head        node;
 159 };
 160
 161 struct rbd_img_request;
 162 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 163
 164 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
 165
 166 struct rbd_obj_request;
 167 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 168
 169 enum obj_request_type {
 170         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
 171 };
 172
 173 enum obj_req_flags {
 174         OBJ_REQ_DONE,           /* completion flag: not done = 0, done = 1 */
 175         OBJ_REQ_IMG_DATA,       /* object usage: standalone = 0, image = 1 */
 176         OBJ_REQ_KNOWN,          /* EXISTS flag valid: no = 0, yes = 1 */
 177         OBJ_REQ_EXISTS,         /* target exists: no = 0, yes = 1 */
 178 };
 179
 180 struct rbd_obj_request {
 181         const char              *object_name;
 182         u64                     offset;         /* object start byte */
 183         u64                     length;         /* bytes from offset */
 184         unsigned long           flags;
 185
 186         /*
 187          * An object request associated with an image will have its
 188          * img_data flag set; a standalone object request will not.
 189          *
 190          * A standalone object request will have which == BAD_WHICH
 191          * and a null obj_request pointer.
 192          *
 193          * An object request initiated in support of a layered image
 194          * object (to check for its existence before a write) will
 195          * have which == BAD_WHICH and a non-null obj_request pointer.
 196          *
 197          * Finally, an object request for rbd image data will have
 198          * which != BAD_WHICH, and will have a non-null img_request
 199          * pointer.  The value of which will be in the range
 200          * 0..(img_request->obj_request_count-1).
 201          */
 202         union {
 203                 struct rbd_obj_request  *obj_request;   /* STAT op */
 204                 struct {
 205                         struct rbd_img_request  *img_request;
 206                         u64                     img_offset;
 207                         /* links for img_request->obj_requests list */
 208                         struct list_head        links;
 209                 };
 210         };
 211         u32                     which;          /* posn image request list */
 212
 213         enum obj_request_type   type;
 214         union {
 215                 struct bio      *bio_list;
 216                 struct {
 217                         struct page     **pages;
 218                         u32             page_count;
 219                 };
 220         };
 221         struct page             **copyup_pages;
 222
 223         struct ceph_osd_request *osd_req;
 224
 225         u64                     xferred;        /* bytes transferred */
 226         u64                     version;
 227         int                     result;
 228
 229         rbd_obj_callback_t      callback;
 230         struct completion       completion;
 231
 232         struct kref             kref;
 233 };
 234
 235 enum img_req_flags {
 236         IMG_REQ_WRITE,          /* I/O direction: read = 0, write = 1 */
 237         IMG_REQ_CHILD,          /* initiator: block = 0, child image = 1 */
 238         IMG_REQ_LAYERED,        /* ENOENT handling: normal = 0, layered = 1 */
 239 };
 240
 241 struct rbd_img_request {
 242         struct rbd_device       *rbd_dev;
 243         u64                     offset; /* starting image byte offset */
 244         u64                     length; /* byte count from offset */
 245         unsigned long           flags;
 246         union {
 247                 u64                     snap_id;        /* for reads */
 248                 struct ceph_snap_context *snapc;        /* for writes */
 249         };
 250         union {
 251                 struct request          *rq;            /* block request */
 252                 struct rbd_obj_request  *obj_request;   /* obj req initiator */
 253         };
 254         struct page             **copyup_pages;
 255         spinlock_t              completion_lock;/* protects next_completion */
 256         u32                     next_completion;
 257         rbd_img_callback_t      callback;
 258         u64                     xferred;/* aggregate bytes transferred */
 259         int                     result; /* first nonzero obj_request result */
 260
 261         u32                     obj_request_count;
 262         struct list_head        obj_requests;   /* rbd_obj_request structs */
 263
 264         struct kref             kref;
 265 };
 266
 267 #define for_each_obj_request(ireq, oreq) \
 268         list_for_each_entry(oreq, &(ireq)->obj_requests, links)
 269 #define for_each_obj_request_from(ireq, oreq) \
 270         list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
 271 #define for_each_obj_request_safe(ireq, oreq, n) \
 272         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 273
 274 struct rbd_snap {
 275         struct  device          dev;
 276         const char              *name;
 277         u64                     size;
 278         struct list_head        node;
 279         u64                     id;
 280         u64                     features;
 281 };
 282
 283 struct rbd_mapping {
 284         u64                     size;
 285         u64                     features;
 286         bool                    read_only;
 287 };
 288
 289 /*
 290  * a single device
 291  */
 292 struct rbd_device {
 293         int                     dev_id;         /* blkdev unique id */
 294
 295         int                     major;          /* blkdev assigned major */
 296         struct gendisk          *disk;          /* blkdev's gendisk and rq */
 297
 298         u32                     image_format;   /* Either 1 or 2 */
 299         struct rbd_client       *rbd_client;
 300
 301         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 302
 303         spinlock_t              lock;           /* queue, flags, open_count */
 304
 305         struct rbd_image_header header;
 306         unsigned long           flags;          /* possibly lock protected */
 307         struct rbd_spec         *spec;
 308
 309         char                    *header_name;
 310
 311         struct ceph_file_layout layout;
 312
 313         struct ceph_osd_event   *watch_event;
 314         struct rbd_obj_request  *watch_request;
 315
 316         struct rbd_spec         *parent_spec;
 317         u64                     parent_overlap;
 318         struct rbd_device       *parent;
 319
 320         /* protects updating the header */
 321         struct rw_semaphore     header_rwsem;
 322
 323         struct rbd_mapping      mapping;
 324
 325         struct list_head        node;
 326
 327         /* list of snapshots */
 328         struct list_head        snaps;
 329
 330         /* sysfs related */
 331         struct device           dev;
 332         unsigned long           open_count;     /* protected by lock */
 333 };
 334
 335 /*
 336  * Flag bits for rbd_dev->flags.  If atomicity is required,
 337  * rbd_dev->lock is used to protect access.
 338  *
 339  * Currently, only the "removing" flag (which is coupled with the
 340  * "open_count" field) requires atomic access.
 341  */
 342 enum rbd_dev_flags {
 343         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
 344         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
 345 };
 346
 347 static DEFINE_MUTEX(ctl_mutex);   /* Serialize open/close/setup/teardown */
 348
 349 static LIST_HEAD(rbd_dev_list);    /* devices */
 350 static DEFINE_SPINLOCK(rbd_dev_list_lock);
 351
 352 static LIST_HEAD(rbd_client_list);              /* clients */
 353 static DEFINE_SPINLOCK(rbd_client_list_lock);
 354
 355 static int rbd_img_request_submit(struct rbd_img_request *img_request);
 356
 357 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
 358 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
 359
 360 static void rbd_dev_release(struct device *dev);
 361 static void rbd_remove_snap_dev(struct rbd_snap *snap);
 362
 363 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 364                        size_t count);
 365 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 366                           size_t count);
 367 static int rbd_dev_probe(struct rbd_device *rbd_dev);
 368
 369 static struct bus_attribute rbd_bus_attrs[] = {
 370         __ATTR(add, S_IWUSR, NULL, rbd_add),
 371         __ATTR(remove, S_IWUSR, NULL, rbd_remove),
 372         __ATTR_NULL
 373 };
 374
 375 static struct bus_type rbd_bus_type = {
 376         .name           = "rbd",
 377         .bus_attrs      = rbd_bus_attrs,
 378 };
 379
 380 static void rbd_root_dev_release(struct device *dev)
 381 {
 382 }
 383
 384 static struct device rbd_root_dev = {
 385         .init_name =    "rbd",
 386         .release =      rbd_root_dev_release,
 387 };
 388
 389 static __printf(2, 3)
 390 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 391 {
 392         struct va_format vaf;
 393         va_list args;
 394
 395         va_start(args, fmt);
 396         vaf.fmt = fmt;
 397         vaf.va = &args;
 398
 399         if (!rbd_dev)
 400                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 401         else if (rbd_dev->disk)
 402                 printk(KERN_WARNING "%s: %s: %pV\n",
 403                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 404         else if (rbd_dev->spec && rbd_dev->spec->image_name)
 405                 printk(KERN_WARNING "%s: image %s: %pV\n",
 406                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 407         else if (rbd_dev->spec && rbd_dev->spec->image_id)
 408                 printk(KERN_WARNING "%s: id %s: %pV\n",
 409                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 410         else    /* punt */
 411                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 412                         RBD_DRV_NAME, rbd_dev, &vaf);
 413         va_end(args);
 414 }
 415
 416 #ifdef RBD_DEBUG
 417 #define rbd_assert(expr)                                                \
 418                 if (unlikely(!(expr))) {                                \
 419                         printk(KERN_ERR "\nAssertion failure in %s() "  \
 420                                                 "at line %d:\n\n"       \
 421                                         "\trbd_assert(%s);\n\n",        \
 422                                         __func__, __LINE__, #expr);     \
 423                         BUG();                                          \
 424                 }
 425 #else /* !RBD_DEBUG */
 426 #  define rbd_assert(expr)      ((void) 0)
 427 #endif /* !RBD_DEBUG */
 428
 429 static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
 430 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
 431
 432 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
 433 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
 434
 435 static int rbd_open(struct block_device *bdev, fmode_t mode)
 436 {
 437         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 438         bool removing = false;
 439
 440         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 441                 return -EROFS;
 442
 443         spin_lock_irq(&rbd_dev->lock);
 444         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 445                 removing = true;
 446         else
 447                 rbd_dev->open_count++;
 448         spin_unlock_irq(&rbd_dev->lock);
 449         if (removing)
 450                 return -ENOENT;
 451
 452         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 453         (void) get_device(&rbd_dev->dev);
 454         set_device_ro(bdev, rbd_dev->mapping.read_only);
 455         mutex_unlock(&ctl_mutex);
 456
 457         return 0;
 458 }
 459
 460 static int rbd_release(struct gendisk *disk, fmode_t mode)
 461 {
 462         struct rbd_device *rbd_dev = disk->private_data;
 463         unsigned long open_count_before;
 464
 465         spin_lock_irq(&rbd_dev->lock);
 466         open_count_before = rbd_dev->open_count--;
 467         spin_unlock_irq(&rbd_dev->lock);
 468         rbd_assert(open_count_before > 0);
 469
 470         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 471         put_device(&rbd_dev->dev);
 472         mutex_unlock(&ctl_mutex);
 473
 474         return 0;
 475 }
 476
 477 static const struct block_device_operations rbd_bd_ops = {
 478         .owner                  = THIS_MODULE,
 479         .open                   = rbd_open,
 480         .release                = rbd_release,
 481 };
 482
 483 /*
 484  * Initialize an rbd client instance.
 485  * We own *ceph_opts.
 486  */
 487 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 488 {
 489         struct rbd_client *rbdc;
 490         int ret = -ENOMEM;
 491
 492         dout("%s:\n", __func__);
 493         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 494         if (!rbdc)
 495                 goto out_opt;
 496
 497         kref_init(&rbdc->kref);
 498         INIT_LIST_HEAD(&rbdc->node);
 499
 500         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 501
 502         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 503         if (IS_ERR(rbdc->client))
 504                 goto out_mutex;
 505         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 506
 507         ret = ceph_open_session(rbdc->client);
 508         if (ret < 0)
 509                 goto out_err;
 510
 511         spin_lock(&rbd_client_list_lock);
 512         list_add_tail(&rbdc->node, &rbd_client_list);
 513         spin_unlock(&rbd_client_list_lock);
 514
 515         mutex_unlock(&ctl_mutex);
 516         dout("%s: rbdc %p\n", __func__, rbdc);
 517
 518         return rbdc;
 519
 520 out_err:
 521         ceph_destroy_client(rbdc->client);
 522 out_mutex:
 523         mutex_unlock(&ctl_mutex);
 524         kfree(rbdc);
 525 out_opt:
 526         if (ceph_opts)
 527                 ceph_destroy_options(ceph_opts);
 528         dout("%s: error %d\n", __func__, ret);
 529
 530         return ERR_PTR(ret);
 531 }
 532
 533 static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 534 {
 535         kref_get(&rbdc->kref);
 536
 537         return rbdc;
 538 }
 539
 540 /*
 541  * Find a ceph client with specific addr and configuration.  If
 542  * found, bump its reference count.
 543  */
 544 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 545 {
 546         struct rbd_client *client_node;
 547         bool found = false;
 548
 549         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 550                 return NULL;
 551
 552         spin_lock(&rbd_client_list_lock);
 553         list_for_each_entry(client_node, &rbd_client_list, node) {
 554                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
 555                         __rbd_get_client(client_node);
 556
 557                         found = true;
 558                         break;
 559                 }
 560         }
 561         spin_unlock(&rbd_client_list_lock);
 562
 563         return found ? client_node : NULL;
 564 }
 565
 566 /*
 567  * mount options
 568  */
 569 enum {
 570         Opt_last_int,
 571         /* int args above */
 572         Opt_last_string,
 573         /* string args above */
 574         Opt_read_only,
 575         Opt_read_write,
 576         /* Boolean args above */
 577         Opt_last_bool,
 578 };
 579
 580 static match_table_t rbd_opts_tokens = {
 581         /* int args above */
 582         /* string args above */
 583         {Opt_read_only, "read_only"},
 584         {Opt_read_only, "ro"},          /* Alternate spelling */
 585         {Opt_read_write, "read_write"},
 586         {Opt_read_write, "rw"},         /* Alternate spelling */
 587         /* Boolean args above */
 588         {-1, NULL}
 589 };
 590
 591 struct rbd_options {
 592         bool    read_only;
 593 };
 594
 595 #define RBD_READ_ONLY_DEFAULT   false
 596
 597 static int parse_rbd_opts_token(char *c, void *private)
 598 {
 599         struct rbd_options *rbd_opts = private;
 600         substring_t argstr[MAX_OPT_ARGS];
 601         int token, intval, ret;
 602
 603         token = match_token(c, rbd_opts_tokens, argstr);
 604         if (token < 0)
 605                 return -EINVAL;
 606
 607         if (token < Opt_last_int) {
 608                 ret = match_int(&argstr[0], &intval);
 609                 if (ret < 0) {
 610                         pr_err("bad mount option arg (not int) "
 611                                "at '%s'\n", c);
 612                         return ret;
 613                 }
 614                 dout("got int token %d val %d\n", token, intval);
 615         } else if (token > Opt_last_int && token < Opt_last_string) {
 616                 dout("got string token %d val %s\n", token,
 617                      argstr[0].from);
 618         } else if (token > Opt_last_string && token < Opt_last_bool) {
 619                 dout("got Boolean token %d\n", token);
 620         } else {
 621                 dout("got token %d\n", token);
 622         }
 623
 624         switch (token) {
 625         case Opt_read_only:
 626                 rbd_opts->read_only = true;
 627                 break;
 628         case Opt_read_write:
 629                 rbd_opts->read_only = false;
 630                 break;
 631         default:
 632                 rbd_assert(false);
 633                 break;
 634         }
 635         return 0;
 636 }
 637
 638 /*
 639  * Get a ceph client with specific addr and configuration, if one does
 640  * not exist create it.
 641  */
 642 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 643 {
 644         struct rbd_client *rbdc;
 645
 646         rbdc = rbd_client_find(ceph_opts);
 647         if (rbdc)       /* using an existing client */
 648                 ceph_destroy_options(ceph_opts);
 649         else
 650                 rbdc = rbd_client_create(ceph_opts);
 651
 652         return rbdc;
 653 }
 654
 655 /*
 656  * Destroy ceph client
 657  *
 658  * Caller must hold rbd_client_list_lock.
 659  */
 660 static void rbd_client_release(struct kref *kref)
 661 {
 662         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 663
 664         dout("%s: rbdc %p\n", __func__, rbdc);
 665         spin_lock(&rbd_client_list_lock);
 666         list_del(&rbdc->node);
 667         spin_unlock(&rbd_client_list_lock);
 668
 669         ceph_destroy_client(rbdc->client);
 670         kfree(rbdc);
 671 }
 672
 673 /*
 674  * Drop reference to ceph client node. If it's not referenced anymore, release
 675  * it.
 676  */
 677 static void rbd_put_client(struct rbd_client *rbdc)
 678 {
 679         if (rbdc)
 680                 kref_put(&rbdc->kref, rbd_client_release);
 681 }
 682
 683 static bool rbd_image_format_valid(u32 image_format)
 684 {
 685         return image_format == 1 || image_format == 2;
 686 }
 687
 688 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 689 {
 690         size_t size;
 691         u32 snap_count;
 692
 693         /* The header has to start with the magic rbd header text */
 694         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 695                 return false;
 696
 697         /* The bio layer requires at least sector-sized I/O */
 698
 699         if (ondisk->options.order < SECTOR_SHIFT)
 700                 return false;
 701
 702         /* If we use u64 in a few spots we may be able to loosen this */
 703
 704         if (ondisk->options.order > 8 * sizeof (int) - 1)
 705                 return false;
 706
 707         /*
 708          * The size of a snapshot header has to fit in a size_t, and
 709          * that limits the number of snapshots.
 710          */
 711         snap_count = le32_to_cpu(ondisk->snap_count);
 712         size = SIZE_MAX - sizeof (struct ceph_snap_context);
 713         if (snap_count > size / sizeof (__le64))
 714                 return false;
 715
 716         /*
 717          * Not only that, but the size of the entire the snapshot
 718          * header must also be representable in a size_t.
 719          */
 720         size -= snap_count * sizeof (__le64);
 721         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 722                 return false;
 723
 724         return true;
 725 }
 726
 727 /*
 728  * Create a new header structure, translate header format from the on-disk
 729  * header.
 730  */
 731 static int rbd_header_from_disk(struct rbd_image_header *header,
 732                                  struct rbd_image_header_ondisk *ondisk)
 733 {
 734         u32 snap_count;
 735         size_t len;
 736         size_t size;
 737         u32 i;
 738
 739         memset(header, 0, sizeof (*header));
 740
 741         snap_count = le32_to_cpu(ondisk->snap_count);
 742
 743         len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
 744         header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
 745         if (!header->object_prefix)
 746                 return -ENOMEM;
 747         memcpy(header->object_prefix, ondisk->object_prefix, len);
 748         header->object_prefix[len] = '\0';
 749
 750         if (snap_count) {
 751                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 752
 753                 /* Save a copy of the snapshot names */
 754
 755                 if (snap_names_len > (u64) SIZE_MAX)
 756                         return -EIO;
 757                 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 758                 if (!header->snap_names)
 759                         goto out_err;
 760                 /*
 761                  * Note that rbd_dev_v1_header_read() guarantees
 762                  * the ondisk buffer we're working with has
 763                  * snap_names_len bytes beyond the end of the
 764                  * snapshot id array, this memcpy() is safe.
 765                  */
 766                 memcpy(header->snap_names, &ondisk->snaps[snap_count],
 767                         snap_names_len);
 768
 769                 /* Record each snapshot's size */
 770
 771                 size = snap_count * sizeof (*header->snap_sizes);
 772                 header->snap_sizes = kmalloc(size, GFP_KERNEL);
 773                 if (!header->snap_sizes)
 774                         goto out_err;
 775                 for (i = 0; i < snap_count; i++)
 776                         header->snap_sizes[i] =
 777                                 le64_to_cpu(ondisk->snaps[i].image_size);
 778         } else {
 779                 WARN_ON(ondisk->snap_names_len);
 780                 header->snap_names = NULL;
 781                 header->snap_sizes = NULL;
 782         }
 783
 784         header->features = 0;   /* No features support in v1 images */
 785         header->obj_order = ondisk->options.order;
 786         header->crypt_type = ondisk->options.crypt_type;
 787         header->comp_type = ondisk->options.comp_type;
 788
 789         /* Allocate and fill in the snapshot context */
 790
 791         header->image_size = le64_to_cpu(ondisk->image_size);
 792         size = sizeof (struct ceph_snap_context);
 793         size += snap_count * sizeof (header->snapc->snaps[0]);
 794         header->snapc = kzalloc(size, GFP_KERNEL);
 795         if (!header->snapc)
 796                 goto out_err;
 797
 798         atomic_set(&header->snapc->nref, 1);
 799         header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
 800         header->snapc->num_snaps = snap_count;
 801         for (i = 0; i < snap_count; i++)
 802                 header->snapc->snaps[i] =
 803                         le64_to_cpu(ondisk->snaps[i].id);
 804
 805         return 0;
 806
 807 out_err:
 808         kfree(header->snap_sizes);
 809         header->snap_sizes = NULL;
 810         kfree(header->snap_names);
 811         header->snap_names = NULL;
 812         kfree(header->object_prefix);
 813         header->object_prefix = NULL;
 814
 815         return -ENOMEM;
 816 }
 817
 818 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 819 {
 820         struct rbd_snap *snap;
 821
 822         if (snap_id == CEPH_NOSNAP)
 823                 return RBD_SNAP_HEAD_NAME;
 824
 825         list_for_each_entry(snap, &rbd_dev->snaps, node)
 826                 if (snap_id == snap->id)
 827                         return snap->name;
 828
 829         return NULL;
 830 }
 831
 832 static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
 833 {
 834
 835         struct rbd_snap *snap;
 836
 837         list_for_each_entry(snap, &rbd_dev->snaps, node) {
 838                 if (!strcmp(snap_name, snap->name)) {
 839                         rbd_dev->spec->snap_id = snap->id;
 840                         rbd_dev->mapping.size = snap->size;
 841                         rbd_dev->mapping.features = snap->features;
 842
 843                         return 0;
 844                 }
 845         }
 846
 847         return -ENOENT;
 848 }
 849
 850 static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 851 {
 852         int ret;
 853
 854         if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
 855                     sizeof (RBD_SNAP_HEAD_NAME))) {
 856                 rbd_dev->spec->snap_id = CEPH_NOSNAP;
 857                 rbd_dev->mapping.size = rbd_dev->header.image_size;
 858                 rbd_dev->mapping.features = rbd_dev->header.features;
 859                 ret = 0;
 860         } else {
 861                 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
 862                 if (ret < 0)
 863                         goto done;
 864                 rbd_dev->mapping.read_only = true;
 865         }
 866         set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 867
 868 done:
 869         return ret;
 870 }
 871
 872 static void rbd_header_free(struct rbd_image_header *header)
 873 {
 874         kfree(header->object_prefix);
 875         header->object_prefix = NULL;
 876         kfree(header->snap_sizes);
 877         header->snap_sizes = NULL;
 878         kfree(header->snap_names);
 879         header->snap_names = NULL;
 880         ceph_put_snap_context(header->snapc);
 881         header->snapc = NULL;
 882 }
 883
 884 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 885 {
 886         char *name;
 887         u64 segment;
 888         int ret;
 889
 890         name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
 891         if (!name)
 892                 return NULL;
 893         segment = offset >> rbd_dev->header.obj_order;
 894         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
 895                         rbd_dev->header.object_prefix, segment);
 896         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
 897                 pr_err("error formatting segment name for #%llu (%d)\n",
 898                         segment, ret);
 899                 kfree(name);
 900                 name = NULL;
 901         }
 902
 903         return name;
 904 }
 905
 906 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 907 {
 908         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 909
 910         return offset & (segment_size - 1);
 911 }
 912
 913 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
 914                                 u64 offset, u64 length)
 915 {
 916         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 917
 918         offset &= segment_size - 1;
 919
 920         rbd_assert(length <= U64_MAX - offset);
 921         if (offset + length > segment_size)
 922                 length = segment_size - offset;
 923
 924         return length;
 925 }
 926
 927 /*
 928  * returns the size of an object in the image
 929  */
 930 static u64 rbd_obj_bytes(struct rbd_image_header *header)
 931 {
 932         return 1 << header->obj_order;
 933 }
 934
 935 /*
 936  * bio helpers
 937  */
 938
 939 static void bio_chain_put(struct bio *chain)
 940 {
 941         struct bio *tmp;
 942
 943         while (chain) {
 944                 tmp = chain;
 945                 chain = chain->bi_next;
 946                 bio_put(tmp);
 947         }
 948 }
 949
 950 /*
 951  * zeros a bio chain, starting at specific offset
 952  */
 953 static void zero_bio_chain(struct bio *chain, int start_ofs)
 954 {
 955         struct bio_vec *bv;
 956         unsigned long flags;
 957         void *buf;
 958         int i;
 959         int pos = 0;
 960
 961         while (chain) {
 962                 bio_for_each_segment(bv, chain, i) {
 963                         if (pos + bv->bv_len > start_ofs) {
 964                                 int remainder = max(start_ofs - pos, 0);
 965                                 buf = bvec_kmap_irq(bv, &flags);
 966                                 memset(buf + remainder, 0,
 967                                        bv->bv_len - remainder);
 968                                 bvec_kunmap_irq(buf, &flags);
 969                         }
 970                         pos += bv->bv_len;
 971                 }
 972
 973                 chain = chain->bi_next;
 974         }
 975 }
 976
 977 /*
 978  * similar to zero_bio_chain(), zeros data defined by a page array,
 979  * starting at the given byte offset from the start of the array and
 980  * continuing up to the given end offset.  The pages array is
 981  * assumed to be big enough to hold all bytes up to the end.
 982  */
 983 static void zero_pages(struct page **pages, u64 offset, u64 end)
 984 {
 985         struct page **page = &pages[offset >> PAGE_SHIFT];
 986
 987         rbd_assert(end > offset);
 988         rbd_assert(end - offset <= (u64)SIZE_MAX);
 989         while (offset < end) {
 990                 size_t page_offset;
 991                 size_t length;
 992                 unsigned long flags;
 993                 void *kaddr;
 994
 995                 page_offset = (size_t)(offset & ~PAGE_MASK);
 996                 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
 997                 local_irq_save(flags);
 998                 kaddr = kmap_atomic(*page);
 999                 memset(kaddr + page_offset, 0, length);
1000                 kunmap_atomic(kaddr);
1001                 local_irq_restore(flags);
1002
1003                 offset += length;
1004                 page++;
1005         }
1006 }
1007
1008 /*
1009  * Clone a portion of a bio, starting at the given byte offset
1010  * and continuing for the number of bytes indicated.
1011  */
1012 static struct bio *bio_clone_range(struct bio *bio_src,
1013                                         unsigned int offset,
1014                                         unsigned int len,
1015                                         gfp_t gfpmask)
1016 {
1017         struct bio_vec *bv;
1018         unsigned int resid;
1019         unsigned short idx;
1020         unsigned int voff;
1021         unsigned short end_idx;
1022         unsigned short vcnt;
1023         struct bio *bio;
1024
1025         /* Handle the easy case for the caller */
1026
1027         if (!offset && len == bio_src->bi_size)
1028                 return bio_clone(bio_src, gfpmask);
1029
1030         if (WARN_ON_ONCE(!len))
1031                 return NULL;
1032         if (WARN_ON_ONCE(len > bio_src->bi_size))
1033                 return NULL;
1034         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1035                 return NULL;
1036
1037         /* Find first affected segment... */
1038
1039         resid = offset;
1040         __bio_for_each_segment(bv, bio_src, idx, 0) {
1041                 if (resid < bv->bv_len)
1042                         break;
1043                 resid -= bv->bv_len;
1044         }
1045         voff = resid;
1046
1047         /* ...and the last affected segment */
1048
1049         resid += len;
1050         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1051                 if (resid <= bv->bv_len)
1052                         break;
1053                 resid -= bv->bv_len;
1054         }
1055         vcnt = end_idx - idx + 1;
1056
1057         /* Build the clone */
1058
1059         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1060         if (!bio)
1061                 return NULL;    /* ENOMEM */
1062
1063         bio->bi_bdev = bio_src->bi_bdev;
1064         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1065         bio->bi_rw = bio_src->bi_rw;
1066         bio->bi_flags |= 1 << BIO_CLONED;
1067
1068         /*
1069          * Copy over our part of the bio_vec, then update the first
1070          * and last (or only) entries.
1071          */
1072         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1073                         vcnt * sizeof (struct bio_vec));
1074         bio->bi_io_vec[0].bv_offset += voff;
1075         if (vcnt > 1) {
1076                 bio->bi_io_vec[0].bv_len -= voff;
1077                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1078         } else {
1079                 bio->bi_io_vec[0].bv_len = len;
1080         }
1081
1082         bio->bi_vcnt = vcnt;
1083         bio->bi_size = len;
1084         bio->bi_idx = 0;
1085
1086         return bio;
1087 }
1088
1089 /*
1090  * Clone a portion of a bio chain, starting at the given byte offset
1091  * into the first bio in the source chain and continuing for the
1092  * number of bytes indicated.  The result is another bio chain of
1093  * exactly the given length, or a null pointer on error.
1094  *
1095  * The bio_src and offset parameters are both in-out.  On entry they
1096  * refer to the first source bio and the offset into that bio where
1097  * the start of data to be cloned is located.
1098  *
1099  * On return, bio_src is updated to refer to the bio in the source
1100  * chain that contains first un-cloned byte, and *offset will
1101  * contain the offset of that byte within that bio.
1102  */
1103 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1104                                         unsigned int *offset,
1105                                         unsigned int len,
1106                                         gfp_t gfpmask)
1107 {
1108         struct bio *bi = *bio_src;
1109         unsigned int off = *offset;
1110         struct bio *chain = NULL;
1111         struct bio **end;
1112
1113         /* Build up a chain of clone bios up to the limit */
1114
1115         if (!bi || off >= bi->bi_size || !len)
1116                 return NULL;            /* Nothing to clone */
1117
1118         end = &chain;
1119         while (len) {
1120                 unsigned int bi_size;
1121                 struct bio *bio;
1122
1123                 if (!bi) {
1124                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1125                         goto out_err;   /* EINVAL; ran out of bio's */
1126                 }
1127                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1128                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1129                 if (!bio)
1130                         goto out_err;   /* ENOMEM */
1131
1132                 *end = bio;
1133                 end = &bio->bi_next;
1134
1135                 off += bi_size;
1136                 if (off == bi->bi_size) {
1137                         bi = bi->bi_next;
1138                         off = 0;
1139                 }
1140                 len -= bi_size;
1141         }
1142         *bio_src = bi;
1143         *offset = off;
1144
1145         return chain;
1146 out_err:
1147         bio_chain_put(chain);
1148
1149         return NULL;
1150 }
1151
1152 /*
1153  * The default/initial value for all object request flags is 0.  For
1154  * each flag, once its value is set to 1 it is never reset to 0
1155  * again.
1156  */
1157 static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1158 {
1159         if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
1160                 struct rbd_device *rbd_dev;
1161
1162                 rbd_dev = obj_request->img_request->rbd_dev;
1163                 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1164                         obj_request);
1165         }
1166 }
1167
1168 static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1169 {
1170         smp_mb();
1171         return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1172 }
1173
1174 static void obj_request_done_set(struct rbd_obj_request *obj_request)
1175 {
1176         if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1177                 struct rbd_device *rbd_dev = NULL;
1178
1179                 if (obj_request_img_data_test(obj_request))
1180                         rbd_dev = obj_request->img_request->rbd_dev;
1181                 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1182                         obj_request);
1183         }
1184 }
1185
1186 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1187 {
1188         smp_mb();
1189         return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1190 }
1191
1192 /*
1193  * This sets the KNOWN flag after (possibly) setting the EXISTS
1194  * flag.  The latter is set based on the "exists" value provided.
1195  *
1196  * Note that for our purposes once an object exists it never goes
1197  * away again.  It's possible that the response from two existence
1198  * checks are separated by the creation of the target object, and
1199  * the first ("doesn't exist") response arrives *after* the second
1200  * ("does exist").  In that case we ignore the second one.
1201  */
1202 static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1203                                 bool exists)
1204 {
1205         if (exists)
1206                 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1207         set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1208         smp_mb();
1209 }
1210
1211 static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1212 {
1213         smp_mb();
1214         return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1215 }
1216
1217 static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1218 {
1219         smp_mb();
1220         return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1221 }
1222
1223 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1224 {
1225         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1226                 atomic_read(&obj_request->kref.refcount));
1227         kref_get(&obj_request->kref);
1228 }
1229
1230 static void rbd_obj_request_destroy(struct kref *kref);
1231 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1232 {
1233         rbd_assert(obj_request != NULL);
1234         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1235                 atomic_read(&obj_request->kref.refcount));
1236         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1237 }
1238
1239 static void rbd_img_request_get(struct rbd_img_request *img_request)
1240 {
1241         dout("%s: img %p (was %d)\n", __func__, img_request,
1242                 atomic_read(&img_request->kref.refcount));
1243         kref_get(&img_request->kref);
1244 }
1245
1246 static void rbd_img_request_destroy(struct kref *kref);
1247 static void rbd_img_request_put(struct rbd_img_request *img_request)
1248 {
1249         rbd_assert(img_request != NULL);
1250         dout("%s: img %p (was %d)\n", __func__, img_request,
1251                 atomic_read(&img_request->kref.refcount));
1252         kref_put(&img_request->kref, rbd_img_request_destroy);
1253 }
1254
1255 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1256                                         struct rbd_obj_request *obj_request)
1257 {
1258         rbd_assert(obj_request->img_request == NULL);
1259
1260         /* Image request now owns object's original reference */
1261         obj_request->img_request = img_request;
1262         obj_request->which = img_request->obj_request_count;
1263         rbd_assert(!obj_request_img_data_test(obj_request));
1264         obj_request_img_data_set(obj_request);
1265         rbd_assert(obj_request->which != BAD_WHICH);
1266         img_request->obj_request_count++;
1267         list_add_tail(&obj_request->links, &img_request->obj_requests);
1268         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1269                 obj_request->which);
1270 }
1271
1272 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1273                                         struct rbd_obj_request *obj_request)
1274 {
1275         rbd_assert(obj_request->which != BAD_WHICH);
1276
1277         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1278                 obj_request->which);
1279         list_del(&obj_request->links);
1280         rbd_assert(img_request->obj_request_count > 0);
1281         img_request->obj_request_count--;
1282         rbd_assert(obj_request->which == img_request->obj_request_count);
1283         obj_request->which = BAD_WHICH;
1284         rbd_assert(obj_request_img_data_test(obj_request));
1285         rbd_assert(obj_request->img_request == img_request);
1286         obj_request->img_request = NULL;
1287         obj_request->callback = NULL;
1288         rbd_obj_request_put(obj_request);
1289 }
1290
1291 static bool obj_request_type_valid(enum obj_request_type type)
1292 {
1293         switch (type) {
1294         case OBJ_REQUEST_NODATA:
1295         case OBJ_REQUEST_BIO:
1296         case OBJ_REQUEST_PAGES:
1297                 return true;
1298         default:
1299                 return false;
1300         }
1301 }
1302
1303 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1304                                 struct rbd_obj_request *obj_request)
1305 {
1306         dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1307
1308         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1309 }
1310
1311 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1312 {
1313
1314         dout("%s: img %p\n", __func__, img_request);
1315
1316         /*
1317          * If no error occurred, compute the aggregate transfer
1318          * count for the image request.  We could instead use
1319          * atomic64_cmpxchg() to update it as each object request
1320          * completes; not clear which way is better off hand.
1321          */
1322         if (!img_request->result) {
1323                 struct rbd_obj_request *obj_request;
1324                 u64 xferred = 0;
1325
1326                 for_each_obj_request(img_request, obj_request)
1327                         xferred += obj_request->xferred;
1328                 img_request->xferred = xferred;
1329         }
1330
1331         if (img_request->callback)
1332                 img_request->callback(img_request);
1333         else
1334                 rbd_img_request_put(img_request);
1335 }
1336
1337 /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1338
1339 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1340 {
1341         dout("%s: obj %p\n", __func__, obj_request);
1342
1343         return wait_for_completion_interruptible(&obj_request->completion);
1344 }
1345
1346 /*
1347  * The default/initial value for all image request flags is 0.  Each
1348  * is conditionally set to 1 at image request initialization time
1349  * and currently never change thereafter.
1350  */
1351 static void img_request_write_set(struct rbd_img_request *img_request)
1352 {
1353         set_bit(IMG_REQ_WRITE, &img_request->flags);
1354         smp_mb();
1355 }
1356
1357 static bool img_request_write_test(struct rbd_img_request *img_request)
1358 {
1359         smp_mb();
1360         return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1361 }
1362
1363 static void img_request_child_set(struct rbd_img_request *img_request)
1364 {
1365         set_bit(IMG_REQ_CHILD, &img_request->flags);
1366         smp_mb();
1367 }
1368
1369 static bool img_request_child_test(struct rbd_img_request *img_request)
1370 {
1371         smp_mb();
1372         return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1373 }
1374
1375 static void img_request_layered_set(struct rbd_img_request *img_request)
1376 {
1377         set_bit(IMG_REQ_LAYERED, &img_request->flags);
1378         smp_mb();
1379 }
1380
1381 static bool img_request_layered_test(struct rbd_img_request *img_request)
1382 {
1383         smp_mb();
1384         return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1385 }
1386
1387 static void
1388 rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1389 {
1390         u64 xferred = obj_request->xferred;
1391         u64 length = obj_request->length;
1392
1393         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1394                 obj_request, obj_request->img_request, obj_request->result,
1395                 xferred, length);
1396         /*
1397          * ENOENT means a hole in the image.  We zero-fill the
1398          * entire length of the request.  A short read also implies
1399          * zero-fill to the end of the request.  Either way we
1400          * update the xferred count to indicate the whole request
1401          * was satisfied.
1402          */
1403         rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
1404         if (obj_request->result == -ENOENT) {
1405                 if (obj_request->type == OBJ_REQUEST_BIO)
1406                         zero_bio_chain(obj_request->bio_list, 0);
1407                 else
1408                         zero_pages(obj_request->pages, 0, length);
1409                 obj_request->result = 0;
1410                 obj_request->xferred = length;
1411         } else if (xferred < length && !obj_request->result) {
1412                 if (obj_request->type == OBJ_REQUEST_BIO)
1413                         zero_bio_chain(obj_request->bio_list, xferred);
1414                 else
1415                         zero_pages(obj_request->pages, xferred, length);
1416                 obj_request->xferred = length;
1417         }
1418         obj_request_done_set(obj_request);
1419 }
1420
1421 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1422 {
1423         dout("%s: obj %p cb %p\n", __func__, obj_request,
1424                 obj_request->callback);
1425         if (obj_request->callback)
1426                 obj_request->callback(obj_request);
1427         else
1428                 complete_all(&obj_request->completion);
1429 }
1430
1431 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1432 {
1433         dout("%s: obj %p\n", __func__, obj_request);
1434         obj_request_done_set(obj_request);
1435 }
1436
1437 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1438 {
1439         struct rbd_img_request *img_request = NULL;
1440         struct rbd_device *rbd_dev = NULL;
1441         bool layered = false;
1442
1443         if (obj_request_img_data_test(obj_request)) {
1444                 img_request = obj_request->img_request;
1445                 layered = img_request && img_request_layered_test(img_request);
1446                 rbd_dev = img_request->rbd_dev;
1447         }
1448
1449         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1450                 obj_request, img_request, obj_request->result,
1451                 obj_request->xferred, obj_request->length);
1452         if (layered && obj_request->result == -ENOENT &&
1453                         obj_request->img_offset < rbd_dev->parent_overlap)
1454                 rbd_img_parent_read(obj_request);
1455         else if (img_request)
1456                 rbd_img_obj_request_read_callback(obj_request);
1457         else
1458                 obj_request_done_set(obj_request);
1459 }
1460
1461 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1462 {
1463         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1464                 obj_request->result, obj_request->length);
1465         /*
1466          * There is no such thing as a successful short write.  Set
1467          * it to our originally-requested length.
1468          */
1469         obj_request->xferred = obj_request->length;
1470         obj_request_done_set(obj_request);
1471 }
1472
1473 /*
1474  * For a simple stat call there's nothing to do.  We'll do more if
1475  * this is part of a write sequence for a layered image.
1476  */
1477 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1478 {
1479         dout("%s: obj %p\n", __func__, obj_request);
1480         obj_request_done_set(obj_request);
1481 }
1482
1483 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1484                                 struct ceph_msg *msg)
1485 {
1486         struct rbd_obj_request *obj_request = osd_req->r_priv;
1487         u16 opcode;
1488
1489         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1490         rbd_assert(osd_req == obj_request->osd_req);
1491         if (obj_request_img_data_test(obj_request)) {
1492                 rbd_assert(obj_request->img_request);
1493                 rbd_assert(obj_request->which != BAD_WHICH);
1494         } else {
1495                 rbd_assert(obj_request->which == BAD_WHICH);
1496         }
1497
1498         if (osd_req->r_result < 0)
1499                 obj_request->result = osd_req->r_result;
1500         obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1501
1502         BUG_ON(osd_req->r_num_ops > 2);
1503
1504         /*
1505          * We support a 64-bit length, but ultimately it has to be
1506          * passed to blk_end_request(), which takes an unsigned int.
1507          */
1508         obj_request->xferred = osd_req->r_reply_op_len[0];
1509         rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1510         opcode = osd_req->r_ops[0].op;
1511         switch (opcode) {
1512         case CEPH_OSD_OP_READ:
1513                 rbd_osd_read_callback(obj_request);
1514                 break;
1515         case CEPH_OSD_OP_WRITE:
1516                 rbd_osd_write_callback(obj_request);
1517                 break;
1518         case CEPH_OSD_OP_STAT:
1519                 rbd_osd_stat_callback(obj_request);
1520                 break;
1521         case CEPH_OSD_OP_CALL:
1522         case CEPH_OSD_OP_NOTIFY_ACK:
1523         case CEPH_OSD_OP_WATCH:
1524                 rbd_osd_trivial_callback(obj_request);
1525                 break;
1526         default:
1527                 rbd_warn(NULL, "%s: unsupported op %hu\n",
1528                         obj_request->object_name, (unsigned short) opcode);
1529                 break;
1530         }
1531
1532         if (obj_request_done_test(obj_request))
1533                 rbd_obj_request_complete(obj_request);
1534 }
1535
1536 static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1537 {
1538         struct rbd_img_request *img_request = obj_request->img_request;
1539         struct ceph_osd_request *osd_req = obj_request->osd_req;
1540         u64 snap_id;
1541
1542         rbd_assert(osd_req != NULL);
1543
1544         snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
1545         ceph_osdc_build_request(osd_req, obj_request->offset,
1546                         NULL, snap_id, NULL);
1547 }
1548
1549 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1550 {
1551         struct rbd_img_request *img_request = obj_request->img_request;
1552         struct ceph_osd_request *osd_req = obj_request->osd_req;
1553         struct ceph_snap_context *snapc;
1554         struct timespec mtime = CURRENT_TIME;
1555
1556         rbd_assert(osd_req != NULL);
1557
1558         snapc = img_request ? img_request->snapc : NULL;
1559         ceph_osdc_build_request(osd_req, obj_request->offset,
1560                         snapc, CEPH_NOSNAP, &mtime);
1561 }
1562
1563 static struct ceph_osd_request *rbd_osd_req_create(
1564                                         struct rbd_device *rbd_dev,
1565                                         bool write_request,
1566                                         struct rbd_obj_request *obj_request)
1567 {
1568         struct ceph_snap_context *snapc = NULL;
1569         struct ceph_osd_client *osdc;
1570         struct ceph_osd_request *osd_req;
1571
1572         if (obj_request_img_data_test(obj_request)) {
1573                 struct rbd_img_request *img_request = obj_request->img_request;
1574
1575                 rbd_assert(write_request ==
1576                                 img_request_write_test(img_request));
1577                 if (write_request)
1578                         snapc = img_request->snapc;
1579         }
1580
1581         /* Allocate and initialize the request, for the single op */
1582
1583         osdc = &rbd_dev->rbd_client->client->osdc;
1584         osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1585         if (!osd_req)
1586                 return NULL;    /* ENOMEM */
1587
1588         if (write_request)
1589                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1590         else
1591                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1592
1593         osd_req->r_callback = rbd_osd_req_callback;
1594         osd_req->r_priv = obj_request;
1595
1596         osd_req->r_oid_len = strlen(obj_request->object_name);
1597         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1598         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1599
1600         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1601
1602         return osd_req;
1603 }
1604
1605 /*
1606  * Create a copyup osd request based on the information in the
1607  * object request supplied.  A copyup request has two osd ops,
1608  * a copyup method call, and a "normal" write request.
1609  */
1610 static struct ceph_osd_request *
1611 rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1612 {
1613         struct rbd_img_request *img_request;
1614         struct ceph_snap_context *snapc;
1615         struct rbd_device *rbd_dev;
1616         struct ceph_osd_client *osdc;
1617         struct ceph_osd_request *osd_req;
1618
1619         rbd_assert(obj_request_img_data_test(obj_request));
1620         img_request = obj_request->img_request;
1621         rbd_assert(img_request);
1622         rbd_assert(img_request_write_test(img_request));
1623
1624         /* Allocate and initialize the request, for the two ops */
1625
1626         snapc = img_request->snapc;
1627         rbd_dev = img_request->rbd_dev;
1628         osdc = &rbd_dev->rbd_client->client->osdc;
1629         osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1630         if (!osd_req)
1631                 return NULL;    /* ENOMEM */
1632
1633         osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1634         osd_req->r_callback = rbd_osd_req_callback;
1635         osd_req->r_priv = obj_request;
1636
1637         osd_req->r_oid_len = strlen(obj_request->object_name);
1638         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1639         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1640
1641         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1642
1643         return osd_req;
1644 }
1645
1646
1647 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1648 {
1649         ceph_osdc_put_request(osd_req);
1650 }
1651
1652 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1653
1654 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1655                                                 u64 offset, u64 length,
1656                                                 enum obj_request_type type)
1657 {
1658         struct rbd_obj_request *obj_request;
1659         size_t size;
1660         char *name;
1661
1662         rbd_assert(obj_request_type_valid(type));
1663
1664         size = strlen(object_name) + 1;
1665         obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1666         if (!obj_request)
1667                 return NULL;
1668
1669         name = (char *)(obj_request + 1);
1670         obj_request->object_name = memcpy(name, object_name, size);
1671         obj_request->offset = offset;
1672         obj_request->length = length;
1673         obj_request->flags = 0;
1674         obj_request->which = BAD_WHICH;
1675         obj_request->type = type;
1676         INIT_LIST_HEAD(&obj_request->links);
1677         init_completion(&obj_request->completion);
1678         kref_init(&obj_request->kref);
1679
1680         dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1681                 offset, length, (int)type, obj_request);
1682
1683         return obj_request;
1684 }
1685
1686 static void rbd_obj_request_destroy(struct kref *kref)
1687 {
1688         struct rbd_obj_request *obj_request;
1689
1690         obj_request = container_of(kref, struct rbd_obj_request, kref);
1691
1692         dout("%s: obj %p\n", __func__, obj_request);
1693
1694         rbd_assert(obj_request->img_request == NULL);
1695         rbd_assert(obj_request->which == BAD_WHICH);
1696
1697         if (obj_request->osd_req)
1698                 rbd_osd_req_destroy(obj_request->osd_req);
1699
1700         rbd_assert(obj_request_type_valid(obj_request->type));
1701         switch (obj_request->type) {
1702         case OBJ_REQUEST_NODATA:
1703                 break;          /* Nothing to do */
1704         case OBJ_REQUEST_BIO:
1705                 if (obj_request->bio_list)
1706                         bio_chain_put(obj_request->bio_list);
1707                 break;
1708         case OBJ_REQUEST_PAGES:
1709                 if (obj_request->pages)
1710                         ceph_release_page_vector(obj_request->pages,
1711                                                 obj_request->page_count);
1712                 break;
1713         }
1714
1715         kfree(obj_request);
1716 }
1717
1718 /*
1719  * Caller is responsible for filling in the list of object requests
1720  * that comprises the image request, and the Linux request pointer
1721  * (if there is one).
1722  */
1723 static struct rbd_img_request *rbd_img_request_create(
1724                                         struct rbd_device *rbd_dev,
1725                                         u64 offset, u64 length,
1726                                         bool write_request,
1727                                         bool child_request)
1728 {
1729         struct rbd_img_request *img_request;
1730         struct ceph_snap_context *snapc = NULL;
1731
1732         img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1733         if (!img_request)
1734                 return NULL;
1735
1736         if (write_request) {
1737                 down_read(&rbd_dev->header_rwsem);
1738                 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1739                 up_read(&rbd_dev->header_rwsem);
1740                 if (WARN_ON(!snapc)) {
1741                         kfree(img_request);
1742                         return NULL;    /* Shouldn't happen */
1743                 }
1744
1745         }
1746
1747         img_request->rq = NULL;
1748         img_request->rbd_dev = rbd_dev;
1749         img_request->offset = offset;
1750         img_request->length = length;
1751         img_request->flags = 0;
1752         if (write_request) {
1753                 img_request_write_set(img_request);
1754                 img_request->snapc = snapc;
1755         } else {
1756                 img_request->snap_id = rbd_dev->spec->snap_id;
1757         }
1758         if (child_request)
1759                 img_request_child_set(img_request);
1760         if (rbd_dev->parent_spec)
1761                 img_request_layered_set(img_request);
1762         spin_lock_init(&img_request->completion_lock);
1763         img_request->next_completion = 0;
1764         img_request->callback = NULL;
1765         img_request->result = 0;
1766         img_request->obj_request_count = 0;
1767         INIT_LIST_HEAD(&img_request->obj_requests);
1768         kref_init(&img_request->kref);
1769
1770         rbd_img_request_get(img_request);       /* Avoid a warning */
1771         rbd_img_request_put(img_request);       /* TEMPORARY */
1772
1773         dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1774                 write_request ? "write" : "read", offset, length,
1775                 img_request);
1776
1777         return img_request;
1778 }
1779
1780 static void rbd_img_request_destroy(struct kref *kref)
1781 {
1782         struct rbd_img_request *img_request;
1783         struct rbd_obj_request *obj_request;
1784         struct rbd_obj_request *next_obj_request;
1785
1786         img_request = container_of(kref, struct rbd_img_request, kref);
1787
1788         dout("%s: img %p\n", __func__, img_request);
1789
1790         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1791                 rbd_img_obj_request_del(img_request, obj_request);
1792         rbd_assert(img_request->obj_request_count == 0);
1793
1794         if (img_request_write_test(img_request))
1795                 ceph_put_snap_context(img_request->snapc);
1796
1797         if (img_request_child_test(img_request))
1798                 rbd_obj_request_put(img_request->obj_request);
1799
1800         kfree(img_request);
1801 }
1802
1803 static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1804 {
1805         struct rbd_img_request *img_request;
1806         unsigned int xferred;
1807         int result;
1808         bool more;
1809
1810         rbd_assert(obj_request_img_data_test(obj_request));
1811         img_request = obj_request->img_request;
1812
1813         rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1814         xferred = (unsigned int)obj_request->xferred;
1815         result = obj_request->result;
1816         if (result) {
1817                 struct rbd_device *rbd_dev = img_request->rbd_dev;
1818
1819                 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1820                         img_request_write_test(img_request) ? "write" : "read",
1821                         obj_request->length, obj_request->img_offset,
1822                         obj_request->offset);
1823                 rbd_warn(rbd_dev, "  result %d xferred %x\n",
1824                         result, xferred);
1825                 if (!img_request->result)
1826                         img_request->result = result;
1827         }
1828
1829         /* Image object requests don't own their page array */
1830
1831         if (obj_request->type == OBJ_REQUEST_PAGES) {
1832                 obj_request->pages = NULL;
1833                 obj_request->page_count = 0;
1834         }
1835
1836         if (img_request_child_test(img_request)) {
1837                 rbd_assert(img_request->obj_request != NULL);
1838                 more = obj_request->which < img_request->obj_request_count - 1;
1839         } else {
1840                 rbd_assert(img_request->rq != NULL);
1841                 more = blk_end_request(img_request->rq, result, xferred);
1842         }
1843
1844         return more;
1845 }
1846
1847 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1848 {
1849         struct rbd_img_request *img_request;
1850         u32 which = obj_request->which;
1851         bool more = true;
1852
1853         rbd_assert(obj_request_img_data_test(obj_request));
1854         img_request = obj_request->img_request;
1855
1856         dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1857         rbd_assert(img_request != NULL);
1858         rbd_assert(img_request->obj_request_count > 0);
1859         rbd_assert(which != BAD_WHICH);
1860         rbd_assert(which < img_request->obj_request_count);
1861         rbd_assert(which >= img_request->next_completion);
1862
1863         spin_lock_irq(&img_request->completion_lock);
1864         if (which != img_request->next_completion)
1865                 goto out;
1866
1867         for_each_obj_request_from(img_request, obj_request) {
1868                 rbd_assert(more);
1869                 rbd_assert(which < img_request->obj_request_count);
1870
1871                 if (!obj_request_done_test(obj_request))
1872                         break;
1873                 more = rbd_img_obj_end_request(obj_request);
1874                 which++;
1875         }
1876
1877         rbd_assert(more ^ (which == img_request->obj_request_count));
1878         img_request->next_completion = which;
1879 out:
1880         spin_unlock_irq(&img_request->completion_lock);
1881
1882         if (!more)
1883                 rbd_img_request_complete(img_request);
1884 }
1885
1886 /*
1887  * Split up an image request into one or more object requests, each
1888  * to a different object.  The "type" parameter indicates whether
1889  * "data_desc" is the pointer to the head of a list of bio
1890  * structures, or the base of a page array.  In either case this
1891  * function assumes data_desc describes memory sufficient to hold
1892  * all data described by the image request.
1893  */
1894 static int rbd_img_request_fill(struct rbd_img_request *img_request,
1895                                         enum obj_request_type type,
1896                                         void *data_desc)
1897 {
1898         struct rbd_device *rbd_dev = img_request->rbd_dev;
1899         struct rbd_obj_request *obj_request = NULL;
1900         struct rbd_obj_request *next_obj_request;
1901         bool write_request = img_request_write_test(img_request);
1902         struct bio *bio_list;
1903         unsigned int bio_offset = 0;
1904         struct page **pages;
1905         u64 img_offset;
1906         u64 resid;
1907         u16 opcode;
1908
1909         dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1910                 (int)type, data_desc);
1911
1912         opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
1913         img_offset = img_request->offset;
1914         resid = img_request->length;
1915         rbd_assert(resid > 0);
1916
1917         if (type == OBJ_REQUEST_BIO) {
1918                 bio_list = data_desc;
1919                 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1920         } else {
1921                 rbd_assert(type == OBJ_REQUEST_PAGES);
1922                 pages = data_desc;
1923         }
1924
1925         while (resid) {
1926                 struct ceph_osd_request *osd_req;
1927                 const char *object_name;
1928                 u64 offset;
1929                 u64 length;
1930
1931                 object_name = rbd_segment_name(rbd_dev, img_offset);
1932                 if (!object_name)
1933                         goto out_unwind;
1934                 offset = rbd_segment_offset(rbd_dev, img_offset);
1935                 length = rbd_segment_length(rbd_dev, img_offset, resid);
1936                 obj_request = rbd_obj_request_create(object_name,
1937                                                 offset, length, type);
1938                 kfree(object_name);     /* object request has its own copy */
1939                 if (!obj_request)
1940                         goto out_unwind;
1941
1942                 if (type == OBJ_REQUEST_BIO) {
1943                         unsigned int clone_size;
1944
1945                         rbd_assert(length <= (u64)UINT_MAX);
1946                         clone_size = (unsigned int)length;
1947                         obj_request->bio_list =
1948                                         bio_chain_clone_range(&bio_list,
1949                                                                 &bio_offset,
1950                                                                 clone_size,
1951                                                                 GFP_ATOMIC);
1952                         if (!obj_request->bio_list)
1953                                 goto out_partial;
1954                 } else {
1955                         unsigned int page_count;
1956
1957                         obj_request->pages = pages;
1958                         page_count = (u32)calc_pages_for(offset, length);
1959                         obj_request->page_count = page_count;
1960                         if ((offset + length) & ~PAGE_MASK)
1961                                 page_count--;   /* more on last page */
1962                         pages += page_count;
1963                 }
1964
1965                 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1966                                                 obj_request);
1967                 if (!osd_req)
1968                         goto out_partial;
1969                 obj_request->osd_req = osd_req;
1970                 obj_request->callback = rbd_img_obj_callback;
1971
1972                 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1973                                                 0, 0);
1974                 if (type == OBJ_REQUEST_BIO)
1975                         osd_req_op_extent_osd_data_bio(osd_req, 0,
1976                                         obj_request->bio_list, length);
1977                 else
1978                         osd_req_op_extent_osd_data_pages(osd_req, 0,
1979                                         obj_request->pages, length,
1980                                         offset & ~PAGE_MASK, false, false);
1981
1982                 if (write_request)
1983                         rbd_osd_req_format_write(obj_request);
1984                 else
1985                         rbd_osd_req_format_read(obj_request);
1986
1987                 obj_request->img_offset = img_offset;
1988                 rbd_img_obj_request_add(img_request, obj_request);
1989
1990                 img_offset += length;
1991                 resid -= length;
1992         }
1993
1994         return 0;
1995
1996 out_partial:
1997         rbd_obj_request_put(obj_request);
1998 out_unwind:
1999         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2000                 rbd_obj_request_put(obj_request);
2001
2002         return -ENOMEM;
2003 }
2004
2005 static void
2006 rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2007 {
2008         struct rbd_img_request *img_request;
2009         struct rbd_device *rbd_dev;
2010         u64 length;
2011         u32 page_count;
2012
2013         rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2014         rbd_assert(obj_request_img_data_test(obj_request));
2015         img_request = obj_request->img_request;
2016         rbd_assert(img_request);
2017
2018         rbd_dev = img_request->rbd_dev;
2019         rbd_assert(rbd_dev);
2020         length = (u64)1 << rbd_dev->header.obj_order;
2021         page_count = (u32)calc_pages_for(0, length);
2022
2023         rbd_assert(obj_request->copyup_pages);
2024         ceph_release_page_vector(obj_request->copyup_pages, page_count);
2025         obj_request->copyup_pages = NULL;
2026
2027         /*
2028          * We want the transfer count to reflect the size of the
2029          * original write request.  There is no such thing as a
2030          * successful short write, so if the request was successful
2031          * we can just set it to the originally-requested length.
2032          */
2033         if (!obj_request->result)
2034                 obj_request->xferred = obj_request->length;
2035
2036         /* Finish up with the normal image object callback */
2037
2038         rbd_img_obj_callback(obj_request);
2039 }
2040
2041 static void
2042 rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2043 {
2044         struct rbd_obj_request *orig_request;
2045         struct ceph_osd_request *osd_req;
2046         struct ceph_osd_client *osdc;
2047         struct rbd_device *rbd_dev;
2048         struct page **pages;
2049         int result;
2050         u64 obj_size;
2051         u64 xferred;
2052
2053         rbd_assert(img_request_child_test(img_request));
2054
2055         /* First get what we need from the image request */
2056
2057         pages = img_request->copyup_pages;
2058         rbd_assert(pages != NULL);
2059         img_request->copyup_pages = NULL;
2060
2061         orig_request = img_request->obj_request;
2062         rbd_assert(orig_request != NULL);
2063         rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
2064         result = img_request->result;
2065         obj_size = img_request->length;
2066         xferred = img_request->xferred;
2067
2068         rbd_dev = img_request->rbd_dev;
2069         rbd_assert(rbd_dev);
2070         rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2071
2072         rbd_img_request_put(img_request);
2073
2074         if (result)
2075                 goto out_err;
2076
2077         /* Allocate the new copyup osd request for the original request */
2078
2079         result = -ENOMEM;
2080         rbd_assert(!orig_request->osd_req);
2081         osd_req = rbd_osd_req_create_copyup(orig_request);
2082         if (!osd_req)
2083                 goto out_err;
2084         orig_request->osd_req = osd_req;
2085         orig_request->copyup_pages = pages;
2086
2087         /* Initialize the copyup op */
2088
2089         osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2090         osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2091                                                 false, false);
2092
2093         /* Then the original write request op */
2094
2095         osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2096                                         orig_request->offset,
2097                                         orig_request->length, 0, 0);
2098         osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2099                                         orig_request->length);
2100
2101         rbd_osd_req_format_write(orig_request);
2102
2103         /* All set, send it off. */
2104
2105         orig_request->callback = rbd_img_obj_copyup_callback;
2106         osdc = &rbd_dev->rbd_client->client->osdc;
2107         result = rbd_obj_request_submit(osdc, orig_request);
2108         if (!result)
2109                 return;
2110 out_err:
2111         /* Record the error code and complete the request */
2112
2113         orig_request->result = result;
2114         orig_request->xferred = 0;
2115         obj_request_done_set(orig_request);
2116         rbd_obj_request_complete(orig_request);
2117 }
2118
2119 /*
2120  * Read from the parent image the range of data that covers the
2121  * entire target of the given object request.  This is used for
2122  * satisfying a layered image write request when the target of an
2123  * object request from the image request does not exist.
2124  *
2125  * A page array big enough to hold the returned data is allocated
2126  * and supplied to rbd_img_request_fill() as the "data descriptor."
2127  * When the read completes, this page array will be transferred to
2128  * the original object request for the copyup operation.
2129  *
2130  * If an error occurs, record it as the result of the original
2131  * object request and mark it done so it gets completed.
2132  */
2133 static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2134 {
2135         struct rbd_img_request *img_request = NULL;
2136         struct rbd_img_request *parent_request = NULL;
2137         struct rbd_device *rbd_dev;
2138         u64 img_offset;
2139         u64 length;
2140         struct page **pages = NULL;
2141         u32 page_count;
2142         int result;
2143
2144         rbd_assert(obj_request_img_data_test(obj_request));
2145         rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2146
2147         img_request = obj_request->img_request;
2148         rbd_assert(img_request != NULL);
2149         rbd_dev = img_request->rbd_dev;
2150         rbd_assert(rbd_dev->parent != NULL);
2151
2152         /*
2153          * First things first.  The original osd request is of no
2154          * use to use any more, we'll need a new one that can hold
2155          * the two ops in a copyup request.  We'll get that later,
2156          * but for now we can release the old one.
2157          */
2158         rbd_osd_req_destroy(obj_request->osd_req);
2159         obj_request->osd_req = NULL;
2160
2161         /*
2162          * Determine the byte range covered by the object in the
2163          * child image to which the original request was to be sent.
2164          */
2165         img_offset = obj_request->img_offset - obj_request->offset;
2166         length = (u64)1 << rbd_dev->header.obj_order;
2167
2168         /*
2169          * There is no defined parent data beyond the parent
2170          * overlap, so limit what we read at that boundary if
2171          * necessary.
2172          */
2173         if (img_offset + length > rbd_dev->parent_overlap) {
2174                 rbd_assert(img_offset < rbd_dev->parent_overlap);
2175                 length = rbd_dev->parent_overlap - img_offset;
2176         }
2177
2178         /*
2179          * Allocate a page array big enough to receive the data read
2180          * from the parent.
2181          */
2182         page_count = (u32)calc_pages_for(0, length);
2183         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2184         if (IS_ERR(pages)) {
2185                 result = PTR_ERR(pages);
2186                 pages = NULL;
2187                 goto out_err;
2188         }
2189
2190         result = -ENOMEM;
2191         parent_request = rbd_img_request_create(rbd_dev->parent,
2192                                                 img_offset, length,
2193                                                 false, true);
2194         if (!parent_request)
2195                 goto out_err;
2196         rbd_obj_request_get(obj_request);
2197         parent_request->obj_request = obj_request;
2198
2199         result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2200         if (result)
2201                 goto out_err;
2202         parent_request->copyup_pages = pages;
2203
2204         parent_request->callback = rbd_img_obj_parent_read_full_callback;
2205         result = rbd_img_request_submit(parent_request);
2206         if (!result)
2207                 return 0;
2208
2209         parent_request->copyup_pages = NULL;
2210         parent_request->obj_request = NULL;
2211         rbd_obj_request_put(obj_request);
2212 out_err:
2213         if (pages)
2214                 ceph_release_page_vector(pages, page_count);
2215         if (parent_request)
2216                 rbd_img_request_put(parent_request);
2217         obj_request->result = result;
2218         obj_request->xferred = 0;
2219         obj_request_done_set(obj_request);
2220
2221         return result;
2222 }
2223
2224 static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2225 {
2226         struct rbd_obj_request *orig_request;
2227         int result;
2228
2229         rbd_assert(!obj_request_img_data_test(obj_request));
2230
2231         /*
2232          * All we need from the object request is the original
2233          * request and the result of the STAT op.  Grab those, then
2234          * we're done with the request.
2235          */
2236         orig_request = obj_request->obj_request;
2237         obj_request->obj_request = NULL;
2238         rbd_assert(orig_request);
2239         rbd_assert(orig_request->img_request);
2240
2241         result = obj_request->result;
2242         obj_request->result = 0;
2243
2244         dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2245                 obj_request, orig_request, result,
2246                 obj_request->xferred, obj_request->length);
2247         rbd_obj_request_put(obj_request);
2248
2249         rbd_assert(orig_request);
2250         rbd_assert(orig_request->img_request);
2251
2252         /*
2253          * Our only purpose here is to determine whether the object
2254          * exists, and we don't want to treat the non-existence as
2255          * an error.  If something else comes back, transfer the
2256          * error to the original request and complete it now.
2257          */
2258         if (!result) {
2259                 obj_request_existence_set(orig_request, true);
2260         } else if (result == -ENOENT) {
2261                 obj_request_existence_set(orig_request, false);
2262         } else if (result) {
2263                 orig_request->result = result;
2264                 goto out;
2265         }
2266
2267         /*
2268          * Resubmit the original request now that we have recorded
2269          * whether the target object exists.
2270          */
2271         orig_request->result = rbd_img_obj_request_submit(orig_request);
2272 out:
2273         if (orig_request->result)
2274                 rbd_obj_request_complete(orig_request);
2275         rbd_obj_request_put(orig_request);
2276 }
2277
2278 static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2279 {
2280         struct rbd_obj_request *stat_request;
2281         struct rbd_device *rbd_dev;
2282         struct ceph_osd_client *osdc;
2283         struct page **pages = NULL;
2284         u32 page_count;
2285         size_t size;
2286         int ret;
2287
2288         /*
2289          * The response data for a STAT call consists of:
2290          *     le64 length;
2291          *     struct {
2292          *         le32 tv_sec;
2293          *         le32 tv_nsec;
2294          *     } mtime;
2295          */
2296         size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2297         page_count = (u32)calc_pages_for(0, size);
2298         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2299         if (IS_ERR(pages))
2300                 return PTR_ERR(pages);
2301
2302         ret = -ENOMEM;
2303         stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2304                                                         OBJ_REQUEST_PAGES);
2305         if (!stat_request)
2306                 goto out;
2307
2308         rbd_obj_request_get(obj_request);
2309         stat_request->obj_request = obj_request;
2310         stat_request->pages = pages;
2311         stat_request->page_count = page_count;
2312
2313         rbd_assert(obj_request->img_request);
2314         rbd_dev = obj_request->img_request->rbd_dev;
2315         stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2316                                                 stat_request);
2317         if (!stat_request->osd_req)
2318                 goto out;
2319         stat_request->callback = rbd_img_obj_exists_callback;
2320
2321         osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2322         osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2323                                         false, false);
2324         rbd_osd_req_format_read(stat_request);
2325
2326         osdc = &rbd_dev->rbd_client->client->osdc;
2327         ret = rbd_obj_request_submit(osdc, stat_request);
2328 out:
2329         if (ret)
2330                 rbd_obj_request_put(obj_request);
2331
2332         return ret;
2333 }
2334
2335 static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2336 {
2337         struct rbd_img_request *img_request;
2338         struct rbd_device *rbd_dev;
2339         bool known;
2340
2341         rbd_assert(obj_request_img_data_test(obj_request));
2342
2343         img_request = obj_request->img_request;
2344         rbd_assert(img_request);
2345         rbd_dev = img_request->rbd_dev;
2346
2347         /*
2348          * Only writes to layered images need special handling.
2349          * Reads and non-layered writes are simple object requests.
2350          * Layered writes that start beyond the end of the overlap
2351          * with the parent have no parent data, so they too are
2352          * simple object requests.  Finally, if the target object is
2353          * known to already exist, its parent data has already been
2354          * copied, so a write to the object can also be handled as a
2355          * simple object request.
2356          */
2357         if (!img_request_write_test(img_request) ||
2358                 !img_request_layered_test(img_request) ||
2359                 rbd_dev->parent_overlap <= obj_request->img_offset ||
2360                 ((known = obj_request_known_test(obj_request)) &&
2361                         obj_request_exists_test(obj_request))) {
2362
2363                 struct rbd_device *rbd_dev;
2364                 struct ceph_osd_client *osdc;
2365
2366                 rbd_dev = obj_request->img_request->rbd_dev;
2367                 osdc = &rbd_dev->rbd_client->client->osdc;
2368
2369                 return rbd_obj_request_submit(osdc, obj_request);
2370         }
2371
2372         /*
2373          * It's a layered write.  The target object might exist but
2374          * we may not know that yet.  If we know it doesn't exist,
2375          * start by reading the data for the full target object from
2376          * the parent so we can use it for a copyup to the target.
2377          */
2378         if (known)
2379                 return rbd_img_obj_parent_read_full(obj_request);
2380
2381         /* We don't know whether the target exists.  Go find out. */
2382
2383         return rbd_img_obj_exists_submit(obj_request);
2384 }
2385
2386 static int rbd_img_request_submit(struct rbd_img_request *img_request)
2387 {
2388         struct rbd_obj_request *obj_request;
2389         struct rbd_obj_request *next_obj_request;
2390
2391         dout("%s: img %p\n", __func__, img_request);
2392         for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2393                 int ret;
2394
2395                 ret = rbd_img_obj_request_submit(obj_request);
2396                 if (ret)
2397                         return ret;
2398         }
2399
2400         return 0;
2401 }
2402
2403 static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2404 {
2405         struct rbd_obj_request *obj_request;
2406         struct rbd_device *rbd_dev;
2407         u64 obj_end;
2408
2409         rbd_assert(img_request_child_test(img_request));
2410
2411         obj_request = img_request->obj_request;
2412         rbd_assert(obj_request);
2413         rbd_assert(obj_request->img_request);
2414
2415         obj_request->result = img_request->result;
2416         if (obj_request->result)
2417                 goto out;
2418
2419         /*
2420          * We need to zero anything beyond the parent overlap
2421          * boundary.  Since rbd_img_obj_request_read_callback()
2422          * will zero anything beyond the end of a short read, an
2423          * easy way to do this is to pretend the data from the
2424          * parent came up short--ending at the overlap boundary.
2425          */
2426         rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2427         obj_end = obj_request->img_offset + obj_request->length;
2428         rbd_dev = obj_request->img_request->rbd_dev;
2429         if (obj_end > rbd_dev->parent_overlap) {
2430                 u64 xferred = 0;
2431
2432                 if (obj_request->img_offset < rbd_dev->parent_overlap)
2433                         xferred = rbd_dev->parent_overlap -
2434                                         obj_request->img_offset;
2435
2436                 obj_request->xferred = min(img_request->xferred, xferred);
2437         } else {
2438                 obj_request->xferred = img_request->xferred;
2439         }
2440 out:
2441         rbd_img_obj_request_read_callback(obj_request);
2442         rbd_obj_request_complete(obj_request);
2443 }
2444
2445 static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2446 {
2447         struct rbd_device *rbd_dev;
2448         struct rbd_img_request *img_request;
2449         int result;
2450
2451         rbd_assert(obj_request_img_data_test(obj_request));
2452         rbd_assert(obj_request->img_request != NULL);
2453         rbd_assert(obj_request->result == (s32) -ENOENT);
2454         rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2455
2456         rbd_dev = obj_request->img_request->rbd_dev;
2457         rbd_assert(rbd_dev->parent != NULL);
2458         /* rbd_read_finish(obj_request, obj_request->length); */
2459         img_request = rbd_img_request_create(rbd_dev->parent,
2460                                                 obj_request->img_offset,
2461                                                 obj_request->length,
2462                                                 false, true);
2463         result = -ENOMEM;
2464         if (!img_request)
2465                 goto out_err;
2466
2467         rbd_obj_request_get(obj_request);
2468         img_request->obj_request = obj_request;
2469
2470         result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2471                                         obj_request->bio_list);
2472         if (result)
2473                 goto out_err;
2474
2475         img_request->callback = rbd_img_parent_read_callback;
2476         result = rbd_img_request_submit(img_request);
2477         if (result)
2478                 goto out_err;
2479
2480         return;
2481 out_err:
2482         if (img_request)
2483                 rbd_img_request_put(img_request);
2484         obj_request->result = result;
2485         obj_request->xferred = 0;
2486         obj_request_done_set(obj_request);
2487 }
2488
2489 static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
2490                                    u64 ver, u64 notify_id)
2491 {
2492         struct rbd_obj_request *obj_request;
2493         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2494         int ret;
2495
2496         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2497                                                         OBJ_REQUEST_NODATA);
2498         if (!obj_request)
2499                 return -ENOMEM;
2500
2501         ret = -ENOMEM;
2502         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2503         if (!obj_request->osd_req)
2504                 goto out;
2505         obj_request->callback = rbd_obj_request_put;
2506
2507         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2508                                         notify_id, ver, 0);
2509         rbd_osd_req_format_read(obj_request);
2510
2511         ret = rbd_obj_request_submit(osdc, obj_request);
2512 out:
2513         if (ret)
2514                 rbd_obj_request_put(obj_request);
2515
2516         return ret;
2517 }
2518
2519 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2520 {
2521         struct rbd_device *rbd_dev = (struct rbd_device *)data;
2522         u64 hver;
2523         int rc;
2524
2525         if (!rbd_dev)
2526                 return;
2527
2528         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2529                 rbd_dev->header_name, (unsigned long long) notify_id,
2530                 (unsigned int) opcode);
2531         rc = rbd_dev_refresh(rbd_dev, &hver);
2532         if (rc)
2533                 rbd_warn(rbd_dev, "got notification but failed to "
2534                            " update snaps: %d\n", rc);
2535
2536         rbd_obj_notify_ack(rbd_dev, hver, notify_id);
2537 }
2538
2539 /*
2540  * Request sync osd watch/unwatch.  The value of "start" determines
2541  * whether a watch request is being initiated or torn down.
2542  */
2543 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
2544 {
2545         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2546         struct rbd_obj_request *obj_request;
2547         int ret;
2548
2549         rbd_assert(start ^ !!rbd_dev->watch_event);
2550         rbd_assert(start ^ !!rbd_dev->watch_request);
2551
2552         if (start) {
2553                 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
2554                                                 &rbd_dev->watch_event);
2555                 if (ret < 0)
2556                         return ret;
2557                 rbd_assert(rbd_dev->watch_event != NULL);
2558         }
2559
2560         ret = -ENOMEM;
2561         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2562                                                         OBJ_REQUEST_NODATA);
2563         if (!obj_request)
2564                 goto out_cancel;
2565
2566         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2567         if (!obj_request->osd_req)
2568                 goto out_cancel;
2569
2570         if (start)
2571                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
2572         else
2573                 ceph_osdc_unregister_linger_request(osdc,
2574                                         rbd_dev->watch_request->osd_req);
2575
2576         osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2577                                 rbd_dev->watch_event->cookie,
2578                                 rbd_dev->header.obj_version, start);
2579         rbd_osd_req_format_write(obj_request);
2580
2581         ret = rbd_obj_request_submit(osdc, obj_request);
2582         if (ret)
2583                 goto out_cancel;
2584         ret = rbd_obj_request_wait(obj_request);
2585         if (ret)
2586                 goto out_cancel;
2587         ret = obj_request->result;
2588         if (ret)
2589                 goto out_cancel;
2590
2591         /*
2592          * A watch request is set to linger, so the underlying osd
2593          * request won't go away until we unregister it.  We retain
2594          * a pointer to the object request during that time (in
2595          * rbd_dev->watch_request), so we'll keep a reference to
2596          * it.  We'll drop that reference (below) after we've
2597          * unregistered it.
2598          */
2599         if (start) {
2600                 rbd_dev->watch_request = obj_request;
2601
2602                 return 0;
2603         }
2604
2605         /* We have successfully torn down the watch request */
2606
2607         rbd_obj_request_put(rbd_dev->watch_request);
2608         rbd_dev->watch_request = NULL;
2609 out_cancel:
2610         /* Cancel the event if we're tearing down, or on error */
2611         ceph_osdc_cancel_event(rbd_dev->watch_event);
2612         rbd_dev->watch_event = NULL;
2613         if (obj_request)
2614                 rbd_obj_request_put(obj_request);
2615
2616         return ret;
2617 }
2618
2619 /*
2620  * Synchronous osd object method call
2621  */
2622 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2623                              const char *object_name,
2624                              const char *class_name,
2625                              const char *method_name,
2626                              const void *outbound,
2627                              size_t outbound_size,
2628                              void *inbound,
2629                              size_t inbound_size,
2630                              u64 *version)
2631 {
2632         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2633         struct rbd_obj_request *obj_request;
2634         struct page **pages;
2635         u32 page_count;
2636         int ret;
2637
2638         /*
2639          * Method calls are ultimately read operations.  The result
2640          * should placed into the inbound buffer provided.  They
2641          * also supply outbound data--parameters for the object
2642          * method.  Currently if this is present it will be a
2643          * snapshot id.
2644          */
2645         page_count = (u32) calc_pages_for(0, inbound_size);
2646         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2647         if (IS_ERR(pages))
2648                 return PTR_ERR(pages);
2649
2650         ret = -ENOMEM;
2651         obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
2652                                                         OBJ_REQUEST_PAGES);
2653         if (!obj_request)
2654                 goto out;
2655
2656         obj_request->pages = pages;
2657         obj_request->page_count = page_count;
2658
2659         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2660         if (!obj_request->osd_req)
2661                 goto out;
2662
2663         osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
2664                                         class_name, method_name);
2665         if (outbound_size) {
2666                 struct ceph_pagelist *pagelist;
2667
2668                 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2669                 if (!pagelist)
2670                         goto out;
2671
2672                 ceph_pagelist_init(pagelist);
2673                 ceph_pagelist_append(pagelist, outbound, outbound_size);
2674                 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2675                                                 pagelist);
2676         }
2677         osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2678                                         obj_request->pages, inbound_size,
2679                                         0, false, false);
2680         rbd_osd_req_format_read(obj_request);
2681
2682         ret = rbd_obj_request_submit(osdc, obj_request);
2683         if (ret)
2684                 goto out;
2685         ret = rbd_obj_request_wait(obj_request);
2686         if (ret)
2687                 goto out;
2688
2689         ret = obj_request->result;
2690         if (ret < 0)
2691                 goto out;
2692         ret = 0;
2693         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
2694         if (version)
2695                 *version = obj_request->version;
2696 out:
2697         if (obj_request)
2698                 rbd_obj_request_put(obj_request);
2699         else
2700                 ceph_release_page_vector(pages, page_count);
2701
2702         return ret;
2703 }
2704
2705 static void rbd_request_fn(struct request_queue *q)
2706                 __releases(q->queue_lock) __acquires(q->queue_lock)
2707 {
2708         struct rbd_device *rbd_dev = q->queuedata;
2709         bool read_only = rbd_dev->mapping.read_only;
2710         struct request *rq;
2711         int result;
2712
2713         while ((rq = blk_fetch_request(q))) {
2714                 bool write_request = rq_data_dir(rq) == WRITE;
2715                 struct rbd_img_request *img_request;
2716                 u64 offset;
2717                 u64 length;
2718
2719                 /* Ignore any non-FS requests that filter through. */
2720
2721                 if (rq->cmd_type != REQ_TYPE_FS) {
2722                         dout("%s: non-fs request type %d\n", __func__,
2723                                 (int) rq->cmd_type);
2724                         __blk_end_request_all(rq, 0);
2725                         continue;
2726                 }
2727
2728                 /* Ignore/skip any zero-length requests */
2729
2730                 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2731                 length = (u64) blk_rq_bytes(rq);
2732
2733                 if (!length) {
2734                         dout("%s: zero-length request\n", __func__);
2735                         __blk_end_request_all(rq, 0);
2736                         continue;
2737                 }
2738
2739                 spin_unlock_irq(q->queue_lock);
2740
2741                 /* Disallow writes to a read-only device */
2742
2743                 if (write_request) {
2744                         result = -EROFS;
2745                         if (read_only)
2746                                 goto end_request;
2747                         rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2748                 }
2749
2750                 /*
2751                  * Quit early if the mapped snapshot no longer
2752                  * exists.  It's still possible the snapshot will
2753                  * have disappeared by the time our request arrives
2754                  * at the osd, but there's no sense in sending it if
2755                  * we already know.
2756                  */
2757                 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2758                         dout("request for non-existent snapshot");
2759                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2760                         result = -ENXIO;
2761                         goto end_request;
2762                 }
2763
2764                 result = -EINVAL;
2765                 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2766                         goto end_request;       /* Shouldn't happen */
2767
2768                 result = -ENOMEM;
2769                 img_request = rbd_img_request_create(rbd_dev, offset, length,
2770                                                         write_request, false);
2771                 if (!img_request)
2772                         goto end_request;
2773
2774                 img_request->rq = rq;
2775
2776                 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2777                                                 rq->bio);
2778                 if (!result)
2779                         result = rbd_img_request_submit(img_request);
2780                 if (result)
2781                         rbd_img_request_put(img_request);
2782 end_request:
2783                 spin_lock_irq(q->queue_lock);
2784                 if (result < 0) {
2785                         rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2786                                 write_request ? "write" : "read",
2787                                 length, offset, result);
2788
2789                         __blk_end_request_all(rq, result);
2790                 }
2791         }
2792 }
2793
2794 /*
2795  * a queue callback. Makes sure that we don't create a bio that spans across
2796  * multiple osd objects. One exception would be with a single page bios,
2797  * which we handle later at bio_chain_clone_range()
2798  */
2799 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2800                           struct bio_vec *bvec)
2801 {
2802         struct rbd_device *rbd_dev = q->queuedata;
2803         sector_t sector_offset;
2804         sector_t sectors_per_obj;
2805         sector_t obj_sector_offset;
2806         int ret;
2807
2808         /*
2809          * Find how far into its rbd object the partition-relative
2810          * bio start sector is to offset relative to the enclosing
2811          * device.
2812          */
2813         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2814         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2815         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2816
2817         /*
2818          * Compute the number of bytes from that offset to the end
2819          * of the object.  Account for what's already used by the bio.
2820          */
2821         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2822         if (ret > bmd->bi_size)
2823                 ret -= bmd->bi_size;
2824         else
2825                 ret = 0;
2826
2827         /*
2828          * Don't send back more than was asked for.  And if the bio
2829          * was empty, let the whole thing through because:  "Note
2830          * that a block device *must* allow a single page to be
2831          * added to an empty bio."
2832          */
2833         rbd_assert(bvec->bv_len <= PAGE_SIZE);
2834         if (ret > (int) bvec->bv_len || !bmd->bi_size)
2835                 ret = (int) bvec->bv_len;
2836
2837         return ret;
2838 }
2839
2840 static void rbd_free_disk(struct rbd_device *rbd_dev)
2841 {
2842         struct gendisk *disk = rbd_dev->disk;
2843
2844         if (!disk)
2845                 return;
2846
2847         if (disk->flags & GENHD_FL_UP)
2848                 del_gendisk(disk);
2849         if (disk->queue)
2850                 blk_cleanup_queue(disk->queue);
2851         put_disk(disk);
2852 }
2853
2854 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2855                                 const char *object_name,
2856                                 u64 offset, u64 length,
2857                                 void *buf, u64 *version)
2858
2859 {
2860         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2861         struct rbd_obj_request *obj_request;
2862         struct page **pages = NULL;
2863         u32 page_count;
2864         size_t size;
2865         int ret;
2866
2867         page_count = (u32) calc_pages_for(offset, length);
2868         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2869         if (IS_ERR(pages))
2870                 ret = PTR_ERR(pages);
2871
2872         ret = -ENOMEM;
2873         obj_request = rbd_obj_request_create(object_name, offset, length,
2874                                                         OBJ_REQUEST_PAGES);
2875         if (!obj_request)
2876                 goto out;
2877
2878         obj_request->pages = pages;
2879         obj_request->page_count = page_count;
2880
2881         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2882         if (!obj_request->osd_req)
2883                 goto out;
2884
2885         osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2886                                         offset, length, 0, 0);
2887         osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
2888                                         obj_request->pages,
2889                                         obj_request->length,
2890                                         obj_request->offset & ~PAGE_MASK,
2891                                         false, false);
2892         rbd_osd_req_format_read(obj_request);
2893
2894         ret = rbd_obj_request_submit(osdc, obj_request);
2895         if (ret)
2896                 goto out;
2897         ret = rbd_obj_request_wait(obj_request);
2898         if (ret)
2899                 goto out;
2900
2901         ret = obj_request->result;
2902         if (ret < 0)
2903                 goto out;
2904
2905         rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2906         size = (size_t) obj_request->xferred;
2907         ceph_copy_from_page_vector(pages, buf, 0, size);
2908         rbd_assert(size <= (size_t) INT_MAX);
2909         ret = (int) size;
2910         if (version)
2911                 *version = obj_request->version;
2912 out:
2913         if (obj_request)
2914                 rbd_obj_request_put(obj_request);
2915         else
2916                 ceph_release_page_vector(pages, page_count);
2917
2918         return ret;
2919 }
2920
2921 /*
2922  * Read the complete header for the given rbd device.
2923  *
2924  * Returns a pointer to a dynamically-allocated buffer containing
2925  * the complete and validated header.  Caller can pass the address
2926  * of a variable that will be filled in with the version of the
2927  * header object at the time it was read.
2928  *
2929  * Returns a pointer-coded errno if a failure occurs.
2930  */
2931 static struct rbd_image_header_ondisk *
2932 rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2933 {
2934         struct rbd_image_header_ondisk *ondisk = NULL;
2935         u32 snap_count = 0;
2936         u64 names_size = 0;
2937         u32 want_count;
2938         int ret;
2939
2940         /*
2941          * The complete header will include an array of its 64-bit
2942          * snapshot ids, followed by the names of those snapshots as
2943          * a contiguous block of NUL-terminated strings.  Note that
2944          * the number of snapshots could change by the time we read
2945          * it in, in which case we re-read it.
2946          */
2947         do {
2948                 size_t size;
2949
2950                 kfree(ondisk);
2951
2952                 size = sizeof (*ondisk);
2953                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2954                 size += names_size;
2955                 ondisk = kmalloc(size, GFP_KERNEL);
2956                 if (!ondisk)
2957                         return ERR_PTR(-ENOMEM);
2958
2959                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
2960                                        0, size, ondisk, version);
2961                 if (ret < 0)
2962                         goto out_err;
2963                 if (WARN_ON((size_t) ret < size)) {
2964                         ret = -ENXIO;
2965                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2966                                 size, ret);
2967                         goto out_err;
2968                 }
2969                 if (!rbd_dev_ondisk_valid(ondisk)) {
2970                         ret = -ENXIO;
2971                         rbd_warn(rbd_dev, "invalid header");
2972                         goto out_err;
2973                 }
2974
2975                 names_size = le64_to_cpu(ondisk->snap_names_len);
2976                 want_count = snap_count;
2977                 snap_count = le32_to_cpu(ondisk->snap_count);
2978         } while (snap_count != want_count);
2979
2980         return ondisk;
2981
2982 out_err:
2983         kfree(ondisk);
2984
2985         return ERR_PTR(ret);
2986 }
2987
2988 /*
2989  * reload the ondisk the header
2990  */
2991 static int rbd_read_header(struct rbd_device *rbd_dev,
2992                            struct rbd_image_header *header)
2993 {
2994         struct rbd_image_header_ondisk *ondisk;
2995         u64 ver = 0;
2996         int ret;
2997
2998         ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2999         if (IS_ERR(ondisk))
3000                 return PTR_ERR(ondisk);
3001         ret = rbd_header_from_disk(header, ondisk);
3002         if (ret >= 0)
3003                 header->obj_version = ver;
3004         kfree(ondisk);
3005
3006         return ret;
3007 }
3008
3009 static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
3010 {
3011         struct rbd_snap *snap;
3012         struct rbd_snap *next;
3013
3014         list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
3015                 rbd_remove_snap_dev(snap);
3016 }
3017
3018 static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
3019 {
3020         sector_t size;
3021
3022         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
3023                 return;
3024
3025         size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
3026         dout("setting size to %llu sectors", (unsigned long long) size);
3027         rbd_dev->mapping.size = (u64) size;
3028         set_capacity(rbd_dev->disk, size);
3029 }
3030
3031 /*
3032  * only read the first part of the ondisk header, without the snaps info
3033  */
3034 static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
3035 {
3036         int ret;
3037         struct rbd_image_header h;
3038
3039         ret = rbd_read_header(rbd_dev, &h);
3040         if (ret < 0)
3041                 return ret;
3042
3043         down_write(&rbd_dev->header_rwsem);
3044
3045         /* Update image size, and check for resize of mapped image */
3046         rbd_dev->header.image_size = h.image_size;
3047         rbd_update_mapping_size(rbd_dev);
3048
3049         /* rbd_dev->header.object_prefix shouldn't change */
3050         kfree(rbd_dev->header.snap_sizes);
3051         kfree(rbd_dev->header.snap_names);
3052         /* osd requests may still refer to snapc */
3053         ceph_put_snap_context(rbd_dev->header.snapc);
3054
3055         if (hver)
3056                 *hver = h.obj_version;
3057         rbd_dev->header.obj_version = h.obj_version;
3058         rbd_dev->header.image_size = h.image_size;
3059         rbd_dev->header.snapc = h.snapc;
3060         rbd_dev->header.snap_names = h.snap_names;
3061         rbd_dev->header.snap_sizes = h.snap_sizes;
3062         /* Free the extra copy of the object prefix */
3063         WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
3064         kfree(h.object_prefix);
3065
3066         ret = rbd_dev_snaps_update(rbd_dev);
3067         if (!ret)
3068                 ret = rbd_dev_snaps_register(rbd_dev);
3069
3070         up_write(&rbd_dev->header_rwsem);
3071
3072         return ret;
3073 }
3074
3075 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
3076 {
3077         int ret;
3078
3079         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3080         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3081         if (rbd_dev->image_format == 1)
3082                 ret = rbd_dev_v1_refresh(rbd_dev, hver);
3083         else
3084                 ret = rbd_dev_v2_refresh(rbd_dev, hver);
3085         mutex_unlock(&ctl_mutex);
3086         revalidate_disk(rbd_dev->disk);
3087
3088         return ret;
3089 }
3090
3091 static int rbd_init_disk(struct rbd_device *rbd_dev)
3092 {
3093         struct gendisk *disk;
3094         struct request_queue *q;
3095         u64 segment_size;
3096
3097         /* create gendisk info */
3098         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3099         if (!disk)
3100                 return -ENOMEM;
3101
3102         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3103                  rbd_dev->dev_id);
3104         disk->major = rbd_dev->major;
3105         disk->first_minor = 0;
3106         disk->fops = &rbd_bd_ops;
3107         disk->private_data = rbd_dev;
3108
3109         q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3110         if (!q)
3111                 goto out_disk;
3112
3113         /* We use the default size, but let's be explicit about it. */
3114         blk_queue_physical_block_size(q, SECTOR_SIZE);
3115
3116         /* set io sizes to object size */
3117         segment_size = rbd_obj_bytes(&rbd_dev->header);
3118         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3119         blk_queue_max_segment_size(q, segment_size);
3120         blk_queue_io_min(q, segment_size);
3121         blk_queue_io_opt(q, segment_size);
3122
3123         blk_queue_merge_bvec(q, rbd_merge_bvec);
3124         disk->queue = q;
3125
3126         q->queuedata = rbd_dev;
3127
3128         rbd_dev->disk = disk;
3129
3130         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
3131
3132         return 0;
3133 out_disk:
3134         put_disk(disk);
3135
3136         return -ENOMEM;
3137 }
3138
3139 /*
3140   sysfs
3141 */
3142
3143 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3144 {
3145         return container_of(dev, struct rbd_device, dev);
3146 }
3147
3148 static ssize_t rbd_size_show(struct device *dev,
3149                              struct device_attribute *attr, char *buf)
3150 {
3151         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3152         sector_t size;
3153
3154         down_read(&rbd_dev->header_rwsem);
3155         size = get_capacity(rbd_dev->disk);
3156         up_read(&rbd_dev->header_rwsem);
3157
3158         return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
3159 }
3160
3161 /*
3162  * Note this shows the features for whatever's mapped, which is not
3163  * necessarily the base image.
3164  */
3165 static ssize_t rbd_features_show(struct device *dev,
3166                              struct device_attribute *attr, char *buf)
3167 {
3168         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3169
3170         return sprintf(buf, "0x%016llx\n",
3171                         (unsigned long long) rbd_dev->mapping.features);
3172 }
3173
3174 static ssize_t rbd_major_show(struct device *dev,
3175                               struct device_attribute *attr, char *buf)
3176 {
3177         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3178
3179         return sprintf(buf, "%d\n", rbd_dev->major);
3180 }
3181
3182 static ssize_t rbd_client_id_show(struct device *dev,
3183                                   struct device_attribute *attr, char *buf)
3184 {
3185         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3186
3187         return sprintf(buf, "client%lld\n",
3188                         ceph_client_id(rbd_dev->rbd_client->client));
3189 }
3190
3191 static ssize_t rbd_pool_show(struct device *dev,
3192                              struct device_attribute *attr, char *buf)
3193 {
3194         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3195
3196         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3197 }
3198
3199 static ssize_t rbd_pool_id_show(struct device *dev,
3200                              struct device_attribute *attr, char *buf)
3201 {
3202         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3203
3204         return sprintf(buf, "%llu\n",
3205                 (unsigned long long) rbd_dev->spec->pool_id);
3206 }
3207
3208 static ssize_t rbd_name_show(struct device *dev,
3209                              struct device_attribute *attr, char *buf)
3210 {
3211         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3212
3213         if (rbd_dev->spec->image_name)
3214                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3215
3216         return sprintf(buf, "(unknown)\n");
3217 }
3218
3219 static ssize_t rbd_image_id_show(struct device *dev,
3220                              struct device_attribute *attr, char *buf)
3221 {
3222         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3223
3224         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3225 }
3226
3227 /*
3228  * Shows the name of the currently-mapped snapshot (or
3229  * RBD_SNAP_HEAD_NAME for the base image).
3230  */
3231 static ssize_t rbd_snap_show(struct device *dev,
3232                              struct device_attribute *attr,
3233                              char *buf)
3234 {
3235         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3236
3237         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3238 }
3239
3240 /*
3241  * For an rbd v2 image, shows the pool id, image id, and snapshot id
3242  * for the parent image.  If there is no parent, simply shows
3243  * "(no parent image)".
3244  */
3245 static ssize_t rbd_parent_show(struct device *dev,
3246                              struct device_attribute *attr,
3247                              char *buf)
3248 {
3249         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3250         struct rbd_spec *spec = rbd_dev->parent_spec;
3251         int count;
3252         char *bufp = buf;
3253
3254         if (!spec)
3255                 return sprintf(buf, "(no parent image)\n");
3256
3257         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3258                         (unsigned long long) spec->pool_id, spec->pool_name);
3259         if (count < 0)
3260                 return count;
3261         bufp += count;
3262
3263         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3264                         spec->image_name ? spec->image_name : "(unknown)");
3265         if (count < 0)
3266                 return count;
3267         bufp += count;
3268
3269         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3270                         (unsigned long long) spec->snap_id, spec->snap_name);
3271         if (count < 0)
3272                 return count;
3273         bufp += count;
3274
3275         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3276         if (count < 0)
3277                 return count;
3278         bufp += count;
3279
3280         return (ssize_t) (bufp - buf);
3281 }
3282
3283 static ssize_t rbd_image_refresh(struct device *dev,
3284                                  struct device_attribute *attr,
3285                                  const char *buf,
3286                                  size_t size)
3287 {
3288         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3289         int ret;
3290
3291         ret = rbd_dev_refresh(rbd_dev, NULL);
3292
3293         return ret < 0 ? ret : size;
3294 }
3295
3296 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
3297 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3298 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3299 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3300 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
3301 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3302 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3303 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3304 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3305 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
3306 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3307
3308 static struct attribute *rbd_attrs[] = {
3309         &dev_attr_size.attr,
3310         &dev_attr_features.attr,
3311         &dev_attr_major.attr,
3312         &dev_attr_client_id.attr,
3313         &dev_attr_pool.attr,
3314         &dev_attr_pool_id.attr,
3315         &dev_attr_name.attr,
3316         &dev_attr_image_id.attr,
3317         &dev_attr_current_snap.attr,
3318         &dev_attr_parent.attr,
3319         &dev_attr_refresh.attr,
3320         NULL
3321 };
3322
3323 static struct attribute_group rbd_attr_group = {
3324         .attrs = rbd_attrs,
3325 };
3326
3327 static const struct attribute_group *rbd_attr_groups[] = {
3328         &rbd_attr_group,
3329         NULL
3330 };
3331
3332 static void rbd_sysfs_dev_release(struct device *dev)
3333 {
3334 }
3335
3336 static struct device_type rbd_device_type = {
3337         .name           = "rbd",
3338         .groups         = rbd_attr_groups,
3339         .release        = rbd_sysfs_dev_release,
3340 };
3341
3342
3343 /*
3344   sysfs - snapshots
3345 */
3346
3347 static ssize_t rbd_snap_size_show(struct device *dev,
3348                                   struct device_attribute *attr,
3349                                   char *buf)
3350 {
3351         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3352
3353         return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
3354 }
3355
3356 static ssize_t rbd_snap_id_show(struct device *dev,
3357                                 struct device_attribute *attr,
3358                                 char *buf)
3359 {
3360         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3361
3362         return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
3363 }
3364
3365 static ssize_t rbd_snap_features_show(struct device *dev,
3366                                 struct device_attribute *attr,
3367                                 char *buf)
3368 {
3369         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3370
3371         return sprintf(buf, "0x%016llx\n",
3372                         (unsigned long long) snap->features);
3373 }
3374
3375 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
3376 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
3377 static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
3378
3379 static struct attribute *rbd_snap_attrs[] = {
3380         &dev_attr_snap_size.attr,
3381         &dev_attr_snap_id.attr,
3382         &dev_attr_snap_features.attr,
3383         NULL,
3384 };
3385
3386 static struct attribute_group rbd_snap_attr_group = {
3387         .attrs = rbd_snap_attrs,
3388 };
3389
3390 static void rbd_snap_dev_release(struct device *dev)
3391 {
3392         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3393         kfree(snap->name);
3394         kfree(snap);
3395 }
3396
3397 static const struct attribute_group *rbd_snap_attr_groups[] = {
3398         &rbd_snap_attr_group,
3399         NULL
3400 };
3401
3402 static struct device_type rbd_snap_device_type = {
3403         .groups         = rbd_snap_attr_groups,
3404         .release        = rbd_snap_dev_release,
3405 };
3406
3407 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3408 {
3409         kref_get(&spec->kref);
3410
3411         return spec;
3412 }
3413
3414 static void rbd_spec_free(struct kref *kref);
3415 static void rbd_spec_put(struct rbd_spec *spec)
3416 {
3417         if (spec)
3418                 kref_put(&spec->kref, rbd_spec_free);
3419 }
3420
3421 static struct rbd_spec *rbd_spec_alloc(void)
3422 {
3423         struct rbd_spec *spec;
3424
3425         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3426         if (!spec)
3427                 return NULL;
3428         kref_init(&spec->kref);
3429
3430         return spec;
3431 }
3432
3433 static void rbd_spec_free(struct kref *kref)
3434 {
3435         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3436
3437         kfree(spec->pool_name);
3438         kfree(spec->image_id);
3439         kfree(spec->image_name);
3440         kfree(spec->snap_name);
3441         kfree(spec);
3442 }
3443
3444 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3445                                 struct rbd_spec *spec)
3446 {
3447         struct rbd_device *rbd_dev;
3448
3449         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3450         if (!rbd_dev)
3451                 return NULL;
3452
3453         spin_lock_init(&rbd_dev->lock);
3454         rbd_dev->flags = 0;
3455         INIT_LIST_HEAD(&rbd_dev->node);
3456         INIT_LIST_HEAD(&rbd_dev->snaps);
3457         init_rwsem(&rbd_dev->header_rwsem);
3458
3459         rbd_dev->spec = spec;
3460         rbd_dev->rbd_client = rbdc;
3461
3462         /* Initialize the layout used for all rbd requests */
3463
3464         rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3465         rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3466         rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3467         rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3468
3469         return rbd_dev;
3470 }
3471
3472 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3473 {
3474         rbd_spec_put(rbd_dev->parent_spec);
3475         kfree(rbd_dev->header_name);
3476         rbd_put_client(rbd_dev->rbd_client);
3477         rbd_spec_put(rbd_dev->spec);
3478         kfree(rbd_dev);
3479 }
3480
3481 static bool rbd_snap_registered(struct rbd_snap *snap)
3482 {
3483         bool ret = snap->dev.type == &rbd_snap_device_type;
3484         bool reg = device_is_registered(&snap->dev);
3485
3486         rbd_assert(!ret ^ reg);
3487
3488         return ret;
3489 }
3490
3491 static void rbd_remove_snap_dev(struct rbd_snap *snap)
3492 {
3493         list_del(&snap->node);
3494         if (device_is_registered(&snap->dev))
3495                 device_unregister(&snap->dev);
3496 }
3497
3498 static int rbd_register_snap_dev(struct rbd_snap *snap,
3499                                   struct device *parent)
3500 {
3501         struct device *dev = &snap->dev;
3502         int ret;
3503
3504         dev->type = &rbd_snap_device_type;
3505         dev->parent = parent;
3506         dev->release = rbd_snap_dev_release;
3507         dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
3508         dout("%s: registering device for snapshot %s\n", __func__, snap->name);
3509
3510         ret = device_register(dev);
3511
3512         return ret;
3513 }
3514
3515 static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
3516                                                 const char *snap_name,
3517                                                 u64 snap_id, u64 snap_size,
3518                                                 u64 snap_features)
3519 {
3520         struct rbd_snap *snap;
3521         int ret;
3522
3523         snap = kzalloc(sizeof (*snap), GFP_KERNEL);
3524         if (!snap)
3525                 return ERR_PTR(-ENOMEM);
3526
3527         ret = -ENOMEM;
3528         snap->name = kstrdup(snap_name, GFP_KERNEL);
3529         if (!snap->name)
3530                 goto err;
3531
3532         snap->id = snap_id;
3533         snap->size = snap_size;
3534         snap->features = snap_features;
3535
3536         return snap;
3537
3538 err:
3539         kfree(snap->name);
3540         kfree(snap);
3541
3542         return ERR_PTR(ret);
3543 }
3544
3545 static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
3546                 u64 *snap_size, u64 *snap_features)
3547 {
3548         char *snap_name;
3549
3550         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3551
3552         *snap_size = rbd_dev->header.snap_sizes[which];
3553         *snap_features = 0;     /* No features for v1 */
3554
3555         /* Skip over names until we find the one we are looking for */
3556
3557         snap_name = rbd_dev->header.snap_names;
3558         while (which--)
3559                 snap_name += strlen(snap_name) + 1;
3560
3561         return snap_name;
3562 }
3563
3564 /*
3565  * Get the size and object order for an image snapshot, or if
3566  * snap_id is CEPH_NOSNAP, gets this information for the base
3567  * image.
3568  */
3569 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3570                                 u8 *order, u64 *snap_size)
3571 {
3572         __le64 snapid = cpu_to_le64(snap_id);
3573         int ret;
3574         struct {
3575                 u8 order;
3576                 __le64 size;
3577         } __attribute__ ((packed)) size_buf = { 0 };
3578
3579         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3580                                 "rbd", "get_size",
3581                                 &snapid, sizeof (snapid),
3582                                 &size_buf, sizeof (size_buf), NULL);
3583         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3584         if (ret < 0)
3585                 return ret;
3586
3587         *order = size_buf.order;
3588         *snap_size = le64_to_cpu(size_buf.size);
3589
3590         dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
3591                 (unsigned long long) snap_id, (unsigned int) *order,
3592                 (unsigned long long) *snap_size);
3593
3594         return 0;
3595 }
3596
3597 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3598 {
3599         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3600                                         &rbd_dev->header.obj_order,
3601                                         &rbd_dev->header.image_size);
3602 }
3603
3604 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3605 {
3606         void *reply_buf;
3607         int ret;
3608         void *p;
3609
3610         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3611         if (!reply_buf)
3612                 return -ENOMEM;
3613
3614         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3615                                 "rbd", "get_object_prefix", NULL, 0,
3616                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
3617         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3618         if (ret < 0)
3619                 goto out;
3620
3621         p = reply_buf;
3622         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3623                                                 p + RBD_OBJ_PREFIX_LEN_MAX,
3624                                                 NULL, GFP_NOIO);
3625
3626         if (IS_ERR(rbd_dev->header.object_prefix)) {
3627                 ret = PTR_ERR(rbd_dev->header.object_prefix);
3628                 rbd_dev->header.object_prefix = NULL;
3629         } else {
3630                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
3631         }
3632
3633 out:
3634         kfree(reply_buf);
3635
3636         return ret;
3637 }
3638
3639 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3640                 u64 *snap_features)
3641 {
3642         __le64 snapid = cpu_to_le64(snap_id);
3643         struct {
3644                 __le64 features;
3645                 __le64 incompat;
3646         } __attribute__ ((packed)) features_buf = { 0 };
3647         u64 incompat;
3648         int ret;
3649
3650         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3651                                 "rbd", "get_features",
3652                                 &snapid, sizeof (snapid),
3653                                 &features_buf, sizeof (features_buf), NULL);
3654         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3655         if (ret < 0)
3656                 return ret;
3657
3658         incompat = le64_to_cpu(features_buf.incompat);
3659         if (incompat & ~RBD_FEATURES_SUPPORTED)
3660                 return -ENXIO;
3661
3662         *snap_features = le64_to_cpu(features_buf.features);
3663
3664         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3665                 (unsigned long long) snap_id,
3666                 (unsigned long long) *snap_features,
3667                 (unsigned long long) le64_to_cpu(features_buf.incompat));
3668
3669         return 0;
3670 }
3671
3672 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3673 {
3674         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3675                                                 &rbd_dev->header.features);
3676 }
3677
3678 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3679 {
3680         struct rbd_spec *parent_spec;
3681         size_t size;
3682         void *reply_buf = NULL;
3683         __le64 snapid;
3684         void *p;
3685         void *end;
3686         char *image_id;
3687         u64 overlap;
3688         int ret;
3689
3690         parent_spec = rbd_spec_alloc();
3691         if (!parent_spec)
3692                 return -ENOMEM;
3693
3694         size = sizeof (__le64) +                                /* pool_id */
3695                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
3696                 sizeof (__le64) +                               /* snap_id */
3697                 sizeof (__le64);                                /* overlap */
3698         reply_buf = kmalloc(size, GFP_KERNEL);
3699         if (!reply_buf) {
3700                 ret = -ENOMEM;
3701                 goto out_err;
3702         }
3703
3704         snapid = cpu_to_le64(CEPH_NOSNAP);
3705         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3706                                 "rbd", "get_parent",
3707                                 &snapid, sizeof (snapid),
3708                                 reply_buf, size, NULL);
3709         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3710         if (ret < 0)
3711                 goto out_err;
3712
3713         ret = -ERANGE;
3714         p = reply_buf;
3715         end = reply_buf + size;
3716         ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3717         if (parent_spec->pool_id == CEPH_NOPOOL)
3718                 goto out;       /* No parent?  No problem. */
3719
3720         /* The ceph file layout needs to fit pool id in 32 bits */
3721
3722         ret = -EIO;
3723         if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
3724                 goto out;
3725
3726         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3727         if (IS_ERR(image_id)) {
3728                 ret = PTR_ERR(image_id);
3729                 goto out_err;
3730         }
3731         parent_spec->image_id = image_id;
3732         ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3733         ceph_decode_64_safe(&p, end, overlap, out_err);
3734
3735         rbd_dev->parent_overlap = overlap;
3736         rbd_dev->parent_spec = parent_spec;
3737         parent_spec = NULL;     /* rbd_dev now owns this */
3738 out:
3739         ret = 0;
3740 out_err:
3741         kfree(reply_buf);
3742         rbd_spec_put(parent_spec);
3743
3744         return ret;
3745 }
3746
3747 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3748 {
3749         size_t image_id_size;
3750         char *image_id;
3751         void *p;
3752         void *end;
3753         size_t size;
3754         void *reply_buf = NULL;
3755         size_t len = 0;
3756         char *image_name = NULL;
3757         int ret;
3758
3759         rbd_assert(!rbd_dev->spec->image_name);
3760
3761         len = strlen(rbd_dev->spec->image_id);
3762         image_id_size = sizeof (__le32) + len;
3763         image_id = kmalloc(image_id_size, GFP_KERNEL);
3764         if (!image_id)
3765                 return NULL;
3766
3767         p = image_id;
3768         end = image_id + image_id_size;
3769         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
3770
3771         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3772         reply_buf = kmalloc(size, GFP_KERNEL);
3773         if (!reply_buf)
3774                 goto out;
3775
3776         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
3777                                 "rbd", "dir_get_name",
3778                                 image_id, image_id_size,
3779                                 reply_buf, size, NULL);
3780         if (ret < 0)
3781                 goto out;
3782         p = reply_buf;
3783         end = reply_buf + size;
3784         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3785         if (IS_ERR(image_name))
3786                 image_name = NULL;
3787         else
3788                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3789 out:
3790         kfree(reply_buf);
3791         kfree(image_id);
3792
3793         return image_name;
3794 }
3795
3796 /*
3797  * When a parent image gets probed, we only have the pool, image,
3798  * and snapshot ids but not the names of any of them.  This call
3799  * is made later to fill in those names.  It has to be done after
3800  * rbd_dev_snaps_update() has completed because some of the
3801  * information (in particular, snapshot name) is not available
3802  * until then.
3803  */
3804 static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3805 {
3806         struct ceph_osd_client *osdc;
3807         const char *name;
3808         void *reply_buf = NULL;
3809         int ret;
3810
3811         if (rbd_dev->spec->pool_name)
3812                 return 0;       /* Already have the names */
3813
3814         /* Look up the pool name */
3815
3816         osdc = &rbd_dev->rbd_client->client->osdc;
3817         name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3818         if (!name) {
3819                 rbd_warn(rbd_dev, "there is no pool with id %llu",
3820                         rbd_dev->spec->pool_id);        /* Really a BUG() */
3821                 return -EIO;
3822         }
3823
3824         rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3825         if (!rbd_dev->spec->pool_name)
3826                 return -ENOMEM;
3827
3828         /* Fetch the image name; tolerate failure here */
3829
3830         name = rbd_dev_image_name(rbd_dev);
3831         if (name)
3832                 rbd_dev->spec->image_name = (char *)name;
3833         else
3834                 rbd_warn(rbd_dev, "unable to get image name");
3835
3836         /* Look up the snapshot name. */
3837
3838         name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3839         if (!name) {
3840                 rbd_warn(rbd_dev, "no snapshot with id %llu",
3841                         rbd_dev->spec->snap_id);        /* Really a BUG() */
3842                 ret = -EIO;
3843                 goto out_err;
3844         }
3845         rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3846         if(!rbd_dev->spec->snap_name)
3847                 goto out_err;
3848
3849         return 0;
3850 out_err:
3851         kfree(reply_buf);
3852         kfree(rbd_dev->spec->pool_name);
3853         rbd_dev->spec->pool_name = NULL;
3854
3855         return ret;
3856 }
3857
3858 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
3859 {
3860         size_t size;
3861         int ret;
3862         void *reply_buf;
3863         void *p;
3864         void *end;
3865         u64 seq;
3866         u32 snap_count;
3867         struct ceph_snap_context *snapc;
3868         u32 i;
3869
3870         /*
3871          * We'll need room for the seq value (maximum snapshot id),
3872          * snapshot count, and array of that many snapshot ids.
3873          * For now we have a fixed upper limit on the number we're
3874          * prepared to receive.
3875          */
3876         size = sizeof (__le64) + sizeof (__le32) +
3877                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
3878         reply_buf = kzalloc(size, GFP_KERNEL);
3879         if (!reply_buf)
3880                 return -ENOMEM;
3881
3882         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3883                                 "rbd", "get_snapcontext", NULL, 0,
3884                                 reply_buf, size, ver);
3885         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3886         if (ret < 0)
3887                 goto out;
3888
3889         ret = -ERANGE;
3890         p = reply_buf;
3891         end = reply_buf + size;
3892         ceph_decode_64_safe(&p, end, seq, out);
3893         ceph_decode_32_safe(&p, end, snap_count, out);
3894
3895         /*
3896          * Make sure the reported number of snapshot ids wouldn't go
3897          * beyond the end of our buffer.  But before checking that,
3898          * make sure the computed size of the snapshot context we
3899          * allocate is representable in a size_t.
3900          */
3901         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3902                                  / sizeof (u64)) {
3903                 ret = -EINVAL;
3904                 goto out;
3905         }
3906         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3907                 goto out;
3908
3909         size = sizeof (struct ceph_snap_context) +
3910                                 snap_count * sizeof (snapc->snaps[0]);
3911         snapc = kmalloc(size, GFP_KERNEL);
3912         if (!snapc) {
3913                 ret = -ENOMEM;
3914                 goto out;
3915         }
3916
3917         atomic_set(&snapc->nref, 1);
3918         snapc->seq = seq;
3919         snapc->num_snaps = snap_count;
3920         for (i = 0; i < snap_count; i++)
3921                 snapc->snaps[i] = ceph_decode_64(&p);
3922
3923         rbd_dev->header.snapc = snapc;
3924
3925         dout("  snap context seq = %llu, snap_count = %u\n",
3926                 (unsigned long long) seq, (unsigned int) snap_count);
3927
3928 out:
3929         kfree(reply_buf);
3930
3931         return 0;
3932 }
3933
3934 static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3935 {
3936         size_t size;
3937         void *reply_buf;
3938         __le64 snap_id;
3939         int ret;
3940         void *p;
3941         void *end;
3942         char *snap_name;
3943
3944         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3945         reply_buf = kmalloc(size, GFP_KERNEL);
3946         if (!reply_buf)
3947                 return ERR_PTR(-ENOMEM);
3948
3949         snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3950         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3951                                 "rbd", "get_snapshot_name",
3952                                 &snap_id, sizeof (snap_id),
3953                                 reply_buf, size, NULL);
3954         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3955         if (ret < 0)
3956                 goto out;
3957
3958         p = reply_buf;
3959         end = reply_buf + size;
3960         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3961         if (IS_ERR(snap_name)) {
3962                 ret = PTR_ERR(snap_name);
3963                 goto out;
3964         } else {
3965                 dout("  snap_id 0x%016llx snap_name = %s\n",
3966                         (unsigned long long) le64_to_cpu(snap_id), snap_name);
3967         }
3968         kfree(reply_buf);
3969
3970         return snap_name;
3971 out:
3972         kfree(reply_buf);
3973
3974         return ERR_PTR(ret);
3975 }
3976
3977 static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3978                 u64 *snap_size, u64 *snap_features)
3979 {
3980         u64 snap_id;
3981         u8 order;
3982         int ret;
3983
3984         snap_id = rbd_dev->header.snapc->snaps[which];
3985         ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3986         if (ret)
3987                 return ERR_PTR(ret);
3988         ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3989         if (ret)
3990                 return ERR_PTR(ret);
3991
3992         return rbd_dev_v2_snap_name(rbd_dev, which);
3993 }
3994
3995 static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3996                 u64 *snap_size, u64 *snap_features)
3997 {
3998         if (rbd_dev->image_format == 1)
3999                 return rbd_dev_v1_snap_info(rbd_dev, which,
4000                                         snap_size, snap_features);
4001         if (rbd_dev->image_format == 2)
4002                 return rbd_dev_v2_snap_info(rbd_dev, which,
4003                                         snap_size, snap_features);
4004         return ERR_PTR(-EINVAL);
4005 }
4006
4007 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
4008 {
4009         int ret;
4010         __u8 obj_order;
4011
4012         down_write(&rbd_dev->header_rwsem);
4013
4014         /* Grab old order first, to see if it changes */
4015
4016         obj_order = rbd_dev->header.obj_order,
4017         ret = rbd_dev_v2_image_size(rbd_dev);
4018         if (ret)
4019                 goto out;
4020         if (rbd_dev->header.obj_order != obj_order) {
4021                 ret = -EIO;
4022                 goto out;
4023         }
4024         rbd_update_mapping_size(rbd_dev);
4025
4026         ret = rbd_dev_v2_snap_context(rbd_dev, hver);
4027         dout("rbd_dev_v2_snap_context returned %d\n", ret);
4028         if (ret)
4029                 goto out;
4030         ret = rbd_dev_snaps_update(rbd_dev);
4031         dout("rbd_dev_snaps_update returned %d\n", ret);
4032         if (ret)
4033                 goto out;
4034         ret = rbd_dev_snaps_register(rbd_dev);
4035         dout("rbd_dev_snaps_register returned %d\n", ret);
4036 out:
4037         up_write(&rbd_dev->header_rwsem);
4038
4039         return ret;
4040 }
4041
4042 /*
4043  * Scan the rbd device's current snapshot list and compare it to the
4044  * newly-received snapshot context.  Remove any existing snapshots
4045  * not present in the new snapshot context.  Add a new snapshot for
4046  * any snaphots in the snapshot context not in the current list.
4047  * And verify there are no changes to snapshots we already know
4048  * about.
4049  *
4050  * Assumes the snapshots in the snapshot context are sorted by
4051  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
4052  * are also maintained in that order.)
4053  */
4054 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
4055 {
4056         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4057         const u32 snap_count = snapc->num_snaps;
4058         struct list_head *head = &rbd_dev->snaps;
4059         struct list_head *links = head->next;
4060         u32 index = 0;
4061
4062         dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
4063         while (index < snap_count || links != head) {
4064                 u64 snap_id;
4065                 struct rbd_snap *snap;
4066                 char *snap_name;
4067                 u64 snap_size = 0;
4068                 u64 snap_features = 0;
4069
4070                 snap_id = index < snap_count ? snapc->snaps[index]
4071                                              : CEPH_NOSNAP;
4072                 snap = links != head ? list_entry(links, struct rbd_snap, node)
4073                                      : NULL;
4074                 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
4075
4076                 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
4077                         struct list_head *next = links->next;
4078
4079                         /*
4080                          * A previously-existing snapshot is not in
4081                          * the new snap context.
4082                          *
4083                          * If the now missing snapshot is the one the
4084                          * image is mapped to, clear its exists flag
4085                          * so we can avoid sending any more requests
4086                          * to it.
4087                          */
4088                         if (rbd_dev->spec->snap_id == snap->id)
4089                                 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4090                         rbd_remove_snap_dev(snap);
4091                         dout("%ssnap id %llu has been removed\n",
4092                                 rbd_dev->spec->snap_id == snap->id ?
4093                                                         "mapped " : "",
4094                                 (unsigned long long) snap->id);
4095
4096                         /* Done with this list entry; advance */
4097
4098                         links = next;
4099                         continue;
4100                 }
4101
4102                 snap_name = rbd_dev_snap_info(rbd_dev, index,
4103                                         &snap_size, &snap_features);
4104                 if (IS_ERR(snap_name))
4105                         return PTR_ERR(snap_name);
4106
4107                 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
4108                         (unsigned long long) snap_id);
4109                 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
4110                         struct rbd_snap *new_snap;
4111
4112                         /* We haven't seen this snapshot before */
4113
4114                         new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
4115                                         snap_id, snap_size, snap_features);
4116                         if (IS_ERR(new_snap)) {
4117                                 int err = PTR_ERR(new_snap);
4118
4119                                 dout("  failed to add dev, error %d\n", err);
4120
4121                                 return err;
4122                         }
4123
4124                         /* New goes before existing, or at end of list */
4125
4126                         dout("  added dev%s\n", snap ? "" : " at end\n");
4127                         if (snap)
4128                                 list_add_tail(&new_snap->node, &snap->node);
4129                         else
4130                                 list_add_tail(&new_snap->node, head);
4131                 } else {
4132                         /* Already have this one */
4133
4134                         dout("  already present\n");
4135
4136                         rbd_assert(snap->size == snap_size);
4137                         rbd_assert(!strcmp(snap->name, snap_name));
4138                         rbd_assert(snap->features == snap_features);
4139
4140                         /* Done with this list entry; advance */
4141
4142                         links = links->next;
4143                 }
4144
4145                 /* Advance to the next entry in the snapshot context */
4146
4147                 index++;
4148         }
4149         dout("%s: done\n", __func__);
4150
4151         return 0;
4152 }
4153
4154 /*
4155  * Scan the list of snapshots and register the devices for any that
4156  * have not already been registered.
4157  */
4158 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
4159 {
4160         struct rbd_snap *snap;
4161         int ret = 0;
4162
4163         dout("%s:\n", __func__);
4164         if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
4165                 return -EIO;
4166
4167         list_for_each_entry(snap, &rbd_dev->snaps, node) {
4168                 if (!rbd_snap_registered(snap)) {
4169                         ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
4170                         if (ret < 0)
4171                                 break;
4172                 }
4173         }
4174         dout("%s: returning %d\n", __func__, ret);
4175
4176         return ret;
4177 }
4178
4179 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4180 {
4181         struct device *dev;
4182         int ret;
4183
4184         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4185
4186         dev = &rbd_dev->dev;
4187         dev->bus = &rbd_bus_type;
4188         dev->type = &rbd_device_type;
4189         dev->parent = &rbd_root_dev;
4190         dev->release = rbd_dev_release;
4191         dev_set_name(dev, "%d", rbd_dev->dev_id);
4192         ret = device_register(dev);
4193
4194         mutex_unlock(&ctl_mutex);
4195
4196         return ret;
4197 }
4198
4199 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4200 {
4201         device_unregister(&rbd_dev->dev);
4202 }
4203
4204 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
4205
4206 /*
4207  * Get a unique rbd identifier for the given new rbd_dev, and add
4208  * the rbd_dev to the global list.  The minimum rbd id is 1.
4209  */
4210 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4211 {
4212         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4213
4214         spin_lock(&rbd_dev_list_lock);
4215         list_add_tail(&rbd_dev->node, &rbd_dev_list);
4216         spin_unlock(&rbd_dev_list_lock);
4217         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4218                 (unsigned long long) rbd_dev->dev_id);
4219 }
4220
4221 /*
4222  * Remove an rbd_dev from the global list, and record that its
4223  * identifier is no longer in use.
4224  */
4225 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
4226 {
4227         struct list_head *tmp;
4228         int rbd_id = rbd_dev->dev_id;
4229         int max_id;
4230
4231         rbd_assert(rbd_id > 0);
4232
4233         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4234                 (unsigned long long) rbd_dev->dev_id);
4235         spin_lock(&rbd_dev_list_lock);
4236         list_del_init(&rbd_dev->node);
4237
4238         /*
4239          * If the id being "put" is not the current maximum, there
4240          * is nothing special we need to do.
4241          */
4242         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4243                 spin_unlock(&rbd_dev_list_lock);
4244                 return;
4245         }
4246
4247         /*
4248          * We need to update the current maximum id.  Search the
4249          * list to find out what it is.  We're more likely to find
4250          * the maximum at the end, so search the list backward.
4251          */
4252         max_id = 0;
4253         list_for_each_prev(tmp, &rbd_dev_list) {
4254                 struct rbd_device *rbd_dev;
4255
4256                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4257                 if (rbd_dev->dev_id > max_id)
4258                         max_id = rbd_dev->dev_id;
4259         }
4260         spin_unlock(&rbd_dev_list_lock);
4261
4262         /*
4263          * The max id could have been updated by rbd_dev_id_get(), in
4264          * which case it now accurately reflects the new maximum.
4265          * Be careful not to overwrite the maximum value in that
4266          * case.
4267          */
4268         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4269         dout("  max dev id has been reset\n");
4270 }
4271
4272 /*
4273  * Skips over white space at *buf, and updates *buf to point to the
4274  * first found non-space character (if any). Returns the length of
4275  * the token (string of non-white space characters) found.  Note
4276  * that *buf must be terminated with '\0'.
4277  */
4278 static inline size_t next_token(const char **buf)
4279 {
4280         /*
4281         * These are the characters that produce nonzero for
4282         * isspace() in the "C" and "POSIX" locales.
4283         */
4284         const char *spaces = " \f\n\r\t\v";
4285
4286         *buf += strspn(*buf, spaces);   /* Find start of token */
4287
4288         return strcspn(*buf, spaces);   /* Return token length */
4289 }
4290
4291 /*
4292  * Finds the next token in *buf, and if the provided token buffer is
4293  * big enough, copies the found token into it.  The result, if
4294  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4295  * must be terminated with '\0' on entry.
4296  *
4297  * Returns the length of the token found (not including the '\0').
4298  * Return value will be 0 if no token is found, and it will be >=
4299  * token_size if the token would not fit.
4300  *
4301  * The *buf pointer will be updated to point beyond the end of the
4302  * found token.  Note that this occurs even if the token buffer is
4303  * too small to hold it.
4304  */
4305 static inline size_t copy_token(const char **buf,
4306                                 char *token,
4307                                 size_t token_size)
4308 {
4309         size_t len;
4310
4311         len = next_token(buf);
4312         if (len < token_size) {
4313                 memcpy(token, *buf, len);
4314                 *(token + len) = '\0';
4315         }
4316         *buf += len;
4317
4318         return len;
4319 }
4320
4321 /*
4322  * Finds the next token in *buf, dynamically allocates a buffer big
4323  * enough to hold a copy of it, and copies the token into the new
4324  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4325  * that a duplicate buffer is created even for a zero-length token.
4326  *
4327  * Returns a pointer to the newly-allocated duplicate, or a null
4328  * pointer if memory for the duplicate was not available.  If
4329  * the lenp argument is a non-null pointer, the length of the token
4330  * (not including the '\0') is returned in *lenp.
4331  *
4332  * If successful, the *buf pointer will be updated to point beyond
4333  * the end of the found token.
4334  *
4335  * Note: uses GFP_KERNEL for allocation.
4336  */
4337 static inline char *dup_token(const char **buf, size_t *lenp)
4338 {
4339         char *dup;
4340         size_t len;
4341
4342         len = next_token(buf);
4343         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4344         if (!dup)
4345                 return NULL;
4346         *(dup + len) = '\0';
4347         *buf += len;
4348
4349         if (lenp)
4350                 *lenp = len;
4351
4352         return dup;
4353 }
4354
4355 /*
4356  * Parse the options provided for an "rbd add" (i.e., rbd image
4357  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4358  * and the data written is passed here via a NUL-terminated buffer.
4359  * Returns 0 if successful or an error code otherwise.
4360  *
4361  * The information extracted from these options is recorded in
4362  * the other parameters which return dynamically-allocated
4363  * structures:
4364  *  ceph_opts
4365  *      The address of a pointer that will refer to a ceph options
4366  *      structure.  Caller must release the returned pointer using
4367  *      ceph_destroy_options() when it is no longer needed.
4368  *  rbd_opts
4369  *      Address of an rbd options pointer.  Fully initialized by
4370  *      this function; caller must release with kfree().
4371  *  spec
4372  *      Address of an rbd image specification pointer.  Fully
4373  *      initialized by this function based on parsed options.
4374  *      Caller must release with rbd_spec_put().
4375  *
4376  * The options passed take this form:
4377  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4378  * where:
4379  *  <mon_addrs>
4380  *      A comma-separated list of one or more monitor addresses.
4381  *      A monitor address is an ip address, optionally followed
4382  *      by a port number (separated by a colon).
4383  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4384  *  <options>
4385  *      A comma-separated list of ceph and/or rbd options.
4386  *  <pool_name>
4387  *      The name of the rados pool containing the rbd image.
4388  *  <image_name>
4389  *      The name of the image in that pool to map.
4390  *  <snap_id>
4391  *      An optional snapshot id.  If provided, the mapping will
4392  *      present data from the image at the time that snapshot was
4393  *      created.  The image head is used if no snapshot id is
4394  *      provided.  Snapshot mappings are always read-only.
4395  */
4396 static int rbd_add_parse_args(const char *buf,
4397                                 struct ceph_options **ceph_opts,
4398                                 struct rbd_options **opts,
4399                                 struct rbd_spec **rbd_spec)
4400 {
4401         size_t len;
4402         char *options;
4403         const char *mon_addrs;
4404         size_t mon_addrs_size;
4405         struct rbd_spec *spec = NULL;
4406         struct rbd_options *rbd_opts = NULL;
4407         struct ceph_options *copts;
4408         int ret;
4409
4410         /* The first four tokens are required */
4411
4412         len = next_token(&buf);
4413         if (!len) {
4414                 rbd_warn(NULL, "no monitor address(es) provided");
4415                 return -EINVAL;
4416         }
4417         mon_addrs = buf;
4418         mon_addrs_size = len + 1;
4419         buf += len;
4420
4421         ret = -EINVAL;
4422         options = dup_token(&buf, NULL);
4423         if (!options)
4424                 return -ENOMEM;
4425         if (!*options) {
4426                 rbd_warn(NULL, "no options provided");
4427                 goto out_err;
4428         }
4429
4430         spec = rbd_spec_alloc();
4431         if (!spec)
4432                 goto out_mem;
4433
4434         spec->pool_name = dup_token(&buf, NULL);
4435         if (!spec->pool_name)
4436                 goto out_mem;
4437         if (!*spec->pool_name) {
4438                 rbd_warn(NULL, "no pool name provided");
4439                 goto out_err;
4440         }
4441
4442         spec->image_name = dup_token(&buf, NULL);
4443         if (!spec->image_name)
4444                 goto out_mem;
4445         if (!*spec->image_name) {
4446                 rbd_warn(NULL, "no image name provided");
4447                 goto out_err;
4448         }
4449
4450         /*
4451          * Snapshot name is optional; default is to use "-"
4452          * (indicating the head/no snapshot).
4453          */
4454         len = next_token(&buf);
4455         if (!len) {
4456                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4457                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4458         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
4459                 ret = -ENAMETOOLONG;
4460                 goto out_err;
4461         }
4462         spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4463         if (!spec->snap_name)
4464                 goto out_mem;
4465         *(spec->snap_name + len) = '\0';
4466
4467         /* Initialize all rbd options to the defaults */
4468
4469         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4470         if (!rbd_opts)
4471                 goto out_mem;
4472
4473         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4474
4475         copts = ceph_parse_options(options, mon_addrs,
4476                                         mon_addrs + mon_addrs_size - 1,
4477                                         parse_rbd_opts_token, rbd_opts);
4478         if (IS_ERR(copts)) {
4479                 ret = PTR_ERR(copts);
4480                 goto out_err;
4481         }
4482         kfree(options);
4483
4484         *ceph_opts = copts;
4485         *opts = rbd_opts;
4486         *rbd_spec = spec;
4487
4488         return 0;
4489 out_mem:
4490         ret = -ENOMEM;
4491 out_err:
4492         kfree(rbd_opts);
4493         rbd_spec_put(spec);
4494         kfree(options);
4495
4496         return ret;
4497 }
4498
4499 /*
4500  * An rbd format 2 image has a unique identifier, distinct from the
4501  * name given to it by the user.  Internally, that identifier is
4502  * what's used to specify the names of objects related to the image.
4503  *
4504  * A special "rbd id" object is used to map an rbd image name to its
4505  * id.  If that object doesn't exist, then there is no v2 rbd image
4506  * with the supplied name.
4507  *
4508  * This function will record the given rbd_dev's image_id field if
4509  * it can be determined, and in that case will return 0.  If any
4510  * errors occur a negative errno will be returned and the rbd_dev's
4511  * image_id field will be unchanged (and should be NULL).
4512  */
4513 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4514 {
4515         int ret;
4516         size_t size;
4517         char *object_name;
4518         void *response;
4519         void *p;
4520
4521         /* If we already have it we don't need to look it up */
4522
4523         if (rbd_dev->spec->image_id)
4524                 return 0;
4525
4526         /*
4527          * When probing a parent image, the image id is already
4528          * known (and the image name likely is not).  There's no
4529          * need to fetch the image id again in this case.
4530          */
4531         if (rbd_dev->spec->image_id)
4532                 return 0;
4533
4534         /*
4535          * First, see if the format 2 image id file exists, and if
4536          * so, get the image's persistent id from it.
4537          */
4538         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4539         object_name = kmalloc(size, GFP_NOIO);
4540         if (!object_name)
4541                 return -ENOMEM;
4542         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4543         dout("rbd id object name is %s\n", object_name);
4544
4545         /* Response will be an encoded string, which includes a length */
4546
4547         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4548         response = kzalloc(size, GFP_NOIO);
4549         if (!response) {
4550                 ret = -ENOMEM;
4551                 goto out;
4552         }
4553
4554         ret = rbd_obj_method_sync(rbd_dev, object_name,
4555                                 "rbd", "get_id", NULL, 0,
4556                                 response, RBD_IMAGE_ID_LEN_MAX, NULL);
4557         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4558         if (ret < 0)
4559                 goto out;
4560
4561         p = response;
4562         rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
4563                                                 p + RBD_IMAGE_ID_LEN_MAX,
4564                                                 NULL, GFP_NOIO);
4565         if (IS_ERR(rbd_dev->spec->image_id)) {
4566                 ret = PTR_ERR(rbd_dev->spec->image_id);
4567                 rbd_dev->spec->image_id = NULL;
4568         } else {
4569                 dout("image_id is %s\n", rbd_dev->spec->image_id);
4570         }
4571 out:
4572         kfree(response);
4573         kfree(object_name);
4574
4575         return ret;
4576 }
4577
4578 static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4579 {
4580         int ret;
4581         size_t size;
4582
4583         /* Version 1 images have no id; empty string is used */
4584
4585         rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
4586         if (!rbd_dev->spec->image_id)
4587                 return -ENOMEM;
4588
4589         /* Record the header object name for this rbd image. */
4590
4591         size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
4592         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4593         if (!rbd_dev->header_name) {
4594                 ret = -ENOMEM;
4595                 goto out_err;
4596         }
4597         sprintf(rbd_dev->header_name, "%s%s",
4598                 rbd_dev->spec->image_name, RBD_SUFFIX);
4599
4600         /* Populate rbd image metadata */
4601
4602         ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4603         if (ret < 0)
4604                 goto out_err;
4605
4606         /* Version 1 images have no parent (no layering) */
4607
4608         rbd_dev->parent_spec = NULL;
4609         rbd_dev->parent_overlap = 0;
4610
4611         rbd_dev->image_format = 1;
4612
4613         dout("discovered version 1 image, header name is %s\n",
4614                 rbd_dev->header_name);
4615
4616         return 0;
4617
4618 out_err:
4619         kfree(rbd_dev->header_name);
4620         rbd_dev->header_name = NULL;
4621         kfree(rbd_dev->spec->image_id);
4622         rbd_dev->spec->image_id = NULL;
4623
4624         return ret;
4625 }
4626
4627 static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4628 {
4629         size_t size;
4630         int ret;
4631         u64 ver = 0;
4632
4633         /*
4634          * Image id was filled in by the caller.  Record the header
4635          * object name for this rbd image.
4636          */
4637         size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
4638         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4639         if (!rbd_dev->header_name)
4640                 return -ENOMEM;
4641         sprintf(rbd_dev->header_name, "%s%s",
4642                         RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
4643
4644         /* Get the size and object order for the image */
4645
4646         ret = rbd_dev_v2_image_size(rbd_dev);
4647         if (ret < 0)
4648                 goto out_err;
4649
4650         /* Get the object prefix (a.k.a. block_name) for the image */
4651
4652         ret = rbd_dev_v2_object_prefix(rbd_dev);
4653         if (ret < 0)
4654                 goto out_err;
4655
4656         /* Get the and check features for the image */
4657
4658         ret = rbd_dev_v2_features(rbd_dev);
4659         if (ret < 0)
4660                 goto out_err;
4661
4662         /* If the image supports layering, get the parent info */
4663
4664         if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4665                 ret = rbd_dev_v2_parent_info(rbd_dev);
4666                 if (ret < 0)
4667                         goto out_err;
4668         }
4669
4670         /* crypto and compression type aren't (yet) supported for v2 images */
4671
4672         rbd_dev->header.crypt_type = 0;
4673         rbd_dev->header.comp_type = 0;
4674
4675         /* Get the snapshot context, plus the header version */
4676
4677         ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
4678         if (ret)
4679                 goto out_err;
4680         rbd_dev->header.obj_version = ver;
4681
4682         rbd_dev->image_format = 2;
4683
4684         dout("discovered version 2 image, header name is %s\n",
4685                 rbd_dev->header_name);
4686
4687         return 0;
4688 out_err:
4689         rbd_dev->parent_overlap = 0;
4690         rbd_spec_put(rbd_dev->parent_spec);
4691         rbd_dev->parent_spec = NULL;
4692         kfree(rbd_dev->header_name);
4693         rbd_dev->header_name = NULL;
4694         kfree(rbd_dev->header.object_prefix);
4695         rbd_dev->header.object_prefix = NULL;
4696
4697         return ret;
4698 }
4699
4700 static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
4701 {
4702         struct rbd_device *parent = NULL;
4703         struct rbd_spec *parent_spec = NULL;
4704         struct rbd_client *rbdc = NULL;
4705         int ret;
4706
4707         /* no need to lock here, as rbd_dev is not registered yet */
4708         ret = rbd_dev_snaps_update(rbd_dev);
4709         if (ret)
4710                 return ret;
4711
4712         ret = rbd_dev_probe_update_spec(rbd_dev);
4713         if (ret)
4714                 goto err_out_snaps;
4715
4716         ret = rbd_dev_set_mapping(rbd_dev);
4717         if (ret)
4718                 goto err_out_snaps;
4719
4720         /* generate unique id: find highest unique id, add one */
4721         rbd_dev_id_get(rbd_dev);
4722
4723         /* Fill in the device name, now that we have its id. */
4724         BUILD_BUG_ON(DEV_NAME_LEN
4725                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4726         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4727
4728         /* Get our block major device number. */
4729
4730         ret = register_blkdev(0, rbd_dev->name);
4731         if (ret < 0)
4732                 goto err_out_id;
4733         rbd_dev->major = ret;
4734
4735         /* Set up the blkdev mapping. */
4736
4737         ret = rbd_init_disk(rbd_dev);
4738         if (ret)
4739                 goto err_out_blkdev;
4740
4741         ret = rbd_bus_add_dev(rbd_dev);
4742         if (ret)
4743                 goto err_out_disk;
4744
4745         /*
4746          * At this point cleanup in the event of an error is the job
4747          * of the sysfs code (initiated by rbd_bus_del_dev()).
4748          */
4749         /* Probe the parent if there is one */
4750
4751         if (rbd_dev->parent_spec) {
4752                 /*
4753                  * We need to pass a reference to the client and the
4754                  * parent spec when creating the parent rbd_dev.
4755                  * Images related by parent/child relationships
4756                  * always share both.
4757                  */
4758                 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4759                 rbdc = __rbd_get_client(rbd_dev->rbd_client);
4760
4761                 parent = rbd_dev_create(rbdc, parent_spec);
4762                 if (!parent) {
4763                         ret = -ENOMEM;
4764                         goto err_out_spec;
4765                 }
4766                 rbdc = NULL;            /* parent now owns reference */
4767                 parent_spec = NULL;     /* parent now owns reference */
4768                 ret = rbd_dev_probe(parent);
4769                 if (ret < 0)
4770                         goto err_out_parent;
4771                 rbd_dev->parent = parent;
4772         }
4773
4774         down_write(&rbd_dev->header_rwsem);
4775         ret = rbd_dev_snaps_register(rbd_dev);
4776         up_write(&rbd_dev->header_rwsem);
4777         if (ret)
4778                 goto err_out_bus;
4779
4780         ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4781         if (ret)
4782                 goto err_out_bus;
4783
4784         /* Everything's ready.  Announce the disk to the world. */
4785
4786         add_disk(rbd_dev->disk);
4787
4788         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4789                 (unsigned long long) rbd_dev->mapping.size);
4790
4791         return ret;
4792
4793 err_out_parent:
4794         rbd_dev_destroy(parent);
4795 err_out_spec:
4796         rbd_spec_put(parent_spec);
4797         rbd_put_client(rbdc);
4798 err_out_bus:
4799         /* this will also clean up rest of rbd_dev stuff */
4800
4801         rbd_bus_del_dev(rbd_dev);
4802
4803         return ret;
4804 err_out_disk:
4805         rbd_free_disk(rbd_dev);
4806 err_out_blkdev:
4807         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4808 err_out_id:
4809         rbd_dev_id_put(rbd_dev);
4810 err_out_snaps:
4811         rbd_remove_all_snaps(rbd_dev);
4812
4813         return ret;
4814 }
4815
4816 /*
4817  * Probe for the existence of the header object for the given rbd
4818  * device.  For format 2 images this includes determining the image
4819  * id.
4820  */
4821 static int rbd_dev_probe(struct rbd_device *rbd_dev)
4822 {
4823         int ret;
4824
4825         /*
4826          * Get the id from the image id object.  If it's not a
4827          * format 2 image, we'll get ENOENT back, and we'll assume
4828          * it's a format 1 image.
4829          */
4830         ret = rbd_dev_image_id(rbd_dev);
4831         if (ret)
4832                 ret = rbd_dev_v1_probe(rbd_dev);
4833         else
4834                 ret = rbd_dev_v2_probe(rbd_dev);
4835         if (ret) {
4836                 dout("probe failed, returning %d\n", ret);
4837
4838                 return ret;
4839         }
4840
4841         ret = rbd_dev_probe_finish(rbd_dev);
4842         if (ret)
4843                 rbd_header_free(&rbd_dev->header);
4844
4845         return ret;
4846 }
4847
4848 static ssize_t rbd_add(struct bus_type *bus,
4849                        const char *buf,
4850                        size_t count)
4851 {
4852         struct rbd_device *rbd_dev = NULL;
4853         struct ceph_options *ceph_opts = NULL;
4854         struct rbd_options *rbd_opts = NULL;
4855         struct rbd_spec *spec = NULL;
4856         struct rbd_client *rbdc;
4857         struct ceph_osd_client *osdc;
4858         int rc = -ENOMEM;
4859
4860         if (!try_module_get(THIS_MODULE))
4861                 return -ENODEV;
4862
4863         /* parse add command */
4864         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4865         if (rc < 0)
4866                 goto err_out_module;
4867
4868         rbdc = rbd_get_client(ceph_opts);
4869         if (IS_ERR(rbdc)) {
4870                 rc = PTR_ERR(rbdc);
4871                 goto err_out_args;
4872         }
4873         ceph_opts = NULL;       /* rbd_dev client now owns this */
4874
4875         /* pick the pool */
4876         osdc = &rbdc->client->osdc;
4877         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4878         if (rc < 0)
4879                 goto err_out_client;
4880         spec->pool_id = (u64) rc;
4881
4882         /* The ceph file layout needs to fit pool id in 32 bits */
4883
4884         if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4885                 rc = -EIO;
4886                 goto err_out_client;
4887         }
4888
4889         rbd_dev = rbd_dev_create(rbdc, spec);
4890         if (!rbd_dev)
4891                 goto err_out_client;
4892         rbdc = NULL;            /* rbd_dev now owns this */
4893         spec = NULL;            /* rbd_dev now owns this */
4894
4895         rbd_dev->mapping.read_only = rbd_opts->read_only;
4896         kfree(rbd_opts);
4897         rbd_opts = NULL;        /* done with this */
4898
4899         rc = rbd_dev_probe(rbd_dev);
4900         if (rc < 0)
4901                 goto err_out_rbd_dev;
4902
4903         return count;
4904 err_out_rbd_dev:
4905         rbd_dev_destroy(rbd_dev);
4906 err_out_client:
4907         rbd_put_client(rbdc);
4908 err_out_args:
4909         if (ceph_opts)
4910                 ceph_destroy_options(ceph_opts);
4911         kfree(rbd_opts);
4912         rbd_spec_put(spec);
4913 err_out_module:
4914         module_put(THIS_MODULE);
4915
4916         dout("Error adding device %s\n", buf);
4917
4918         return (ssize_t) rc;
4919 }
4920
4921 static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4922 {
4923         struct list_head *tmp;
4924         struct rbd_device *rbd_dev;
4925
4926         spin_lock(&rbd_dev_list_lock);
4927         list_for_each(tmp, &rbd_dev_list) {
4928                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4929                 if (rbd_dev->dev_id == dev_id) {
4930                         spin_unlock(&rbd_dev_list_lock);
4931                         return rbd_dev;
4932                 }
4933         }
4934         spin_unlock(&rbd_dev_list_lock);
4935         return NULL;
4936 }
4937
4938 static void rbd_dev_release(struct device *dev)
4939 {
4940         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4941
4942         if (rbd_dev->watch_event)
4943                 rbd_dev_header_watch_sync(rbd_dev, 0);
4944
4945         /* clean up and free blkdev */
4946         rbd_free_disk(rbd_dev);
4947         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4948
4949         /* release allocated disk header fields */
4950         rbd_header_free(&rbd_dev->header);
4951
4952         /* done with the id, and with the rbd_dev */
4953         rbd_dev_id_put(rbd_dev);
4954         rbd_assert(rbd_dev->rbd_client != NULL);
4955         rbd_dev_destroy(rbd_dev);
4956
4957         /* release module ref */
4958         module_put(THIS_MODULE);
4959 }
4960
4961 static void __rbd_remove(struct rbd_device *rbd_dev)
4962 {
4963         rbd_remove_all_snaps(rbd_dev);
4964         rbd_bus_del_dev(rbd_dev);
4965 }
4966
4967 static ssize_t rbd_remove(struct bus_type *bus,
4968                           const char *buf,
4969                           size_t count)
4970 {
4971         struct rbd_device *rbd_dev = NULL;
4972         int target_id, rc;
4973         unsigned long ul;
4974         int ret = count;
4975
4976         rc = strict_strtoul(buf, 10, &ul);
4977         if (rc)
4978                 return rc;
4979
4980         /* convert to int; abort if we lost anything in the conversion */
4981         target_id = (int) ul;
4982         if (target_id != ul)
4983                 return -EINVAL;
4984
4985         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4986
4987         rbd_dev = __rbd_get_dev(target_id);
4988         if (!rbd_dev) {
4989                 ret = -ENOENT;
4990                 goto done;
4991         }
4992
4993         spin_lock_irq(&rbd_dev->lock);
4994         if (rbd_dev->open_count)
4995                 ret = -EBUSY;
4996         else
4997                 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4998         spin_unlock_irq(&rbd_dev->lock);
4999         if (ret < 0)
5000                 goto done;
5001
5002         while (rbd_dev->parent_spec) {
5003                 struct rbd_device *first = rbd_dev;
5004                 struct rbd_device *second = first->parent;
5005                 struct rbd_device *third;
5006
5007                 /*
5008                  * Follow to the parent with no grandparent and
5009                  * remove it.
5010                  */
5011                 while (second && (third = second->parent)) {
5012                         first = second;
5013                         second = third;
5014                 }
5015                 __rbd_remove(second);
5016                 rbd_spec_put(first->parent_spec);
5017                 first->parent_spec = NULL;
5018                 first->parent_overlap = 0;
5019                 first->parent = NULL;
5020         }
5021         __rbd_remove(rbd_dev);
5022
5023 done:
5024         mutex_unlock(&ctl_mutex);
5025
5026         return ret;
5027 }
5028
5029 /*
5030  * create control files in sysfs
5031  * /sys/bus/rbd/...
5032  */
5033 static int rbd_sysfs_init(void)
5034 {
5035         int ret;
5036
5037         ret = device_register(&rbd_root_dev);
5038         if (ret < 0)
5039                 return ret;
5040
5041         ret = bus_register(&rbd_bus_type);
5042         if (ret < 0)
5043                 device_unregister(&rbd_root_dev);
5044
5045         return ret;
5046 }
5047
5048 static void rbd_sysfs_cleanup(void)
5049 {
5050         bus_unregister(&rbd_bus_type);
5051         device_unregister(&rbd_root_dev);
5052 }
5053
5054 static int __init rbd_init(void)
5055 {
5056         int rc;
5057
5058         if (!libceph_compatible(NULL)) {
5059                 rbd_warn(NULL, "libceph incompatibility (quitting)");
5060
5061                 return -EINVAL;
5062         }
5063         rc = rbd_sysfs_init();
5064         if (rc)
5065                 return rc;
5066         pr_info("loaded " RBD_DRV_NAME_LONG "\n");
5067         return 0;
5068 }
5069
5070 static void __exit rbd_exit(void)
5071 {
5072         rbd_sysfs_cleanup();
5073 }
5074
5075 module_init(rbd_init);
5076 module_exit(rbd_exit);
5077
5078 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5079 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5080 MODULE_DESCRIPTION("rados block device");
5081
5082 /* following authorship retained from original osdblk.c */
5083 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5084
5085 MODULE_LICENSE("GPL");