drivers/block/rbd.c

   1 /*
   2    rbd.c -- Export ceph rados objects as a Linux block device
   3
   4
   5    based on drivers/block/osdblk.c:
   6
   7    Copyright 2009 Red Hat, Inc.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; see the file COPYING.  If not, write to
  20    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  21
  22
  23
  24    For usage instructions, please refer to:
  25
  26                  Documentation/ABI/testing/sysfs-bus-rbd
  27
  28  */
  29
  30 #include <linux/ceph/libceph.h>
  31 #include <linux/ceph/osd_client.h>
  32 #include <linux/ceph/mon_client.h>
  33 #include <linux/ceph/decode.h>
  34 #include <linux/parser.h>
  35
  36 #include <linux/kernel.h>
  37 #include <linux/device.h>
  38 #include <linux/module.h>
  39 #include <linux/fs.h>
  40 #include <linux/blkdev.h>
  41
  42 #include "rbd_types.h"
  43
  44 #define RBD_DEBUG       /* Activate rbd_assert() calls */
  45
  46 /*
  47  * The basic unit of block I/O is a sector.  It is interpreted in a
  48  * number of contexts in Linux (blk, bio, genhd), but the default is
  49  * universally 512 bytes.  These symbols are just slightly more
  50  * meaningful than the bare numbers they represent.
  51  */
  52 #define SECTOR_SHIFT    9
  53 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  54
  55 /* It might be useful to have these defined elsewhere */
  56
  57 #define U8_MAX  ((u8)   (~0U))
  58 #define U16_MAX ((u16)  (~0U))
  59 #define U32_MAX ((u32)  (~0U))
  60 #define U64_MAX ((u64)  (~0ULL))
  61
  62 #define RBD_DRV_NAME "rbd"
  63 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  64
  65 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
  66
  67 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
  68 #define RBD_MAX_SNAP_NAME_LEN   \
  69                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
  70
  71 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
  72
  73 #define RBD_SNAP_HEAD_NAME      "-"
  74
  75 /* This allows a single page to hold an image name sent by OSD */
  76 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
  77 #define RBD_IMAGE_ID_LEN_MAX    64
  78
  79 #define RBD_OBJ_PREFIX_LEN_MAX  64
  80
  81 /* Feature bits */
  82
  83 #define RBD_FEATURE_LAYERING      1
  84
  85 /* Features supported by this (client software) implementation. */
  86
  87 #define RBD_FEATURES_ALL          (0)
  88
  89 /*
  90  * An RBD device name will be "rbd#", where the "rbd" comes from
  91  * RBD_DRV_NAME above, and # is a unique integer identifier.
  92  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
  93  * enough to hold all possible device names.
  94  */
  95 #define DEV_NAME_LEN            32
  96 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
  97
  98 /*
  99  * block device image metadata (in-memory version)
 100  */
 101 struct rbd_image_header {
 102         /* These four fields never change for a given rbd image */
 103         char *object_prefix;
 104         u64 features;
 105         __u8 obj_order;
 106         __u8 crypt_type;
 107         __u8 comp_type;
 108
 109         /* The remaining fields need to be updated occasionally */
 110         u64 image_size;
 111         struct ceph_snap_context *snapc;
 112         char *snap_names;
 113         u64 *snap_sizes;
 114
 115         u64 obj_version;
 116 };
 117
 118 /*
 119  * An rbd image specification.
 120  *
 121  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 122  * identify an image.  Each rbd_dev structure includes a pointer to
 123  * an rbd_spec structure that encapsulates this identity.
 124  *
 125  * Each of the id's in an rbd_spec has an associated name.  For a
 126  * user-mapped image, the names are supplied and the id's associated
 127  * with them are looked up.  For a layered image, a parent image is
 128  * defined by the tuple, and the names are looked up.
 129  *
 130  * An rbd_dev structure contains a parent_spec pointer which is
 131  * non-null if the image it represents is a child in a layered
 132  * image.  This pointer will refer to the rbd_spec structure used
 133  * by the parent rbd_dev for its own identity (i.e., the structure
 134  * is shared between the parent and child).
 135  *
 136  * Since these structures are populated once, during the discovery
 137  * phase of image construction, they are effectively immutable so
 138  * we make no effort to synchronize access to them.
 139  *
 140  * Note that code herein does not assume the image name is known (it
 141  * could be a null pointer).
 142  */
 143 struct rbd_spec {
 144         u64             pool_id;
 145         char            *pool_name;
 146
 147         char            *image_id;
 148         char            *image_name;
 149
 150         u64             snap_id;
 151         char            *snap_name;
 152
 153         struct kref     kref;
 154 };
 155
 156 /*
 157  * an instance of the client.  multiple devices may share an rbd client.
 158  */
 159 struct rbd_client {
 160         struct ceph_client      *client;
 161         struct kref             kref;
 162         struct list_head        node;
 163 };
 164
 165 struct rbd_img_request;
 166 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 167
 168 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
 169
 170 struct rbd_obj_request;
 171 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 172
 173 enum obj_request_type {
 174         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
 175 };
 176
 177 struct rbd_obj_request {
 178         const char              *object_name;
 179         u64                     offset;         /* object start byte */
 180         u64                     length;         /* bytes from offset */
 181
 182         struct rbd_img_request  *img_request;
 183         struct list_head        links;          /* img_request->obj_requests */
 184         u32                     which;          /* posn image request list */
 185
 186         enum obj_request_type   type;
 187         union {
 188                 struct bio      *bio_list;
 189                 struct {
 190                         struct page     **pages;
 191                         u32             page_count;
 192                 };
 193         };
 194
 195         struct ceph_osd_request *osd_req;
 196
 197         u64                     xferred;        /* bytes transferred */
 198         u64                     version;
 199         int                     result;
 200         atomic_t                done;
 201
 202         rbd_obj_callback_t      callback;
 203         struct completion       completion;
 204
 205         struct kref             kref;
 206 };
 207
 208 struct rbd_img_request {
 209         struct request          *rq;
 210         struct rbd_device       *rbd_dev;
 211         u64                     offset; /* starting image byte offset */
 212         u64                     length; /* byte count from offset */
 213         bool                    write_request;  /* false for read */
 214         union {
 215                 struct ceph_snap_context *snapc;        /* for writes */
 216                 u64             snap_id;                /* for reads */
 217         };
 218         spinlock_t              completion_lock;/* protects next_completion */
 219         u32                     next_completion;
 220         rbd_img_callback_t      callback;
 221
 222         u32                     obj_request_count;
 223         struct list_head        obj_requests;   /* rbd_obj_request structs */
 224
 225         struct kref             kref;
 226 };
 227
 228 #define for_each_obj_request(ireq, oreq) \
 229         list_for_each_entry(oreq, &(ireq)->obj_requests, links)
 230 #define for_each_obj_request_from(ireq, oreq) \
 231         list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
 232 #define for_each_obj_request_safe(ireq, oreq, n) \
 233         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 234
 235 struct rbd_snap {
 236         struct  device          dev;
 237         const char              *name;
 238         u64                     size;
 239         struct list_head        node;
 240         u64                     id;
 241         u64                     features;
 242 };
 243
 244 struct rbd_mapping {
 245         u64                     size;
 246         u64                     features;
 247         bool                    read_only;
 248 };
 249
 250 /*
 251  * a single device
 252  */
 253 struct rbd_device {
 254         int                     dev_id;         /* blkdev unique id */
 255
 256         int                     major;          /* blkdev assigned major */
 257         struct gendisk          *disk;          /* blkdev's gendisk and rq */
 258
 259         u32                     image_format;   /* Either 1 or 2 */
 260         struct rbd_client       *rbd_client;
 261
 262         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 263
 264         spinlock_t              lock;           /* queue, flags, open_count */
 265
 266         struct rbd_image_header header;
 267         unsigned long           flags;          /* possibly lock protected */
 268         struct rbd_spec         *spec;
 269
 270         char                    *header_name;
 271
 272         struct ceph_file_layout layout;
 273
 274         struct ceph_osd_event   *watch_event;
 275         struct rbd_obj_request  *watch_request;
 276
 277         struct rbd_spec         *parent_spec;
 278         u64                     parent_overlap;
 279
 280         /* protects updating the header */
 281         struct rw_semaphore     header_rwsem;
 282
 283         struct rbd_mapping      mapping;
 284
 285         struct list_head        node;
 286
 287         /* list of snapshots */
 288         struct list_head        snaps;
 289
 290         /* sysfs related */
 291         struct device           dev;
 292         unsigned long           open_count;     /* protected by lock */
 293 };
 294
 295 /*
 296  * Flag bits for rbd_dev->flags.  If atomicity is required,
 297  * rbd_dev->lock is used to protect access.
 298  *
 299  * Currently, only the "removing" flag (which is coupled with the
 300  * "open_count" field) requires atomic access.
 301  */
 302 enum rbd_dev_flags {
 303         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
 304         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
 305 };
 306
 307 static DEFINE_MUTEX(ctl_mutex);   /* Serialize open/close/setup/teardown */
 308
 309 static LIST_HEAD(rbd_dev_list);    /* devices */
 310 static DEFINE_SPINLOCK(rbd_dev_list_lock);
 311
 312 static LIST_HEAD(rbd_client_list);              /* clients */
 313 static DEFINE_SPINLOCK(rbd_client_list_lock);
 314
 315 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
 316 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
 317
 318 static void rbd_dev_release(struct device *dev);
 319 static void rbd_remove_snap_dev(struct rbd_snap *snap);
 320
 321 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 322                        size_t count);
 323 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 324                           size_t count);
 325
 326 static struct bus_attribute rbd_bus_attrs[] = {
 327         __ATTR(add, S_IWUSR, NULL, rbd_add),
 328         __ATTR(remove, S_IWUSR, NULL, rbd_remove),
 329         __ATTR_NULL
 330 };
 331
 332 static struct bus_type rbd_bus_type = {
 333         .name           = "rbd",
 334         .bus_attrs      = rbd_bus_attrs,
 335 };
 336
 337 static void rbd_root_dev_release(struct device *dev)
 338 {
 339 }
 340
 341 static struct device rbd_root_dev = {
 342         .init_name =    "rbd",
 343         .release =      rbd_root_dev_release,
 344 };
 345
 346 static __printf(2, 3)
 347 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 348 {
 349         struct va_format vaf;
 350         va_list args;
 351
 352         va_start(args, fmt);
 353         vaf.fmt = fmt;
 354         vaf.va = &args;
 355
 356         if (!rbd_dev)
 357                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 358         else if (rbd_dev->disk)
 359                 printk(KERN_WARNING "%s: %s: %pV\n",
 360                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 361         else if (rbd_dev->spec && rbd_dev->spec->image_name)
 362                 printk(KERN_WARNING "%s: image %s: %pV\n",
 363                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 364         else if (rbd_dev->spec && rbd_dev->spec->image_id)
 365                 printk(KERN_WARNING "%s: id %s: %pV\n",
 366                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 367         else    /* punt */
 368                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 369                         RBD_DRV_NAME, rbd_dev, &vaf);
 370         va_end(args);
 371 }
 372
 373 #ifdef RBD_DEBUG
 374 #define rbd_assert(expr)                                                \
 375                 if (unlikely(!(expr))) {                                \
 376                         printk(KERN_ERR "\nAssertion failure in %s() "  \
 377                                                 "at line %d:\n\n"       \
 378                                         "\trbd_assert(%s);\n\n",        \
 379                                         __func__, __LINE__, #expr);     \
 380                         BUG();                                          \
 381                 }
 382 #else /* !RBD_DEBUG */
 383 #  define rbd_assert(expr)      ((void) 0)
 384 #endif /* !RBD_DEBUG */
 385
 386 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
 387 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
 388
 389 static int rbd_open(struct block_device *bdev, fmode_t mode)
 390 {
 391         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 392         bool removing = false;
 393
 394         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 395                 return -EROFS;
 396
 397         spin_lock_irq(&rbd_dev->lock);
 398         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 399                 removing = true;
 400         else
 401                 rbd_dev->open_count++;
 402         spin_unlock_irq(&rbd_dev->lock);
 403         if (removing)
 404                 return -ENOENT;
 405
 406         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 407         (void) get_device(&rbd_dev->dev);
 408         set_device_ro(bdev, rbd_dev->mapping.read_only);
 409         mutex_unlock(&ctl_mutex);
 410
 411         return 0;
 412 }
 413
 414 static int rbd_release(struct gendisk *disk, fmode_t mode)
 415 {
 416         struct rbd_device *rbd_dev = disk->private_data;
 417         unsigned long open_count_before;
 418
 419         spin_lock_irq(&rbd_dev->lock);
 420         open_count_before = rbd_dev->open_count--;
 421         spin_unlock_irq(&rbd_dev->lock);
 422         rbd_assert(open_count_before > 0);
 423
 424         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 425         put_device(&rbd_dev->dev);
 426         mutex_unlock(&ctl_mutex);
 427
 428         return 0;
 429 }
 430
 431 static const struct block_device_operations rbd_bd_ops = {
 432         .owner                  = THIS_MODULE,
 433         .open                   = rbd_open,
 434         .release                = rbd_release,
 435 };
 436
 437 /*
 438  * Initialize an rbd client instance.
 439  * We own *ceph_opts.
 440  */
 441 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 442 {
 443         struct rbd_client *rbdc;
 444         int ret = -ENOMEM;
 445
 446         dout("%s:\n", __func__);
 447         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 448         if (!rbdc)
 449                 goto out_opt;
 450
 451         kref_init(&rbdc->kref);
 452         INIT_LIST_HEAD(&rbdc->node);
 453
 454         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 455
 456         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 457         if (IS_ERR(rbdc->client))
 458                 goto out_mutex;
 459         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 460
 461         ret = ceph_open_session(rbdc->client);
 462         if (ret < 0)
 463                 goto out_err;
 464
 465         spin_lock(&rbd_client_list_lock);
 466         list_add_tail(&rbdc->node, &rbd_client_list);
 467         spin_unlock(&rbd_client_list_lock);
 468
 469         mutex_unlock(&ctl_mutex);
 470         dout("%s: rbdc %p\n", __func__, rbdc);
 471
 472         return rbdc;
 473
 474 out_err:
 475         ceph_destroy_client(rbdc->client);
 476 out_mutex:
 477         mutex_unlock(&ctl_mutex);
 478         kfree(rbdc);
 479 out_opt:
 480         if (ceph_opts)
 481                 ceph_destroy_options(ceph_opts);
 482         dout("%s: error %d\n", __func__, ret);
 483
 484         return ERR_PTR(ret);
 485 }
 486
 487 /*
 488  * Find a ceph client with specific addr and configuration.  If
 489  * found, bump its reference count.
 490  */
 491 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 492 {
 493         struct rbd_client *client_node;
 494         bool found = false;
 495
 496         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 497                 return NULL;
 498
 499         spin_lock(&rbd_client_list_lock);
 500         list_for_each_entry(client_node, &rbd_client_list, node) {
 501                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
 502                         kref_get(&client_node->kref);
 503                         found = true;
 504                         break;
 505                 }
 506         }
 507         spin_unlock(&rbd_client_list_lock);
 508
 509         return found ? client_node : NULL;
 510 }
 511
 512 /*
 513  * mount options
 514  */
 515 enum {
 516         Opt_last_int,
 517         /* int args above */
 518         Opt_last_string,
 519         /* string args above */
 520         Opt_read_only,
 521         Opt_read_write,
 522         /* Boolean args above */
 523         Opt_last_bool,
 524 };
 525
 526 static match_table_t rbd_opts_tokens = {
 527         /* int args above */
 528         /* string args above */
 529         {Opt_read_only, "read_only"},
 530         {Opt_read_only, "ro"},          /* Alternate spelling */
 531         {Opt_read_write, "read_write"},
 532         {Opt_read_write, "rw"},         /* Alternate spelling */
 533         /* Boolean args above */
 534         {-1, NULL}
 535 };
 536
 537 struct rbd_options {
 538         bool    read_only;
 539 };
 540
 541 #define RBD_READ_ONLY_DEFAULT   false
 542
 543 static int parse_rbd_opts_token(char *c, void *private)
 544 {
 545         struct rbd_options *rbd_opts = private;
 546         substring_t argstr[MAX_OPT_ARGS];
 547         int token, intval, ret;
 548
 549         token = match_token(c, rbd_opts_tokens, argstr);
 550         if (token < 0)
 551                 return -EINVAL;
 552
 553         if (token < Opt_last_int) {
 554                 ret = match_int(&argstr[0], &intval);
 555                 if (ret < 0) {
 556                         pr_err("bad mount option arg (not int) "
 557                                "at '%s'\n", c);
 558                         return ret;
 559                 }
 560                 dout("got int token %d val %d\n", token, intval);
 561         } else if (token > Opt_last_int && token < Opt_last_string) {
 562                 dout("got string token %d val %s\n", token,
 563                      argstr[0].from);
 564         } else if (token > Opt_last_string && token < Opt_last_bool) {
 565                 dout("got Boolean token %d\n", token);
 566         } else {
 567                 dout("got token %d\n", token);
 568         }
 569
 570         switch (token) {
 571         case Opt_read_only:
 572                 rbd_opts->read_only = true;
 573                 break;
 574         case Opt_read_write:
 575                 rbd_opts->read_only = false;
 576                 break;
 577         default:
 578                 rbd_assert(false);
 579                 break;
 580         }
 581         return 0;
 582 }
 583
 584 /*
 585  * Get a ceph client with specific addr and configuration, if one does
 586  * not exist create it.
 587  */
 588 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 589 {
 590         struct rbd_client *rbdc;
 591
 592         rbdc = rbd_client_find(ceph_opts);
 593         if (rbdc)       /* using an existing client */
 594                 ceph_destroy_options(ceph_opts);
 595         else
 596                 rbdc = rbd_client_create(ceph_opts);
 597
 598         return rbdc;
 599 }
 600
 601 /*
 602  * Destroy ceph client
 603  *
 604  * Caller must hold rbd_client_list_lock.
 605  */
 606 static void rbd_client_release(struct kref *kref)
 607 {
 608         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 609
 610         dout("%s: rbdc %p\n", __func__, rbdc);
 611         spin_lock(&rbd_client_list_lock);
 612         list_del(&rbdc->node);
 613         spin_unlock(&rbd_client_list_lock);
 614
 615         ceph_destroy_client(rbdc->client);
 616         kfree(rbdc);
 617 }
 618
 619 /*
 620  * Drop reference to ceph client node. If it's not referenced anymore, release
 621  * it.
 622  */
 623 static void rbd_put_client(struct rbd_client *rbdc)
 624 {
 625         if (rbdc)
 626                 kref_put(&rbdc->kref, rbd_client_release);
 627 }
 628
 629 static bool rbd_image_format_valid(u32 image_format)
 630 {
 631         return image_format == 1 || image_format == 2;
 632 }
 633
 634 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 635 {
 636         size_t size;
 637         u32 snap_count;
 638
 639         /* The header has to start with the magic rbd header text */
 640         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 641                 return false;
 642
 643         /* The bio layer requires at least sector-sized I/O */
 644
 645         if (ondisk->options.order < SECTOR_SHIFT)
 646                 return false;
 647
 648         /* If we use u64 in a few spots we may be able to loosen this */
 649
 650         if (ondisk->options.order > 8 * sizeof (int) - 1)
 651                 return false;
 652
 653         /*
 654          * The size of a snapshot header has to fit in a size_t, and
 655          * that limits the number of snapshots.
 656          */
 657         snap_count = le32_to_cpu(ondisk->snap_count);
 658         size = SIZE_MAX - sizeof (struct ceph_snap_context);
 659         if (snap_count > size / sizeof (__le64))
 660                 return false;
 661
 662         /*
 663          * Not only that, but the size of the entire the snapshot
 664          * header must also be representable in a size_t.
 665          */
 666         size -= snap_count * sizeof (__le64);
 667         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 668                 return false;
 669
 670         return true;
 671 }
 672
 673 /*
 674  * Create a new header structure, translate header format from the on-disk
 675  * header.
 676  */
 677 static int rbd_header_from_disk(struct rbd_image_header *header,
 678                                  struct rbd_image_header_ondisk *ondisk)
 679 {
 680         u32 snap_count;
 681         size_t len;
 682         size_t size;
 683         u32 i;
 684
 685         memset(header, 0, sizeof (*header));
 686
 687         snap_count = le32_to_cpu(ondisk->snap_count);
 688
 689         len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
 690         header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
 691         if (!header->object_prefix)
 692                 return -ENOMEM;
 693         memcpy(header->object_prefix, ondisk->object_prefix, len);
 694         header->object_prefix[len] = '\0';
 695
 696         if (snap_count) {
 697                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 698
 699                 /* Save a copy of the snapshot names */
 700
 701                 if (snap_names_len > (u64) SIZE_MAX)
 702                         return -EIO;
 703                 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 704                 if (!header->snap_names)
 705                         goto out_err;
 706                 /*
 707                  * Note that rbd_dev_v1_header_read() guarantees
 708                  * the ondisk buffer we're working with has
 709                  * snap_names_len bytes beyond the end of the
 710                  * snapshot id array, this memcpy() is safe.
 711                  */
 712                 memcpy(header->snap_names, &ondisk->snaps[snap_count],
 713                         snap_names_len);
 714
 715                 /* Record each snapshot's size */
 716
 717                 size = snap_count * sizeof (*header->snap_sizes);
 718                 header->snap_sizes = kmalloc(size, GFP_KERNEL);
 719                 if (!header->snap_sizes)
 720                         goto out_err;
 721                 for (i = 0; i < snap_count; i++)
 722                         header->snap_sizes[i] =
 723                                 le64_to_cpu(ondisk->snaps[i].image_size);
 724         } else {
 725                 WARN_ON(ondisk->snap_names_len);
 726                 header->snap_names = NULL;
 727                 header->snap_sizes = NULL;
 728         }
 729
 730         header->features = 0;   /* No features support in v1 images */
 731         header->obj_order = ondisk->options.order;
 732         header->crypt_type = ondisk->options.crypt_type;
 733         header->comp_type = ondisk->options.comp_type;
 734
 735         /* Allocate and fill in the snapshot context */
 736
 737         header->image_size = le64_to_cpu(ondisk->image_size);
 738         size = sizeof (struct ceph_snap_context);
 739         size += snap_count * sizeof (header->snapc->snaps[0]);
 740         header->snapc = kzalloc(size, GFP_KERNEL);
 741         if (!header->snapc)
 742                 goto out_err;
 743
 744         atomic_set(&header->snapc->nref, 1);
 745         header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
 746         header->snapc->num_snaps = snap_count;
 747         for (i = 0; i < snap_count; i++)
 748                 header->snapc->snaps[i] =
 749                         le64_to_cpu(ondisk->snaps[i].id);
 750
 751         return 0;
 752
 753 out_err:
 754         kfree(header->snap_sizes);
 755         header->snap_sizes = NULL;
 756         kfree(header->snap_names);
 757         header->snap_names = NULL;
 758         kfree(header->object_prefix);
 759         header->object_prefix = NULL;
 760
 761         return -ENOMEM;
 762 }
 763
 764 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 765 {
 766         struct rbd_snap *snap;
 767
 768         if (snap_id == CEPH_NOSNAP)
 769                 return RBD_SNAP_HEAD_NAME;
 770
 771         list_for_each_entry(snap, &rbd_dev->snaps, node)
 772                 if (snap_id == snap->id)
 773                         return snap->name;
 774
 775         return NULL;
 776 }
 777
 778 static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
 779 {
 780
 781         struct rbd_snap *snap;
 782
 783         list_for_each_entry(snap, &rbd_dev->snaps, node) {
 784                 if (!strcmp(snap_name, snap->name)) {
 785                         rbd_dev->spec->snap_id = snap->id;
 786                         rbd_dev->mapping.size = snap->size;
 787                         rbd_dev->mapping.features = snap->features;
 788
 789                         return 0;
 790                 }
 791         }
 792
 793         return -ENOENT;
 794 }
 795
 796 static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 797 {
 798         int ret;
 799
 800         if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
 801                     sizeof (RBD_SNAP_HEAD_NAME))) {
 802                 rbd_dev->spec->snap_id = CEPH_NOSNAP;
 803                 rbd_dev->mapping.size = rbd_dev->header.image_size;
 804                 rbd_dev->mapping.features = rbd_dev->header.features;
 805                 ret = 0;
 806         } else {
 807                 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
 808                 if (ret < 0)
 809                         goto done;
 810                 rbd_dev->mapping.read_only = true;
 811         }
 812         set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 813
 814 done:
 815         return ret;
 816 }
 817
 818 static void rbd_header_free(struct rbd_image_header *header)
 819 {
 820         kfree(header->object_prefix);
 821         header->object_prefix = NULL;
 822         kfree(header->snap_sizes);
 823         header->snap_sizes = NULL;
 824         kfree(header->snap_names);
 825         header->snap_names = NULL;
 826         ceph_put_snap_context(header->snapc);
 827         header->snapc = NULL;
 828 }
 829
 830 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 831 {
 832         char *name;
 833         u64 segment;
 834         int ret;
 835
 836         name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
 837         if (!name)
 838                 return NULL;
 839         segment = offset >> rbd_dev->header.obj_order;
 840         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
 841                         rbd_dev->header.object_prefix, segment);
 842         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
 843                 pr_err("error formatting segment name for #%llu (%d)\n",
 844                         segment, ret);
 845                 kfree(name);
 846                 name = NULL;
 847         }
 848
 849         return name;
 850 }
 851
 852 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 853 {
 854         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 855
 856         return offset & (segment_size - 1);
 857 }
 858
 859 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
 860                                 u64 offset, u64 length)
 861 {
 862         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 863
 864         offset &= segment_size - 1;
 865
 866         rbd_assert(length <= U64_MAX - offset);
 867         if (offset + length > segment_size)
 868                 length = segment_size - offset;
 869
 870         return length;
 871 }
 872
 873 /*
 874  * returns the size of an object in the image
 875  */
 876 static u64 rbd_obj_bytes(struct rbd_image_header *header)
 877 {
 878         return 1 << header->obj_order;
 879 }
 880
 881 /*
 882  * bio helpers
 883  */
 884
 885 static void bio_chain_put(struct bio *chain)
 886 {
 887         struct bio *tmp;
 888
 889         while (chain) {
 890                 tmp = chain;
 891                 chain = chain->bi_next;
 892                 bio_put(tmp);
 893         }
 894 }
 895
 896 /*
 897  * zeros a bio chain, starting at specific offset
 898  */
 899 static void zero_bio_chain(struct bio *chain, int start_ofs)
 900 {
 901         struct bio_vec *bv;
 902         unsigned long flags;
 903         void *buf;
 904         int i;
 905         int pos = 0;
 906
 907         while (chain) {
 908                 bio_for_each_segment(bv, chain, i) {
 909                         if (pos + bv->bv_len > start_ofs) {
 910                                 int remainder = max(start_ofs - pos, 0);
 911                                 buf = bvec_kmap_irq(bv, &flags);
 912                                 memset(buf + remainder, 0,
 913                                        bv->bv_len - remainder);
 914                                 bvec_kunmap_irq(buf, &flags);
 915                         }
 916                         pos += bv->bv_len;
 917                 }
 918
 919                 chain = chain->bi_next;
 920         }
 921 }
 922
 923 /*
 924  * Clone a portion of a bio, starting at the given byte offset
 925  * and continuing for the number of bytes indicated.
 926  */
 927 static struct bio *bio_clone_range(struct bio *bio_src,
 928                                         unsigned int offset,
 929                                         unsigned int len,
 930                                         gfp_t gfpmask)
 931 {
 932         struct bio_vec *bv;
 933         unsigned int resid;
 934         unsigned short idx;
 935         unsigned int voff;
 936         unsigned short end_idx;
 937         unsigned short vcnt;
 938         struct bio *bio;
 939
 940         /* Handle the easy case for the caller */
 941
 942         if (!offset && len == bio_src->bi_size)
 943                 return bio_clone(bio_src, gfpmask);
 944
 945         if (WARN_ON_ONCE(!len))
 946                 return NULL;
 947         if (WARN_ON_ONCE(len > bio_src->bi_size))
 948                 return NULL;
 949         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
 950                 return NULL;
 951
 952         /* Find first affected segment... */
 953
 954         resid = offset;
 955         __bio_for_each_segment(bv, bio_src, idx, 0) {
 956                 if (resid < bv->bv_len)
 957                         break;
 958                 resid -= bv->bv_len;
 959         }
 960         voff = resid;
 961
 962         /* ...and the last affected segment */
 963
 964         resid += len;
 965         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
 966                 if (resid <= bv->bv_len)
 967                         break;
 968                 resid -= bv->bv_len;
 969         }
 970         vcnt = end_idx - idx + 1;
 971
 972         /* Build the clone */
 973
 974         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
 975         if (!bio)
 976                 return NULL;    /* ENOMEM */
 977
 978         bio->bi_bdev = bio_src->bi_bdev;
 979         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
 980         bio->bi_rw = bio_src->bi_rw;
 981         bio->bi_flags |= 1 << BIO_CLONED;
 982
 983         /*
 984          * Copy over our part of the bio_vec, then update the first
 985          * and last (or only) entries.
 986          */
 987         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
 988                         vcnt * sizeof (struct bio_vec));
 989         bio->bi_io_vec[0].bv_offset += voff;
 990         if (vcnt > 1) {
 991                 bio->bi_io_vec[0].bv_len -= voff;
 992                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
 993         } else {
 994                 bio->bi_io_vec[0].bv_len = len;
 995         }
 996
 997         bio->bi_vcnt = vcnt;
 998         bio->bi_size = len;
 999         bio->bi_idx = 0;
1000
1001         return bio;
1002 }
1003
1004 /*
1005  * Clone a portion of a bio chain, starting at the given byte offset
1006  * into the first bio in the source chain and continuing for the
1007  * number of bytes indicated.  The result is another bio chain of
1008  * exactly the given length, or a null pointer on error.
1009  *
1010  * The bio_src and offset parameters are both in-out.  On entry they
1011  * refer to the first source bio and the offset into that bio where
1012  * the start of data to be cloned is located.
1013  *
1014  * On return, bio_src is updated to refer to the bio in the source
1015  * chain that contains first un-cloned byte, and *offset will
1016  * contain the offset of that byte within that bio.
1017  */
1018 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1019                                         unsigned int *offset,
1020                                         unsigned int len,
1021                                         gfp_t gfpmask)
1022 {
1023         struct bio *bi = *bio_src;
1024         unsigned int off = *offset;
1025         struct bio *chain = NULL;
1026         struct bio **end;
1027
1028         /* Build up a chain of clone bios up to the limit */
1029
1030         if (!bi || off >= bi->bi_size || !len)
1031                 return NULL;            /* Nothing to clone */
1032
1033         end = &chain;
1034         while (len) {
1035                 unsigned int bi_size;
1036                 struct bio *bio;
1037
1038                 if (!bi) {
1039                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1040                         goto out_err;   /* EINVAL; ran out of bio's */
1041                 }
1042                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1043                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1044                 if (!bio)
1045                         goto out_err;   /* ENOMEM */
1046
1047                 *end = bio;
1048                 end = &bio->bi_next;
1049
1050                 off += bi_size;
1051                 if (off == bi->bi_size) {
1052                         bi = bi->bi_next;
1053                         off = 0;
1054                 }
1055                 len -= bi_size;
1056         }
1057         *bio_src = bi;
1058         *offset = off;
1059
1060         return chain;
1061 out_err:
1062         bio_chain_put(chain);
1063
1064         return NULL;
1065 }
1066
1067 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1068 {
1069         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1070                 atomic_read(&obj_request->kref.refcount));
1071         kref_get(&obj_request->kref);
1072 }
1073
1074 static void rbd_obj_request_destroy(struct kref *kref);
1075 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1076 {
1077         rbd_assert(obj_request != NULL);
1078         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1079                 atomic_read(&obj_request->kref.refcount));
1080         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1081 }
1082
1083 static void rbd_img_request_get(struct rbd_img_request *img_request)
1084 {
1085         dout("%s: img %p (was %d)\n", __func__, img_request,
1086                 atomic_read(&img_request->kref.refcount));
1087         kref_get(&img_request->kref);
1088 }
1089
1090 static void rbd_img_request_destroy(struct kref *kref);
1091 static void rbd_img_request_put(struct rbd_img_request *img_request)
1092 {
1093         rbd_assert(img_request != NULL);
1094         dout("%s: img %p (was %d)\n", __func__, img_request,
1095                 atomic_read(&img_request->kref.refcount));
1096         kref_put(&img_request->kref, rbd_img_request_destroy);
1097 }
1098
1099 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1100                                         struct rbd_obj_request *obj_request)
1101 {
1102         rbd_assert(obj_request->img_request == NULL);
1103
1104         rbd_obj_request_get(obj_request);
1105         obj_request->img_request = img_request;
1106         obj_request->which = img_request->obj_request_count;
1107         rbd_assert(obj_request->which != BAD_WHICH);
1108         img_request->obj_request_count++;
1109         list_add_tail(&obj_request->links, &img_request->obj_requests);
1110         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1111                 obj_request->which);
1112 }
1113
1114 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1115                                         struct rbd_obj_request *obj_request)
1116 {
1117         rbd_assert(obj_request->which != BAD_WHICH);
1118
1119         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1120                 obj_request->which);
1121         list_del(&obj_request->links);
1122         rbd_assert(img_request->obj_request_count > 0);
1123         img_request->obj_request_count--;
1124         rbd_assert(obj_request->which == img_request->obj_request_count);
1125         obj_request->which = BAD_WHICH;
1126         rbd_assert(obj_request->img_request == img_request);
1127         obj_request->img_request = NULL;
1128         obj_request->callback = NULL;
1129         rbd_obj_request_put(obj_request);
1130 }
1131
1132 static bool obj_request_type_valid(enum obj_request_type type)
1133 {
1134         switch (type) {
1135         case OBJ_REQUEST_NODATA:
1136         case OBJ_REQUEST_BIO:
1137         case OBJ_REQUEST_PAGES:
1138                 return true;
1139         default:
1140                 return false;
1141         }
1142 }
1143
1144 static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1145 {
1146         struct ceph_osd_req_op *op;
1147         va_list args;
1148         size_t size;
1149
1150         op = kzalloc(sizeof (*op), GFP_NOIO);
1151         if (!op)
1152                 return NULL;
1153         op->op = opcode;
1154         va_start(args, opcode);
1155         switch (opcode) {
1156         case CEPH_OSD_OP_READ:
1157         case CEPH_OSD_OP_WRITE:
1158                 /* rbd_osd_req_op_create(READ, offset, length) */
1159                 /* rbd_osd_req_op_create(WRITE, offset, length) */
1160                 op->extent.offset = va_arg(args, u64);
1161                 op->extent.length = va_arg(args, u64);
1162                 if (opcode == CEPH_OSD_OP_WRITE)
1163                         op->payload_len = op->extent.length;
1164                 break;
1165         case CEPH_OSD_OP_STAT:
1166                 break;
1167         case CEPH_OSD_OP_CALL:
1168                 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1169                 op->cls.class_name = va_arg(args, char *);
1170                 size = strlen(op->cls.class_name);
1171                 rbd_assert(size <= (size_t) U8_MAX);
1172                 op->cls.class_len = size;
1173                 op->payload_len = size;
1174
1175                 op->cls.method_name = va_arg(args, char *);
1176                 size = strlen(op->cls.method_name);
1177                 rbd_assert(size <= (size_t) U8_MAX);
1178                 op->cls.method_len = size;
1179                 op->payload_len += size;
1180
1181                 op->cls.argc = 0;
1182                 op->cls.indata = va_arg(args, void *);
1183                 size = va_arg(args, size_t);
1184                 rbd_assert(size <= (size_t) U32_MAX);
1185                 op->cls.indata_len = (u32) size;
1186                 op->payload_len += size;
1187                 break;
1188         case CEPH_OSD_OP_NOTIFY_ACK:
1189         case CEPH_OSD_OP_WATCH:
1190                 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1191                 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1192                 op->watch.cookie = va_arg(args, u64);
1193                 op->watch.ver = va_arg(args, u64);
1194                 op->watch.ver = cpu_to_le64(op->watch.ver);
1195                 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1196                         op->watch.flag = (u8) 1;
1197                 break;
1198         default:
1199                 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1200                 kfree(op);
1201                 op = NULL;
1202                 break;
1203         }
1204         va_end(args);
1205
1206         return op;
1207 }
1208
1209 static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1210 {
1211         kfree(op);
1212 }
1213
1214 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1215                                 struct rbd_obj_request *obj_request)
1216 {
1217         dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1218
1219         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1220 }
1221
1222 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1223 {
1224         dout("%s: img %p\n", __func__, img_request);
1225         if (img_request->callback)
1226                 img_request->callback(img_request);
1227         else
1228                 rbd_img_request_put(img_request);
1229 }
1230
1231 /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1232
1233 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1234 {
1235         dout("%s: obj %p\n", __func__, obj_request);
1236
1237         return wait_for_completion_interruptible(&obj_request->completion);
1238 }
1239
1240 static void obj_request_done_init(struct rbd_obj_request *obj_request)
1241 {
1242         atomic_set(&obj_request->done, 0);
1243         smp_wmb();
1244 }
1245
1246 static void obj_request_done_set(struct rbd_obj_request *obj_request)
1247 {
1248         int done;
1249
1250         done = atomic_inc_return(&obj_request->done);
1251         if (done > 1) {
1252                 struct rbd_img_request *img_request = obj_request->img_request;
1253                 struct rbd_device *rbd_dev;
1254
1255                 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1256                 rbd_warn(rbd_dev, "obj_request %p was already done\n",
1257                         obj_request);
1258         }
1259 }
1260
1261 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1262 {
1263         smp_mb();
1264         return atomic_read(&obj_request->done) != 0;
1265 }
1266
1267 static void
1268 rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1269 {
1270         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1271                 obj_request, obj_request->img_request, obj_request->result,
1272                 obj_request->xferred, obj_request->length);
1273         /*
1274          * ENOENT means a hole in the image.  We zero-fill the
1275          * entire length of the request.  A short read also implies
1276          * zero-fill to the end of the request.  Either way we
1277          * update the xferred count to indicate the whole request
1278          * was satisfied.
1279          */
1280         BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
1281         if (obj_request->result == -ENOENT) {
1282                 zero_bio_chain(obj_request->bio_list, 0);
1283                 obj_request->result = 0;
1284                 obj_request->xferred = obj_request->length;
1285         } else if (obj_request->xferred < obj_request->length &&
1286                         !obj_request->result) {
1287                 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1288                 obj_request->xferred = obj_request->length;
1289         }
1290         obj_request_done_set(obj_request);
1291 }
1292
1293 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1294 {
1295         dout("%s: obj %p cb %p\n", __func__, obj_request,
1296                 obj_request->callback);
1297         if (obj_request->callback)
1298                 obj_request->callback(obj_request);
1299         else
1300                 complete_all(&obj_request->completion);
1301 }
1302
1303 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1304 {
1305         dout("%s: obj %p\n", __func__, obj_request);
1306         obj_request_done_set(obj_request);
1307 }
1308
1309 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1310 {
1311         dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
1312                 obj_request->result, obj_request->xferred, obj_request->length);
1313         if (obj_request->img_request)
1314                 rbd_img_obj_request_read_callback(obj_request);
1315         else
1316                 obj_request_done_set(obj_request);
1317 }
1318
1319 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1320 {
1321         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1322                 obj_request->result, obj_request->length);
1323         /*
1324          * There is no such thing as a successful short write.
1325          * Our xferred value is the number of bytes transferred
1326          * back.  Set it to our originally-requested length.
1327          */
1328         obj_request->xferred = obj_request->length;
1329         obj_request_done_set(obj_request);
1330 }
1331
1332 /*
1333  * For a simple stat call there's nothing to do.  We'll do more if
1334  * this is part of a write sequence for a layered image.
1335  */
1336 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1337 {
1338         dout("%s: obj %p\n", __func__, obj_request);
1339         obj_request_done_set(obj_request);
1340 }
1341
1342 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1343                                 struct ceph_msg *msg)
1344 {
1345         struct rbd_obj_request *obj_request = osd_req->r_priv;
1346         u16 opcode;
1347
1348         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1349         rbd_assert(osd_req == obj_request->osd_req);
1350         rbd_assert(!!obj_request->img_request ^
1351                                 (obj_request->which == BAD_WHICH));
1352
1353         if (osd_req->r_result < 0)
1354                 obj_request->result = osd_req->r_result;
1355         obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1356
1357         WARN_ON(osd_req->r_num_ops != 1);       /* For now */
1358
1359         /*
1360          * We support a 64-bit length, but ultimately it has to be
1361          * passed to blk_end_request(), which takes an unsigned int.
1362          */
1363         obj_request->xferred = osd_req->r_reply_op_len[0];
1364         rbd_assert(obj_request->xferred < (u64) UINT_MAX);
1365         opcode = osd_req->r_request_ops[0].op;
1366         switch (opcode) {
1367         case CEPH_OSD_OP_READ:
1368                 rbd_osd_read_callback(obj_request);
1369                 break;
1370         case CEPH_OSD_OP_WRITE:
1371                 rbd_osd_write_callback(obj_request);
1372                 break;
1373         case CEPH_OSD_OP_STAT:
1374                 rbd_osd_stat_callback(obj_request);
1375                 break;
1376         case CEPH_OSD_OP_CALL:
1377         case CEPH_OSD_OP_NOTIFY_ACK:
1378         case CEPH_OSD_OP_WATCH:
1379                 rbd_osd_trivial_callback(obj_request);
1380                 break;
1381         default:
1382                 rbd_warn(NULL, "%s: unsupported op %hu\n",
1383                         obj_request->object_name, (unsigned short) opcode);
1384                 break;
1385         }
1386
1387         if (obj_request_done_test(obj_request))
1388                 rbd_obj_request_complete(obj_request);
1389 }
1390
1391 static struct ceph_osd_request *rbd_osd_req_create(
1392                                         struct rbd_device *rbd_dev,
1393                                         bool write_request,
1394                                         struct rbd_obj_request *obj_request,
1395                                         struct ceph_osd_req_op *op)
1396 {
1397         struct rbd_img_request *img_request = obj_request->img_request;
1398         struct ceph_snap_context *snapc = NULL;
1399         struct ceph_osd_client *osdc;
1400         struct ceph_osd_request *osd_req;
1401         struct timespec now;
1402         struct timespec *mtime;
1403         u64 snap_id = CEPH_NOSNAP;
1404         u64 offset = obj_request->offset;
1405         u64 length = obj_request->length;
1406
1407         if (img_request) {
1408                 rbd_assert(img_request->write_request == write_request);
1409                 if (img_request->write_request)
1410                         snapc = img_request->snapc;
1411                 else
1412                         snap_id = img_request->snap_id;
1413         }
1414
1415         /* Allocate and initialize the request, for the single op */
1416
1417         osdc = &rbd_dev->rbd_client->client->osdc;
1418         osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1419         if (!osd_req)
1420                 return NULL;    /* ENOMEM */
1421
1422         rbd_assert(obj_request_type_valid(obj_request->type));
1423         switch (obj_request->type) {
1424         case OBJ_REQUEST_NODATA:
1425                 break;          /* Nothing to do */
1426         case OBJ_REQUEST_BIO:
1427                 rbd_assert(obj_request->bio_list != NULL);
1428                 osd_req->r_data.type = CEPH_OSD_DATA_TYPE_BIO;
1429                 osd_req->r_data.bio = obj_request->bio_list;
1430                 break;
1431         case OBJ_REQUEST_PAGES:
1432                 osd_req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
1433                 osd_req->r_data.pages = obj_request->pages;
1434                 osd_req->r_data.num_pages = obj_request->page_count;
1435                 osd_req->r_data.alignment = offset & ~PAGE_MASK;
1436                 osd_req->r_data.pages_from_pool = false;
1437                 osd_req->r_data.own_pages = false;
1438                 break;
1439         }
1440
1441         if (write_request) {
1442                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1443                 now = CURRENT_TIME;
1444                 mtime = &now;
1445         } else {
1446                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1447                 mtime = NULL;   /* not needed for reads */
1448                 offset = 0;     /* These are not used... */
1449                 length = 0;     /* ...for osd read requests */
1450         }
1451
1452         osd_req->r_callback = rbd_osd_req_callback;
1453         osd_req->r_priv = obj_request;
1454
1455         osd_req->r_oid_len = strlen(obj_request->object_name);
1456         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1457         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1458
1459         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1460
1461         /* osd_req will get its own reference to snapc (if non-null) */
1462
1463         ceph_osdc_build_request(osd_req, offset, length, 1, op,
1464                                 snapc, snap_id, mtime);
1465
1466         return osd_req;
1467 }
1468
1469 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1470 {
1471         ceph_osdc_put_request(osd_req);
1472 }
1473
1474 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1475
1476 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1477                                                 u64 offset, u64 length,
1478                                                 enum obj_request_type type)
1479 {
1480         struct rbd_obj_request *obj_request;
1481         size_t size;
1482         char *name;
1483
1484         rbd_assert(obj_request_type_valid(type));
1485
1486         size = strlen(object_name) + 1;
1487         obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1488         if (!obj_request)
1489                 return NULL;
1490
1491         name = (char *)(obj_request + 1);
1492         obj_request->object_name = memcpy(name, object_name, size);
1493         obj_request->offset = offset;
1494         obj_request->length = length;
1495         obj_request->which = BAD_WHICH;
1496         obj_request->type = type;
1497         INIT_LIST_HEAD(&obj_request->links);
1498         obj_request_done_init(obj_request);
1499         init_completion(&obj_request->completion);
1500         kref_init(&obj_request->kref);
1501
1502         dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1503                 offset, length, (int)type, obj_request);
1504
1505         return obj_request;
1506 }
1507
1508 static void rbd_obj_request_destroy(struct kref *kref)
1509 {
1510         struct rbd_obj_request *obj_request;
1511
1512         obj_request = container_of(kref, struct rbd_obj_request, kref);
1513
1514         dout("%s: obj %p\n", __func__, obj_request);
1515
1516         rbd_assert(obj_request->img_request == NULL);
1517         rbd_assert(obj_request->which == BAD_WHICH);
1518
1519         if (obj_request->osd_req)
1520                 rbd_osd_req_destroy(obj_request->osd_req);
1521
1522         rbd_assert(obj_request_type_valid(obj_request->type));
1523         switch (obj_request->type) {
1524         case OBJ_REQUEST_NODATA:
1525                 break;          /* Nothing to do */
1526         case OBJ_REQUEST_BIO:
1527                 if (obj_request->bio_list)
1528                         bio_chain_put(obj_request->bio_list);
1529                 break;
1530         case OBJ_REQUEST_PAGES:
1531                 if (obj_request->pages)
1532                         ceph_release_page_vector(obj_request->pages,
1533                                                 obj_request->page_count);
1534                 break;
1535         }
1536
1537         kfree(obj_request);
1538 }
1539
1540 /*
1541  * Caller is responsible for filling in the list of object requests
1542  * that comprises the image request, and the Linux request pointer
1543  * (if there is one).
1544  */
1545 static struct rbd_img_request *rbd_img_request_create(
1546                                         struct rbd_device *rbd_dev,
1547                                         u64 offset, u64 length,
1548                                         bool write_request)
1549 {
1550         struct rbd_img_request *img_request;
1551         struct ceph_snap_context *snapc = NULL;
1552
1553         img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1554         if (!img_request)
1555                 return NULL;
1556
1557         if (write_request) {
1558                 down_read(&rbd_dev->header_rwsem);
1559                 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1560                 up_read(&rbd_dev->header_rwsem);
1561                 if (WARN_ON(!snapc)) {
1562                         kfree(img_request);
1563                         return NULL;    /* Shouldn't happen */
1564                 }
1565         }
1566
1567         img_request->rq = NULL;
1568         img_request->rbd_dev = rbd_dev;
1569         img_request->offset = offset;
1570         img_request->length = length;
1571         img_request->write_request = write_request;
1572         if (write_request)
1573                 img_request->snapc = snapc;
1574         else
1575                 img_request->snap_id = rbd_dev->spec->snap_id;
1576         spin_lock_init(&img_request->completion_lock);
1577         img_request->next_completion = 0;
1578         img_request->callback = NULL;
1579         img_request->obj_request_count = 0;
1580         INIT_LIST_HEAD(&img_request->obj_requests);
1581         kref_init(&img_request->kref);
1582
1583         rbd_img_request_get(img_request);       /* Avoid a warning */
1584         rbd_img_request_put(img_request);       /* TEMPORARY */
1585
1586         dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1587                 write_request ? "write" : "read", offset, length,
1588                 img_request);
1589
1590         return img_request;
1591 }
1592
1593 static void rbd_img_request_destroy(struct kref *kref)
1594 {
1595         struct rbd_img_request *img_request;
1596         struct rbd_obj_request *obj_request;
1597         struct rbd_obj_request *next_obj_request;
1598
1599         img_request = container_of(kref, struct rbd_img_request, kref);
1600
1601         dout("%s: img %p\n", __func__, img_request);
1602
1603         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1604                 rbd_img_obj_request_del(img_request, obj_request);
1605         rbd_assert(img_request->obj_request_count == 0);
1606
1607         if (img_request->write_request)
1608                 ceph_put_snap_context(img_request->snapc);
1609
1610         kfree(img_request);
1611 }
1612
1613 static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1614                                         struct bio *bio_list)
1615 {
1616         struct rbd_device *rbd_dev = img_request->rbd_dev;
1617         struct rbd_obj_request *obj_request = NULL;
1618         struct rbd_obj_request *next_obj_request;
1619         unsigned int bio_offset;
1620         u64 image_offset;
1621         u64 resid;
1622         u16 opcode;
1623
1624         dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1625
1626         opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1627                                               : CEPH_OSD_OP_READ;
1628         bio_offset = 0;
1629         image_offset = img_request->offset;
1630         rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1631         resid = img_request->length;
1632         rbd_assert(resid > 0);
1633         while (resid) {
1634                 const char *object_name;
1635                 unsigned int clone_size;
1636                 struct ceph_osd_req_op *op;
1637                 u64 offset;
1638                 u64 length;
1639
1640                 object_name = rbd_segment_name(rbd_dev, image_offset);
1641                 if (!object_name)
1642                         goto out_unwind;
1643                 offset = rbd_segment_offset(rbd_dev, image_offset);
1644                 length = rbd_segment_length(rbd_dev, image_offset, resid);
1645                 obj_request = rbd_obj_request_create(object_name,
1646                                                 offset, length,
1647                                                 OBJ_REQUEST_BIO);
1648                 kfree(object_name);     /* object request has its own copy */
1649                 if (!obj_request)
1650                         goto out_unwind;
1651
1652                 rbd_assert(length <= (u64) UINT_MAX);
1653                 clone_size = (unsigned int) length;
1654                 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1655                                                 &bio_offset, clone_size,
1656                                                 GFP_ATOMIC);
1657                 if (!obj_request->bio_list)
1658                         goto out_partial;
1659
1660                 /*
1661                  * Build up the op to use in building the osd
1662                  * request.  Note that the contents of the op are
1663                  * copied by rbd_osd_req_create().
1664                  */
1665                 op = rbd_osd_req_op_create(opcode, offset, length);
1666                 if (!op)
1667                         goto out_partial;
1668                 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1669                                                 img_request->write_request,
1670                                                 obj_request, op);
1671                 rbd_osd_req_op_destroy(op);
1672                 if (!obj_request->osd_req)
1673                         goto out_partial;
1674                 /* status and version are initially zero-filled */
1675
1676                 rbd_img_obj_request_add(img_request, obj_request);
1677
1678                 image_offset += length;
1679                 resid -= length;
1680         }
1681
1682         return 0;
1683
1684 out_partial:
1685         rbd_obj_request_put(obj_request);
1686 out_unwind:
1687         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1688                 rbd_obj_request_put(obj_request);
1689
1690         return -ENOMEM;
1691 }
1692
1693 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1694 {
1695         struct rbd_img_request *img_request;
1696         u32 which = obj_request->which;
1697         bool more = true;
1698
1699         img_request = obj_request->img_request;
1700
1701         dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1702         rbd_assert(img_request != NULL);
1703         rbd_assert(img_request->rq != NULL);
1704         rbd_assert(img_request->obj_request_count > 0);
1705         rbd_assert(which != BAD_WHICH);
1706         rbd_assert(which < img_request->obj_request_count);
1707         rbd_assert(which >= img_request->next_completion);
1708
1709         spin_lock_irq(&img_request->completion_lock);
1710         if (which != img_request->next_completion)
1711                 goto out;
1712
1713         for_each_obj_request_from(img_request, obj_request) {
1714                 unsigned int xferred;
1715                 int result;
1716
1717                 rbd_assert(more);
1718                 rbd_assert(which < img_request->obj_request_count);
1719
1720                 if (!obj_request_done_test(obj_request))
1721                         break;
1722
1723                 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1724                 xferred = (unsigned int) obj_request->xferred;
1725                 result = (int) obj_request->result;
1726                 if (result)
1727                         rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1728                                 img_request->write_request ? "write" : "read",
1729                                 result, xferred);
1730
1731                 more = blk_end_request(img_request->rq, result, xferred);
1732                 which++;
1733         }
1734
1735         rbd_assert(more ^ (which == img_request->obj_request_count));
1736         img_request->next_completion = which;
1737 out:
1738         spin_unlock_irq(&img_request->completion_lock);
1739
1740         if (!more)
1741                 rbd_img_request_complete(img_request);
1742 }
1743
1744 static int rbd_img_request_submit(struct rbd_img_request *img_request)
1745 {
1746         struct rbd_device *rbd_dev = img_request->rbd_dev;
1747         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1748         struct rbd_obj_request *obj_request;
1749         struct rbd_obj_request *next_obj_request;
1750
1751         dout("%s: img %p\n", __func__, img_request);
1752         for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
1753                 int ret;
1754
1755                 obj_request->callback = rbd_img_obj_callback;
1756                 ret = rbd_obj_request_submit(osdc, obj_request);
1757                 if (ret)
1758                         return ret;
1759                 /*
1760                  * The image request has its own reference to each
1761                  * of its object requests, so we can safely drop the
1762                  * initial one here.
1763                  */
1764                 rbd_obj_request_put(obj_request);
1765         }
1766
1767         return 0;
1768 }
1769
1770 static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1771                                    u64 ver, u64 notify_id)
1772 {
1773         struct rbd_obj_request *obj_request;
1774         struct ceph_osd_req_op *op;
1775         struct ceph_osd_client *osdc;
1776         int ret;
1777
1778         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1779                                                         OBJ_REQUEST_NODATA);
1780         if (!obj_request)
1781                 return -ENOMEM;
1782
1783         ret = -ENOMEM;
1784         op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1785         if (!op)
1786                 goto out;
1787         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1788                                                 obj_request, op);
1789         rbd_osd_req_op_destroy(op);
1790         if (!obj_request->osd_req)
1791                 goto out;
1792
1793         osdc = &rbd_dev->rbd_client->client->osdc;
1794         obj_request->callback = rbd_obj_request_put;
1795         ret = rbd_obj_request_submit(osdc, obj_request);
1796 out:
1797         if (ret)
1798                 rbd_obj_request_put(obj_request);
1799
1800         return ret;
1801 }
1802
1803 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1804 {
1805         struct rbd_device *rbd_dev = (struct rbd_device *)data;
1806         u64 hver;
1807         int rc;
1808
1809         if (!rbd_dev)
1810                 return;
1811
1812         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1813                 rbd_dev->header_name, (unsigned long long) notify_id,
1814                 (unsigned int) opcode);
1815         rc = rbd_dev_refresh(rbd_dev, &hver);
1816         if (rc)
1817                 rbd_warn(rbd_dev, "got notification but failed to "
1818                            " update snaps: %d\n", rc);
1819
1820         rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1821 }
1822
1823 /*
1824  * Request sync osd watch/unwatch.  The value of "start" determines
1825  * whether a watch request is being initiated or torn down.
1826  */
1827 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1828 {
1829         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1830         struct rbd_obj_request *obj_request;
1831         struct ceph_osd_req_op *op;
1832         int ret;
1833
1834         rbd_assert(start ^ !!rbd_dev->watch_event);
1835         rbd_assert(start ^ !!rbd_dev->watch_request);
1836
1837         if (start) {
1838                 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
1839                                                 &rbd_dev->watch_event);
1840                 if (ret < 0)
1841                         return ret;
1842                 rbd_assert(rbd_dev->watch_event != NULL);
1843         }
1844
1845         ret = -ENOMEM;
1846         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1847                                                         OBJ_REQUEST_NODATA);
1848         if (!obj_request)
1849                 goto out_cancel;
1850
1851         op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1852                                 rbd_dev->watch_event->cookie,
1853                                 rbd_dev->header.obj_version, start);
1854         if (!op)
1855                 goto out_cancel;
1856         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1857                                                         obj_request, op);
1858         rbd_osd_req_op_destroy(op);
1859         if (!obj_request->osd_req)
1860                 goto out_cancel;
1861
1862         if (start)
1863                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
1864         else
1865                 ceph_osdc_unregister_linger_request(osdc,
1866                                         rbd_dev->watch_request->osd_req);
1867         ret = rbd_obj_request_submit(osdc, obj_request);
1868         if (ret)
1869                 goto out_cancel;
1870         ret = rbd_obj_request_wait(obj_request);
1871         if (ret)
1872                 goto out_cancel;
1873         ret = obj_request->result;
1874         if (ret)
1875                 goto out_cancel;
1876
1877         /*
1878          * A watch request is set to linger, so the underlying osd
1879          * request won't go away until we unregister it.  We retain
1880          * a pointer to the object request during that time (in
1881          * rbd_dev->watch_request), so we'll keep a reference to
1882          * it.  We'll drop that reference (below) after we've
1883          * unregistered it.
1884          */
1885         if (start) {
1886                 rbd_dev->watch_request = obj_request;
1887
1888                 return 0;
1889         }
1890
1891         /* We have successfully torn down the watch request */
1892
1893         rbd_obj_request_put(rbd_dev->watch_request);
1894         rbd_dev->watch_request = NULL;
1895 out_cancel:
1896         /* Cancel the event if we're tearing down, or on error */
1897         ceph_osdc_cancel_event(rbd_dev->watch_event);
1898         rbd_dev->watch_event = NULL;
1899         if (obj_request)
1900                 rbd_obj_request_put(obj_request);
1901
1902         return ret;
1903 }
1904
1905 /*
1906  * Synchronous osd object method call
1907  */
1908 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1909                              const char *object_name,
1910                              const char *class_name,
1911                              const char *method_name,
1912                              const char *outbound,
1913                              size_t outbound_size,
1914                              char *inbound,
1915                              size_t inbound_size,
1916                              u64 *version)
1917 {
1918         struct rbd_obj_request *obj_request;
1919         struct ceph_osd_client *osdc;
1920         struct ceph_osd_req_op *op;
1921         struct page **pages;
1922         u32 page_count;
1923         int ret;
1924
1925         /*
1926          * Method calls are ultimately read operations but they
1927          * don't involve object data (so no offset or length).
1928          * The result should placed into the inbound buffer
1929          * provided.  They also supply outbound data--parameters for
1930          * the object method.  Currently if this is present it will
1931          * be a snapshot id.
1932          */
1933         page_count = (u32) calc_pages_for(0, inbound_size);
1934         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1935         if (IS_ERR(pages))
1936                 return PTR_ERR(pages);
1937
1938         ret = -ENOMEM;
1939         obj_request = rbd_obj_request_create(object_name, 0, 0,
1940                                                         OBJ_REQUEST_PAGES);
1941         if (!obj_request)
1942                 goto out;
1943
1944         obj_request->pages = pages;
1945         obj_request->page_count = page_count;
1946
1947         op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1948                                         method_name, outbound, outbound_size);
1949         if (!op)
1950                 goto out;
1951         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1952                                                 obj_request, op);
1953         rbd_osd_req_op_destroy(op);
1954         if (!obj_request->osd_req)
1955                 goto out;
1956
1957         osdc = &rbd_dev->rbd_client->client->osdc;
1958         ret = rbd_obj_request_submit(osdc, obj_request);
1959         if (ret)
1960                 goto out;
1961         ret = rbd_obj_request_wait(obj_request);
1962         if (ret)
1963                 goto out;
1964
1965         ret = obj_request->result;
1966         if (ret < 0)
1967                 goto out;
1968         ret = 0;
1969         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
1970         if (version)
1971                 *version = obj_request->version;
1972 out:
1973         if (obj_request)
1974                 rbd_obj_request_put(obj_request);
1975         else
1976                 ceph_release_page_vector(pages, page_count);
1977
1978         return ret;
1979 }
1980
1981 static void rbd_request_fn(struct request_queue *q)
1982                 __releases(q->queue_lock) __acquires(q->queue_lock)
1983 {
1984         struct rbd_device *rbd_dev = q->queuedata;
1985         bool read_only = rbd_dev->mapping.read_only;
1986         struct request *rq;
1987         int result;
1988
1989         while ((rq = blk_fetch_request(q))) {
1990                 bool write_request = rq_data_dir(rq) == WRITE;
1991                 struct rbd_img_request *img_request;
1992                 u64 offset;
1993                 u64 length;
1994
1995                 /* Ignore any non-FS requests that filter through. */
1996
1997                 if (rq->cmd_type != REQ_TYPE_FS) {
1998                         dout("%s: non-fs request type %d\n", __func__,
1999                                 (int) rq->cmd_type);
2000                         __blk_end_request_all(rq, 0);
2001                         continue;
2002                 }
2003
2004                 /* Ignore/skip any zero-length requests */
2005
2006                 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2007                 length = (u64) blk_rq_bytes(rq);
2008
2009                 if (!length) {
2010                         dout("%s: zero-length request\n", __func__);
2011                         __blk_end_request_all(rq, 0);
2012                         continue;
2013                 }
2014
2015                 spin_unlock_irq(q->queue_lock);
2016
2017                 /* Disallow writes to a read-only device */
2018
2019                 if (write_request) {
2020                         result = -EROFS;
2021                         if (read_only)
2022                                 goto end_request;
2023                         rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2024                 }
2025
2026                 /*
2027                  * Quit early if the mapped snapshot no longer
2028                  * exists.  It's still possible the snapshot will
2029                  * have disappeared by the time our request arrives
2030                  * at the osd, but there's no sense in sending it if
2031                  * we already know.
2032                  */
2033                 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2034                         dout("request for non-existent snapshot");
2035                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2036                         result = -ENXIO;
2037                         goto end_request;
2038                 }
2039
2040                 result = -EINVAL;
2041                 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2042                         goto end_request;       /* Shouldn't happen */
2043
2044                 result = -ENOMEM;
2045                 img_request = rbd_img_request_create(rbd_dev, offset, length,
2046                                                         write_request);
2047                 if (!img_request)
2048                         goto end_request;
2049
2050                 img_request->rq = rq;
2051
2052                 result = rbd_img_request_fill_bio(img_request, rq->bio);
2053                 if (!result)
2054                         result = rbd_img_request_submit(img_request);
2055                 if (result)
2056                         rbd_img_request_put(img_request);
2057 end_request:
2058                 spin_lock_irq(q->queue_lock);
2059                 if (result < 0) {
2060                         rbd_warn(rbd_dev, "obj_request %s result %d\n",
2061                                 write_request ? "write" : "read", result);
2062                         __blk_end_request_all(rq, result);
2063                 }
2064         }
2065 }
2066
2067 /*
2068  * a queue callback. Makes sure that we don't create a bio that spans across
2069  * multiple osd objects. One exception would be with a single page bios,
2070  * which we handle later at bio_chain_clone_range()
2071  */
2072 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2073                           struct bio_vec *bvec)
2074 {
2075         struct rbd_device *rbd_dev = q->queuedata;
2076         sector_t sector_offset;
2077         sector_t sectors_per_obj;
2078         sector_t obj_sector_offset;
2079         int ret;
2080
2081         /*
2082          * Find how far into its rbd object the partition-relative
2083          * bio start sector is to offset relative to the enclosing
2084          * device.
2085          */
2086         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2087         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2088         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2089
2090         /*
2091          * Compute the number of bytes from that offset to the end
2092          * of the object.  Account for what's already used by the bio.
2093          */
2094         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2095         if (ret > bmd->bi_size)
2096                 ret -= bmd->bi_size;
2097         else
2098                 ret = 0;
2099
2100         /*
2101          * Don't send back more than was asked for.  And if the bio
2102          * was empty, let the whole thing through because:  "Note
2103          * that a block device *must* allow a single page to be
2104          * added to an empty bio."
2105          */
2106         rbd_assert(bvec->bv_len <= PAGE_SIZE);
2107         if (ret > (int) bvec->bv_len || !bmd->bi_size)
2108                 ret = (int) bvec->bv_len;
2109
2110         return ret;
2111 }
2112
2113 static void rbd_free_disk(struct rbd_device *rbd_dev)
2114 {
2115         struct gendisk *disk = rbd_dev->disk;
2116
2117         if (!disk)
2118                 return;
2119
2120         if (disk->flags & GENHD_FL_UP)
2121                 del_gendisk(disk);
2122         if (disk->queue)
2123                 blk_cleanup_queue(disk->queue);
2124         put_disk(disk);
2125 }
2126
2127 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2128                                 const char *object_name,
2129                                 u64 offset, u64 length,
2130                                 char *buf, u64 *version)
2131
2132 {
2133         struct ceph_osd_req_op *op;
2134         struct rbd_obj_request *obj_request;
2135         struct ceph_osd_client *osdc;
2136         struct page **pages = NULL;
2137         u32 page_count;
2138         size_t size;
2139         int ret;
2140
2141         page_count = (u32) calc_pages_for(offset, length);
2142         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2143         if (IS_ERR(pages))
2144                 ret = PTR_ERR(pages);
2145
2146         ret = -ENOMEM;
2147         obj_request = rbd_obj_request_create(object_name, offset, length,
2148                                                         OBJ_REQUEST_PAGES);
2149         if (!obj_request)
2150                 goto out;
2151
2152         obj_request->pages = pages;
2153         obj_request->page_count = page_count;
2154
2155         op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2156         if (!op)
2157                 goto out;
2158         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2159                                                 obj_request, op);
2160         rbd_osd_req_op_destroy(op);
2161         if (!obj_request->osd_req)
2162                 goto out;
2163
2164         osdc = &rbd_dev->rbd_client->client->osdc;
2165         ret = rbd_obj_request_submit(osdc, obj_request);
2166         if (ret)
2167                 goto out;
2168         ret = rbd_obj_request_wait(obj_request);
2169         if (ret)
2170                 goto out;
2171
2172         ret = obj_request->result;
2173         if (ret < 0)
2174                 goto out;
2175
2176         rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2177         size = (size_t) obj_request->xferred;
2178         ceph_copy_from_page_vector(pages, buf, 0, size);
2179         rbd_assert(size <= (size_t) INT_MAX);
2180         ret = (int) size;
2181         if (version)
2182                 *version = obj_request->version;
2183 out:
2184         if (obj_request)
2185                 rbd_obj_request_put(obj_request);
2186         else
2187                 ceph_release_page_vector(pages, page_count);
2188
2189         return ret;
2190 }
2191
2192 /*
2193  * Read the complete header for the given rbd device.
2194  *
2195  * Returns a pointer to a dynamically-allocated buffer containing
2196  * the complete and validated header.  Caller can pass the address
2197  * of a variable that will be filled in with the version of the
2198  * header object at the time it was read.
2199  *
2200  * Returns a pointer-coded errno if a failure occurs.
2201  */
2202 static struct rbd_image_header_ondisk *
2203 rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2204 {
2205         struct rbd_image_header_ondisk *ondisk = NULL;
2206         u32 snap_count = 0;
2207         u64 names_size = 0;
2208         u32 want_count;
2209         int ret;
2210
2211         /*
2212          * The complete header will include an array of its 64-bit
2213          * snapshot ids, followed by the names of those snapshots as
2214          * a contiguous block of NUL-terminated strings.  Note that
2215          * the number of snapshots could change by the time we read
2216          * it in, in which case we re-read it.
2217          */
2218         do {
2219                 size_t size;
2220
2221                 kfree(ondisk);
2222
2223                 size = sizeof (*ondisk);
2224                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2225                 size += names_size;
2226                 ondisk = kmalloc(size, GFP_KERNEL);
2227                 if (!ondisk)
2228                         return ERR_PTR(-ENOMEM);
2229
2230                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
2231                                        0, size,
2232                                        (char *) ondisk, version);
2233                 if (ret < 0)
2234                         goto out_err;
2235                 if (WARN_ON((size_t) ret < size)) {
2236                         ret = -ENXIO;
2237                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2238                                 size, ret);
2239                         goto out_err;
2240                 }
2241                 if (!rbd_dev_ondisk_valid(ondisk)) {
2242                         ret = -ENXIO;
2243                         rbd_warn(rbd_dev, "invalid header");
2244                         goto out_err;
2245                 }
2246
2247                 names_size = le64_to_cpu(ondisk->snap_names_len);
2248                 want_count = snap_count;
2249                 snap_count = le32_to_cpu(ondisk->snap_count);
2250         } while (snap_count != want_count);
2251
2252         return ondisk;
2253
2254 out_err:
2255         kfree(ondisk);
2256
2257         return ERR_PTR(ret);
2258 }
2259
2260 /*
2261  * reload the ondisk the header
2262  */
2263 static int rbd_read_header(struct rbd_device *rbd_dev,
2264                            struct rbd_image_header *header)
2265 {
2266         struct rbd_image_header_ondisk *ondisk;
2267         u64 ver = 0;
2268         int ret;
2269
2270         ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2271         if (IS_ERR(ondisk))
2272                 return PTR_ERR(ondisk);
2273         ret = rbd_header_from_disk(header, ondisk);
2274         if (ret >= 0)
2275                 header->obj_version = ver;
2276         kfree(ondisk);
2277
2278         return ret;
2279 }
2280
2281 static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2282 {
2283         struct rbd_snap *snap;
2284         struct rbd_snap *next;
2285
2286         list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
2287                 rbd_remove_snap_dev(snap);
2288 }
2289
2290 static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2291 {
2292         sector_t size;
2293
2294         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
2295                 return;
2296
2297         size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2298         dout("setting size to %llu sectors", (unsigned long long) size);
2299         rbd_dev->mapping.size = (u64) size;
2300         set_capacity(rbd_dev->disk, size);
2301 }
2302
2303 /*
2304  * only read the first part of the ondisk header, without the snaps info
2305  */
2306 static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2307 {
2308         int ret;
2309         struct rbd_image_header h;
2310
2311         ret = rbd_read_header(rbd_dev, &h);
2312         if (ret < 0)
2313                 return ret;
2314
2315         down_write(&rbd_dev->header_rwsem);
2316
2317         /* Update image size, and check for resize of mapped image */
2318         rbd_dev->header.image_size = h.image_size;
2319         rbd_update_mapping_size(rbd_dev);
2320
2321         /* rbd_dev->header.object_prefix shouldn't change */
2322         kfree(rbd_dev->header.snap_sizes);
2323         kfree(rbd_dev->header.snap_names);
2324         /* osd requests may still refer to snapc */
2325         ceph_put_snap_context(rbd_dev->header.snapc);
2326
2327         if (hver)
2328                 *hver = h.obj_version;
2329         rbd_dev->header.obj_version = h.obj_version;
2330         rbd_dev->header.image_size = h.image_size;
2331         rbd_dev->header.snapc = h.snapc;
2332         rbd_dev->header.snap_names = h.snap_names;
2333         rbd_dev->header.snap_sizes = h.snap_sizes;
2334         /* Free the extra copy of the object prefix */
2335         WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2336         kfree(h.object_prefix);
2337
2338         ret = rbd_dev_snaps_update(rbd_dev);
2339         if (!ret)
2340                 ret = rbd_dev_snaps_register(rbd_dev);
2341
2342         up_write(&rbd_dev->header_rwsem);
2343
2344         return ret;
2345 }
2346
2347 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
2348 {
2349         int ret;
2350
2351         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
2352         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2353         if (rbd_dev->image_format == 1)
2354                 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2355         else
2356                 ret = rbd_dev_v2_refresh(rbd_dev, hver);
2357         mutex_unlock(&ctl_mutex);
2358
2359         return ret;
2360 }
2361
2362 static int rbd_init_disk(struct rbd_device *rbd_dev)
2363 {
2364         struct gendisk *disk;
2365         struct request_queue *q;
2366         u64 segment_size;
2367
2368         /* create gendisk info */
2369         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2370         if (!disk)
2371                 return -ENOMEM;
2372
2373         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2374                  rbd_dev->dev_id);
2375         disk->major = rbd_dev->major;
2376         disk->first_minor = 0;
2377         disk->fops = &rbd_bd_ops;
2378         disk->private_data = rbd_dev;
2379
2380         q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2381         if (!q)
2382                 goto out_disk;
2383
2384         /* We use the default size, but let's be explicit about it. */
2385         blk_queue_physical_block_size(q, SECTOR_SIZE);
2386
2387         /* set io sizes to object size */
2388         segment_size = rbd_obj_bytes(&rbd_dev->header);
2389         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2390         blk_queue_max_segment_size(q, segment_size);
2391         blk_queue_io_min(q, segment_size);
2392         blk_queue_io_opt(q, segment_size);
2393
2394         blk_queue_merge_bvec(q, rbd_merge_bvec);
2395         disk->queue = q;
2396
2397         q->queuedata = rbd_dev;
2398
2399         rbd_dev->disk = disk;
2400
2401         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2402
2403         return 0;
2404 out_disk:
2405         put_disk(disk);
2406
2407         return -ENOMEM;
2408 }
2409
2410 /*
2411   sysfs
2412 */
2413
2414 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2415 {
2416         return container_of(dev, struct rbd_device, dev);
2417 }
2418
2419 static ssize_t rbd_size_show(struct device *dev,
2420                              struct device_attribute *attr, char *buf)
2421 {
2422         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2423         sector_t size;
2424
2425         down_read(&rbd_dev->header_rwsem);
2426         size = get_capacity(rbd_dev->disk);
2427         up_read(&rbd_dev->header_rwsem);
2428
2429         return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2430 }
2431
2432 /*
2433  * Note this shows the features for whatever's mapped, which is not
2434  * necessarily the base image.
2435  */
2436 static ssize_t rbd_features_show(struct device *dev,
2437                              struct device_attribute *attr, char *buf)
2438 {
2439         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2440
2441         return sprintf(buf, "0x%016llx\n",
2442                         (unsigned long long) rbd_dev->mapping.features);
2443 }
2444
2445 static ssize_t rbd_major_show(struct device *dev,
2446                               struct device_attribute *attr, char *buf)
2447 {
2448         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2449
2450         return sprintf(buf, "%d\n", rbd_dev->major);
2451 }
2452
2453 static ssize_t rbd_client_id_show(struct device *dev,
2454                                   struct device_attribute *attr, char *buf)
2455 {
2456         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2457
2458         return sprintf(buf, "client%lld\n",
2459                         ceph_client_id(rbd_dev->rbd_client->client));
2460 }
2461
2462 static ssize_t rbd_pool_show(struct device *dev,
2463                              struct device_attribute *attr, char *buf)
2464 {
2465         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2466
2467         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2468 }
2469
2470 static ssize_t rbd_pool_id_show(struct device *dev,
2471                              struct device_attribute *attr, char *buf)
2472 {
2473         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2474
2475         return sprintf(buf, "%llu\n",
2476                 (unsigned long long) rbd_dev->spec->pool_id);
2477 }
2478
2479 static ssize_t rbd_name_show(struct device *dev,
2480                              struct device_attribute *attr, char *buf)
2481 {
2482         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2483
2484         if (rbd_dev->spec->image_name)
2485                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2486
2487         return sprintf(buf, "(unknown)\n");
2488 }
2489
2490 static ssize_t rbd_image_id_show(struct device *dev,
2491                              struct device_attribute *attr, char *buf)
2492 {
2493         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2494
2495         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2496 }
2497
2498 /*
2499  * Shows the name of the currently-mapped snapshot (or
2500  * RBD_SNAP_HEAD_NAME for the base image).
2501  */
2502 static ssize_t rbd_snap_show(struct device *dev,
2503                              struct device_attribute *attr,
2504                              char *buf)
2505 {
2506         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2507
2508         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2509 }
2510
2511 /*
2512  * For an rbd v2 image, shows the pool id, image id, and snapshot id
2513  * for the parent image.  If there is no parent, simply shows
2514  * "(no parent image)".
2515  */
2516 static ssize_t rbd_parent_show(struct device *dev,
2517                              struct device_attribute *attr,
2518                              char *buf)
2519 {
2520         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2521         struct rbd_spec *spec = rbd_dev->parent_spec;
2522         int count;
2523         char *bufp = buf;
2524
2525         if (!spec)
2526                 return sprintf(buf, "(no parent image)\n");
2527
2528         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2529                         (unsigned long long) spec->pool_id, spec->pool_name);
2530         if (count < 0)
2531                 return count;
2532         bufp += count;
2533
2534         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2535                         spec->image_name ? spec->image_name : "(unknown)");
2536         if (count < 0)
2537                 return count;
2538         bufp += count;
2539
2540         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2541                         (unsigned long long) spec->snap_id, spec->snap_name);
2542         if (count < 0)
2543                 return count;
2544         bufp += count;
2545
2546         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2547         if (count < 0)
2548                 return count;
2549         bufp += count;
2550
2551         return (ssize_t) (bufp - buf);
2552 }
2553
2554 static ssize_t rbd_image_refresh(struct device *dev,
2555                                  struct device_attribute *attr,
2556                                  const char *buf,
2557                                  size_t size)
2558 {
2559         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2560         int ret;
2561
2562         ret = rbd_dev_refresh(rbd_dev, NULL);
2563
2564         return ret < 0 ? ret : size;
2565 }
2566
2567 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2568 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2569 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2570 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2571 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
2572 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2573 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2574 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2575 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2576 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2577 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2578
2579 static struct attribute *rbd_attrs[] = {
2580         &dev_attr_size.attr,
2581         &dev_attr_features.attr,
2582         &dev_attr_major.attr,
2583         &dev_attr_client_id.attr,
2584         &dev_attr_pool.attr,
2585         &dev_attr_pool_id.attr,
2586         &dev_attr_name.attr,
2587         &dev_attr_image_id.attr,
2588         &dev_attr_current_snap.attr,
2589         &dev_attr_parent.attr,
2590         &dev_attr_refresh.attr,
2591         NULL
2592 };
2593
2594 static struct attribute_group rbd_attr_group = {
2595         .attrs = rbd_attrs,
2596 };
2597
2598 static const struct attribute_group *rbd_attr_groups[] = {
2599         &rbd_attr_group,
2600         NULL
2601 };
2602
2603 static void rbd_sysfs_dev_release(struct device *dev)
2604 {
2605 }
2606
2607 static struct device_type rbd_device_type = {
2608         .name           = "rbd",
2609         .groups         = rbd_attr_groups,
2610         .release        = rbd_sysfs_dev_release,
2611 };
2612
2613
2614 /*
2615   sysfs - snapshots
2616 */
2617
2618 static ssize_t rbd_snap_size_show(struct device *dev,
2619                                   struct device_attribute *attr,
2620                                   char *buf)
2621 {
2622         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2623
2624         return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2625 }
2626
2627 static ssize_t rbd_snap_id_show(struct device *dev,
2628                                 struct device_attribute *attr,
2629                                 char *buf)
2630 {
2631         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2632
2633         return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2634 }
2635
2636 static ssize_t rbd_snap_features_show(struct device *dev,
2637                                 struct device_attribute *attr,
2638                                 char *buf)
2639 {
2640         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2641
2642         return sprintf(buf, "0x%016llx\n",
2643                         (unsigned long long) snap->features);
2644 }
2645
2646 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2647 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2648 static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2649
2650 static struct attribute *rbd_snap_attrs[] = {
2651         &dev_attr_snap_size.attr,
2652         &dev_attr_snap_id.attr,
2653         &dev_attr_snap_features.attr,
2654         NULL,
2655 };
2656
2657 static struct attribute_group rbd_snap_attr_group = {
2658         .attrs = rbd_snap_attrs,
2659 };
2660
2661 static void rbd_snap_dev_release(struct device *dev)
2662 {
2663         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2664         kfree(snap->name);
2665         kfree(snap);
2666 }
2667
2668 static const struct attribute_group *rbd_snap_attr_groups[] = {
2669         &rbd_snap_attr_group,
2670         NULL
2671 };
2672
2673 static struct device_type rbd_snap_device_type = {
2674         .groups         = rbd_snap_attr_groups,
2675         .release        = rbd_snap_dev_release,
2676 };
2677
2678 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2679 {
2680         kref_get(&spec->kref);
2681
2682         return spec;
2683 }
2684
2685 static void rbd_spec_free(struct kref *kref);
2686 static void rbd_spec_put(struct rbd_spec *spec)
2687 {
2688         if (spec)
2689                 kref_put(&spec->kref, rbd_spec_free);
2690 }
2691
2692 static struct rbd_spec *rbd_spec_alloc(void)
2693 {
2694         struct rbd_spec *spec;
2695
2696         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2697         if (!spec)
2698                 return NULL;
2699         kref_init(&spec->kref);
2700
2701         rbd_spec_put(rbd_spec_get(spec));       /* TEMPORARY */
2702
2703         return spec;
2704 }
2705
2706 static void rbd_spec_free(struct kref *kref)
2707 {
2708         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2709
2710         kfree(spec->pool_name);
2711         kfree(spec->image_id);
2712         kfree(spec->image_name);
2713         kfree(spec->snap_name);
2714         kfree(spec);
2715 }
2716
2717 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2718                                 struct rbd_spec *spec)
2719 {
2720         struct rbd_device *rbd_dev;
2721
2722         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2723         if (!rbd_dev)
2724                 return NULL;
2725
2726         spin_lock_init(&rbd_dev->lock);
2727         rbd_dev->flags = 0;
2728         INIT_LIST_HEAD(&rbd_dev->node);
2729         INIT_LIST_HEAD(&rbd_dev->snaps);
2730         init_rwsem(&rbd_dev->header_rwsem);
2731
2732         rbd_dev->spec = spec;
2733         rbd_dev->rbd_client = rbdc;
2734
2735         /* Initialize the layout used for all rbd requests */
2736
2737         rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2738         rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2739         rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2740         rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2741
2742         return rbd_dev;
2743 }
2744
2745 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2746 {
2747         rbd_spec_put(rbd_dev->parent_spec);
2748         kfree(rbd_dev->header_name);
2749         rbd_put_client(rbd_dev->rbd_client);
2750         rbd_spec_put(rbd_dev->spec);
2751         kfree(rbd_dev);
2752 }
2753
2754 static bool rbd_snap_registered(struct rbd_snap *snap)
2755 {
2756         bool ret = snap->dev.type == &rbd_snap_device_type;
2757         bool reg = device_is_registered(&snap->dev);
2758
2759         rbd_assert(!ret ^ reg);
2760
2761         return ret;
2762 }
2763
2764 static void rbd_remove_snap_dev(struct rbd_snap *snap)
2765 {
2766         list_del(&snap->node);
2767         if (device_is_registered(&snap->dev))
2768                 device_unregister(&snap->dev);
2769 }
2770
2771 static int rbd_register_snap_dev(struct rbd_snap *snap,
2772                                   struct device *parent)
2773 {
2774         struct device *dev = &snap->dev;
2775         int ret;
2776
2777         dev->type = &rbd_snap_device_type;
2778         dev->parent = parent;
2779         dev->release = rbd_snap_dev_release;
2780         dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2781         dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2782
2783         ret = device_register(dev);
2784
2785         return ret;
2786 }
2787
2788 static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2789                                                 const char *snap_name,
2790                                                 u64 snap_id, u64 snap_size,
2791                                                 u64 snap_features)
2792 {
2793         struct rbd_snap *snap;
2794         int ret;
2795
2796         snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2797         if (!snap)
2798                 return ERR_PTR(-ENOMEM);
2799
2800         ret = -ENOMEM;
2801         snap->name = kstrdup(snap_name, GFP_KERNEL);
2802         if (!snap->name)
2803                 goto err;
2804
2805         snap->id = snap_id;
2806         snap->size = snap_size;
2807         snap->features = snap_features;
2808
2809         return snap;
2810
2811 err:
2812         kfree(snap->name);
2813         kfree(snap);
2814
2815         return ERR_PTR(ret);
2816 }
2817
2818 static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2819                 u64 *snap_size, u64 *snap_features)
2820 {
2821         char *snap_name;
2822
2823         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2824
2825         *snap_size = rbd_dev->header.snap_sizes[which];
2826         *snap_features = 0;     /* No features for v1 */
2827
2828         /* Skip over names until we find the one we are looking for */
2829
2830         snap_name = rbd_dev->header.snap_names;
2831         while (which--)
2832                 snap_name += strlen(snap_name) + 1;
2833
2834         return snap_name;
2835 }
2836
2837 /*
2838  * Get the size and object order for an image snapshot, or if
2839  * snap_id is CEPH_NOSNAP, gets this information for the base
2840  * image.
2841  */
2842 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2843                                 u8 *order, u64 *snap_size)
2844 {
2845         __le64 snapid = cpu_to_le64(snap_id);
2846         int ret;
2847         struct {
2848                 u8 order;
2849                 __le64 size;
2850         } __attribute__ ((packed)) size_buf = { 0 };
2851
2852         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2853                                 "rbd", "get_size",
2854                                 (char *) &snapid, sizeof (snapid),
2855                                 (char *) &size_buf, sizeof (size_buf), NULL);
2856         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2857         if (ret < 0)
2858                 return ret;
2859
2860         *order = size_buf.order;
2861         *snap_size = le64_to_cpu(size_buf.size);
2862
2863         dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
2864                 (unsigned long long) snap_id, (unsigned int) *order,
2865                 (unsigned long long) *snap_size);
2866
2867         return 0;
2868 }
2869
2870 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2871 {
2872         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2873                                         &rbd_dev->header.obj_order,
2874                                         &rbd_dev->header.image_size);
2875 }
2876
2877 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2878 {
2879         void *reply_buf;
2880         int ret;
2881         void *p;
2882
2883         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2884         if (!reply_buf)
2885                 return -ENOMEM;
2886
2887         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2888                                 "rbd", "get_object_prefix",
2889                                 NULL, 0,
2890                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
2891         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2892         if (ret < 0)
2893                 goto out;
2894
2895         p = reply_buf;
2896         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2897                                                 p + RBD_OBJ_PREFIX_LEN_MAX,
2898                                                 NULL, GFP_NOIO);
2899
2900         if (IS_ERR(rbd_dev->header.object_prefix)) {
2901                 ret = PTR_ERR(rbd_dev->header.object_prefix);
2902                 rbd_dev->header.object_prefix = NULL;
2903         } else {
2904                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
2905         }
2906
2907 out:
2908         kfree(reply_buf);
2909
2910         return ret;
2911 }
2912
2913 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2914                 u64 *snap_features)
2915 {
2916         __le64 snapid = cpu_to_le64(snap_id);
2917         struct {
2918                 __le64 features;
2919                 __le64 incompat;
2920         } features_buf = { 0 };
2921         u64 incompat;
2922         int ret;
2923
2924         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2925                                 "rbd", "get_features",
2926                                 (char *) &snapid, sizeof (snapid),
2927                                 (char *) &features_buf, sizeof (features_buf),
2928                                 NULL);
2929         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2930         if (ret < 0)
2931                 return ret;
2932
2933         incompat = le64_to_cpu(features_buf.incompat);
2934         if (incompat & ~RBD_FEATURES_ALL)
2935                 return -ENXIO;
2936
2937         *snap_features = le64_to_cpu(features_buf.features);
2938
2939         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2940                 (unsigned long long) snap_id,
2941                 (unsigned long long) *snap_features,
2942                 (unsigned long long) le64_to_cpu(features_buf.incompat));
2943
2944         return 0;
2945 }
2946
2947 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2948 {
2949         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2950                                                 &rbd_dev->header.features);
2951 }
2952
2953 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2954 {
2955         struct rbd_spec *parent_spec;
2956         size_t size;
2957         void *reply_buf = NULL;
2958         __le64 snapid;
2959         void *p;
2960         void *end;
2961         char *image_id;
2962         u64 overlap;
2963         int ret;
2964
2965         parent_spec = rbd_spec_alloc();
2966         if (!parent_spec)
2967                 return -ENOMEM;
2968
2969         size = sizeof (__le64) +                                /* pool_id */
2970                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
2971                 sizeof (__le64) +                               /* snap_id */
2972                 sizeof (__le64);                                /* overlap */
2973         reply_buf = kmalloc(size, GFP_KERNEL);
2974         if (!reply_buf) {
2975                 ret = -ENOMEM;
2976                 goto out_err;
2977         }
2978
2979         snapid = cpu_to_le64(CEPH_NOSNAP);
2980         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2981                                 "rbd", "get_parent",
2982                                 (char *) &snapid, sizeof (snapid),
2983                                 (char *) reply_buf, size, NULL);
2984         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2985         if (ret < 0)
2986                 goto out_err;
2987
2988         ret = -ERANGE;
2989         p = reply_buf;
2990         end = (char *) reply_buf + size;
2991         ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2992         if (parent_spec->pool_id == CEPH_NOPOOL)
2993                 goto out;       /* No parent?  No problem. */
2994
2995         /* The ceph file layout needs to fit pool id in 32 bits */
2996
2997         ret = -EIO;
2998         if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2999                 goto out;
3000
3001         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3002         if (IS_ERR(image_id)) {
3003                 ret = PTR_ERR(image_id);
3004                 goto out_err;
3005         }
3006         parent_spec->image_id = image_id;
3007         ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3008         ceph_decode_64_safe(&p, end, overlap, out_err);
3009
3010         rbd_dev->parent_overlap = overlap;
3011         rbd_dev->parent_spec = parent_spec;
3012         parent_spec = NULL;     /* rbd_dev now owns this */
3013 out:
3014         ret = 0;
3015 out_err:
3016         kfree(reply_buf);
3017         rbd_spec_put(parent_spec);
3018
3019         return ret;
3020 }
3021
3022 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3023 {
3024         size_t image_id_size;
3025         char *image_id;
3026         void *p;
3027         void *end;
3028         size_t size;
3029         void *reply_buf = NULL;
3030         size_t len = 0;
3031         char *image_name = NULL;
3032         int ret;
3033
3034         rbd_assert(!rbd_dev->spec->image_name);
3035
3036         len = strlen(rbd_dev->spec->image_id);
3037         image_id_size = sizeof (__le32) + len;
3038         image_id = kmalloc(image_id_size, GFP_KERNEL);
3039         if (!image_id)
3040                 return NULL;
3041
3042         p = image_id;
3043         end = (char *) image_id + image_id_size;
3044         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
3045
3046         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3047         reply_buf = kmalloc(size, GFP_KERNEL);
3048         if (!reply_buf)
3049                 goto out;
3050
3051         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
3052                                 "rbd", "dir_get_name",
3053                                 image_id, image_id_size,
3054                                 (char *) reply_buf, size, NULL);
3055         if (ret < 0)
3056                 goto out;
3057         p = reply_buf;
3058         end = (char *) reply_buf + size;
3059         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3060         if (IS_ERR(image_name))
3061                 image_name = NULL;
3062         else
3063                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3064 out:
3065         kfree(reply_buf);
3066         kfree(image_id);
3067
3068         return image_name;
3069 }
3070
3071 /*
3072  * When a parent image gets probed, we only have the pool, image,
3073  * and snapshot ids but not the names of any of them.  This call
3074  * is made later to fill in those names.  It has to be done after
3075  * rbd_dev_snaps_update() has completed because some of the
3076  * information (in particular, snapshot name) is not available
3077  * until then.
3078  */
3079 static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3080 {
3081         struct ceph_osd_client *osdc;
3082         const char *name;
3083         void *reply_buf = NULL;
3084         int ret;
3085
3086         if (rbd_dev->spec->pool_name)
3087                 return 0;       /* Already have the names */
3088
3089         /* Look up the pool name */
3090
3091         osdc = &rbd_dev->rbd_client->client->osdc;
3092         name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3093         if (!name) {
3094                 rbd_warn(rbd_dev, "there is no pool with id %llu",
3095                         rbd_dev->spec->pool_id);        /* Really a BUG() */
3096                 return -EIO;
3097         }
3098
3099         rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3100         if (!rbd_dev->spec->pool_name)
3101                 return -ENOMEM;
3102
3103         /* Fetch the image name; tolerate failure here */
3104
3105         name = rbd_dev_image_name(rbd_dev);
3106         if (name)
3107                 rbd_dev->spec->image_name = (char *) name;
3108         else
3109                 rbd_warn(rbd_dev, "unable to get image name");
3110
3111         /* Look up the snapshot name. */
3112
3113         name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3114         if (!name) {
3115                 rbd_warn(rbd_dev, "no snapshot with id %llu",
3116                         rbd_dev->spec->snap_id);        /* Really a BUG() */
3117                 ret = -EIO;
3118                 goto out_err;
3119         }
3120         rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3121         if(!rbd_dev->spec->snap_name)
3122                 goto out_err;
3123
3124         return 0;
3125 out_err:
3126         kfree(reply_buf);
3127         kfree(rbd_dev->spec->pool_name);
3128         rbd_dev->spec->pool_name = NULL;
3129
3130         return ret;
3131 }
3132
3133 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
3134 {
3135         size_t size;
3136         int ret;
3137         void *reply_buf;
3138         void *p;
3139         void *end;
3140         u64 seq;
3141         u32 snap_count;
3142         struct ceph_snap_context *snapc;
3143         u32 i;
3144
3145         /*
3146          * We'll need room for the seq value (maximum snapshot id),
3147          * snapshot count, and array of that many snapshot ids.
3148          * For now we have a fixed upper limit on the number we're
3149          * prepared to receive.
3150          */
3151         size = sizeof (__le64) + sizeof (__le32) +
3152                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
3153         reply_buf = kzalloc(size, GFP_KERNEL);
3154         if (!reply_buf)
3155                 return -ENOMEM;
3156
3157         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3158                                 "rbd", "get_snapcontext",
3159                                 NULL, 0,
3160                                 reply_buf, size, ver);
3161         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3162         if (ret < 0)
3163                 goto out;
3164
3165         ret = -ERANGE;
3166         p = reply_buf;
3167         end = (char *) reply_buf + size;
3168         ceph_decode_64_safe(&p, end, seq, out);
3169         ceph_decode_32_safe(&p, end, snap_count, out);
3170
3171         /*
3172          * Make sure the reported number of snapshot ids wouldn't go
3173          * beyond the end of our buffer.  But before checking that,
3174          * make sure the computed size of the snapshot context we
3175          * allocate is representable in a size_t.
3176          */
3177         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3178                                  / sizeof (u64)) {
3179                 ret = -EINVAL;
3180                 goto out;
3181         }
3182         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3183                 goto out;
3184
3185         size = sizeof (struct ceph_snap_context) +
3186                                 snap_count * sizeof (snapc->snaps[0]);
3187         snapc = kmalloc(size, GFP_KERNEL);
3188         if (!snapc) {
3189                 ret = -ENOMEM;
3190                 goto out;
3191         }
3192
3193         atomic_set(&snapc->nref, 1);
3194         snapc->seq = seq;
3195         snapc->num_snaps = snap_count;
3196         for (i = 0; i < snap_count; i++)
3197                 snapc->snaps[i] = ceph_decode_64(&p);
3198
3199         rbd_dev->header.snapc = snapc;
3200
3201         dout("  snap context seq = %llu, snap_count = %u\n",
3202                 (unsigned long long) seq, (unsigned int) snap_count);
3203
3204 out:
3205         kfree(reply_buf);
3206
3207         return 0;
3208 }
3209
3210 static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3211 {
3212         size_t size;
3213         void *reply_buf;
3214         __le64 snap_id;
3215         int ret;
3216         void *p;
3217         void *end;
3218         char *snap_name;
3219
3220         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3221         reply_buf = kmalloc(size, GFP_KERNEL);
3222         if (!reply_buf)
3223                 return ERR_PTR(-ENOMEM);
3224
3225         snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3226         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3227                                 "rbd", "get_snapshot_name",
3228                                 (char *) &snap_id, sizeof (snap_id),
3229                                 reply_buf, size, NULL);
3230         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3231         if (ret < 0)
3232                 goto out;
3233
3234         p = reply_buf;
3235         end = (char *) reply_buf + size;
3236         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3237         if (IS_ERR(snap_name)) {
3238                 ret = PTR_ERR(snap_name);
3239                 goto out;
3240         } else {
3241                 dout("  snap_id 0x%016llx snap_name = %s\n",
3242                         (unsigned long long) le64_to_cpu(snap_id), snap_name);
3243         }
3244         kfree(reply_buf);
3245
3246         return snap_name;
3247 out:
3248         kfree(reply_buf);
3249
3250         return ERR_PTR(ret);
3251 }
3252
3253 static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3254                 u64 *snap_size, u64 *snap_features)
3255 {
3256         u64 snap_id;
3257         u8 order;
3258         int ret;
3259
3260         snap_id = rbd_dev->header.snapc->snaps[which];
3261         ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3262         if (ret)
3263                 return ERR_PTR(ret);
3264         ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3265         if (ret)
3266                 return ERR_PTR(ret);
3267
3268         return rbd_dev_v2_snap_name(rbd_dev, which);
3269 }
3270
3271 static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3272                 u64 *snap_size, u64 *snap_features)
3273 {
3274         if (rbd_dev->image_format == 1)
3275                 return rbd_dev_v1_snap_info(rbd_dev, which,
3276                                         snap_size, snap_features);
3277         if (rbd_dev->image_format == 2)
3278                 return rbd_dev_v2_snap_info(rbd_dev, which,
3279                                         snap_size, snap_features);
3280         return ERR_PTR(-EINVAL);
3281 }
3282
3283 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3284 {
3285         int ret;
3286         __u8 obj_order;
3287
3288         down_write(&rbd_dev->header_rwsem);
3289
3290         /* Grab old order first, to see if it changes */
3291
3292         obj_order = rbd_dev->header.obj_order,
3293         ret = rbd_dev_v2_image_size(rbd_dev);
3294         if (ret)
3295                 goto out;
3296         if (rbd_dev->header.obj_order != obj_order) {
3297                 ret = -EIO;
3298                 goto out;
3299         }
3300         rbd_update_mapping_size(rbd_dev);
3301
3302         ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3303         dout("rbd_dev_v2_snap_context returned %d\n", ret);
3304         if (ret)
3305                 goto out;
3306         ret = rbd_dev_snaps_update(rbd_dev);
3307         dout("rbd_dev_snaps_update returned %d\n", ret);
3308         if (ret)
3309                 goto out;
3310         ret = rbd_dev_snaps_register(rbd_dev);
3311         dout("rbd_dev_snaps_register returned %d\n", ret);
3312 out:
3313         up_write(&rbd_dev->header_rwsem);
3314
3315         return ret;
3316 }
3317
3318 /*
3319  * Scan the rbd device's current snapshot list and compare it to the
3320  * newly-received snapshot context.  Remove any existing snapshots
3321  * not present in the new snapshot context.  Add a new snapshot for
3322  * any snaphots in the snapshot context not in the current list.
3323  * And verify there are no changes to snapshots we already know
3324  * about.
3325  *
3326  * Assumes the snapshots in the snapshot context are sorted by
3327  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
3328  * are also maintained in that order.)
3329  */
3330 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3331 {
3332         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3333         const u32 snap_count = snapc->num_snaps;
3334         struct list_head *head = &rbd_dev->snaps;
3335         struct list_head *links = head->next;
3336         u32 index = 0;
3337
3338         dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
3339         while (index < snap_count || links != head) {
3340                 u64 snap_id;
3341                 struct rbd_snap *snap;
3342                 char *snap_name;
3343                 u64 snap_size = 0;
3344                 u64 snap_features = 0;
3345
3346                 snap_id = index < snap_count ? snapc->snaps[index]
3347                                              : CEPH_NOSNAP;
3348                 snap = links != head ? list_entry(links, struct rbd_snap, node)
3349                                      : NULL;
3350                 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3351
3352                 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3353                         struct list_head *next = links->next;
3354
3355                         /*
3356                          * A previously-existing snapshot is not in
3357                          * the new snap context.
3358                          *
3359                          * If the now missing snapshot is the one the
3360                          * image is mapped to, clear its exists flag
3361                          * so we can avoid sending any more requests
3362                          * to it.
3363                          */
3364                         if (rbd_dev->spec->snap_id == snap->id)
3365                                 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3366                         rbd_remove_snap_dev(snap);
3367                         dout("%ssnap id %llu has been removed\n",
3368                                 rbd_dev->spec->snap_id == snap->id ?
3369                                                         "mapped " : "",
3370                                 (unsigned long long) snap->id);
3371
3372                         /* Done with this list entry; advance */
3373
3374                         links = next;
3375                         continue;
3376                 }
3377
3378                 snap_name = rbd_dev_snap_info(rbd_dev, index,
3379                                         &snap_size, &snap_features);
3380                 if (IS_ERR(snap_name))
3381                         return PTR_ERR(snap_name);
3382
3383                 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3384                         (unsigned long long) snap_id);
3385                 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3386                         struct rbd_snap *new_snap;
3387
3388                         /* We haven't seen this snapshot before */
3389
3390                         new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3391                                         snap_id, snap_size, snap_features);
3392                         if (IS_ERR(new_snap)) {
3393                                 int err = PTR_ERR(new_snap);
3394
3395                                 dout("  failed to add dev, error %d\n", err);
3396
3397                                 return err;
3398                         }
3399
3400                         /* New goes before existing, or at end of list */
3401
3402                         dout("  added dev%s\n", snap ? "" : " at end\n");
3403                         if (snap)
3404                                 list_add_tail(&new_snap->node, &snap->node);
3405                         else
3406                                 list_add_tail(&new_snap->node, head);
3407                 } else {
3408                         /* Already have this one */
3409
3410                         dout("  already present\n");
3411
3412                         rbd_assert(snap->size == snap_size);
3413                         rbd_assert(!strcmp(snap->name, snap_name));
3414                         rbd_assert(snap->features == snap_features);
3415
3416                         /* Done with this list entry; advance */
3417
3418                         links = links->next;
3419                 }
3420
3421                 /* Advance to the next entry in the snapshot context */
3422
3423                 index++;
3424         }
3425         dout("%s: done\n", __func__);
3426
3427         return 0;
3428 }
3429
3430 /*
3431  * Scan the list of snapshots and register the devices for any that
3432  * have not already been registered.
3433  */
3434 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3435 {
3436         struct rbd_snap *snap;
3437         int ret = 0;
3438
3439         dout("%s:\n", __func__);
3440         if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3441                 return -EIO;
3442
3443         list_for_each_entry(snap, &rbd_dev->snaps, node) {
3444                 if (!rbd_snap_registered(snap)) {
3445                         ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3446                         if (ret < 0)
3447                                 break;
3448                 }
3449         }
3450         dout("%s: returning %d\n", __func__, ret);
3451
3452         return ret;
3453 }
3454
3455 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3456 {
3457         struct device *dev;
3458         int ret;
3459
3460         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3461
3462         dev = &rbd_dev->dev;
3463         dev->bus = &rbd_bus_type;
3464         dev->type = &rbd_device_type;
3465         dev->parent = &rbd_root_dev;
3466         dev->release = rbd_dev_release;
3467         dev_set_name(dev, "%d", rbd_dev->dev_id);
3468         ret = device_register(dev);
3469
3470         mutex_unlock(&ctl_mutex);
3471
3472         return ret;
3473 }
3474
3475 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3476 {
3477         device_unregister(&rbd_dev->dev);
3478 }
3479
3480 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
3481
3482 /*
3483  * Get a unique rbd identifier for the given new rbd_dev, and add
3484  * the rbd_dev to the global list.  The minimum rbd id is 1.
3485  */
3486 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3487 {
3488         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3489
3490         spin_lock(&rbd_dev_list_lock);
3491         list_add_tail(&rbd_dev->node, &rbd_dev_list);
3492         spin_unlock(&rbd_dev_list_lock);
3493         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3494                 (unsigned long long) rbd_dev->dev_id);
3495 }
3496
3497 /*
3498  * Remove an rbd_dev from the global list, and record that its
3499  * identifier is no longer in use.
3500  */
3501 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
3502 {
3503         struct list_head *tmp;
3504         int rbd_id = rbd_dev->dev_id;
3505         int max_id;
3506
3507         rbd_assert(rbd_id > 0);
3508
3509         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3510                 (unsigned long long) rbd_dev->dev_id);
3511         spin_lock(&rbd_dev_list_lock);
3512         list_del_init(&rbd_dev->node);
3513
3514         /*
3515          * If the id being "put" is not the current maximum, there
3516          * is nothing special we need to do.
3517          */
3518         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3519                 spin_unlock(&rbd_dev_list_lock);
3520                 return;
3521         }
3522
3523         /*
3524          * We need to update the current maximum id.  Search the
3525          * list to find out what it is.  We're more likely to find
3526          * the maximum at the end, so search the list backward.
3527          */
3528         max_id = 0;
3529         list_for_each_prev(tmp, &rbd_dev_list) {
3530                 struct rbd_device *rbd_dev;
3531
3532                 rbd_dev = list_entry(tmp, struct rbd_device, node);
3533                 if (rbd_dev->dev_id > max_id)
3534                         max_id = rbd_dev->dev_id;
3535         }
3536         spin_unlock(&rbd_dev_list_lock);
3537
3538         /*
3539          * The max id could have been updated by rbd_dev_id_get(), in
3540          * which case it now accurately reflects the new maximum.
3541          * Be careful not to overwrite the maximum value in that
3542          * case.
3543          */
3544         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3545         dout("  max dev id has been reset\n");
3546 }
3547
3548 /*
3549  * Skips over white space at *buf, and updates *buf to point to the
3550  * first found non-space character (if any). Returns the length of
3551  * the token (string of non-white space characters) found.  Note
3552  * that *buf must be terminated with '\0'.
3553  */
3554 static inline size_t next_token(const char **buf)
3555 {
3556         /*
3557         * These are the characters that produce nonzero for
3558         * isspace() in the "C" and "POSIX" locales.
3559         */
3560         const char *spaces = " \f\n\r\t\v";
3561
3562         *buf += strspn(*buf, spaces);   /* Find start of token */
3563
3564         return strcspn(*buf, spaces);   /* Return token length */
3565 }
3566
3567 /*
3568  * Finds the next token in *buf, and if the provided token buffer is
3569  * big enough, copies the found token into it.  The result, if
3570  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3571  * must be terminated with '\0' on entry.
3572  *
3573  * Returns the length of the token found (not including the '\0').
3574  * Return value will be 0 if no token is found, and it will be >=
3575  * token_size if the token would not fit.
3576  *
3577  * The *buf pointer will be updated to point beyond the end of the
3578  * found token.  Note that this occurs even if the token buffer is
3579  * too small to hold it.
3580  */
3581 static inline size_t copy_token(const char **buf,
3582                                 char *token,
3583                                 size_t token_size)
3584 {
3585         size_t len;
3586
3587         len = next_token(buf);
3588         if (len < token_size) {
3589                 memcpy(token, *buf, len);
3590                 *(token + len) = '\0';
3591         }
3592         *buf += len;
3593
3594         return len;
3595 }
3596
3597 /*
3598  * Finds the next token in *buf, dynamically allocates a buffer big
3599  * enough to hold a copy of it, and copies the token into the new
3600  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3601  * that a duplicate buffer is created even for a zero-length token.
3602  *
3603  * Returns a pointer to the newly-allocated duplicate, or a null
3604  * pointer if memory for the duplicate was not available.  If
3605  * the lenp argument is a non-null pointer, the length of the token
3606  * (not including the '\0') is returned in *lenp.
3607  *
3608  * If successful, the *buf pointer will be updated to point beyond
3609  * the end of the found token.
3610  *
3611  * Note: uses GFP_KERNEL for allocation.
3612  */
3613 static inline char *dup_token(const char **buf, size_t *lenp)
3614 {
3615         char *dup;
3616         size_t len;
3617
3618         len = next_token(buf);
3619         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3620         if (!dup)
3621                 return NULL;
3622         *(dup + len) = '\0';
3623         *buf += len;
3624
3625         if (lenp)
3626                 *lenp = len;
3627
3628         return dup;
3629 }
3630
3631 /*
3632  * Parse the options provided for an "rbd add" (i.e., rbd image
3633  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3634  * and the data written is passed here via a NUL-terminated buffer.
3635  * Returns 0 if successful or an error code otherwise.
3636  *
3637  * The information extracted from these options is recorded in
3638  * the other parameters which return dynamically-allocated
3639  * structures:
3640  *  ceph_opts
3641  *      The address of a pointer that will refer to a ceph options
3642  *      structure.  Caller must release the returned pointer using
3643  *      ceph_destroy_options() when it is no longer needed.
3644  *  rbd_opts
3645  *      Address of an rbd options pointer.  Fully initialized by
3646  *      this function; caller must release with kfree().
3647  *  spec
3648  *      Address of an rbd image specification pointer.  Fully
3649  *      initialized by this function based on parsed options.
3650  *      Caller must release with rbd_spec_put().
3651  *
3652  * The options passed take this form:
3653  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3654  * where:
3655  *  <mon_addrs>
3656  *      A comma-separated list of one or more monitor addresses.
3657  *      A monitor address is an ip address, optionally followed
3658  *      by a port number (separated by a colon).
3659  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3660  *  <options>
3661  *      A comma-separated list of ceph and/or rbd options.
3662  *  <pool_name>
3663  *      The name of the rados pool containing the rbd image.
3664  *  <image_name>
3665  *      The name of the image in that pool to map.
3666  *  <snap_id>
3667  *      An optional snapshot id.  If provided, the mapping will
3668  *      present data from the image at the time that snapshot was
3669  *      created.  The image head is used if no snapshot id is
3670  *      provided.  Snapshot mappings are always read-only.
3671  */
3672 static int rbd_add_parse_args(const char *buf,
3673                                 struct ceph_options **ceph_opts,
3674                                 struct rbd_options **opts,
3675                                 struct rbd_spec **rbd_spec)
3676 {
3677         size_t len;
3678         char *options;
3679         const char *mon_addrs;
3680         size_t mon_addrs_size;
3681         struct rbd_spec *spec = NULL;
3682         struct rbd_options *rbd_opts = NULL;
3683         struct ceph_options *copts;
3684         int ret;
3685
3686         /* The first four tokens are required */
3687
3688         len = next_token(&buf);
3689         if (!len) {
3690                 rbd_warn(NULL, "no monitor address(es) provided");
3691                 return -EINVAL;
3692         }
3693         mon_addrs = buf;
3694         mon_addrs_size = len + 1;
3695         buf += len;
3696
3697         ret = -EINVAL;
3698         options = dup_token(&buf, NULL);
3699         if (!options)
3700                 return -ENOMEM;
3701         if (!*options) {
3702                 rbd_warn(NULL, "no options provided");
3703                 goto out_err;
3704         }
3705
3706         spec = rbd_spec_alloc();
3707         if (!spec)
3708                 goto out_mem;
3709
3710         spec->pool_name = dup_token(&buf, NULL);
3711         if (!spec->pool_name)
3712                 goto out_mem;
3713         if (!*spec->pool_name) {
3714                 rbd_warn(NULL, "no pool name provided");
3715                 goto out_err;
3716         }
3717
3718         spec->image_name = dup_token(&buf, NULL);
3719         if (!spec->image_name)
3720                 goto out_mem;
3721         if (!*spec->image_name) {
3722                 rbd_warn(NULL, "no image name provided");
3723                 goto out_err;
3724         }
3725
3726         /*
3727          * Snapshot name is optional; default is to use "-"
3728          * (indicating the head/no snapshot).
3729          */
3730         len = next_token(&buf);
3731         if (!len) {
3732                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3733                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3734         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
3735                 ret = -ENAMETOOLONG;
3736                 goto out_err;
3737         }
3738         spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3739         if (!spec->snap_name)
3740                 goto out_mem;
3741         *(spec->snap_name + len) = '\0';
3742
3743         /* Initialize all rbd options to the defaults */
3744
3745         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3746         if (!rbd_opts)
3747                 goto out_mem;
3748
3749         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3750
3751         copts = ceph_parse_options(options, mon_addrs,
3752                                         mon_addrs + mon_addrs_size - 1,
3753                                         parse_rbd_opts_token, rbd_opts);
3754         if (IS_ERR(copts)) {
3755                 ret = PTR_ERR(copts);
3756                 goto out_err;
3757         }
3758         kfree(options);
3759
3760         *ceph_opts = copts;
3761         *opts = rbd_opts;
3762         *rbd_spec = spec;
3763
3764         return 0;
3765 out_mem:
3766         ret = -ENOMEM;
3767 out_err:
3768         kfree(rbd_opts);
3769         rbd_spec_put(spec);
3770         kfree(options);
3771
3772         return ret;
3773 }
3774
3775 /*
3776  * An rbd format 2 image has a unique identifier, distinct from the
3777  * name given to it by the user.  Internally, that identifier is
3778  * what's used to specify the names of objects related to the image.
3779  *
3780  * A special "rbd id" object is used to map an rbd image name to its
3781  * id.  If that object doesn't exist, then there is no v2 rbd image
3782  * with the supplied name.
3783  *
3784  * This function will record the given rbd_dev's image_id field if
3785  * it can be determined, and in that case will return 0.  If any
3786  * errors occur a negative errno will be returned and the rbd_dev's
3787  * image_id field will be unchanged (and should be NULL).
3788  */
3789 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3790 {
3791         int ret;
3792         size_t size;
3793         char *object_name;
3794         void *response;
3795         void *p;
3796
3797         /*
3798          * When probing a parent image, the image id is already
3799          * known (and the image name likely is not).  There's no
3800          * need to fetch the image id again in this case.
3801          */
3802         if (rbd_dev->spec->image_id)
3803                 return 0;
3804
3805         /*
3806          * First, see if the format 2 image id file exists, and if
3807          * so, get the image's persistent id from it.
3808          */
3809         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3810         object_name = kmalloc(size, GFP_NOIO);
3811         if (!object_name)
3812                 return -ENOMEM;
3813         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3814         dout("rbd id object name is %s\n", object_name);
3815
3816         /* Response will be an encoded string, which includes a length */
3817
3818         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3819         response = kzalloc(size, GFP_NOIO);
3820         if (!response) {
3821                 ret = -ENOMEM;
3822                 goto out;
3823         }
3824
3825         ret = rbd_obj_method_sync(rbd_dev, object_name,
3826                                 "rbd", "get_id",
3827                                 NULL, 0,
3828                                 response, RBD_IMAGE_ID_LEN_MAX, NULL);
3829         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3830         if (ret < 0)
3831                 goto out;
3832
3833         p = response;
3834         rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3835                                                 p + RBD_IMAGE_ID_LEN_MAX,
3836                                                 NULL, GFP_NOIO);
3837         if (IS_ERR(rbd_dev->spec->image_id)) {
3838                 ret = PTR_ERR(rbd_dev->spec->image_id);
3839                 rbd_dev->spec->image_id = NULL;
3840         } else {
3841                 dout("image_id is %s\n", rbd_dev->spec->image_id);
3842         }
3843 out:
3844         kfree(response);
3845         kfree(object_name);
3846
3847         return ret;
3848 }
3849
3850 static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3851 {
3852         int ret;
3853         size_t size;
3854
3855         /* Version 1 images have no id; empty string is used */
3856
3857         rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3858         if (!rbd_dev->spec->image_id)
3859                 return -ENOMEM;
3860
3861         /* Record the header object name for this rbd image. */
3862
3863         size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3864         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3865         if (!rbd_dev->header_name) {
3866                 ret = -ENOMEM;
3867                 goto out_err;
3868         }
3869         sprintf(rbd_dev->header_name, "%s%s",
3870                 rbd_dev->spec->image_name, RBD_SUFFIX);
3871
3872         /* Populate rbd image metadata */
3873
3874         ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3875         if (ret < 0)
3876                 goto out_err;
3877
3878         /* Version 1 images have no parent (no layering) */
3879
3880         rbd_dev->parent_spec = NULL;
3881         rbd_dev->parent_overlap = 0;
3882
3883         rbd_dev->image_format = 1;
3884
3885         dout("discovered version 1 image, header name is %s\n",
3886                 rbd_dev->header_name);
3887
3888         return 0;
3889
3890 out_err:
3891         kfree(rbd_dev->header_name);
3892         rbd_dev->header_name = NULL;
3893         kfree(rbd_dev->spec->image_id);
3894         rbd_dev->spec->image_id = NULL;
3895
3896         return ret;
3897 }
3898
3899 static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3900 {
3901         size_t size;
3902         int ret;
3903         u64 ver = 0;
3904
3905         /*
3906          * Image id was filled in by the caller.  Record the header
3907          * object name for this rbd image.
3908          */
3909         size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3910         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3911         if (!rbd_dev->header_name)
3912                 return -ENOMEM;
3913         sprintf(rbd_dev->header_name, "%s%s",
3914                         RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
3915
3916         /* Get the size and object order for the image */
3917
3918         ret = rbd_dev_v2_image_size(rbd_dev);
3919         if (ret < 0)
3920                 goto out_err;
3921
3922         /* Get the object prefix (a.k.a. block_name) for the image */
3923
3924         ret = rbd_dev_v2_object_prefix(rbd_dev);
3925         if (ret < 0)
3926                 goto out_err;
3927
3928         /* Get the and check features for the image */
3929
3930         ret = rbd_dev_v2_features(rbd_dev);
3931         if (ret < 0)
3932                 goto out_err;
3933
3934         /* If the image supports layering, get the parent info */
3935
3936         if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3937                 ret = rbd_dev_v2_parent_info(rbd_dev);
3938                 if (ret < 0)
3939                         goto out_err;
3940         }
3941
3942         /* crypto and compression type aren't (yet) supported for v2 images */
3943
3944         rbd_dev->header.crypt_type = 0;
3945         rbd_dev->header.comp_type = 0;
3946
3947         /* Get the snapshot context, plus the header version */
3948
3949         ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
3950         if (ret)
3951                 goto out_err;
3952         rbd_dev->header.obj_version = ver;
3953
3954         rbd_dev->image_format = 2;
3955
3956         dout("discovered version 2 image, header name is %s\n",
3957                 rbd_dev->header_name);
3958
3959         return 0;
3960 out_err:
3961         rbd_dev->parent_overlap = 0;
3962         rbd_spec_put(rbd_dev->parent_spec);
3963         rbd_dev->parent_spec = NULL;
3964         kfree(rbd_dev->header_name);
3965         rbd_dev->header_name = NULL;
3966         kfree(rbd_dev->header.object_prefix);
3967         rbd_dev->header.object_prefix = NULL;
3968
3969         return ret;
3970 }
3971
3972 static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3973 {
3974         int ret;
3975
3976         /* no need to lock here, as rbd_dev is not registered yet */
3977         ret = rbd_dev_snaps_update(rbd_dev);
3978         if (ret)
3979                 return ret;
3980
3981         ret = rbd_dev_probe_update_spec(rbd_dev);
3982         if (ret)
3983                 goto err_out_snaps;
3984
3985         ret = rbd_dev_set_mapping(rbd_dev);
3986         if (ret)
3987                 goto err_out_snaps;
3988
3989         /* generate unique id: find highest unique id, add one */
3990         rbd_dev_id_get(rbd_dev);
3991
3992         /* Fill in the device name, now that we have its id. */
3993         BUILD_BUG_ON(DEV_NAME_LEN
3994                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3995         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3996
3997         /* Get our block major device number. */
3998
3999         ret = register_blkdev(0, rbd_dev->name);
4000         if (ret < 0)
4001                 goto err_out_id;
4002         rbd_dev->major = ret;
4003
4004         /* Set up the blkdev mapping. */
4005
4006         ret = rbd_init_disk(rbd_dev);
4007         if (ret)
4008                 goto err_out_blkdev;
4009
4010         ret = rbd_bus_add_dev(rbd_dev);
4011         if (ret)
4012                 goto err_out_disk;
4013
4014         /*
4015          * At this point cleanup in the event of an error is the job
4016          * of the sysfs code (initiated by rbd_bus_del_dev()).
4017          */
4018         down_write(&rbd_dev->header_rwsem);
4019         ret = rbd_dev_snaps_register(rbd_dev);
4020         up_write(&rbd_dev->header_rwsem);
4021         if (ret)
4022                 goto err_out_bus;
4023
4024         ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4025         if (ret)
4026                 goto err_out_bus;
4027
4028         /* Everything's ready.  Announce the disk to the world. */
4029
4030         add_disk(rbd_dev->disk);
4031
4032         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4033                 (unsigned long long) rbd_dev->mapping.size);
4034
4035         return ret;
4036 err_out_bus:
4037         /* this will also clean up rest of rbd_dev stuff */
4038
4039         rbd_bus_del_dev(rbd_dev);
4040
4041         return ret;
4042 err_out_disk:
4043         rbd_free_disk(rbd_dev);
4044 err_out_blkdev:
4045         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4046 err_out_id:
4047         rbd_dev_id_put(rbd_dev);
4048 err_out_snaps:
4049         rbd_remove_all_snaps(rbd_dev);
4050
4051         return ret;
4052 }
4053
4054 /*
4055  * Probe for the existence of the header object for the given rbd
4056  * device.  For format 2 images this includes determining the image
4057  * id.
4058  */
4059 static int rbd_dev_probe(struct rbd_device *rbd_dev)
4060 {
4061         int ret;
4062
4063         /*
4064          * Get the id from the image id object.  If it's not a
4065          * format 2 image, we'll get ENOENT back, and we'll assume
4066          * it's a format 1 image.
4067          */
4068         ret = rbd_dev_image_id(rbd_dev);
4069         if (ret)
4070                 ret = rbd_dev_v1_probe(rbd_dev);
4071         else
4072                 ret = rbd_dev_v2_probe(rbd_dev);
4073         if (ret) {
4074                 dout("probe failed, returning %d\n", ret);
4075
4076                 return ret;
4077         }
4078
4079         ret = rbd_dev_probe_finish(rbd_dev);
4080         if (ret)
4081                 rbd_header_free(&rbd_dev->header);
4082
4083         return ret;
4084 }
4085
4086 static ssize_t rbd_add(struct bus_type *bus,
4087                        const char *buf,
4088                        size_t count)
4089 {
4090         struct rbd_device *rbd_dev = NULL;
4091         struct ceph_options *ceph_opts = NULL;
4092         struct rbd_options *rbd_opts = NULL;
4093         struct rbd_spec *spec = NULL;
4094         struct rbd_client *rbdc;
4095         struct ceph_osd_client *osdc;
4096         int rc = -ENOMEM;
4097
4098         if (!try_module_get(THIS_MODULE))
4099                 return -ENODEV;
4100
4101         /* parse add command */
4102         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4103         if (rc < 0)
4104                 goto err_out_module;
4105
4106         rbdc = rbd_get_client(ceph_opts);
4107         if (IS_ERR(rbdc)) {
4108                 rc = PTR_ERR(rbdc);
4109                 goto err_out_args;
4110         }
4111         ceph_opts = NULL;       /* rbd_dev client now owns this */
4112
4113         /* pick the pool */
4114         osdc = &rbdc->client->osdc;
4115         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4116         if (rc < 0)
4117                 goto err_out_client;
4118         spec->pool_id = (u64) rc;
4119
4120         /* The ceph file layout needs to fit pool id in 32 bits */
4121
4122         if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4123                 rc = -EIO;
4124                 goto err_out_client;
4125         }
4126
4127         rbd_dev = rbd_dev_create(rbdc, spec);
4128         if (!rbd_dev)
4129                 goto err_out_client;
4130         rbdc = NULL;            /* rbd_dev now owns this */
4131         spec = NULL;            /* rbd_dev now owns this */
4132
4133         rbd_dev->mapping.read_only = rbd_opts->read_only;
4134         kfree(rbd_opts);
4135         rbd_opts = NULL;        /* done with this */
4136
4137         rc = rbd_dev_probe(rbd_dev);
4138         if (rc < 0)
4139                 goto err_out_rbd_dev;
4140
4141         return count;
4142 err_out_rbd_dev:
4143         rbd_dev_destroy(rbd_dev);
4144 err_out_client:
4145         rbd_put_client(rbdc);
4146 err_out_args:
4147         if (ceph_opts)
4148                 ceph_destroy_options(ceph_opts);
4149         kfree(rbd_opts);
4150         rbd_spec_put(spec);
4151 err_out_module:
4152         module_put(THIS_MODULE);
4153
4154         dout("Error adding device %s\n", buf);
4155
4156         return (ssize_t) rc;
4157 }
4158
4159 static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4160 {
4161         struct list_head *tmp;
4162         struct rbd_device *rbd_dev;
4163
4164         spin_lock(&rbd_dev_list_lock);
4165         list_for_each(tmp, &rbd_dev_list) {
4166                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4167                 if (rbd_dev->dev_id == dev_id) {
4168                         spin_unlock(&rbd_dev_list_lock);
4169                         return rbd_dev;
4170                 }
4171         }
4172         spin_unlock(&rbd_dev_list_lock);
4173         return NULL;
4174 }
4175
4176 static void rbd_dev_release(struct device *dev)
4177 {
4178         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4179
4180         if (rbd_dev->watch_event)
4181                 rbd_dev_header_watch_sync(rbd_dev, 0);
4182
4183         /* clean up and free blkdev */
4184         rbd_free_disk(rbd_dev);
4185         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4186
4187         /* release allocated disk header fields */
4188         rbd_header_free(&rbd_dev->header);
4189
4190         /* done with the id, and with the rbd_dev */
4191         rbd_dev_id_put(rbd_dev);
4192         rbd_assert(rbd_dev->rbd_client != NULL);
4193         rbd_dev_destroy(rbd_dev);
4194
4195         /* release module ref */
4196         module_put(THIS_MODULE);
4197 }
4198
4199 static ssize_t rbd_remove(struct bus_type *bus,
4200                           const char *buf,
4201                           size_t count)
4202 {
4203         struct rbd_device *rbd_dev = NULL;
4204         int target_id, rc;
4205         unsigned long ul;
4206         int ret = count;
4207
4208         rc = strict_strtoul(buf, 10, &ul);
4209         if (rc)
4210                 return rc;
4211
4212         /* convert to int; abort if we lost anything in the conversion */
4213         target_id = (int) ul;
4214         if (target_id != ul)
4215                 return -EINVAL;
4216
4217         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4218
4219         rbd_dev = __rbd_get_dev(target_id);
4220         if (!rbd_dev) {
4221                 ret = -ENOENT;
4222                 goto done;
4223         }
4224
4225         spin_lock_irq(&rbd_dev->lock);
4226         if (rbd_dev->open_count)
4227                 ret = -EBUSY;
4228         else
4229                 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4230         spin_unlock_irq(&rbd_dev->lock);
4231         if (ret < 0)
4232                 goto done;
4233
4234         rbd_remove_all_snaps(rbd_dev);
4235         rbd_bus_del_dev(rbd_dev);
4236
4237 done:
4238         mutex_unlock(&ctl_mutex);
4239
4240         return ret;
4241 }
4242
4243 /*
4244  * create control files in sysfs
4245  * /sys/bus/rbd/...
4246  */
4247 static int rbd_sysfs_init(void)
4248 {
4249         int ret;
4250
4251         ret = device_register(&rbd_root_dev);
4252         if (ret < 0)
4253                 return ret;
4254
4255         ret = bus_register(&rbd_bus_type);
4256         if (ret < 0)
4257                 device_unregister(&rbd_root_dev);
4258
4259         return ret;
4260 }
4261
4262 static void rbd_sysfs_cleanup(void)
4263 {
4264         bus_unregister(&rbd_bus_type);
4265         device_unregister(&rbd_root_dev);
4266 }
4267
4268 static int __init rbd_init(void)
4269 {
4270         int rc;
4271
4272         if (!libceph_compatible(NULL)) {
4273                 rbd_warn(NULL, "libceph incompatibility (quitting)");
4274
4275                 return -EINVAL;
4276         }
4277         rc = rbd_sysfs_init();
4278         if (rc)
4279                 return rc;
4280         pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4281         return 0;
4282 }
4283
4284 static void __exit rbd_exit(void)
4285 {
4286         rbd_sysfs_cleanup();
4287 }
4288
4289 module_init(rbd_init);
4290 module_exit(rbd_exit);
4291
4292 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4293 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4294 MODULE_DESCRIPTION("rados block device");
4295
4296 /* following authorship retained from original osdblk.c */
4297 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4298
4299 MODULE_LICENSE("GPL");