drivers/md/dm-snap-persistent.c

   1 /*
   2  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
   3  * Copyright (C) 2006-2008 Red Hat GmbH
   4  *
   5  * This file is released under the GPL.
   6  */
   7
   8 #include "dm-exception-store.h"
   9
  10 #include <linux/ctype.h>
  11 #include <linux/mm.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/vmalloc.h>
  14 #include <linux/export.h>
  15 #include <linux/slab.h>
  16 #include <linux/dm-io.h>
  17 #include "dm-bufio.h"
  18
  19 #define DM_MSG_PREFIX "persistent snapshot"
  20 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32        /* 16KB */
  21
  22 #define DM_PREFETCH_CHUNKS              12
  23
  24 /*-----------------------------------------------------------------
  25  * Persistent snapshots, by persistent we mean that the snapshot
  26  * will survive a reboot.
  27  *---------------------------------------------------------------*/
  28
  29 /*
  30  * We need to store a record of which parts of the origin have
  31  * been copied to the snapshot device.  The snapshot code
  32  * requires that we copy exception chunks to chunk aligned areas
  33  * of the COW store.  It makes sense therefore, to store the
  34  * metadata in chunk size blocks.
  35  *
  36  * There is no backward or forward compatibility implemented,
  37  * snapshots with different disk versions than the kernel will
  38  * not be usable.  It is expected that "lvcreate" will blank out
  39  * the start of a fresh COW device before calling the snapshot
  40  * constructor.
  41  *
  42  * The first chunk of the COW device just contains the header.
  43  * After this there is a chunk filled with exception metadata,
  44  * followed by as many exception chunks as can fit in the
  45  * metadata areas.
  46  *
  47  * All on disk structures are in little-endian format.  The end
  48  * of the exceptions info is indicated by an exception with a
  49  * new_chunk of 0, which is invalid since it would point to the
  50  * header chunk.
  51  */
  52
  53 /*
  54  * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
  55  */
  56 #define SNAP_MAGIC 0x70416e53
  57
  58 /*
  59  * The on-disk version of the metadata.
  60  */
  61 #define SNAPSHOT_DISK_VERSION 1
  62
  63 #define NUM_SNAPSHOT_HDR_CHUNKS 1
  64
  65 struct disk_header {
  66         __le32 magic;
  67
  68         /*
  69          * Is this snapshot valid.  There is no way of recovering
  70          * an invalid snapshot.
  71          */
  72         __le32 valid;
  73
  74         /*
  75          * Simple, incrementing version. no backward
  76          * compatibility.
  77          */
  78         __le32 version;
  79
  80         /* In sectors */
  81         __le32 chunk_size;
  82 } __packed;
  83
  84 struct disk_exception {
  85         __le64 old_chunk;
  86         __le64 new_chunk;
  87 } __packed;
  88
  89 struct core_exception {
  90         uint64_t old_chunk;
  91         uint64_t new_chunk;
  92 };
  93
  94 struct commit_callback {
  95         void (*callback)(void *, int success);
  96         void *context;
  97 };
  98
  99 /*
 100  * The top level structure for a persistent exception store.
 101  */
 102 struct pstore {
 103         struct dm_exception_store *store;
 104         int version;
 105         int valid;
 106         uint32_t exceptions_per_area;
 107
 108         /*
 109          * Now that we have an asynchronous kcopyd there is no
 110          * need for large chunk sizes, so it wont hurt to have a
 111          * whole chunks worth of metadata in memory at once.
 112          */
 113         void *area;
 114
 115         /*
 116          * An area of zeros used to clear the next area.
 117          */
 118         void *zero_area;
 119
 120         /*
 121          * An area used for header. The header can be written
 122          * concurrently with metadata (when invalidating the snapshot),
 123          * so it needs a separate buffer.
 124          */
 125         void *header_area;
 126
 127         /*
 128          * Used to keep track of which metadata area the data in
 129          * 'chunk' refers to.
 130          */
 131         chunk_t current_area;
 132
 133         /*
 134          * The next free chunk for an exception.
 135          *
 136          * When creating exceptions, all the chunks here and above are
 137          * free.  It holds the next chunk to be allocated.  On rare
 138          * occasions (e.g. after a system crash) holes can be left in
 139          * the exception store because chunks can be committed out of
 140          * order.
 141          *
 142          * When merging exceptions, it does not necessarily mean all the
 143          * chunks here and above are free.  It holds the value it would
 144          * have held if all chunks had been committed in order of
 145          * allocation.  Consequently the value may occasionally be
 146          * slightly too low, but since it's only used for 'status' and
 147          * it can never reach its minimum value too early this doesn't
 148          * matter.
 149          */
 150
 151         chunk_t next_free;
 152
 153         /*
 154          * The index of next free exception in the current
 155          * metadata area.
 156          */
 157         uint32_t current_committed;
 158
 159         atomic_t pending_count;
 160         uint32_t callback_count;
 161         struct commit_callback *callbacks;
 162         struct dm_io_client *io_client;
 163
 164         struct workqueue_struct *metadata_wq;
 165 };
 166
 167 static int alloc_area(struct pstore *ps)
 168 {
 169         int r = -ENOMEM;
 170         size_t len;
 171
 172         len = ps->store->chunk_size << SECTOR_SHIFT;
 173
 174         /*
 175          * Allocate the chunk_size block of memory that will hold
 176          * a single metadata area.
 177          */
 178         ps->area = vmalloc(len);
 179         if (!ps->area)
 180                 goto err_area;
 181
 182         ps->zero_area = vzalloc(len);
 183         if (!ps->zero_area)
 184                 goto err_zero_area;
 185
 186         ps->header_area = vmalloc(len);
 187         if (!ps->header_area)
 188                 goto err_header_area;
 189
 190         return 0;
 191
 192 err_header_area:
 193         vfree(ps->zero_area);
 194
 195 err_zero_area:
 196         vfree(ps->area);
 197
 198 err_area:
 199         return r;
 200 }
 201
 202 static void free_area(struct pstore *ps)
 203 {
 204         vfree(ps->area);
 205         ps->area = NULL;
 206         vfree(ps->zero_area);
 207         ps->zero_area = NULL;
 208         vfree(ps->header_area);
 209         ps->header_area = NULL;
 210 }
 211
 212 struct mdata_req {
 213         struct dm_io_region *where;
 214         struct dm_io_request *io_req;
 215         struct work_struct work;
 216         int result;
 217 };
 218
 219 static void do_metadata(struct work_struct *work)
 220 {
 221         struct mdata_req *req = container_of(work, struct mdata_req, work);
 222
 223         req->result = dm_io(req->io_req, 1, req->where, NULL);
 224 }
 225
 226 /*
 227  * Read or write a chunk aligned and sized block of data from a device.
 228  */
 229 static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
 230                     int metadata)
 231 {
 232         struct dm_io_region where = {
 233                 .bdev = dm_snap_cow(ps->store->snap)->bdev,
 234                 .sector = ps->store->chunk_size * chunk,
 235                 .count = ps->store->chunk_size,
 236         };
 237         struct dm_io_request io_req = {
 238                 .bi_rw = rw,
 239                 .mem.type = DM_IO_VMA,
 240                 .mem.ptr.vma = area,
 241                 .client = ps->io_client,
 242                 .notify.fn = NULL,
 243         };
 244         struct mdata_req req;
 245
 246         if (!metadata)
 247                 return dm_io(&io_req, 1, &where, NULL);
 248
 249         req.where = &where;
 250         req.io_req = &io_req;
 251
 252         /*
 253          * Issue the synchronous I/O from a different thread
 254          * to avoid generic_make_request recursion.
 255          */
 256         INIT_WORK_ONSTACK(&req.work, do_metadata);
 257         queue_work(ps->metadata_wq, &req.work);
 258         flush_workqueue(ps->metadata_wq);
 259         destroy_work_on_stack(&req.work);
 260
 261         return req.result;
 262 }
 263
 264 /*
 265  * Convert a metadata area index to a chunk index.
 266  */
 267 static chunk_t area_location(struct pstore *ps, chunk_t area)
 268 {
 269         return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area);
 270 }
 271
 272 static void skip_metadata(struct pstore *ps)
 273 {
 274         uint32_t stride = ps->exceptions_per_area + 1;
 275         chunk_t next_free = ps->next_free;
 276         if (sector_div(next_free, stride) == NUM_SNAPSHOT_HDR_CHUNKS)
 277                 ps->next_free++;
 278 }
 279
 280 /*
 281  * Read or write a metadata area.  Remembering to skip the first
 282  * chunk which holds the header.
 283  */
 284 static int area_io(struct pstore *ps, int rw)
 285 {
 286         int r;
 287         chunk_t chunk;
 288
 289         chunk = area_location(ps, ps->current_area);
 290
 291         r = chunk_io(ps, ps->area, chunk, rw, 0);
 292         if (r)
 293                 return r;
 294
 295         return 0;
 296 }
 297
 298 static void zero_memory_area(struct pstore *ps)
 299 {
 300         memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
 301 }
 302
 303 static int zero_disk_area(struct pstore *ps, chunk_t area)
 304 {
 305         return chunk_io(ps, ps->zero_area, area_location(ps, area), WRITE, 0);
 306 }
 307
 308 static int read_header(struct pstore *ps, int *new_snapshot)
 309 {
 310         int r;
 311         struct disk_header *dh;
 312         unsigned chunk_size;
 313         int chunk_size_supplied = 1;
 314         char *chunk_err;
 315
 316         /*
 317          * Use default chunk size (or logical_block_size, if larger)
 318          * if none supplied
 319          */
 320         if (!ps->store->chunk_size) {
 321                 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
 322                     bdev_logical_block_size(dm_snap_cow(ps->store->snap)->
 323                                             bdev) >> 9);
 324                 ps->store->chunk_mask = ps->store->chunk_size - 1;
 325                 ps->store->chunk_shift = __ffs(ps->store->chunk_size);
 326                 chunk_size_supplied = 0;
 327         }
 328
 329         ps->io_client = dm_io_client_create();
 330         if (IS_ERR(ps->io_client))
 331                 return PTR_ERR(ps->io_client);
 332
 333         r = alloc_area(ps);
 334         if (r)
 335                 return r;
 336
 337         r = chunk_io(ps, ps->header_area, 0, READ, 1);
 338         if (r)
 339                 goto bad;
 340
 341         dh = ps->header_area;
 342
 343         if (le32_to_cpu(dh->magic) == 0) {
 344                 *new_snapshot = 1;
 345                 return 0;
 346         }
 347
 348         if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
 349                 DMWARN("Invalid or corrupt snapshot");
 350                 r = -ENXIO;
 351                 goto bad;
 352         }
 353
 354         *new_snapshot = 0;
 355         ps->valid = le32_to_cpu(dh->valid);
 356         ps->version = le32_to_cpu(dh->version);
 357         chunk_size = le32_to_cpu(dh->chunk_size);
 358
 359         if (ps->store->chunk_size == chunk_size)
 360                 return 0;
 361
 362         if (chunk_size_supplied)
 363                 DMWARN("chunk size %u in device metadata overrides "
 364                        "table chunk size of %u.",
 365                        chunk_size, ps->store->chunk_size);
 366
 367         /* We had a bogus chunk_size. Fix stuff up. */
 368         free_area(ps);
 369
 370         r = dm_exception_store_set_chunk_size(ps->store, chunk_size,
 371                                               &chunk_err);
 372         if (r) {
 373                 DMERR("invalid on-disk chunk size %u: %s.",
 374                       chunk_size, chunk_err);
 375                 return r;
 376         }
 377
 378         r = alloc_area(ps);
 379         return r;
 380
 381 bad:
 382         free_area(ps);
 383         return r;
 384 }
 385
 386 static int write_header(struct pstore *ps)
 387 {
 388         struct disk_header *dh;
 389
 390         memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT);
 391
 392         dh = ps->header_area;
 393         dh->magic = cpu_to_le32(SNAP_MAGIC);
 394         dh->valid = cpu_to_le32(ps->valid);
 395         dh->version = cpu_to_le32(ps->version);
 396         dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
 397
 398         return chunk_io(ps, ps->header_area, 0, WRITE, 1);
 399 }
 400
 401 /*
 402  * Access functions for the disk exceptions, these do the endian conversions.
 403  */
 404 static struct disk_exception *get_exception(struct pstore *ps, void *ps_area,
 405                                             uint32_t index)
 406 {
 407         BUG_ON(index >= ps->exceptions_per_area);
 408
 409         return ((struct disk_exception *) ps_area) + index;
 410 }
 411
 412 static void read_exception(struct pstore *ps, void *ps_area,
 413                            uint32_t index, struct core_exception *result)
 414 {
 415         struct disk_exception *de = get_exception(ps, ps_area, index);
 416
 417         /* copy it */
 418         result->old_chunk = le64_to_cpu(de->old_chunk);
 419         result->new_chunk = le64_to_cpu(de->new_chunk);
 420 }
 421
 422 static void write_exception(struct pstore *ps,
 423                             uint32_t index, struct core_exception *e)
 424 {
 425         struct disk_exception *de = get_exception(ps, ps->area, index);
 426
 427         /* copy it */
 428         de->old_chunk = cpu_to_le64(e->old_chunk);
 429         de->new_chunk = cpu_to_le64(e->new_chunk);
 430 }
 431
 432 static void clear_exception(struct pstore *ps, uint32_t index)
 433 {
 434         struct disk_exception *de = get_exception(ps, ps->area, index);
 435
 436         /* clear it */
 437         de->old_chunk = 0;
 438         de->new_chunk = 0;
 439 }
 440
 441 /*
 442  * Registers the exceptions that are present in the current area.
 443  * 'full' is filled in to indicate if the area has been
 444  * filled.
 445  */
 446 static int insert_exceptions(struct pstore *ps, void *ps_area,
 447                              int (*callback)(void *callback_context,
 448                                              chunk_t old, chunk_t new),
 449                              void *callback_context,
 450                              int *full)
 451 {
 452         int r;
 453         unsigned int i;
 454         struct core_exception e;
 455
 456         /* presume the area is full */
 457         *full = 1;
 458
 459         for (i = 0; i < ps->exceptions_per_area; i++) {
 460                 read_exception(ps, ps_area, i, &e);
 461
 462                 /*
 463                  * If the new_chunk is pointing at the start of
 464                  * the COW device, where the first metadata area
 465                  * is we know that we've hit the end of the
 466                  * exceptions.  Therefore the area is not full.
 467                  */
 468                 if (e.new_chunk == 0LL) {
 469                         ps->current_committed = i;
 470                         *full = 0;
 471                         break;
 472                 }
 473
 474                 /*
 475                  * Keep track of the start of the free chunks.
 476                  */
 477                 if (ps->next_free <= e.new_chunk)
 478                         ps->next_free = e.new_chunk + 1;
 479
 480                 /*
 481                  * Otherwise we add the exception to the snapshot.
 482                  */
 483                 r = callback(callback_context, e.old_chunk, e.new_chunk);
 484                 if (r)
 485                         return r;
 486         }
 487
 488         return 0;
 489 }
 490
 491 static int read_exceptions(struct pstore *ps,
 492                            int (*callback)(void *callback_context, chunk_t old,
 493                                            chunk_t new),
 494                            void *callback_context)
 495 {
 496         int r, full = 1;
 497         struct dm_bufio_client *client;
 498         chunk_t prefetch_area = 0;
 499
 500         client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev,
 501                                         ps->store->chunk_size << SECTOR_SHIFT,
 502                                         1, 0, NULL, NULL);
 503
 504         if (IS_ERR(client))
 505                 return PTR_ERR(client);
 506
 507         /*
 508          * Setup for one current buffer + desired readahead buffers.
 509          */
 510         dm_bufio_set_minimum_buffers(client, 1 + DM_PREFETCH_CHUNKS);
 511
 512         /*
 513          * Keeping reading chunks and inserting exceptions until
 514          * we find a partially full area.
 515          */
 516         for (ps->current_area = 0; full; ps->current_area++) {
 517                 struct dm_buffer *bp;
 518                 void *area;
 519                 chunk_t chunk;
 520
 521                 if (unlikely(prefetch_area < ps->current_area))
 522                         prefetch_area = ps->current_area;
 523
 524                 if (DM_PREFETCH_CHUNKS) do {
 525                         chunk_t pf_chunk = area_location(ps, prefetch_area);
 526                         if (unlikely(pf_chunk >= dm_bufio_get_device_size(client)))
 527                                 break;
 528                         dm_bufio_prefetch(client, pf_chunk, 1);
 529                         prefetch_area++;
 530                         if (unlikely(!prefetch_area))
 531                                 break;
 532                 } while (prefetch_area <= ps->current_area + DM_PREFETCH_CHUNKS);
 533
 534                 chunk = area_location(ps, ps->current_area);
 535
 536                 area = dm_bufio_read(client, chunk, &bp);
 537                 if (IS_ERR(area)) {
 538                         r = PTR_ERR(area);
 539                         goto ret_destroy_bufio;
 540                 }
 541
 542                 r = insert_exceptions(ps, area, callback, callback_context,
 543                                       &full);
 544
 545                 if (!full)
 546                         memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT);
 547
 548                 dm_bufio_release(bp);
 549
 550                 dm_bufio_forget(client, chunk);
 551
 552                 if (unlikely(r))
 553                         goto ret_destroy_bufio;
 554         }
 555
 556         ps->current_area--;
 557
 558         skip_metadata(ps);
 559
 560         r = 0;
 561
 562 ret_destroy_bufio:
 563         dm_bufio_client_destroy(client);
 564
 565         return r;
 566 }
 567
 568 static struct pstore *get_info(struct dm_exception_store *store)
 569 {
 570         return (struct pstore *) store->context;
 571 }
 572
 573 static void persistent_usage(struct dm_exception_store *store,
 574                              sector_t *total_sectors,
 575                              sector_t *sectors_allocated,
 576                              sector_t *metadata_sectors)
 577 {
 578         struct pstore *ps = get_info(store);
 579
 580         *sectors_allocated = ps->next_free * store->chunk_size;
 581         *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
 582
 583         /*
 584          * First chunk is the fixed header.
 585          * Then there are (ps->current_area + 1) metadata chunks, each one
 586          * separated from the next by ps->exceptions_per_area data chunks.
 587          */
 588         *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) *
 589                             store->chunk_size;
 590 }
 591
 592 static void persistent_dtr(struct dm_exception_store *store)
 593 {
 594         struct pstore *ps = get_info(store);
 595
 596         destroy_workqueue(ps->metadata_wq);
 597
 598         /* Created in read_header */
 599         if (ps->io_client)
 600                 dm_io_client_destroy(ps->io_client);
 601         free_area(ps);
 602
 603         /* Allocated in persistent_read_metadata */
 604         vfree(ps->callbacks);
 605
 606         kfree(ps);
 607 }
 608
 609 static int persistent_read_metadata(struct dm_exception_store *store,
 610                                     int (*callback)(void *callback_context,
 611                                                     chunk_t old, chunk_t new),
 612                                     void *callback_context)
 613 {
 614         int r, uninitialized_var(new_snapshot);
 615         struct pstore *ps = get_info(store);
 616
 617         /*
 618          * Read the snapshot header.
 619          */
 620         r = read_header(ps, &new_snapshot);
 621         if (r)
 622                 return r;
 623
 624         /*
 625          * Now we know correct chunk_size, complete the initialisation.
 626          */
 627         ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
 628                                   sizeof(struct disk_exception);
 629         ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
 630                                    sizeof(*ps->callbacks));
 631         if (!ps->callbacks)
 632                 return -ENOMEM;
 633
 634         /*
 635          * Do we need to setup a new snapshot ?
 636          */
 637         if (new_snapshot) {
 638                 r = write_header(ps);
 639                 if (r) {
 640                         DMWARN("write_header failed");
 641                         return r;
 642                 }
 643
 644                 ps->current_area = 0;
 645                 zero_memory_area(ps);
 646                 r = zero_disk_area(ps, 0);
 647                 if (r)
 648                         DMWARN("zero_disk_area(0) failed");
 649                 return r;
 650         }
 651         /*
 652          * Sanity checks.
 653          */
 654         if (ps->version != SNAPSHOT_DISK_VERSION) {
 655                 DMWARN("unable to handle snapshot disk version %d",
 656                        ps->version);
 657                 return -EINVAL;
 658         }
 659
 660         /*
 661          * Metadata are valid, but snapshot is invalidated
 662          */
 663         if (!ps->valid)
 664                 return 1;
 665
 666         /*
 667          * Read the metadata.
 668          */
 669         r = read_exceptions(ps, callback, callback_context);
 670
 671         return r;
 672 }
 673
 674 static int persistent_prepare_exception(struct dm_exception_store *store,
 675                                         struct dm_exception *e)
 676 {
 677         struct pstore *ps = get_info(store);
 678         sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
 679
 680         /* Is there enough room ? */
 681         if (size < ((ps->next_free + 1) * store->chunk_size))
 682                 return -ENOSPC;
 683
 684         e->new_chunk = ps->next_free;
 685
 686         /*
 687          * Move onto the next free pending, making sure to take
 688          * into account the location of the metadata chunks.
 689          */
 690         ps->next_free++;
 691         skip_metadata(ps);
 692
 693         atomic_inc(&ps->pending_count);
 694         return 0;
 695 }
 696
 697 static void persistent_commit_exception(struct dm_exception_store *store,
 698                                         struct dm_exception *e,
 699                                         void (*callback) (void *, int success),
 700                                         void *callback_context)
 701 {
 702         unsigned int i;
 703         struct pstore *ps = get_info(store);
 704         struct core_exception ce;
 705         struct commit_callback *cb;
 706
 707         ce.old_chunk = e->old_chunk;
 708         ce.new_chunk = e->new_chunk;
 709         write_exception(ps, ps->current_committed++, &ce);
 710
 711         /*
 712          * Add the callback to the back of the array.  This code
 713          * is the only place where the callback array is
 714          * manipulated, and we know that it will never be called
 715          * multiple times concurrently.
 716          */
 717         cb = ps->callbacks + ps->callback_count++;
 718         cb->callback = callback;
 719         cb->context = callback_context;
 720
 721         /*
 722          * If there are exceptions in flight and we have not yet
 723          * filled this metadata area there's nothing more to do.
 724          */
 725         if (!atomic_dec_and_test(&ps->pending_count) &&
 726             (ps->current_committed != ps->exceptions_per_area))
 727                 return;
 728
 729         /*
 730          * If we completely filled the current area, then wipe the next one.
 731          */
 732         if ((ps->current_committed == ps->exceptions_per_area) &&
 733             zero_disk_area(ps, ps->current_area + 1))
 734                 ps->valid = 0;
 735
 736         /*
 737          * Commit exceptions to disk.
 738          */
 739         if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
 740                 ps->valid = 0;
 741
 742         /*
 743          * Advance to the next area if this one is full.
 744          */
 745         if (ps->current_committed == ps->exceptions_per_area) {
 746                 ps->current_committed = 0;
 747                 ps->current_area++;
 748                 zero_memory_area(ps);
 749         }
 750
 751         for (i = 0; i < ps->callback_count; i++) {
 752                 cb = ps->callbacks + i;
 753                 cb->callback(cb->context, ps->valid);
 754         }
 755
 756         ps->callback_count = 0;
 757 }
 758
 759 static int persistent_prepare_merge(struct dm_exception_store *store,
 760                                     chunk_t *last_old_chunk,
 761                                     chunk_t *last_new_chunk)
 762 {
 763         struct pstore *ps = get_info(store);
 764         struct core_exception ce;
 765         int nr_consecutive;
 766         int r;
 767
 768         /*
 769          * When current area is empty, move back to preceding area.
 770          */
 771         if (!ps->current_committed) {
 772                 /*
 773                  * Have we finished?
 774                  */
 775                 if (!ps->current_area)
 776                         return 0;
 777
 778                 ps->current_area--;
 779                 r = area_io(ps, READ);
 780                 if (r < 0)
 781                         return r;
 782                 ps->current_committed = ps->exceptions_per_area;
 783         }
 784
 785         read_exception(ps, ps->area, ps->current_committed - 1, &ce);
 786         *last_old_chunk = ce.old_chunk;
 787         *last_new_chunk = ce.new_chunk;
 788
 789         /*
 790          * Find number of consecutive chunks within the current area,
 791          * working backwards.
 792          */
 793         for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
 794              nr_consecutive++) {
 795                 read_exception(ps, ps->area,
 796                                ps->current_committed - 1 - nr_consecutive, &ce);
 797                 if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
 798                     ce.new_chunk != *last_new_chunk - nr_consecutive)
 799                         break;
 800         }
 801
 802         return nr_consecutive;
 803 }
 804
 805 static int persistent_commit_merge(struct dm_exception_store *store,
 806                                    int nr_merged)
 807 {
 808         int r, i;
 809         struct pstore *ps = get_info(store);
 810
 811         BUG_ON(nr_merged > ps->current_committed);
 812
 813         for (i = 0; i < nr_merged; i++)
 814                 clear_exception(ps, ps->current_committed - 1 - i);
 815
 816         r = area_io(ps, WRITE_FLUSH_FUA);
 817         if (r < 0)
 818                 return r;
 819
 820         ps->current_committed -= nr_merged;
 821
 822         /*
 823          * At this stage, only persistent_usage() uses ps->next_free, so
 824          * we make no attempt to keep ps->next_free strictly accurate
 825          * as exceptions may have been committed out-of-order originally.
 826          * Once a snapshot has become merging, we set it to the value it
 827          * would have held had all the exceptions been committed in order.
 828          *
 829          * ps->current_area does not get reduced by prepare_merge() until
 830          * after commit_merge() has removed the nr_merged previous exceptions.
 831          */
 832         ps->next_free = area_location(ps, ps->current_area) +
 833                         ps->current_committed + 1;
 834
 835         return 0;
 836 }
 837
 838 static void persistent_drop_snapshot(struct dm_exception_store *store)
 839 {
 840         struct pstore *ps = get_info(store);
 841
 842         ps->valid = 0;
 843         if (write_header(ps))
 844                 DMWARN("write header failed");
 845 }
 846
 847 static int persistent_ctr(struct dm_exception_store *store, char *options)
 848 {
 849         struct pstore *ps;
 850         int r;
 851
 852         /* allocate the pstore */
 853         ps = kzalloc(sizeof(*ps), GFP_KERNEL);
 854         if (!ps)
 855                 return -ENOMEM;
 856
 857         ps->store = store;
 858         ps->valid = 1;
 859         ps->version = SNAPSHOT_DISK_VERSION;
 860         ps->area = NULL;
 861         ps->zero_area = NULL;
 862         ps->header_area = NULL;
 863         ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */
 864         ps->current_committed = 0;
 865
 866         ps->callback_count = 0;
 867         atomic_set(&ps->pending_count, 0);
 868         ps->callbacks = NULL;
 869
 870         ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
 871         if (!ps->metadata_wq) {
 872                 DMERR("couldn't start header metadata update thread");
 873                 r = -ENOMEM;
 874                 goto err_workqueue;
 875         }
 876
 877         if (options) {
 878                 char overflow = toupper(options[0]);
 879                 if (overflow == 'O')
 880                         store->userspace_supports_overflow = true;
 881                 else {
 882                         DMERR("Unsupported persistent store option: %s", options);
 883                         r = -EINVAL;
 884                         goto err_options;
 885                 }
 886         }
 887
 888         store->context = ps;
 889
 890         return 0;
 891
 892 err_options:
 893         destroy_workqueue(ps->metadata_wq);
 894 err_workqueue:
 895         kfree(ps);
 896
 897         return r;
 898 }
 899
 900 static unsigned persistent_status(struct dm_exception_store *store,
 901                                   status_type_t status, char *result,
 902                                   unsigned maxlen)
 903 {
 904         unsigned sz = 0;
 905
 906         switch (status) {
 907         case STATUSTYPE_INFO:
 908                 break;
 909         case STATUSTYPE_TABLE:
 910                 DMEMIT(" %s %llu", store->userspace_supports_overflow ? "PO" : "P",
 911                        (unsigned long long)store->chunk_size);
 912         }
 913
 914         return sz;
 915 }
 916
 917 static struct dm_exception_store_type _persistent_type = {
 918         .name = "persistent",
 919         .module = THIS_MODULE,
 920         .ctr = persistent_ctr,
 921         .dtr = persistent_dtr,
 922         .read_metadata = persistent_read_metadata,
 923         .prepare_exception = persistent_prepare_exception,
 924         .commit_exception = persistent_commit_exception,
 925         .prepare_merge = persistent_prepare_merge,
 926         .commit_merge = persistent_commit_merge,
 927         .drop_snapshot = persistent_drop_snapshot,
 928         .usage = persistent_usage,
 929         .status = persistent_status,
 930 };
 931
 932 static struct dm_exception_store_type _persistent_compat_type = {
 933         .name = "P",
 934         .module = THIS_MODULE,
 935         .ctr = persistent_ctr,
 936         .dtr = persistent_dtr,
 937         .read_metadata = persistent_read_metadata,
 938         .prepare_exception = persistent_prepare_exception,
 939         .commit_exception = persistent_commit_exception,
 940         .prepare_merge = persistent_prepare_merge,
 941         .commit_merge = persistent_commit_merge,
 942         .drop_snapshot = persistent_drop_snapshot,
 943         .usage = persistent_usage,
 944         .status = persistent_status,
 945 };
 946
 947 int dm_persistent_snapshot_init(void)
 948 {
 949         int r;
 950
 951         r = dm_exception_store_type_register(&_persistent_type);
 952         if (r) {
 953                 DMERR("Unable to register persistent exception store type");
 954                 return r;
 955         }
 956
 957         r = dm_exception_store_type_register(&_persistent_compat_type);
 958         if (r) {
 959                 DMERR("Unable to register old-style persistent exception "
 960                       "store type");
 961                 dm_exception_store_type_unregister(&_persistent_type);
 962                 return r;
 963         }
 964
 965         return r;
 966 }
 967
 968 void dm_persistent_snapshot_exit(void)
 969 {
 970         dm_exception_store_type_unregister(&_persistent_type);
 971         dm_exception_store_type_unregister(&_persistent_compat_type);
 972 }