fs/btrfs/scrub.c

   1 /*
   2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/blkdev.h>
  20 #include <linux/ratelimit.h>
  21 #include "ctree.h"
  22 #include "volumes.h"
  23 #include "disk-io.h"
  24 #include "ordered-data.h"
  25 #include "transaction.h"
  26 #include "backref.h"
  27 #include "extent_io.h"
  28 #include "dev-replace.h"
  29 #include "check-integrity.h"
  30 #include "rcu-string.h"
  31
  32 /*
  33  * This is only the first step towards a full-features scrub. It reads all
  34  * extent and super block and verifies the checksums. In case a bad checksum
  35  * is found or the extent cannot be read, good data will be written back if
  36  * any can be found.
  37  *
  38  * Future enhancements:
  39  *  - In case an unrepairable extent is encountered, track which files are
  40  *    affected and report them
  41  *  - track and record media errors, throw out bad devices
  42  *  - add a mode to also read unallocated space
  43  */
  44
  45 struct scrub_block;
  46 struct scrub_ctx;
  47
  48 /*
  49  * the following three values only influence the performance.
  50  * The last one configures the number of parallel and outstanding I/O
  51  * operations. The first two values configure an upper limit for the number
  52  * of (dynamically allocated) pages that are added to a bio.
  53  */
  54 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  55 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  56 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  57
  58 /*
  59  * the following value times PAGE_SIZE needs to be large enough to match the
  60  * largest node/leaf/sector size that shall be supported.
  61  * Values larger than BTRFS_STRIPE_LEN are not supported.
  62  */
  63 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  64
  65 struct scrub_page {
  66         struct scrub_block      *sblock;
  67         struct page             *page;
  68         struct btrfs_device     *dev;
  69         u64                     flags;  /* extent flags */
  70         u64                     generation;
  71         u64                     logical;
  72         u64                     physical;
  73         u64                     physical_for_dev_replace;
  74         atomic_t                ref_count;
  75         struct {
  76                 unsigned int    mirror_num:8;
  77                 unsigned int    have_csum:1;
  78                 unsigned int    io_error:1;
  79         };
  80         u8                      csum[BTRFS_CSUM_SIZE];
  81 };
  82
  83 struct scrub_bio {
  84         int                     index;
  85         struct scrub_ctx        *sctx;
  86         struct btrfs_device     *dev;
  87         struct bio              *bio;
  88         int                     err;
  89         u64                     logical;
  90         u64                     physical;
  91 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
  92         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
  93 #else
  94         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
  95 #endif
  96         int                     page_count;
  97         int                     next_free;
  98         struct btrfs_work       work;
  99 };
 100
 101 struct scrub_block {
 102         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 103         int                     page_count;
 104         atomic_t                outstanding_pages;
 105         atomic_t                ref_count; /* free mem on transition to zero */
 106         struct scrub_ctx        *sctx;
 107         struct {
 108                 unsigned int    header_error:1;
 109                 unsigned int    checksum_error:1;
 110                 unsigned int    no_io_error_seen:1;
 111                 unsigned int    generation_error:1; /* also sets header_error */
 112         };
 113 };
 114
 115 struct scrub_wr_ctx {
 116         struct scrub_bio *wr_curr_bio;
 117         struct btrfs_device *tgtdev;
 118         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 119         atomic_t flush_all_writes;
 120         struct mutex wr_lock;
 121 };
 122
 123 struct scrub_ctx {
 124         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 125         struct btrfs_root       *dev_root;
 126         int                     first_free;
 127         int                     curr;
 128         atomic_t                bios_in_flight;
 129         atomic_t                workers_pending;
 130         spinlock_t              list_lock;
 131         wait_queue_head_t       list_wait;
 132         u16                     csum_size;
 133         struct list_head        csum_list;
 134         atomic_t                cancel_req;
 135         int                     readonly;
 136         int                     pages_per_rd_bio;
 137         u32                     sectorsize;
 138         u32                     nodesize;
 139         u32                     leafsize;
 140
 141         int                     is_dev_replace;
 142         struct scrub_wr_ctx     wr_ctx;
 143
 144         /*
 145          * statistics
 146          */
 147         struct btrfs_scrub_progress stat;
 148         spinlock_t              stat_lock;
 149 };
 150
 151 struct scrub_fixup_nodatasum {
 152         struct scrub_ctx        *sctx;
 153         struct btrfs_device     *dev;
 154         u64                     logical;
 155         struct btrfs_root       *root;
 156         struct btrfs_work       work;
 157         int                     mirror_num;
 158 };
 159
 160 struct scrub_copy_nocow_ctx {
 161         struct scrub_ctx        *sctx;
 162         u64                     logical;
 163         u64                     len;
 164         int                     mirror_num;
 165         u64                     physical_for_dev_replace;
 166         struct btrfs_work       work;
 167 };
 168
 169 struct scrub_warning {
 170         struct btrfs_path       *path;
 171         u64                     extent_item_size;
 172         char                    *scratch_buf;
 173         char                    *msg_buf;
 174         const char              *errstr;
 175         sector_t                sector;
 176         u64                     logical;
 177         struct btrfs_device     *dev;
 178         int                     msg_bufsize;
 179         int                     scratch_bufsize;
 180 };
 181
 182
 183 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 184 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 185 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 186 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 187 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 188 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 189                                      struct btrfs_fs_info *fs_info,
 190                                      struct scrub_block *original_sblock,
 191                                      u64 length, u64 logical,
 192                                      struct scrub_block *sblocks_for_recheck);
 193 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 194                                 struct scrub_block *sblock, int is_metadata,
 195                                 int have_csum, u8 *csum, u64 generation,
 196                                 u16 csum_size);
 197 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 198                                          struct scrub_block *sblock,
 199                                          int is_metadata, int have_csum,
 200                                          const u8 *csum, u64 generation,
 201                                          u16 csum_size);
 202 static void scrub_complete_bio_end_io(struct bio *bio, int err);
 203 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 204                                              struct scrub_block *sblock_good,
 205                                              int force_write);
 206 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 207                                             struct scrub_block *sblock_good,
 208                                             int page_num, int force_write);
 209 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 210 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 211                                            int page_num);
 212 static int scrub_checksum_data(struct scrub_block *sblock);
 213 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 214 static int scrub_checksum_super(struct scrub_block *sblock);
 215 static void scrub_block_get(struct scrub_block *sblock);
 216 static void scrub_block_put(struct scrub_block *sblock);
 217 static void scrub_page_get(struct scrub_page *spage);
 218 static void scrub_page_put(struct scrub_page *spage);
 219 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 220                                     struct scrub_page *spage);
 221 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 222                        u64 physical, struct btrfs_device *dev, u64 flags,
 223                        u64 gen, int mirror_num, u8 *csum, int force,
 224                        u64 physical_for_dev_replace);
 225 static void scrub_bio_end_io(struct bio *bio, int err);
 226 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 227 static void scrub_block_complete(struct scrub_block *sblock);
 228 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 229                                u64 extent_logical, u64 extent_len,
 230                                u64 *extent_physical,
 231                                struct btrfs_device **extent_dev,
 232                                int *extent_mirror_num);
 233 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
 234                               struct scrub_wr_ctx *wr_ctx,
 235                               struct btrfs_fs_info *fs_info,
 236                               struct btrfs_device *dev,
 237                               int is_dev_replace);
 238 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
 239 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 240                                     struct scrub_page *spage);
 241 static void scrub_wr_submit(struct scrub_ctx *sctx);
 242 static void scrub_wr_bio_end_io(struct bio *bio, int err);
 243 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 244 static int write_page_nocow(struct scrub_ctx *sctx,
 245                             u64 physical_for_dev_replace, struct page *page);
 246 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 247                                       void *ctx);
 248 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 249                             int mirror_num, u64 physical_for_dev_replace);
 250 static void copy_nocow_pages_worker(struct btrfs_work *work);
 251
 252
 253 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 254 {
 255         atomic_inc(&sctx->bios_in_flight);
 256 }
 257
 258 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 259 {
 260         atomic_dec(&sctx->bios_in_flight);
 261         wake_up(&sctx->list_wait);
 262 }
 263
 264 /*
 265  * used for workers that require transaction commits (i.e., for the
 266  * NOCOW case)
 267  */
 268 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 269 {
 270         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 271
 272         /*
 273          * increment scrubs_running to prevent cancel requests from
 274          * completing as long as a worker is running. we must also
 275          * increment scrubs_paused to prevent deadlocking on pause
 276          * requests used for transactions commits (as the worker uses a
 277          * transaction context). it is safe to regard the worker
 278          * as paused for all matters practical. effectively, we only
 279          * avoid cancellation requests from completing.
 280          */
 281         mutex_lock(&fs_info->scrub_lock);
 282         atomic_inc(&fs_info->scrubs_running);
 283         atomic_inc(&fs_info->scrubs_paused);
 284         mutex_unlock(&fs_info->scrub_lock);
 285         atomic_inc(&sctx->workers_pending);
 286 }
 287
 288 /* used for workers that require transaction commits */
 289 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 290 {
 291         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 292
 293         /*
 294          * see scrub_pending_trans_workers_inc() why we're pretending
 295          * to be paused in the scrub counters
 296          */
 297         mutex_lock(&fs_info->scrub_lock);
 298         atomic_dec(&fs_info->scrubs_running);
 299         atomic_dec(&fs_info->scrubs_paused);
 300         mutex_unlock(&fs_info->scrub_lock);
 301         atomic_dec(&sctx->workers_pending);
 302         wake_up(&fs_info->scrub_pause_wait);
 303         wake_up(&sctx->list_wait);
 304 }
 305
 306 static void scrub_free_csums(struct scrub_ctx *sctx)
 307 {
 308         while (!list_empty(&sctx->csum_list)) {
 309                 struct btrfs_ordered_sum *sum;
 310                 sum = list_first_entry(&sctx->csum_list,
 311                                        struct btrfs_ordered_sum, list);
 312                 list_del(&sum->list);
 313                 kfree(sum);
 314         }
 315 }
 316
 317 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 318 {
 319         int i;
 320
 321         if (!sctx)
 322                 return;
 323
 324         scrub_free_wr_ctx(&sctx->wr_ctx);
 325
 326         /* this can happen when scrub is cancelled */
 327         if (sctx->curr != -1) {
 328                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 329
 330                 for (i = 0; i < sbio->page_count; i++) {
 331                         WARN_ON(!sbio->pagev[i]->page);
 332                         scrub_block_put(sbio->pagev[i]->sblock);
 333                 }
 334                 bio_put(sbio->bio);
 335         }
 336
 337         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 338                 struct scrub_bio *sbio = sctx->bios[i];
 339
 340                 if (!sbio)
 341                         break;
 342                 kfree(sbio);
 343         }
 344
 345         scrub_free_csums(sctx);
 346         kfree(sctx);
 347 }
 348
 349 static noinline_for_stack
 350 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 351 {
 352         struct scrub_ctx *sctx;
 353         int             i;
 354         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 355         int pages_per_rd_bio;
 356         int ret;
 357
 358         /*
 359          * the setting of pages_per_rd_bio is correct for scrub but might
 360          * be wrong for the dev_replace code where we might read from
 361          * different devices in the initial huge bios. However, that
 362          * code is able to correctly handle the case when adding a page
 363          * to a bio fails.
 364          */
 365         if (dev->bdev)
 366                 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
 367                                          bio_get_nr_vecs(dev->bdev));
 368         else
 369                 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 370         sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
 371         if (!sctx)
 372                 goto nomem;
 373         sctx->is_dev_replace = is_dev_replace;
 374         sctx->pages_per_rd_bio = pages_per_rd_bio;
 375         sctx->curr = -1;
 376         sctx->dev_root = dev->dev_root;
 377         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 378                 struct scrub_bio *sbio;
 379
 380                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 381                 if (!sbio)
 382                         goto nomem;
 383                 sctx->bios[i] = sbio;
 384
 385                 sbio->index = i;
 386                 sbio->sctx = sctx;
 387                 sbio->page_count = 0;
 388                 sbio->work.func = scrub_bio_end_io_worker;
 389
 390                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 391                         sctx->bios[i]->next_free = i + 1;
 392                 else
 393                         sctx->bios[i]->next_free = -1;
 394         }
 395         sctx->first_free = 0;
 396         sctx->nodesize = dev->dev_root->nodesize;
 397         sctx->leafsize = dev->dev_root->leafsize;
 398         sctx->sectorsize = dev->dev_root->sectorsize;
 399         atomic_set(&sctx->bios_in_flight, 0);
 400         atomic_set(&sctx->workers_pending, 0);
 401         atomic_set(&sctx->cancel_req, 0);
 402         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 403         INIT_LIST_HEAD(&sctx->csum_list);
 404
 405         spin_lock_init(&sctx->list_lock);
 406         spin_lock_init(&sctx->stat_lock);
 407         init_waitqueue_head(&sctx->list_wait);
 408
 409         ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
 410                                  fs_info->dev_replace.tgtdev, is_dev_replace);
 411         if (ret) {
 412                 scrub_free_ctx(sctx);
 413                 return ERR_PTR(ret);
 414         }
 415         return sctx;
 416
 417 nomem:
 418         scrub_free_ctx(sctx);
 419         return ERR_PTR(-ENOMEM);
 420 }
 421
 422 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 423                                      void *warn_ctx)
 424 {
 425         u64 isize;
 426         u32 nlink;
 427         int ret;
 428         int i;
 429         struct extent_buffer *eb;
 430         struct btrfs_inode_item *inode_item;
 431         struct scrub_warning *swarn = warn_ctx;
 432         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 433         struct inode_fs_paths *ipath = NULL;
 434         struct btrfs_root *local_root;
 435         struct btrfs_key root_key;
 436
 437         root_key.objectid = root;
 438         root_key.type = BTRFS_ROOT_ITEM_KEY;
 439         root_key.offset = (u64)-1;
 440         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 441         if (IS_ERR(local_root)) {
 442                 ret = PTR_ERR(local_root);
 443                 goto err;
 444         }
 445
 446         ret = inode_item_info(inum, 0, local_root, swarn->path);
 447         if (ret) {
 448                 btrfs_release_path(swarn->path);
 449                 goto err;
 450         }
 451
 452         eb = swarn->path->nodes[0];
 453         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 454                                         struct btrfs_inode_item);
 455         isize = btrfs_inode_size(eb, inode_item);
 456         nlink = btrfs_inode_nlink(eb, inode_item);
 457         btrfs_release_path(swarn->path);
 458
 459         ipath = init_ipath(4096, local_root, swarn->path);
 460         if (IS_ERR(ipath)) {
 461                 ret = PTR_ERR(ipath);
 462                 ipath = NULL;
 463                 goto err;
 464         }
 465         ret = paths_from_inode(inum, ipath);
 466
 467         if (ret < 0)
 468                 goto err;
 469
 470         /*
 471          * we deliberately ignore the bit ipath might have been too small to
 472          * hold all of the paths here
 473          */
 474         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 475                 printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
 476                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
 477                         "length %llu, links %u (path: %s)\n", swarn->errstr,
 478                         swarn->logical, rcu_str_deref(swarn->dev->name),
 479                         (unsigned long long)swarn->sector, root, inum, offset,
 480                         min(isize - offset, (u64)PAGE_SIZE), nlink,
 481                         (char *)(unsigned long)ipath->fspath->val[i]);
 482
 483         free_ipath(ipath);
 484         return 0;
 485
 486 err:
 487         printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
 488                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 489                 "resolving failed with ret=%d\n", swarn->errstr,
 490                 swarn->logical, rcu_str_deref(swarn->dev->name),
 491                 (unsigned long long)swarn->sector, root, inum, offset, ret);
 492
 493         free_ipath(ipath);
 494         return 0;
 495 }
 496
 497 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 498 {
 499         struct btrfs_device *dev;
 500         struct btrfs_fs_info *fs_info;
 501         struct btrfs_path *path;
 502         struct btrfs_key found_key;
 503         struct extent_buffer *eb;
 504         struct btrfs_extent_item *ei;
 505         struct scrub_warning swarn;
 506         unsigned long ptr = 0;
 507         u64 extent_item_pos;
 508         u64 flags = 0;
 509         u64 ref_root;
 510         u32 item_size;
 511         u8 ref_level;
 512         const int bufsize = 4096;
 513         int ret;
 514
 515         WARN_ON(sblock->page_count < 1);
 516         dev = sblock->pagev[0]->dev;
 517         fs_info = sblock->sctx->dev_root->fs_info;
 518
 519         path = btrfs_alloc_path();
 520
 521         swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
 522         swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
 523         swarn.sector = (sblock->pagev[0]->physical) >> 9;
 524         swarn.logical = sblock->pagev[0]->logical;
 525         swarn.errstr = errstr;
 526         swarn.dev = NULL;
 527         swarn.msg_bufsize = bufsize;
 528         swarn.scratch_bufsize = bufsize;
 529
 530         if (!path || !swarn.scratch_buf || !swarn.msg_buf)
 531                 goto out;
 532
 533         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 534                                   &flags);
 535         if (ret < 0)
 536                 goto out;
 537
 538         extent_item_pos = swarn.logical - found_key.objectid;
 539         swarn.extent_item_size = found_key.offset;
 540
 541         eb = path->nodes[0];
 542         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 543         item_size = btrfs_item_size_nr(eb, path->slots[0]);
 544         btrfs_release_path(path);
 545
 546         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 547                 do {
 548                         ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
 549                                                         &ref_root, &ref_level);
 550                         printk_in_rcu(KERN_WARNING
 551                                 "btrfs: %s at logical %llu on dev %s, "
 552                                 "sector %llu: metadata %s (level %d) in tree "
 553                                 "%llu\n", errstr, swarn.logical,
 554                                 rcu_str_deref(dev->name),
 555                                 (unsigned long long)swarn.sector,
 556                                 ref_level ? "node" : "leaf",
 557                                 ret < 0 ? -1 : ref_level,
 558                                 ret < 0 ? -1 : ref_root);
 559                 } while (ret != 1);
 560         } else {
 561                 swarn.path = path;
 562                 swarn.dev = dev;
 563                 iterate_extent_inodes(fs_info, found_key.objectid,
 564                                         extent_item_pos, 1,
 565                                         scrub_print_warning_inode, &swarn);
 566         }
 567
 568 out:
 569         btrfs_free_path(path);
 570         kfree(swarn.scratch_buf);
 571         kfree(swarn.msg_buf);
 572 }
 573
 574 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 575 {
 576         struct page *page = NULL;
 577         unsigned long index;
 578         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 579         int ret;
 580         int corrected = 0;
 581         struct btrfs_key key;
 582         struct inode *inode = NULL;
 583         u64 end = offset + PAGE_SIZE - 1;
 584         struct btrfs_root *local_root;
 585
 586         key.objectid = root;
 587         key.type = BTRFS_ROOT_ITEM_KEY;
 588         key.offset = (u64)-1;
 589         local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
 590         if (IS_ERR(local_root))
 591                 return PTR_ERR(local_root);
 592
 593         key.type = BTRFS_INODE_ITEM_KEY;
 594         key.objectid = inum;
 595         key.offset = 0;
 596         inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
 597         if (IS_ERR(inode))
 598                 return PTR_ERR(inode);
 599
 600         index = offset >> PAGE_CACHE_SHIFT;
 601
 602         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 603         if (!page) {
 604                 ret = -ENOMEM;
 605                 goto out;
 606         }
 607
 608         if (PageUptodate(page)) {
 609                 struct btrfs_fs_info *fs_info;
 610                 if (PageDirty(page)) {
 611                         /*
 612                          * we need to write the data to the defect sector. the
 613                          * data that was in that sector is not in memory,
 614                          * because the page was modified. we must not write the
 615                          * modified page to that sector.
 616                          *
 617                          * TODO: what could be done here: wait for the delalloc
 618                          *       runner to write out that page (might involve
 619                          *       COW) and see whether the sector is still
 620                          *       referenced afterwards.
 621                          *
 622                          * For the meantime, we'll treat this error
 623                          * incorrectable, although there is a chance that a
 624                          * later scrub will find the bad sector again and that
 625                          * there's no dirty page in memory, then.
 626                          */
 627                         ret = -EIO;
 628                         goto out;
 629                 }
 630                 fs_info = BTRFS_I(inode)->root->fs_info;
 631                 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
 632                                         fixup->logical, page,
 633                                         fixup->mirror_num);
 634                 unlock_page(page);
 635                 corrected = !ret;
 636         } else {
 637                 /*
 638                  * we need to get good data first. the general readpage path
 639                  * will call repair_io_failure for us, we just have to make
 640                  * sure we read the bad mirror.
 641                  */
 642                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 643                                         EXTENT_DAMAGED, GFP_NOFS);
 644                 if (ret) {
 645                         /* set_extent_bits should give proper error */
 646                         WARN_ON(ret > 0);
 647                         if (ret > 0)
 648                                 ret = -EFAULT;
 649                         goto out;
 650                 }
 651
 652                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 653                                                 btrfs_get_extent,
 654                                                 fixup->mirror_num);
 655                 wait_on_page_locked(page);
 656
 657                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 658                                                 end, EXTENT_DAMAGED, 0, NULL);
 659                 if (!corrected)
 660                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 661                                                 EXTENT_DAMAGED, GFP_NOFS);
 662         }
 663
 664 out:
 665         if (page)
 666                 put_page(page);
 667         if (inode)
 668                 iput(inode);
 669
 670         if (ret < 0)
 671                 return ret;
 672
 673         if (ret == 0 && corrected) {
 674                 /*
 675                  * we only need to call readpage for one of the inodes belonging
 676                  * to this extent. so make iterate_extent_inodes stop
 677                  */
 678                 return 1;
 679         }
 680
 681         return -EIO;
 682 }
 683
 684 static void scrub_fixup_nodatasum(struct btrfs_work *work)
 685 {
 686         int ret;
 687         struct scrub_fixup_nodatasum *fixup;
 688         struct scrub_ctx *sctx;
 689         struct btrfs_trans_handle *trans = NULL;
 690         struct btrfs_fs_info *fs_info;
 691         struct btrfs_path *path;
 692         int uncorrectable = 0;
 693
 694         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
 695         sctx = fixup->sctx;
 696         fs_info = fixup->root->fs_info;
 697
 698         path = btrfs_alloc_path();
 699         if (!path) {
 700                 spin_lock(&sctx->stat_lock);
 701                 ++sctx->stat.malloc_errors;
 702                 spin_unlock(&sctx->stat_lock);
 703                 uncorrectable = 1;
 704                 goto out;
 705         }
 706
 707         trans = btrfs_join_transaction(fixup->root);
 708         if (IS_ERR(trans)) {
 709                 uncorrectable = 1;
 710                 goto out;
 711         }
 712
 713         /*
 714          * the idea is to trigger a regular read through the standard path. we
 715          * read a page from the (failed) logical address by specifying the
 716          * corresponding copynum of the failed sector. thus, that readpage is
 717          * expected to fail.
 718          * that is the point where on-the-fly error correction will kick in
 719          * (once it's finished) and rewrite the failed sector if a good copy
 720          * can be found.
 721          */
 722         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
 723                                                 path, scrub_fixup_readpage,
 724                                                 fixup);
 725         if (ret < 0) {
 726                 uncorrectable = 1;
 727                 goto out;
 728         }
 729         WARN_ON(ret != 1);
 730
 731         spin_lock(&sctx->stat_lock);
 732         ++sctx->stat.corrected_errors;
 733         spin_unlock(&sctx->stat_lock);
 734
 735 out:
 736         if (trans && !IS_ERR(trans))
 737                 btrfs_end_transaction(trans, fixup->root);
 738         if (uncorrectable) {
 739                 spin_lock(&sctx->stat_lock);
 740                 ++sctx->stat.uncorrectable_errors;
 741                 spin_unlock(&sctx->stat_lock);
 742                 btrfs_dev_replace_stats_inc(
 743                         &sctx->dev_root->fs_info->dev_replace.
 744                         num_uncorrectable_read_errors);
 745                 printk_ratelimited_in_rcu(KERN_ERR
 746                         "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 747                         (unsigned long long)fixup->logical,
 748                         rcu_str_deref(fixup->dev->name));
 749         }
 750
 751         btrfs_free_path(path);
 752         kfree(fixup);
 753
 754         scrub_pending_trans_workers_dec(sctx);
 755 }
 756
 757 /*
 758  * scrub_handle_errored_block gets called when either verification of the
 759  * pages failed or the bio failed to read, e.g. with EIO. In the latter
 760  * case, this function handles all pages in the bio, even though only one
 761  * may be bad.
 762  * The goal of this function is to repair the errored block by using the
 763  * contents of one of the mirrors.
 764  */
 765 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 766 {
 767         struct scrub_ctx *sctx = sblock_to_check->sctx;
 768         struct btrfs_device *dev;
 769         struct btrfs_fs_info *fs_info;
 770         u64 length;
 771         u64 logical;
 772         u64 generation;
 773         unsigned int failed_mirror_index;
 774         unsigned int is_metadata;
 775         unsigned int have_csum;
 776         u8 *csum;
 777         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 778         struct scrub_block *sblock_bad;
 779         int ret;
 780         int mirror_index;
 781         int page_num;
 782         int success;
 783         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 784                                       DEFAULT_RATELIMIT_BURST);
 785
 786         BUG_ON(sblock_to_check->page_count < 1);
 787         fs_info = sctx->dev_root->fs_info;
 788         length = sblock_to_check->page_count * PAGE_SIZE;
 789         logical = sblock_to_check->pagev[0]->logical;
 790         generation = sblock_to_check->pagev[0]->generation;
 791         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 792         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 793         is_metadata = !(sblock_to_check->pagev[0]->flags &
 794                         BTRFS_EXTENT_FLAG_DATA);
 795         have_csum = sblock_to_check->pagev[0]->have_csum;
 796         csum = sblock_to_check->pagev[0]->csum;
 797         dev = sblock_to_check->pagev[0]->dev;
 798
 799         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 800                 sblocks_for_recheck = NULL;
 801                 goto nodatasum_case;
 802         }
 803
 804         /*
 805          * read all mirrors one after the other. This includes to
 806          * re-read the extent or metadata block that failed (that was
 807          * the cause that this fixup code is called) another time,
 808          * page by page this time in order to know which pages
 809          * caused I/O errors and which ones are good (for all mirrors).
 810          * It is the goal to handle the situation when more than one
 811          * mirror contains I/O errors, but the errors do not
 812          * overlap, i.e. the data can be repaired by selecting the
 813          * pages from those mirrors without I/O error on the
 814          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
 815          * would be that mirror #1 has an I/O error on the first page,
 816          * the second page is good, and mirror #2 has an I/O error on
 817          * the second page, but the first page is good.
 818          * Then the first page of the first mirror can be repaired by
 819          * taking the first page of the second mirror, and the
 820          * second page of the second mirror can be repaired by
 821          * copying the contents of the 2nd page of the 1st mirror.
 822          * One more note: if the pages of one mirror contain I/O
 823          * errors, the checksum cannot be verified. In order to get
 824          * the best data for repairing, the first attempt is to find
 825          * a mirror without I/O errors and with a validated checksum.
 826          * Only if this is not possible, the pages are picked from
 827          * mirrors with I/O errors without considering the checksum.
 828          * If the latter is the case, at the end, the checksum of the
 829          * repaired area is verified in order to correctly maintain
 830          * the statistics.
 831          */
 832
 833         sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
 834                                      sizeof(*sblocks_for_recheck),
 835                                      GFP_NOFS);
 836         if (!sblocks_for_recheck) {
 837                 spin_lock(&sctx->stat_lock);
 838                 sctx->stat.malloc_errors++;
 839                 sctx->stat.read_errors++;
 840                 sctx->stat.uncorrectable_errors++;
 841                 spin_unlock(&sctx->stat_lock);
 842                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 843                 goto out;
 844         }
 845
 846         /* setup the context, map the logical blocks and alloc the pages */
 847         ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
 848                                         logical, sblocks_for_recheck);
 849         if (ret) {
 850                 spin_lock(&sctx->stat_lock);
 851                 sctx->stat.read_errors++;
 852                 sctx->stat.uncorrectable_errors++;
 853                 spin_unlock(&sctx->stat_lock);
 854                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 855                 goto out;
 856         }
 857         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 858         sblock_bad = sblocks_for_recheck + failed_mirror_index;
 859
 860         /* build and submit the bios for the failed mirror, check checksums */
 861         scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
 862                             csum, generation, sctx->csum_size);
 863
 864         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 865             sblock_bad->no_io_error_seen) {
 866                 /*
 867                  * the error disappeared after reading page by page, or
 868                  * the area was part of a huge bio and other parts of the
 869                  * bio caused I/O errors, or the block layer merged several
 870                  * read requests into one and the error is caused by a
 871                  * different bio (usually one of the two latter cases is
 872                  * the cause)
 873                  */
 874                 spin_lock(&sctx->stat_lock);
 875                 sctx->stat.unverified_errors++;
 876                 spin_unlock(&sctx->stat_lock);
 877
 878                 if (sctx->is_dev_replace)
 879                         scrub_write_block_to_dev_replace(sblock_bad);
 880                 goto out;
 881         }
 882
 883         if (!sblock_bad->no_io_error_seen) {
 884                 spin_lock(&sctx->stat_lock);
 885                 sctx->stat.read_errors++;
 886                 spin_unlock(&sctx->stat_lock);
 887                 if (__ratelimit(&_rs))
 888                         scrub_print_warning("i/o error", sblock_to_check);
 889                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 890         } else if (sblock_bad->checksum_error) {
 891                 spin_lock(&sctx->stat_lock);
 892                 sctx->stat.csum_errors++;
 893                 spin_unlock(&sctx->stat_lock);
 894                 if (__ratelimit(&_rs))
 895                         scrub_print_warning("checksum error", sblock_to_check);
 896                 btrfs_dev_stat_inc_and_print(dev,
 897                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
 898         } else if (sblock_bad->header_error) {
 899                 spin_lock(&sctx->stat_lock);
 900                 sctx->stat.verify_errors++;
 901                 spin_unlock(&sctx->stat_lock);
 902                 if (__ratelimit(&_rs))
 903                         scrub_print_warning("checksum/header error",
 904                                             sblock_to_check);
 905                 if (sblock_bad->generation_error)
 906                         btrfs_dev_stat_inc_and_print(dev,
 907                                 BTRFS_DEV_STAT_GENERATION_ERRS);
 908                 else
 909                         btrfs_dev_stat_inc_and_print(dev,
 910                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
 911         }
 912
 913         if (sctx->readonly && !sctx->is_dev_replace)
 914                 goto did_not_correct_error;
 915
 916         if (!is_metadata && !have_csum) {
 917                 struct scrub_fixup_nodatasum *fixup_nodatasum;
 918
 919 nodatasum_case:
 920                 WARN_ON(sctx->is_dev_replace);
 921
 922                 /*
 923                  * !is_metadata and !have_csum, this means that the data
 924                  * might not be COW'ed, that it might be modified
 925                  * concurrently. The general strategy to work on the
 926                  * commit root does not help in the case when COW is not
 927                  * used.
 928                  */
 929                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
 930                 if (!fixup_nodatasum)
 931                         goto did_not_correct_error;
 932                 fixup_nodatasum->sctx = sctx;
 933                 fixup_nodatasum->dev = dev;
 934                 fixup_nodatasum->logical = logical;
 935                 fixup_nodatasum->root = fs_info->extent_root;
 936                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
 937                 scrub_pending_trans_workers_inc(sctx);
 938                 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
 939                 btrfs_queue_worker(&fs_info->scrub_workers,
 940                                    &fixup_nodatasum->work);
 941                 goto out;
 942         }
 943
 944         /*
 945          * now build and submit the bios for the other mirrors, check
 946          * checksums.
 947          * First try to pick the mirror which is completely without I/O
 948          * errors and also does not have a checksum error.
 949          * If one is found, and if a checksum is present, the full block
 950          * that is known to contain an error is rewritten. Afterwards
 951          * the block is known to be corrected.
 952          * If a mirror is found which is completely correct, and no
 953          * checksum is present, only those pages are rewritten that had
 954          * an I/O error in the block to be repaired, since it cannot be
 955          * determined, which copy of the other pages is better (and it
 956          * could happen otherwise that a correct page would be
 957          * overwritten by a bad one).
 958          */
 959         for (mirror_index = 0;
 960              mirror_index < BTRFS_MAX_MIRRORS &&
 961              sblocks_for_recheck[mirror_index].page_count > 0;
 962              mirror_index++) {
 963                 struct scrub_block *sblock_other;
 964
 965                 if (mirror_index == failed_mirror_index)
 966                         continue;
 967                 sblock_other = sblocks_for_recheck + mirror_index;
 968
 969                 /* build and submit the bios, check checksums */
 970                 scrub_recheck_block(fs_info, sblock_other, is_metadata,
 971                                     have_csum, csum, generation,
 972                                     sctx->csum_size);
 973
 974                 if (!sblock_other->header_error &&
 975                     !sblock_other->checksum_error &&
 976                     sblock_other->no_io_error_seen) {
 977                         if (sctx->is_dev_replace) {
 978                                 scrub_write_block_to_dev_replace(sblock_other);
 979                         } else {
 980                                 int force_write = is_metadata || have_csum;
 981
 982                                 ret = scrub_repair_block_from_good_copy(
 983                                                 sblock_bad, sblock_other,
 984                                                 force_write);
 985                         }
 986                         if (0 == ret)
 987                                 goto corrected_error;
 988                 }
 989         }
 990
 991         /*
 992          * for dev_replace, pick good pages and write to the target device.
 993          */
 994         if (sctx->is_dev_replace) {
 995                 success = 1;
 996                 for (page_num = 0; page_num < sblock_bad->page_count;
 997                      page_num++) {
 998                         int sub_success;
 999
1000                         sub_success = 0;
1001                         for (mirror_index = 0;
1002                              mirror_index < BTRFS_MAX_MIRRORS &&
1003                              sblocks_for_recheck[mirror_index].page_count > 0;
1004                              mirror_index++) {
1005                                 struct scrub_block *sblock_other =
1006                                         sblocks_for_recheck + mirror_index;
1007                                 struct scrub_page *page_other =
1008                                         sblock_other->pagev[page_num];
1009
1010                                 if (!page_other->io_error) {
1011                                         ret = scrub_write_page_to_dev_replace(
1012                                                         sblock_other, page_num);
1013                                         if (ret == 0) {
1014                                                 /* succeeded for this page */
1015                                                 sub_success = 1;
1016                                                 break;
1017                                         } else {
1018                                                 btrfs_dev_replace_stats_inc(
1019                                                         &sctx->dev_root->
1020                                                         fs_info->dev_replace.
1021                                                         num_write_errors);
1022                                         }
1023                                 }
1024                         }
1025
1026                         if (!sub_success) {
1027                                 /*
1028                                  * did not find a mirror to fetch the page
1029                                  * from. scrub_write_page_to_dev_replace()
1030                                  * handles this case (page->io_error), by
1031                                  * filling the block with zeros before
1032                                  * submitting the write request
1033                                  */
1034                                 success = 0;
1035                                 ret = scrub_write_page_to_dev_replace(
1036                                                 sblock_bad, page_num);
1037                                 if (ret)
1038                                         btrfs_dev_replace_stats_inc(
1039                                                 &sctx->dev_root->fs_info->
1040                                                 dev_replace.num_write_errors);
1041                         }
1042                 }
1043
1044                 goto out;
1045         }
1046
1047         /*
1048          * for regular scrub, repair those pages that are errored.
1049          * In case of I/O errors in the area that is supposed to be
1050          * repaired, continue by picking good copies of those pages.
1051          * Select the good pages from mirrors to rewrite bad pages from
1052          * the area to fix. Afterwards verify the checksum of the block
1053          * that is supposed to be repaired. This verification step is
1054          * only done for the purpose of statistic counting and for the
1055          * final scrub report, whether errors remain.
1056          * A perfect algorithm could make use of the checksum and try
1057          * all possible combinations of pages from the different mirrors
1058          * until the checksum verification succeeds. For example, when
1059          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1060          * of mirror #2 is readable but the final checksum test fails,
1061          * then the 2nd page of mirror #3 could be tried, whether now
1062          * the final checksum succeedes. But this would be a rare
1063          * exception and is therefore not implemented. At least it is
1064          * avoided that the good copy is overwritten.
1065          * A more useful improvement would be to pick the sectors
1066          * without I/O error based on sector sizes (512 bytes on legacy
1067          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1068          * mirror could be repaired by taking 512 byte of a different
1069          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1070          * area are unreadable.
1071          */
1072
1073         /* can only fix I/O errors from here on */
1074         if (sblock_bad->no_io_error_seen)
1075                 goto did_not_correct_error;
1076
1077         success = 1;
1078         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1079                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1080
1081                 if (!page_bad->io_error)
1082                         continue;
1083
1084                 for (mirror_index = 0;
1085                      mirror_index < BTRFS_MAX_MIRRORS &&
1086                      sblocks_for_recheck[mirror_index].page_count > 0;
1087                      mirror_index++) {
1088                         struct scrub_block *sblock_other = sblocks_for_recheck +
1089                                                            mirror_index;
1090                         struct scrub_page *page_other = sblock_other->pagev[
1091                                                         page_num];
1092
1093                         if (!page_other->io_error) {
1094                                 ret = scrub_repair_page_from_good_copy(
1095                                         sblock_bad, sblock_other, page_num, 0);
1096                                 if (0 == ret) {
1097                                         page_bad->io_error = 0;
1098                                         break; /* succeeded for this page */
1099                                 }
1100                         }
1101                 }
1102
1103                 if (page_bad->io_error) {
1104                         /* did not find a mirror to copy the page from */
1105                         success = 0;
1106                 }
1107         }
1108
1109         if (success) {
1110                 if (is_metadata || have_csum) {
1111                         /*
1112                          * need to verify the checksum now that all
1113                          * sectors on disk are repaired (the write
1114                          * request for data to be repaired is on its way).
1115                          * Just be lazy and use scrub_recheck_block()
1116                          * which re-reads the data before the checksum
1117                          * is verified, but most likely the data comes out
1118                          * of the page cache.
1119                          */
1120                         scrub_recheck_block(fs_info, sblock_bad,
1121                                             is_metadata, have_csum, csum,
1122                                             generation, sctx->csum_size);
1123                         if (!sblock_bad->header_error &&
1124                             !sblock_bad->checksum_error &&
1125                             sblock_bad->no_io_error_seen)
1126                                 goto corrected_error;
1127                         else
1128                                 goto did_not_correct_error;
1129                 } else {
1130 corrected_error:
1131                         spin_lock(&sctx->stat_lock);
1132                         sctx->stat.corrected_errors++;
1133                         spin_unlock(&sctx->stat_lock);
1134                         printk_ratelimited_in_rcu(KERN_ERR
1135                                 "btrfs: fixed up error at logical %llu on dev %s\n",
1136                                 (unsigned long long)logical,
1137                                 rcu_str_deref(dev->name));
1138                 }
1139         } else {
1140 did_not_correct_error:
1141                 spin_lock(&sctx->stat_lock);
1142                 sctx->stat.uncorrectable_errors++;
1143                 spin_unlock(&sctx->stat_lock);
1144                 printk_ratelimited_in_rcu(KERN_ERR
1145                         "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
1146                         (unsigned long long)logical,
1147                         rcu_str_deref(dev->name));
1148         }
1149
1150 out:
1151         if (sblocks_for_recheck) {
1152                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1153                      mirror_index++) {
1154                         struct scrub_block *sblock = sblocks_for_recheck +
1155                                                      mirror_index;
1156                         int page_index;
1157
1158                         for (page_index = 0; page_index < sblock->page_count;
1159                              page_index++) {
1160                                 sblock->pagev[page_index]->sblock = NULL;
1161                                 scrub_page_put(sblock->pagev[page_index]);
1162                         }
1163                 }
1164                 kfree(sblocks_for_recheck);
1165         }
1166
1167         return 0;
1168 }
1169
1170 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1171                                      struct btrfs_fs_info *fs_info,
1172                                      struct scrub_block *original_sblock,
1173                                      u64 length, u64 logical,
1174                                      struct scrub_block *sblocks_for_recheck)
1175 {
1176         int page_index;
1177         int mirror_index;
1178         int ret;
1179
1180         /*
1181          * note: the two members ref_count and outstanding_pages
1182          * are not used (and not set) in the blocks that are used for
1183          * the recheck procedure
1184          */
1185
1186         page_index = 0;
1187         while (length > 0) {
1188                 u64 sublen = min_t(u64, length, PAGE_SIZE);
1189                 u64 mapped_length = sublen;
1190                 struct btrfs_bio *bbio = NULL;
1191
1192                 /*
1193                  * with a length of PAGE_SIZE, each returned stripe
1194                  * represents one mirror
1195                  */
1196                 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1197                                       &mapped_length, &bbio, 0);
1198                 if (ret || !bbio || mapped_length < sublen) {
1199                         kfree(bbio);
1200                         return -EIO;
1201                 }
1202
1203                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1204                 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1205                      mirror_index++) {
1206                         struct scrub_block *sblock;
1207                         struct scrub_page *page;
1208
1209                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1210                                 continue;
1211
1212                         sblock = sblocks_for_recheck + mirror_index;
1213                         sblock->sctx = sctx;
1214                         page = kzalloc(sizeof(*page), GFP_NOFS);
1215                         if (!page) {
1216 leave_nomem:
1217                                 spin_lock(&sctx->stat_lock);
1218                                 sctx->stat.malloc_errors++;
1219                                 spin_unlock(&sctx->stat_lock);
1220                                 kfree(bbio);
1221                                 return -ENOMEM;
1222                         }
1223                         scrub_page_get(page);
1224                         sblock->pagev[page_index] = page;
1225                         page->logical = logical;
1226                         page->physical = bbio->stripes[mirror_index].physical;
1227                         BUG_ON(page_index >= original_sblock->page_count);
1228                         page->physical_for_dev_replace =
1229                                 original_sblock->pagev[page_index]->
1230                                 physical_for_dev_replace;
1231                         /* for missing devices, dev->bdev is NULL */
1232                         page->dev = bbio->stripes[mirror_index].dev;
1233                         page->mirror_num = mirror_index + 1;
1234                         sblock->page_count++;
1235                         page->page = alloc_page(GFP_NOFS);
1236                         if (!page->page)
1237                                 goto leave_nomem;
1238                 }
1239                 kfree(bbio);
1240                 length -= sublen;
1241                 logical += sublen;
1242                 page_index++;
1243         }
1244
1245         return 0;
1246 }
1247
1248 /*
1249  * this function will check the on disk data for checksum errors, header
1250  * errors and read I/O errors. If any I/O errors happen, the exact pages
1251  * which are errored are marked as being bad. The goal is to enable scrub
1252  * to take those pages that are not errored from all the mirrors so that
1253  * the pages that are errored in the just handled mirror can be repaired.
1254  */
1255 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1256                                 struct scrub_block *sblock, int is_metadata,
1257                                 int have_csum, u8 *csum, u64 generation,
1258                                 u16 csum_size)
1259 {
1260         int page_num;
1261
1262         sblock->no_io_error_seen = 1;
1263         sblock->header_error = 0;
1264         sblock->checksum_error = 0;
1265
1266         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1267                 struct bio *bio;
1268                 struct scrub_page *page = sblock->pagev[page_num];
1269                 DECLARE_COMPLETION_ONSTACK(complete);
1270
1271                 if (page->dev->bdev == NULL) {
1272                         page->io_error = 1;
1273                         sblock->no_io_error_seen = 0;
1274                         continue;
1275                 }
1276
1277                 WARN_ON(!page->page);
1278                 bio = bio_alloc(GFP_NOFS, 1);
1279                 if (!bio) {
1280                         page->io_error = 1;
1281                         sblock->no_io_error_seen = 0;
1282                         continue;
1283                 }
1284                 bio->bi_bdev = page->dev->bdev;
1285                 bio->bi_sector = page->physical >> 9;
1286                 bio->bi_end_io = scrub_complete_bio_end_io;
1287                 bio->bi_private = &complete;
1288
1289                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1290                 btrfsic_submit_bio(READ, bio);
1291
1292                 /* this will also unplug the queue */
1293                 wait_for_completion(&complete);
1294
1295                 page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
1296                 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1297                         sblock->no_io_error_seen = 0;
1298                 bio_put(bio);
1299         }
1300
1301         if (sblock->no_io_error_seen)
1302                 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1303                                              have_csum, csum, generation,
1304                                              csum_size);
1305
1306         return;
1307 }
1308
1309 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1310                                          struct scrub_block *sblock,
1311                                          int is_metadata, int have_csum,
1312                                          const u8 *csum, u64 generation,
1313                                          u16 csum_size)
1314 {
1315         int page_num;
1316         u8 calculated_csum[BTRFS_CSUM_SIZE];
1317         u32 crc = ~(u32)0;
1318         struct btrfs_root *root = fs_info->extent_root;
1319         void *mapped_buffer;
1320
1321         WARN_ON(!sblock->pagev[0]->page);
1322         if (is_metadata) {
1323                 struct btrfs_header *h;
1324
1325                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1326                 h = (struct btrfs_header *)mapped_buffer;
1327
1328                 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
1329                     memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1330                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1331                            BTRFS_UUID_SIZE)) {
1332                         sblock->header_error = 1;
1333                 } else if (generation != le64_to_cpu(h->generation)) {
1334                         sblock->header_error = 1;
1335                         sblock->generation_error = 1;
1336                 }
1337                 csum = h->csum;
1338         } else {
1339                 if (!have_csum)
1340                         return;
1341
1342                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1343         }
1344
1345         for (page_num = 0;;) {
1346                 if (page_num == 0 && is_metadata)
1347                         crc = btrfs_csum_data(root,
1348                                 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1349                                 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1350                 else
1351                         crc = btrfs_csum_data(root, mapped_buffer, crc,
1352                                               PAGE_SIZE);
1353
1354                 kunmap_atomic(mapped_buffer);
1355                 page_num++;
1356                 if (page_num >= sblock->page_count)
1357                         break;
1358                 WARN_ON(!sblock->pagev[page_num]->page);
1359
1360                 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1361         }
1362
1363         btrfs_csum_final(crc, calculated_csum);
1364         if (memcmp(calculated_csum, csum, csum_size))
1365                 sblock->checksum_error = 1;
1366 }
1367
1368 static void scrub_complete_bio_end_io(struct bio *bio, int err)
1369 {
1370         complete((struct completion *)bio->bi_private);
1371 }
1372
1373 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1374                                              struct scrub_block *sblock_good,
1375                                              int force_write)
1376 {
1377         int page_num;
1378         int ret = 0;
1379
1380         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1381                 int ret_sub;
1382
1383                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1384                                                            sblock_good,
1385                                                            page_num,
1386                                                            force_write);
1387                 if (ret_sub)
1388                         ret = ret_sub;
1389         }
1390
1391         return ret;
1392 }
1393
1394 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1395                                             struct scrub_block *sblock_good,
1396                                             int page_num, int force_write)
1397 {
1398         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1399         struct scrub_page *page_good = sblock_good->pagev[page_num];
1400
1401         BUG_ON(page_bad->page == NULL);
1402         BUG_ON(page_good->page == NULL);
1403         if (force_write || sblock_bad->header_error ||
1404             sblock_bad->checksum_error || page_bad->io_error) {
1405                 struct bio *bio;
1406                 int ret;
1407                 DECLARE_COMPLETION_ONSTACK(complete);
1408
1409                 if (!page_bad->dev->bdev) {
1410                         printk_ratelimited(KERN_WARNING
1411                                 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1412                         return -EIO;
1413                 }
1414
1415                 bio = bio_alloc(GFP_NOFS, 1);
1416                 if (!bio)
1417                         return -EIO;
1418                 bio->bi_bdev = page_bad->dev->bdev;
1419                 bio->bi_sector = page_bad->physical >> 9;
1420                 bio->bi_end_io = scrub_complete_bio_end_io;
1421                 bio->bi_private = &complete;
1422
1423                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1424                 if (PAGE_SIZE != ret) {
1425                         bio_put(bio);
1426                         return -EIO;
1427                 }
1428                 btrfsic_submit_bio(WRITE, bio);
1429
1430                 /* this will also unplug the queue */
1431                 wait_for_completion(&complete);
1432                 if (!bio_flagged(bio, BIO_UPTODATE)) {
1433                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1434                                 BTRFS_DEV_STAT_WRITE_ERRS);
1435                         btrfs_dev_replace_stats_inc(
1436                                 &sblock_bad->sctx->dev_root->fs_info->
1437                                 dev_replace.num_write_errors);
1438                         bio_put(bio);
1439                         return -EIO;
1440                 }
1441                 bio_put(bio);
1442         }
1443
1444         return 0;
1445 }
1446
1447 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1448 {
1449         int page_num;
1450
1451         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1452                 int ret;
1453
1454                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1455                 if (ret)
1456                         btrfs_dev_replace_stats_inc(
1457                                 &sblock->sctx->dev_root->fs_info->dev_replace.
1458                                 num_write_errors);
1459         }
1460 }
1461
1462 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1463                                            int page_num)
1464 {
1465         struct scrub_page *spage = sblock->pagev[page_num];
1466
1467         BUG_ON(spage->page == NULL);
1468         if (spage->io_error) {
1469                 void *mapped_buffer = kmap_atomic(spage->page);
1470
1471                 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1472                 flush_dcache_page(spage->page);
1473                 kunmap_atomic(mapped_buffer);
1474         }
1475         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1476 }
1477
1478 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1479                                     struct scrub_page *spage)
1480 {
1481         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1482         struct scrub_bio *sbio;
1483         int ret;
1484
1485         mutex_lock(&wr_ctx->wr_lock);
1486 again:
1487         if (!wr_ctx->wr_curr_bio) {
1488                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1489                                               GFP_NOFS);
1490                 if (!wr_ctx->wr_curr_bio) {
1491                         mutex_unlock(&wr_ctx->wr_lock);
1492                         return -ENOMEM;
1493                 }
1494                 wr_ctx->wr_curr_bio->sctx = sctx;
1495                 wr_ctx->wr_curr_bio->page_count = 0;
1496         }
1497         sbio = wr_ctx->wr_curr_bio;
1498         if (sbio->page_count == 0) {
1499                 struct bio *bio;
1500
1501                 sbio->physical = spage->physical_for_dev_replace;
1502                 sbio->logical = spage->logical;
1503                 sbio->dev = wr_ctx->tgtdev;
1504                 bio = sbio->bio;
1505                 if (!bio) {
1506                         bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1507                         if (!bio) {
1508                                 mutex_unlock(&wr_ctx->wr_lock);
1509                                 return -ENOMEM;
1510                         }
1511                         sbio->bio = bio;
1512                 }
1513
1514                 bio->bi_private = sbio;
1515                 bio->bi_end_io = scrub_wr_bio_end_io;
1516                 bio->bi_bdev = sbio->dev->bdev;
1517                 bio->bi_sector = sbio->physical >> 9;
1518                 sbio->err = 0;
1519         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1520                    spage->physical_for_dev_replace ||
1521                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1522                    spage->logical) {
1523                 scrub_wr_submit(sctx);
1524                 goto again;
1525         }
1526
1527         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1528         if (ret != PAGE_SIZE) {
1529                 if (sbio->page_count < 1) {
1530                         bio_put(sbio->bio);
1531                         sbio->bio = NULL;
1532                         mutex_unlock(&wr_ctx->wr_lock);
1533                         return -EIO;
1534                 }
1535                 scrub_wr_submit(sctx);
1536                 goto again;
1537         }
1538
1539         sbio->pagev[sbio->page_count] = spage;
1540         scrub_page_get(spage);
1541         sbio->page_count++;
1542         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1543                 scrub_wr_submit(sctx);
1544         mutex_unlock(&wr_ctx->wr_lock);
1545
1546         return 0;
1547 }
1548
1549 static void scrub_wr_submit(struct scrub_ctx *sctx)
1550 {
1551         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1552         struct scrub_bio *sbio;
1553
1554         if (!wr_ctx->wr_curr_bio)
1555                 return;
1556
1557         sbio = wr_ctx->wr_curr_bio;
1558         wr_ctx->wr_curr_bio = NULL;
1559         WARN_ON(!sbio->bio->bi_bdev);
1560         scrub_pending_bio_inc(sctx);
1561         /* process all writes in a single worker thread. Then the block layer
1562          * orders the requests before sending them to the driver which
1563          * doubled the write performance on spinning disks when measured
1564          * with Linux 3.5 */
1565         btrfsic_submit_bio(WRITE, sbio->bio);
1566 }
1567
1568 static void scrub_wr_bio_end_io(struct bio *bio, int err)
1569 {
1570         struct scrub_bio *sbio = bio->bi_private;
1571         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1572
1573         sbio->err = err;
1574         sbio->bio = bio;
1575
1576         sbio->work.func = scrub_wr_bio_end_io_worker;
1577         btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1578 }
1579
1580 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1581 {
1582         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1583         struct scrub_ctx *sctx = sbio->sctx;
1584         int i;
1585
1586         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1587         if (sbio->err) {
1588                 struct btrfs_dev_replace *dev_replace =
1589                         &sbio->sctx->dev_root->fs_info->dev_replace;
1590
1591                 for (i = 0; i < sbio->page_count; i++) {
1592                         struct scrub_page *spage = sbio->pagev[i];
1593
1594                         spage->io_error = 1;
1595                         btrfs_dev_replace_stats_inc(&dev_replace->
1596                                                     num_write_errors);
1597                 }
1598         }
1599
1600         for (i = 0; i < sbio->page_count; i++)
1601                 scrub_page_put(sbio->pagev[i]);
1602
1603         bio_put(sbio->bio);
1604         kfree(sbio);
1605         scrub_pending_bio_dec(sctx);
1606 }
1607
1608 static int scrub_checksum(struct scrub_block *sblock)
1609 {
1610         u64 flags;
1611         int ret;
1612
1613         WARN_ON(sblock->page_count < 1);
1614         flags = sblock->pagev[0]->flags;
1615         ret = 0;
1616         if (flags & BTRFS_EXTENT_FLAG_DATA)
1617                 ret = scrub_checksum_data(sblock);
1618         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1619                 ret = scrub_checksum_tree_block(sblock);
1620         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1621                 (void)scrub_checksum_super(sblock);
1622         else
1623                 WARN_ON(1);
1624         if (ret)
1625                 scrub_handle_errored_block(sblock);
1626
1627         return ret;
1628 }
1629
1630 static int scrub_checksum_data(struct scrub_block *sblock)
1631 {
1632         struct scrub_ctx *sctx = sblock->sctx;
1633         u8 csum[BTRFS_CSUM_SIZE];
1634         u8 *on_disk_csum;
1635         struct page *page;
1636         void *buffer;
1637         u32 crc = ~(u32)0;
1638         int fail = 0;
1639         struct btrfs_root *root = sctx->dev_root;
1640         u64 len;
1641         int index;
1642
1643         BUG_ON(sblock->page_count < 1);
1644         if (!sblock->pagev[0]->have_csum)
1645                 return 0;
1646
1647         on_disk_csum = sblock->pagev[0]->csum;
1648         page = sblock->pagev[0]->page;
1649         buffer = kmap_atomic(page);
1650
1651         len = sctx->sectorsize;
1652         index = 0;
1653         for (;;) {
1654                 u64 l = min_t(u64, len, PAGE_SIZE);
1655
1656                 crc = btrfs_csum_data(root, buffer, crc, l);
1657                 kunmap_atomic(buffer);
1658                 len -= l;
1659                 if (len == 0)
1660                         break;
1661                 index++;
1662                 BUG_ON(index >= sblock->page_count);
1663                 BUG_ON(!sblock->pagev[index]->page);
1664                 page = sblock->pagev[index]->page;
1665                 buffer = kmap_atomic(page);
1666         }
1667
1668         btrfs_csum_final(crc, csum);
1669         if (memcmp(csum, on_disk_csum, sctx->csum_size))
1670                 fail = 1;
1671
1672         return fail;
1673 }
1674
1675 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1676 {
1677         struct scrub_ctx *sctx = sblock->sctx;
1678         struct btrfs_header *h;
1679         struct btrfs_root *root = sctx->dev_root;
1680         struct btrfs_fs_info *fs_info = root->fs_info;
1681         u8 calculated_csum[BTRFS_CSUM_SIZE];
1682         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1683         struct page *page;
1684         void *mapped_buffer;
1685         u64 mapped_size;
1686         void *p;
1687         u32 crc = ~(u32)0;
1688         int fail = 0;
1689         int crc_fail = 0;
1690         u64 len;
1691         int index;
1692
1693         BUG_ON(sblock->page_count < 1);
1694         page = sblock->pagev[0]->page;
1695         mapped_buffer = kmap_atomic(page);
1696         h = (struct btrfs_header *)mapped_buffer;
1697         memcpy(on_disk_csum, h->csum, sctx->csum_size);
1698
1699         /*
1700          * we don't use the getter functions here, as we
1701          * a) don't have an extent buffer and
1702          * b) the page is already kmapped
1703          */
1704
1705         if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
1706                 ++fail;
1707
1708         if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
1709                 ++fail;
1710
1711         if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1712                 ++fail;
1713
1714         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1715                    BTRFS_UUID_SIZE))
1716                 ++fail;
1717
1718         WARN_ON(sctx->nodesize != sctx->leafsize);
1719         len = sctx->nodesize - BTRFS_CSUM_SIZE;
1720         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1721         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1722         index = 0;
1723         for (;;) {
1724                 u64 l = min_t(u64, len, mapped_size);
1725
1726                 crc = btrfs_csum_data(root, p, crc, l);
1727                 kunmap_atomic(mapped_buffer);
1728                 len -= l;
1729                 if (len == 0)
1730                         break;
1731                 index++;
1732                 BUG_ON(index >= sblock->page_count);
1733                 BUG_ON(!sblock->pagev[index]->page);
1734                 page = sblock->pagev[index]->page;
1735                 mapped_buffer = kmap_atomic(page);
1736                 mapped_size = PAGE_SIZE;
1737                 p = mapped_buffer;
1738         }
1739
1740         btrfs_csum_final(crc, calculated_csum);
1741         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1742                 ++crc_fail;
1743
1744         return fail || crc_fail;
1745 }
1746
1747 static int scrub_checksum_super(struct scrub_block *sblock)
1748 {
1749         struct btrfs_super_block *s;
1750         struct scrub_ctx *sctx = sblock->sctx;
1751         struct btrfs_root *root = sctx->dev_root;
1752         struct btrfs_fs_info *fs_info = root->fs_info;
1753         u8 calculated_csum[BTRFS_CSUM_SIZE];
1754         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1755         struct page *page;
1756         void *mapped_buffer;
1757         u64 mapped_size;
1758         void *p;
1759         u32 crc = ~(u32)0;
1760         int fail_gen = 0;
1761         int fail_cor = 0;
1762         u64 len;
1763         int index;
1764
1765         BUG_ON(sblock->page_count < 1);
1766         page = sblock->pagev[0]->page;
1767         mapped_buffer = kmap_atomic(page);
1768         s = (struct btrfs_super_block *)mapped_buffer;
1769         memcpy(on_disk_csum, s->csum, sctx->csum_size);
1770
1771         if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
1772                 ++fail_cor;
1773
1774         if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
1775                 ++fail_gen;
1776
1777         if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1778                 ++fail_cor;
1779
1780         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1781         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1782         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1783         index = 0;
1784         for (;;) {
1785                 u64 l = min_t(u64, len, mapped_size);
1786
1787                 crc = btrfs_csum_data(root, p, crc, l);
1788                 kunmap_atomic(mapped_buffer);
1789                 len -= l;
1790                 if (len == 0)
1791                         break;
1792                 index++;
1793                 BUG_ON(index >= sblock->page_count);
1794                 BUG_ON(!sblock->pagev[index]->page);
1795                 page = sblock->pagev[index]->page;
1796                 mapped_buffer = kmap_atomic(page);
1797                 mapped_size = PAGE_SIZE;
1798                 p = mapped_buffer;
1799         }
1800
1801         btrfs_csum_final(crc, calculated_csum);
1802         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1803                 ++fail_cor;
1804
1805         if (fail_cor + fail_gen) {
1806                 /*
1807                  * if we find an error in a super block, we just report it.
1808                  * They will get written with the next transaction commit
1809                  * anyway
1810                  */
1811                 spin_lock(&sctx->stat_lock);
1812                 ++sctx->stat.super_errors;
1813                 spin_unlock(&sctx->stat_lock);
1814                 if (fail_cor)
1815                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1816                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1817                 else
1818                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1819                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1820         }
1821
1822         return fail_cor + fail_gen;
1823 }
1824
1825 static void scrub_block_get(struct scrub_block *sblock)
1826 {
1827         atomic_inc(&sblock->ref_count);
1828 }
1829
1830 static void scrub_block_put(struct scrub_block *sblock)
1831 {
1832         if (atomic_dec_and_test(&sblock->ref_count)) {
1833                 int i;
1834
1835                 for (i = 0; i < sblock->page_count; i++)
1836                         scrub_page_put(sblock->pagev[i]);
1837                 kfree(sblock);
1838         }
1839 }
1840
1841 static void scrub_page_get(struct scrub_page *spage)
1842 {
1843         atomic_inc(&spage->ref_count);
1844 }
1845
1846 static void scrub_page_put(struct scrub_page *spage)
1847 {
1848         if (atomic_dec_and_test(&spage->ref_count)) {
1849                 if (spage->page)
1850                         __free_page(spage->page);
1851                 kfree(spage);
1852         }
1853 }
1854
1855 static void scrub_submit(struct scrub_ctx *sctx)
1856 {
1857         struct scrub_bio *sbio;
1858
1859         if (sctx->curr == -1)
1860                 return;
1861
1862         sbio = sctx->bios[sctx->curr];
1863         sctx->curr = -1;
1864         scrub_pending_bio_inc(sctx);
1865
1866         if (!sbio->bio->bi_bdev) {
1867                 /*
1868                  * this case should not happen. If btrfs_map_block() is
1869                  * wrong, it could happen for dev-replace operations on
1870                  * missing devices when no mirrors are available, but in
1871                  * this case it should already fail the mount.
1872                  * This case is handled correctly (but _very_ slowly).
1873                  */
1874                 printk_ratelimited(KERN_WARNING
1875                         "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1876                 bio_endio(sbio->bio, -EIO);
1877         } else {
1878                 btrfsic_submit_bio(READ, sbio->bio);
1879         }
1880 }
1881
1882 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1883                                     struct scrub_page *spage)
1884 {
1885         struct scrub_block *sblock = spage->sblock;
1886         struct scrub_bio *sbio;
1887         int ret;
1888
1889 again:
1890         /*
1891          * grab a fresh bio or wait for one to become available
1892          */
1893         while (sctx->curr == -1) {
1894                 spin_lock(&sctx->list_lock);
1895                 sctx->curr = sctx->first_free;
1896                 if (sctx->curr != -1) {
1897                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
1898                         sctx->bios[sctx->curr]->next_free = -1;
1899                         sctx->bios[sctx->curr]->page_count = 0;
1900                         spin_unlock(&sctx->list_lock);
1901                 } else {
1902                         spin_unlock(&sctx->list_lock);
1903                         wait_event(sctx->list_wait, sctx->first_free != -1);
1904                 }
1905         }
1906         sbio = sctx->bios[sctx->curr];
1907         if (sbio->page_count == 0) {
1908                 struct bio *bio;
1909
1910                 sbio->physical = spage->physical;
1911                 sbio->logical = spage->logical;
1912                 sbio->dev = spage->dev;
1913                 bio = sbio->bio;
1914                 if (!bio) {
1915                         bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1916                         if (!bio)
1917                                 return -ENOMEM;
1918                         sbio->bio = bio;
1919                 }
1920
1921                 bio->bi_private = sbio;
1922                 bio->bi_end_io = scrub_bio_end_io;
1923                 bio->bi_bdev = sbio->dev->bdev;
1924                 bio->bi_sector = sbio->physical >> 9;
1925                 sbio->err = 0;
1926         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1927                    spage->physical ||
1928                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1929                    spage->logical ||
1930                    sbio->dev != spage->dev) {
1931                 scrub_submit(sctx);
1932                 goto again;
1933         }
1934
1935         sbio->pagev[sbio->page_count] = spage;
1936         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1937         if (ret != PAGE_SIZE) {
1938                 if (sbio->page_count < 1) {
1939                         bio_put(sbio->bio);
1940                         sbio->bio = NULL;
1941                         return -EIO;
1942                 }
1943                 scrub_submit(sctx);
1944                 goto again;
1945         }
1946
1947         scrub_block_get(sblock); /* one for the page added to the bio */
1948         atomic_inc(&sblock->outstanding_pages);
1949         sbio->page_count++;
1950         if (sbio->page_count == sctx->pages_per_rd_bio)
1951                 scrub_submit(sctx);
1952
1953         return 0;
1954 }
1955
1956 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1957                        u64 physical, struct btrfs_device *dev, u64 flags,
1958                        u64 gen, int mirror_num, u8 *csum, int force,
1959                        u64 physical_for_dev_replace)
1960 {
1961         struct scrub_block *sblock;
1962         int index;
1963
1964         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1965         if (!sblock) {
1966                 spin_lock(&sctx->stat_lock);
1967                 sctx->stat.malloc_errors++;
1968                 spin_unlock(&sctx->stat_lock);
1969                 return -ENOMEM;
1970         }
1971
1972         /* one ref inside this function, plus one for each page added to
1973          * a bio later on */
1974         atomic_set(&sblock->ref_count, 1);
1975         sblock->sctx = sctx;
1976         sblock->no_io_error_seen = 1;
1977
1978         for (index = 0; len > 0; index++) {
1979                 struct scrub_page *spage;
1980                 u64 l = min_t(u64, len, PAGE_SIZE);
1981
1982                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
1983                 if (!spage) {
1984 leave_nomem:
1985                         spin_lock(&sctx->stat_lock);
1986                         sctx->stat.malloc_errors++;
1987                         spin_unlock(&sctx->stat_lock);
1988                         scrub_block_put(sblock);
1989                         return -ENOMEM;
1990                 }
1991                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
1992                 scrub_page_get(spage);
1993                 sblock->pagev[index] = spage;
1994                 spage->sblock = sblock;
1995                 spage->dev = dev;
1996                 spage->flags = flags;
1997                 spage->generation = gen;
1998                 spage->logical = logical;
1999                 spage->physical = physical;
2000                 spage->physical_for_dev_replace = physical_for_dev_replace;
2001                 spage->mirror_num = mirror_num;
2002                 if (csum) {
2003                         spage->have_csum = 1;
2004                         memcpy(spage->csum, csum, sctx->csum_size);
2005                 } else {
2006                         spage->have_csum = 0;
2007                 }
2008                 sblock->page_count++;
2009                 spage->page = alloc_page(GFP_NOFS);
2010                 if (!spage->page)
2011                         goto leave_nomem;
2012                 len -= l;
2013                 logical += l;
2014                 physical += l;
2015                 physical_for_dev_replace += l;
2016         }
2017
2018         WARN_ON(sblock->page_count == 0);
2019         for (index = 0; index < sblock->page_count; index++) {
2020                 struct scrub_page *spage = sblock->pagev[index];
2021                 int ret;
2022
2023                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2024                 if (ret) {
2025                         scrub_block_put(sblock);
2026                         return ret;
2027                 }
2028         }
2029
2030         if (force)
2031                 scrub_submit(sctx);
2032
2033         /* last one frees, either here or in bio completion for last page */
2034         scrub_block_put(sblock);
2035         return 0;
2036 }
2037
2038 static void scrub_bio_end_io(struct bio *bio, int err)
2039 {
2040         struct scrub_bio *sbio = bio->bi_private;
2041         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2042
2043         sbio->err = err;
2044         sbio->bio = bio;
2045
2046         btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
2047 }
2048
2049 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2050 {
2051         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2052         struct scrub_ctx *sctx = sbio->sctx;
2053         int i;
2054
2055         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2056         if (sbio->err) {
2057                 for (i = 0; i < sbio->page_count; i++) {
2058                         struct scrub_page *spage = sbio->pagev[i];
2059
2060                         spage->io_error = 1;
2061                         spage->sblock->no_io_error_seen = 0;
2062                 }
2063         }
2064
2065         /* now complete the scrub_block items that have all pages completed */
2066         for (i = 0; i < sbio->page_count; i++) {
2067                 struct scrub_page *spage = sbio->pagev[i];
2068                 struct scrub_block *sblock = spage->sblock;
2069
2070                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2071                         scrub_block_complete(sblock);
2072                 scrub_block_put(sblock);
2073         }
2074
2075         bio_put(sbio->bio);
2076         sbio->bio = NULL;
2077         spin_lock(&sctx->list_lock);
2078         sbio->next_free = sctx->first_free;
2079         sctx->first_free = sbio->index;
2080         spin_unlock(&sctx->list_lock);
2081
2082         if (sctx->is_dev_replace &&
2083             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2084                 mutex_lock(&sctx->wr_ctx.wr_lock);
2085                 scrub_wr_submit(sctx);
2086                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2087         }
2088
2089         scrub_pending_bio_dec(sctx);
2090 }
2091
2092 static void scrub_block_complete(struct scrub_block *sblock)
2093 {
2094         if (!sblock->no_io_error_seen) {
2095                 scrub_handle_errored_block(sblock);
2096         } else {
2097                 /*
2098                  * if has checksum error, write via repair mechanism in
2099                  * dev replace case, otherwise write here in dev replace
2100                  * case.
2101                  */
2102                 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2103                         scrub_write_block_to_dev_replace(sblock);
2104         }
2105 }
2106
2107 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2108                            u8 *csum)
2109 {
2110         struct btrfs_ordered_sum *sum = NULL;
2111         int ret = 0;
2112         unsigned long i;
2113         unsigned long num_sectors;
2114
2115         while (!list_empty(&sctx->csum_list)) {
2116                 sum = list_first_entry(&sctx->csum_list,
2117                                        struct btrfs_ordered_sum, list);
2118                 if (sum->bytenr > logical)
2119                         return 0;
2120                 if (sum->bytenr + sum->len > logical)
2121                         break;
2122
2123                 ++sctx->stat.csum_discards;
2124                 list_del(&sum->list);
2125                 kfree(sum);
2126                 sum = NULL;
2127         }
2128         if (!sum)
2129                 return 0;
2130
2131         num_sectors = sum->len / sctx->sectorsize;
2132         for (i = 0; i < num_sectors; ++i) {
2133                 if (sum->sums[i].bytenr == logical) {
2134                         memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
2135                         ret = 1;
2136                         break;
2137                 }
2138         }
2139         if (ret && i == num_sectors - 1) {
2140                 list_del(&sum->list);
2141                 kfree(sum);
2142         }
2143         return ret;
2144 }
2145
2146 /* scrub extent tries to collect up to 64 kB for each bio */
2147 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2148                         u64 physical, struct btrfs_device *dev, u64 flags,
2149                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2150 {
2151         int ret;
2152         u8 csum[BTRFS_CSUM_SIZE];
2153         u32 blocksize;
2154
2155         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2156                 blocksize = sctx->sectorsize;
2157                 spin_lock(&sctx->stat_lock);
2158                 sctx->stat.data_extents_scrubbed++;
2159                 sctx->stat.data_bytes_scrubbed += len;
2160                 spin_unlock(&sctx->stat_lock);
2161         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2162                 WARN_ON(sctx->nodesize != sctx->leafsize);
2163                 blocksize = sctx->nodesize;
2164                 spin_lock(&sctx->stat_lock);
2165                 sctx->stat.tree_extents_scrubbed++;
2166                 sctx->stat.tree_bytes_scrubbed += len;
2167                 spin_unlock(&sctx->stat_lock);
2168         } else {
2169                 blocksize = sctx->sectorsize;
2170                 WARN_ON(1);
2171         }
2172
2173         while (len) {
2174                 u64 l = min_t(u64, len, blocksize);
2175                 int have_csum = 0;
2176
2177                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2178                         /* push csums to sbio */
2179                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2180                         if (have_csum == 0)
2181                                 ++sctx->stat.no_csum;
2182                         if (sctx->is_dev_replace && !have_csum) {
2183                                 ret = copy_nocow_pages(sctx, logical, l,
2184                                                        mirror_num,
2185                                                       physical_for_dev_replace);
2186                                 goto behind_scrub_pages;
2187                         }
2188                 }
2189                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2190                                   mirror_num, have_csum ? csum : NULL, 0,
2191                                   physical_for_dev_replace);
2192 behind_scrub_pages:
2193                 if (ret)
2194                         return ret;
2195                 len -= l;
2196                 logical += l;
2197                 physical += l;
2198                 physical_for_dev_replace += l;
2199         }
2200         return 0;
2201 }
2202
2203 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2204                                            struct map_lookup *map,
2205                                            struct btrfs_device *scrub_dev,
2206                                            int num, u64 base, u64 length,
2207                                            int is_dev_replace)
2208 {
2209         struct btrfs_path *path;
2210         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2211         struct btrfs_root *root = fs_info->extent_root;
2212         struct btrfs_root *csum_root = fs_info->csum_root;
2213         struct btrfs_extent_item *extent;
2214         struct blk_plug plug;
2215         u64 flags;
2216         int ret;
2217         int slot;
2218         int i;
2219         u64 nstripes;
2220         struct extent_buffer *l;
2221         struct btrfs_key key;
2222         u64 physical;
2223         u64 logical;
2224         u64 generation;
2225         int mirror_num;
2226         struct reada_control *reada1;
2227         struct reada_control *reada2;
2228         struct btrfs_key key_start;
2229         struct btrfs_key key_end;
2230         u64 increment = map->stripe_len;
2231         u64 offset;
2232         u64 extent_logical;
2233         u64 extent_physical;
2234         u64 extent_len;
2235         struct btrfs_device *extent_dev;
2236         int extent_mirror_num;
2237
2238         nstripes = length;
2239         offset = 0;
2240         do_div(nstripes, map->stripe_len);
2241         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2242                 offset = map->stripe_len * num;
2243                 increment = map->stripe_len * map->num_stripes;
2244                 mirror_num = 1;
2245         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2246                 int factor = map->num_stripes / map->sub_stripes;
2247                 offset = map->stripe_len * (num / map->sub_stripes);
2248                 increment = map->stripe_len * factor;
2249                 mirror_num = num % map->sub_stripes + 1;
2250         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2251                 increment = map->stripe_len;
2252                 mirror_num = num % map->num_stripes + 1;
2253         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2254                 increment = map->stripe_len;
2255                 mirror_num = num % map->num_stripes + 1;
2256         } else {
2257                 increment = map->stripe_len;
2258                 mirror_num = 1;
2259         }
2260
2261         path = btrfs_alloc_path();
2262         if (!path)
2263                 return -ENOMEM;
2264
2265         /*
2266          * work on commit root. The related disk blocks are static as
2267          * long as COW is applied. This means, it is save to rewrite
2268          * them to repair disk errors without any race conditions
2269          */
2270         path->search_commit_root = 1;
2271         path->skip_locking = 1;
2272
2273         /*
2274          * trigger the readahead for extent tree csum tree and wait for
2275          * completion. During readahead, the scrub is officially paused
2276          * to not hold off transaction commits
2277          */
2278         logical = base + offset;
2279
2280         wait_event(sctx->list_wait,
2281                    atomic_read(&sctx->bios_in_flight) == 0);
2282         atomic_inc(&fs_info->scrubs_paused);
2283         wake_up(&fs_info->scrub_pause_wait);
2284
2285         /* FIXME it might be better to start readahead at commit root */
2286         key_start.objectid = logical;
2287         key_start.type = BTRFS_EXTENT_ITEM_KEY;
2288         key_start.offset = (u64)0;
2289         key_end.objectid = base + offset + nstripes * increment;
2290         key_end.type = BTRFS_EXTENT_ITEM_KEY;
2291         key_end.offset = (u64)0;
2292         reada1 = btrfs_reada_add(root, &key_start, &key_end);
2293
2294         key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2295         key_start.type = BTRFS_EXTENT_CSUM_KEY;
2296         key_start.offset = logical;
2297         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2298         key_end.type = BTRFS_EXTENT_CSUM_KEY;
2299         key_end.offset = base + offset + nstripes * increment;
2300         reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
2301
2302         if (!IS_ERR(reada1))
2303                 btrfs_reada_wait(reada1);
2304         if (!IS_ERR(reada2))
2305                 btrfs_reada_wait(reada2);
2306
2307         mutex_lock(&fs_info->scrub_lock);
2308         while (atomic_read(&fs_info->scrub_pause_req)) {
2309                 mutex_unlock(&fs_info->scrub_lock);
2310                 wait_event(fs_info->scrub_pause_wait,
2311                    atomic_read(&fs_info->scrub_pause_req) == 0);
2312                 mutex_lock(&fs_info->scrub_lock);
2313         }
2314         atomic_dec(&fs_info->scrubs_paused);
2315         mutex_unlock(&fs_info->scrub_lock);
2316         wake_up(&fs_info->scrub_pause_wait);
2317
2318         /*
2319          * collect all data csums for the stripe to avoid seeking during
2320          * the scrub. This might currently (crc32) end up to be about 1MB
2321          */
2322         blk_start_plug(&plug);
2323
2324         /*
2325          * now find all extents for each stripe and scrub them
2326          */
2327         logical = base + offset;
2328         physical = map->stripes[num].physical;
2329         ret = 0;
2330         for (i = 0; i < nstripes; ++i) {
2331                 /*
2332                  * canceled?
2333                  */
2334                 if (atomic_read(&fs_info->scrub_cancel_req) ||
2335                     atomic_read(&sctx->cancel_req)) {
2336                         ret = -ECANCELED;
2337                         goto out;
2338                 }
2339                 /*
2340                  * check to see if we have to pause
2341                  */
2342                 if (atomic_read(&fs_info->scrub_pause_req)) {
2343                         /* push queued extents */
2344                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2345                         scrub_submit(sctx);
2346                         mutex_lock(&sctx->wr_ctx.wr_lock);
2347                         scrub_wr_submit(sctx);
2348                         mutex_unlock(&sctx->wr_ctx.wr_lock);
2349                         wait_event(sctx->list_wait,
2350                                    atomic_read(&sctx->bios_in_flight) == 0);
2351                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2352                         atomic_inc(&fs_info->scrubs_paused);
2353                         wake_up(&fs_info->scrub_pause_wait);
2354                         mutex_lock(&fs_info->scrub_lock);
2355                         while (atomic_read(&fs_info->scrub_pause_req)) {
2356                                 mutex_unlock(&fs_info->scrub_lock);
2357                                 wait_event(fs_info->scrub_pause_wait,
2358                                    atomic_read(&fs_info->scrub_pause_req) == 0);
2359                                 mutex_lock(&fs_info->scrub_lock);
2360                         }
2361                         atomic_dec(&fs_info->scrubs_paused);
2362                         mutex_unlock(&fs_info->scrub_lock);
2363                         wake_up(&fs_info->scrub_pause_wait);
2364                 }
2365
2366                 ret = btrfs_lookup_csums_range(csum_root, logical,
2367                                                logical + map->stripe_len - 1,
2368                                                &sctx->csum_list, 1);
2369                 if (ret)
2370                         goto out;
2371
2372                 key.objectid = logical;
2373                 key.type = BTRFS_EXTENT_ITEM_KEY;
2374                 key.offset = (u64)0;
2375
2376                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2377                 if (ret < 0)
2378                         goto out;
2379                 if (ret > 0) {
2380                         ret = btrfs_previous_item(root, path, 0,
2381                                                   BTRFS_EXTENT_ITEM_KEY);
2382                         if (ret < 0)
2383                                 goto out;
2384                         if (ret > 0) {
2385                                 /* there's no smaller item, so stick with the
2386                                  * larger one */
2387                                 btrfs_release_path(path);
2388                                 ret = btrfs_search_slot(NULL, root, &key,
2389                                                         path, 0, 0);
2390                                 if (ret < 0)
2391                                         goto out;
2392                         }
2393                 }
2394
2395                 while (1) {
2396                         l = path->nodes[0];
2397                         slot = path->slots[0];
2398                         if (slot >= btrfs_header_nritems(l)) {
2399                                 ret = btrfs_next_leaf(root, path);
2400                                 if (ret == 0)
2401                                         continue;
2402                                 if (ret < 0)
2403                                         goto out;
2404
2405                                 break;
2406                         }
2407                         btrfs_item_key_to_cpu(l, &key, slot);
2408
2409                         if (key.objectid + key.offset <= logical)
2410                                 goto next;
2411
2412                         if (key.objectid >= logical + map->stripe_len)
2413                                 break;
2414
2415                         if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
2416                                 goto next;
2417
2418                         extent = btrfs_item_ptr(l, slot,
2419                                                 struct btrfs_extent_item);
2420                         flags = btrfs_extent_flags(l, extent);
2421                         generation = btrfs_extent_generation(l, extent);
2422
2423                         if (key.objectid < logical &&
2424                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2425                                 printk(KERN_ERR
2426                                        "btrfs scrub: tree block %llu spanning "
2427                                        "stripes, ignored. logical=%llu\n",
2428                                        (unsigned long long)key.objectid,
2429                                        (unsigned long long)logical);
2430                                 goto next;
2431                         }
2432
2433                         /*
2434                          * trim extent to this stripe
2435                          */
2436                         if (key.objectid < logical) {
2437                                 key.offset -= logical - key.objectid;
2438                                 key.objectid = logical;
2439                         }
2440                         if (key.objectid + key.offset >
2441                             logical + map->stripe_len) {
2442                                 key.offset = logical + map->stripe_len -
2443                                              key.objectid;
2444                         }
2445
2446                         extent_logical = key.objectid;
2447                         extent_physical = key.objectid - logical + physical;
2448                         extent_len = key.offset;
2449                         extent_dev = scrub_dev;
2450                         extent_mirror_num = mirror_num;
2451                         if (is_dev_replace)
2452                                 scrub_remap_extent(fs_info, extent_logical,
2453                                                    extent_len, &extent_physical,
2454                                                    &extent_dev,
2455                                                    &extent_mirror_num);
2456                         ret = scrub_extent(sctx, extent_logical, extent_len,
2457                                            extent_physical, extent_dev, flags,
2458                                            generation, extent_mirror_num,
2459                                            key.objectid - logical + physical);
2460                         if (ret)
2461                                 goto out;
2462
2463 next:
2464                         path->slots[0]++;
2465                 }
2466                 btrfs_release_path(path);
2467                 logical += increment;
2468                 physical += map->stripe_len;
2469                 spin_lock(&sctx->stat_lock);
2470                 sctx->stat.last_physical = physical;
2471                 spin_unlock(&sctx->stat_lock);
2472         }
2473 out:
2474         /* push queued extents */
2475         scrub_submit(sctx);
2476         mutex_lock(&sctx->wr_ctx.wr_lock);
2477         scrub_wr_submit(sctx);
2478         mutex_unlock(&sctx->wr_ctx.wr_lock);
2479
2480         blk_finish_plug(&plug);
2481         btrfs_free_path(path);
2482         return ret < 0 ? ret : 0;
2483 }
2484
2485 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2486                                           struct btrfs_device *scrub_dev,
2487                                           u64 chunk_tree, u64 chunk_objectid,
2488                                           u64 chunk_offset, u64 length,
2489                                           u64 dev_offset, int is_dev_replace)
2490 {
2491         struct btrfs_mapping_tree *map_tree =
2492                 &sctx->dev_root->fs_info->mapping_tree;
2493         struct map_lookup *map;
2494         struct extent_map *em;
2495         int i;
2496         int ret = 0;
2497
2498         read_lock(&map_tree->map_tree.lock);
2499         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2500         read_unlock(&map_tree->map_tree.lock);
2501
2502         if (!em)
2503                 return -EINVAL;
2504
2505         map = (struct map_lookup *)em->bdev;
2506         if (em->start != chunk_offset)
2507                 goto out;
2508
2509         if (em->len < length)
2510                 goto out;
2511
2512         for (i = 0; i < map->num_stripes; ++i) {
2513                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2514                     map->stripes[i].physical == dev_offset) {
2515                         ret = scrub_stripe(sctx, map, scrub_dev, i,
2516                                            chunk_offset, length,
2517                                            is_dev_replace);
2518                         if (ret)
2519                                 goto out;
2520                 }
2521         }
2522 out:
2523         free_extent_map(em);
2524
2525         return ret;
2526 }
2527
2528 static noinline_for_stack
2529 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2530                            struct btrfs_device *scrub_dev, u64 start, u64 end,
2531                            int is_dev_replace)
2532 {
2533         struct btrfs_dev_extent *dev_extent = NULL;
2534         struct btrfs_path *path;
2535         struct btrfs_root *root = sctx->dev_root;
2536         struct btrfs_fs_info *fs_info = root->fs_info;
2537         u64 length;
2538         u64 chunk_tree;
2539         u64 chunk_objectid;
2540         u64 chunk_offset;
2541         int ret;
2542         int slot;
2543         struct extent_buffer *l;
2544         struct btrfs_key key;
2545         struct btrfs_key found_key;
2546         struct btrfs_block_group_cache *cache;
2547         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2548
2549         path = btrfs_alloc_path();
2550         if (!path)
2551                 return -ENOMEM;
2552
2553         path->reada = 2;
2554         path->search_commit_root = 1;
2555         path->skip_locking = 1;
2556
2557         key.objectid = scrub_dev->devid;
2558         key.offset = 0ull;
2559         key.type = BTRFS_DEV_EXTENT_KEY;
2560
2561         while (1) {
2562                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2563                 if (ret < 0)
2564                         break;
2565                 if (ret > 0) {
2566                         if (path->slots[0] >=
2567                             btrfs_header_nritems(path->nodes[0])) {
2568                                 ret = btrfs_next_leaf(root, path);
2569                                 if (ret)
2570                                         break;
2571                         }
2572                 }
2573
2574                 l = path->nodes[0];
2575                 slot = path->slots[0];
2576
2577                 btrfs_item_key_to_cpu(l, &found_key, slot);
2578
2579                 if (found_key.objectid != scrub_dev->devid)
2580                         break;
2581
2582                 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
2583                         break;
2584
2585                 if (found_key.offset >= end)
2586                         break;
2587
2588                 if (found_key.offset < key.offset)
2589                         break;
2590
2591                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2592                 length = btrfs_dev_extent_length(l, dev_extent);
2593
2594                 if (found_key.offset + length <= start) {
2595                         key.offset = found_key.offset + length;
2596                         btrfs_release_path(path);
2597                         continue;
2598                 }
2599
2600                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2601                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2602                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2603
2604                 /*
2605                  * get a reference on the corresponding block group to prevent
2606                  * the chunk from going away while we scrub it
2607                  */
2608                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2609                 if (!cache) {
2610                         ret = -ENOENT;
2611                         break;
2612                 }
2613                 dev_replace->cursor_right = found_key.offset + length;
2614                 dev_replace->cursor_left = found_key.offset;
2615                 dev_replace->item_needs_writeback = 1;
2616                 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2617                                   chunk_offset, length, found_key.offset,
2618                                   is_dev_replace);
2619
2620                 /*
2621                  * flush, submit all pending read and write bios, afterwards
2622                  * wait for them.
2623                  * Note that in the dev replace case, a read request causes
2624                  * write requests that are submitted in the read completion
2625                  * worker. Therefore in the current situation, it is required
2626                  * that all write requests are flushed, so that all read and
2627                  * write requests are really completed when bios_in_flight
2628                  * changes to 0.
2629                  */
2630                 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2631                 scrub_submit(sctx);
2632                 mutex_lock(&sctx->wr_ctx.wr_lock);
2633                 scrub_wr_submit(sctx);
2634                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2635
2636                 wait_event(sctx->list_wait,
2637                            atomic_read(&sctx->bios_in_flight) == 0);
2638                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2639                 atomic_inc(&fs_info->scrubs_paused);
2640                 wake_up(&fs_info->scrub_pause_wait);
2641                 wait_event(sctx->list_wait,
2642                            atomic_read(&sctx->workers_pending) == 0);
2643
2644                 mutex_lock(&fs_info->scrub_lock);
2645                 while (atomic_read(&fs_info->scrub_pause_req)) {
2646                         mutex_unlock(&fs_info->scrub_lock);
2647                         wait_event(fs_info->scrub_pause_wait,
2648                            atomic_read(&fs_info->scrub_pause_req) == 0);
2649                         mutex_lock(&fs_info->scrub_lock);
2650                 }
2651                 atomic_dec(&fs_info->scrubs_paused);
2652                 mutex_unlock(&fs_info->scrub_lock);
2653                 wake_up(&fs_info->scrub_pause_wait);
2654
2655                 dev_replace->cursor_left = dev_replace->cursor_right;
2656                 dev_replace->item_needs_writeback = 1;
2657                 btrfs_put_block_group(cache);
2658                 if (ret)
2659                         break;
2660                 if (atomic64_read(&dev_replace->num_write_errors) > 0) {
2661                         ret = -EIO;
2662                         break;
2663                 }
2664                 if (sctx->stat.malloc_errors > 0) {
2665                         ret = -ENOMEM;
2666                         break;
2667                 }
2668
2669                 key.offset = found_key.offset + length;
2670                 btrfs_release_path(path);
2671         }
2672
2673         btrfs_free_path(path);
2674
2675         /*
2676          * ret can still be 1 from search_slot or next_leaf,
2677          * that's not an error
2678          */
2679         return ret < 0 ? ret : 0;
2680 }
2681
2682 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2683                                            struct btrfs_device *scrub_dev)
2684 {
2685         int     i;
2686         u64     bytenr;
2687         u64     gen;
2688         int     ret;
2689         struct btrfs_root *root = sctx->dev_root;
2690
2691         if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2692                 return -EIO;
2693
2694         gen = root->fs_info->last_trans_committed;
2695
2696         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2697                 bytenr = btrfs_sb_offset(i);
2698                 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2699                         break;
2700
2701                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2702                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2703                                   NULL, 1, bytenr);
2704                 if (ret)
2705                         return ret;
2706         }
2707         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2708
2709         return 0;
2710 }
2711
2712 /*
2713  * get a reference count on fs_info->scrub_workers. start worker if necessary
2714  */
2715 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2716                                                 int is_dev_replace)
2717 {
2718         int ret = 0;
2719
2720         mutex_lock(&fs_info->scrub_lock);
2721         if (fs_info->scrub_workers_refcnt == 0) {
2722                 if (is_dev_replace)
2723                         btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2724                                         &fs_info->generic_worker);
2725                 else
2726                         btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2727                                         fs_info->thread_pool_size,
2728                                         &fs_info->generic_worker);
2729                 fs_info->scrub_workers.idle_thresh = 4;
2730                 ret = btrfs_start_workers(&fs_info->scrub_workers);
2731                 if (ret)
2732                         goto out;
2733                 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2734                                    "scrubwrc",
2735                                    fs_info->thread_pool_size,
2736                                    &fs_info->generic_worker);
2737                 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2738                 ret = btrfs_start_workers(
2739                                 &fs_info->scrub_wr_completion_workers);
2740                 if (ret)
2741                         goto out;
2742                 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2743                                    &fs_info->generic_worker);
2744                 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2745                 if (ret)
2746                         goto out;
2747         }
2748         ++fs_info->scrub_workers_refcnt;
2749 out:
2750         mutex_unlock(&fs_info->scrub_lock);
2751
2752         return ret;
2753 }
2754
2755 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2756 {
2757         mutex_lock(&fs_info->scrub_lock);
2758         if (--fs_info->scrub_workers_refcnt == 0) {
2759                 btrfs_stop_workers(&fs_info->scrub_workers);
2760                 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2761                 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2762         }
2763         WARN_ON(fs_info->scrub_workers_refcnt < 0);
2764         mutex_unlock(&fs_info->scrub_lock);
2765 }
2766
2767 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2768                     u64 end, struct btrfs_scrub_progress *progress,
2769                     int readonly, int is_dev_replace)
2770 {
2771         struct scrub_ctx *sctx;
2772         int ret;
2773         struct btrfs_device *dev;
2774
2775         if (btrfs_fs_closing(fs_info))
2776                 return -EINVAL;
2777
2778         /*
2779          * check some assumptions
2780          */
2781         if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2782                 printk(KERN_ERR
2783                        "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2784                        fs_info->chunk_root->nodesize,
2785                        fs_info->chunk_root->leafsize);
2786                 return -EINVAL;
2787         }
2788
2789         if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2790                 /*
2791                  * in this case scrub is unable to calculate the checksum
2792                  * the way scrub is implemented. Do not handle this
2793                  * situation at all because it won't ever happen.
2794                  */
2795                 printk(KERN_ERR
2796                        "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2797                        fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2798                 return -EINVAL;
2799         }
2800
2801         if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2802                 /* not supported for data w/o checksums */
2803                 printk(KERN_ERR
2804                        "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2805                        fs_info->chunk_root->sectorsize,
2806                        (unsigned long long)PAGE_SIZE);
2807                 return -EINVAL;
2808         }
2809
2810         if (fs_info->chunk_root->nodesize >
2811             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2812             fs_info->chunk_root->sectorsize >
2813             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2814                 /*
2815                  * would exhaust the array bounds of pagev member in
2816                  * struct scrub_block
2817                  */
2818                 pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2819                        fs_info->chunk_root->nodesize,
2820                        SCRUB_MAX_PAGES_PER_BLOCK,
2821                        fs_info->chunk_root->sectorsize,
2822                        SCRUB_MAX_PAGES_PER_BLOCK);
2823                 return -EINVAL;
2824         }
2825
2826         ret = scrub_workers_get(fs_info, is_dev_replace);
2827         if (ret)
2828                 return ret;
2829
2830         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2831         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2832         if (!dev || (dev->missing && !is_dev_replace)) {
2833                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2834                 scrub_workers_put(fs_info);
2835                 return -ENODEV;
2836         }
2837         mutex_lock(&fs_info->scrub_lock);
2838
2839         if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2840                 mutex_unlock(&fs_info->scrub_lock);
2841                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2842                 scrub_workers_put(fs_info);
2843                 return -EIO;
2844         }
2845
2846         btrfs_dev_replace_lock(&fs_info->dev_replace);
2847         if (dev->scrub_device ||
2848             (!is_dev_replace &&
2849              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2850                 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2851                 mutex_unlock(&fs_info->scrub_lock);
2852                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2853                 scrub_workers_put(fs_info);
2854                 return -EINPROGRESS;
2855         }
2856         btrfs_dev_replace_unlock(&fs_info->dev_replace);
2857         sctx = scrub_setup_ctx(dev, is_dev_replace);
2858         if (IS_ERR(sctx)) {
2859                 mutex_unlock(&fs_info->scrub_lock);
2860                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2861                 scrub_workers_put(fs_info);
2862                 return PTR_ERR(sctx);
2863         }
2864         sctx->readonly = readonly;
2865         dev->scrub_device = sctx;
2866
2867         atomic_inc(&fs_info->scrubs_running);
2868         mutex_unlock(&fs_info->scrub_lock);
2869         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2870
2871         if (!is_dev_replace) {
2872                 down_read(&fs_info->scrub_super_lock);
2873                 ret = scrub_supers(sctx, dev);
2874                 up_read(&fs_info->scrub_super_lock);
2875         }
2876
2877         if (!ret)
2878                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2879                                              is_dev_replace);
2880
2881         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2882         atomic_dec(&fs_info->scrubs_running);
2883         wake_up(&fs_info->scrub_pause_wait);
2884
2885         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2886
2887         if (progress)
2888                 memcpy(progress, &sctx->stat, sizeof(*progress));
2889
2890         mutex_lock(&fs_info->scrub_lock);
2891         dev->scrub_device = NULL;
2892         mutex_unlock(&fs_info->scrub_lock);
2893
2894         scrub_free_ctx(sctx);
2895         scrub_workers_put(fs_info);
2896
2897         return ret;
2898 }
2899
2900 void btrfs_scrub_pause(struct btrfs_root *root)
2901 {
2902         struct btrfs_fs_info *fs_info = root->fs_info;
2903
2904         mutex_lock(&fs_info->scrub_lock);
2905         atomic_inc(&fs_info->scrub_pause_req);
2906         while (atomic_read(&fs_info->scrubs_paused) !=
2907                atomic_read(&fs_info->scrubs_running)) {
2908                 mutex_unlock(&fs_info->scrub_lock);
2909                 wait_event(fs_info->scrub_pause_wait,
2910                            atomic_read(&fs_info->scrubs_paused) ==
2911                            atomic_read(&fs_info->scrubs_running));
2912                 mutex_lock(&fs_info->scrub_lock);
2913         }
2914         mutex_unlock(&fs_info->scrub_lock);
2915 }
2916
2917 void btrfs_scrub_continue(struct btrfs_root *root)
2918 {
2919         struct btrfs_fs_info *fs_info = root->fs_info;
2920
2921         atomic_dec(&fs_info->scrub_pause_req);
2922         wake_up(&fs_info->scrub_pause_wait);
2923 }
2924
2925 void btrfs_scrub_pause_super(struct btrfs_root *root)
2926 {
2927         down_write(&root->fs_info->scrub_super_lock);
2928 }
2929
2930 void btrfs_scrub_continue_super(struct btrfs_root *root)
2931 {
2932         up_write(&root->fs_info->scrub_super_lock);
2933 }
2934
2935 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2936 {
2937         mutex_lock(&fs_info->scrub_lock);
2938         if (!atomic_read(&fs_info->scrubs_running)) {
2939                 mutex_unlock(&fs_info->scrub_lock);
2940                 return -ENOTCONN;
2941         }
2942
2943         atomic_inc(&fs_info->scrub_cancel_req);
2944         while (atomic_read(&fs_info->scrubs_running)) {
2945                 mutex_unlock(&fs_info->scrub_lock);
2946                 wait_event(fs_info->scrub_pause_wait,
2947                            atomic_read(&fs_info->scrubs_running) == 0);
2948                 mutex_lock(&fs_info->scrub_lock);
2949         }
2950         atomic_dec(&fs_info->scrub_cancel_req);
2951         mutex_unlock(&fs_info->scrub_lock);
2952
2953         return 0;
2954 }
2955
2956 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2957                            struct btrfs_device *dev)
2958 {
2959         struct scrub_ctx *sctx;
2960
2961         mutex_lock(&fs_info->scrub_lock);
2962         sctx = dev->scrub_device;
2963         if (!sctx) {
2964                 mutex_unlock(&fs_info->scrub_lock);
2965                 return -ENOTCONN;
2966         }
2967         atomic_inc(&sctx->cancel_req);
2968         while (dev->scrub_device) {
2969                 mutex_unlock(&fs_info->scrub_lock);
2970                 wait_event(fs_info->scrub_pause_wait,
2971                            dev->scrub_device == NULL);
2972                 mutex_lock(&fs_info->scrub_lock);
2973         }
2974         mutex_unlock(&fs_info->scrub_lock);
2975
2976         return 0;
2977 }
2978
2979 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2980 {
2981         struct btrfs_fs_info *fs_info = root->fs_info;
2982         struct btrfs_device *dev;
2983         int ret;
2984
2985         /*
2986          * we have to hold the device_list_mutex here so the device
2987          * does not go away in cancel_dev. FIXME: find a better solution
2988          */
2989         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2990         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2991         if (!dev) {
2992                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2993                 return -ENODEV;
2994         }
2995         ret = btrfs_scrub_cancel_dev(fs_info, dev);
2996         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2997
2998         return ret;
2999 }
3000
3001 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3002                          struct btrfs_scrub_progress *progress)
3003 {
3004         struct btrfs_device *dev;
3005         struct scrub_ctx *sctx = NULL;
3006
3007         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3008         dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3009         if (dev)
3010                 sctx = dev->scrub_device;
3011         if (sctx)
3012                 memcpy(progress, &sctx->stat, sizeof(*progress));
3013         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3014
3015         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3016 }
3017
3018 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3019                                u64 extent_logical, u64 extent_len,
3020                                u64 *extent_physical,
3021                                struct btrfs_device **extent_dev,
3022                                int *extent_mirror_num)
3023 {
3024         u64 mapped_length;
3025         struct btrfs_bio *bbio = NULL;
3026         int ret;
3027
3028         mapped_length = extent_len;
3029         ret = btrfs_map_block(fs_info, READ, extent_logical,
3030                               &mapped_length, &bbio, 0);
3031         if (ret || !bbio || mapped_length < extent_len ||
3032             !bbio->stripes[0].dev->bdev) {
3033                 kfree(bbio);
3034                 return;
3035         }
3036
3037         *extent_physical = bbio->stripes[0].physical;
3038         *extent_mirror_num = bbio->mirror_num;
3039         *extent_dev = bbio->stripes[0].dev;
3040         kfree(bbio);
3041 }
3042
3043 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3044                               struct scrub_wr_ctx *wr_ctx,
3045                               struct btrfs_fs_info *fs_info,
3046                               struct btrfs_device *dev,
3047                               int is_dev_replace)
3048 {
3049         WARN_ON(wr_ctx->wr_curr_bio != NULL);
3050
3051         mutex_init(&wr_ctx->wr_lock);
3052         wr_ctx->wr_curr_bio = NULL;
3053         if (!is_dev_replace)
3054                 return 0;
3055
3056         WARN_ON(!dev->bdev);
3057         wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3058                                          bio_get_nr_vecs(dev->bdev));
3059         wr_ctx->tgtdev = dev;
3060         atomic_set(&wr_ctx->flush_all_writes, 0);
3061         return 0;
3062 }
3063
3064 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3065 {
3066         mutex_lock(&wr_ctx->wr_lock);
3067         kfree(wr_ctx->wr_curr_bio);
3068         wr_ctx->wr_curr_bio = NULL;
3069         mutex_unlock(&wr_ctx->wr_lock);
3070 }
3071
3072 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3073                             int mirror_num, u64 physical_for_dev_replace)
3074 {
3075         struct scrub_copy_nocow_ctx *nocow_ctx;
3076         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3077
3078         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3079         if (!nocow_ctx) {
3080                 spin_lock(&sctx->stat_lock);
3081                 sctx->stat.malloc_errors++;
3082                 spin_unlock(&sctx->stat_lock);
3083                 return -ENOMEM;
3084         }
3085
3086         scrub_pending_trans_workers_inc(sctx);
3087
3088         nocow_ctx->sctx = sctx;
3089         nocow_ctx->logical = logical;
3090         nocow_ctx->len = len;
3091         nocow_ctx->mirror_num = mirror_num;
3092         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3093         nocow_ctx->work.func = copy_nocow_pages_worker;
3094         btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3095                            &nocow_ctx->work);
3096
3097         return 0;
3098 }
3099
3100 static void copy_nocow_pages_worker(struct btrfs_work *work)
3101 {
3102         struct scrub_copy_nocow_ctx *nocow_ctx =
3103                 container_of(work, struct scrub_copy_nocow_ctx, work);
3104         struct scrub_ctx *sctx = nocow_ctx->sctx;
3105         u64 logical = nocow_ctx->logical;
3106         u64 len = nocow_ctx->len;
3107         int mirror_num = nocow_ctx->mirror_num;
3108         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3109         int ret;
3110         struct btrfs_trans_handle *trans = NULL;
3111         struct btrfs_fs_info *fs_info;
3112         struct btrfs_path *path;
3113         struct btrfs_root *root;
3114         int not_written = 0;
3115
3116         fs_info = sctx->dev_root->fs_info;
3117         root = fs_info->extent_root;
3118
3119         path = btrfs_alloc_path();
3120         if (!path) {
3121                 spin_lock(&sctx->stat_lock);
3122                 sctx->stat.malloc_errors++;
3123                 spin_unlock(&sctx->stat_lock);
3124                 not_written = 1;
3125                 goto out;
3126         }
3127
3128         trans = btrfs_join_transaction(root);
3129         if (IS_ERR(trans)) {
3130                 not_written = 1;
3131                 goto out;
3132         }
3133
3134         ret = iterate_inodes_from_logical(logical, fs_info, path,
3135                                           copy_nocow_pages_for_inode,
3136                                           nocow_ctx);
3137         if (ret != 0 && ret != -ENOENT) {
3138                 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3139                         (unsigned long long)logical,
3140                         (unsigned long long)physical_for_dev_replace,
3141                         (unsigned long long)len,
3142                         (unsigned long long)mirror_num, ret);
3143                 not_written = 1;
3144                 goto out;
3145         }
3146
3147 out:
3148         if (trans && !IS_ERR(trans))
3149                 btrfs_end_transaction(trans, root);
3150         if (not_written)
3151                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3152                                             num_uncorrectable_read_errors);
3153
3154         btrfs_free_path(path);
3155         kfree(nocow_ctx);
3156
3157         scrub_pending_trans_workers_dec(sctx);
3158 }
3159
3160 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3161 {
3162         unsigned long index;
3163         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3164         int ret = 0;
3165         struct btrfs_key key;
3166         struct inode *inode = NULL;
3167         struct btrfs_root *local_root;
3168         u64 physical_for_dev_replace;
3169         u64 len;
3170         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3171
3172         key.objectid = root;
3173         key.type = BTRFS_ROOT_ITEM_KEY;
3174         key.offset = (u64)-1;
3175         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3176         if (IS_ERR(local_root))
3177                 return PTR_ERR(local_root);
3178
3179         key.type = BTRFS_INODE_ITEM_KEY;
3180         key.objectid = inum;
3181         key.offset = 0;
3182         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3183         if (IS_ERR(inode))
3184                 return PTR_ERR(inode);
3185
3186         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3187         len = nocow_ctx->len;
3188         while (len >= PAGE_CACHE_SIZE) {
3189                 struct page *page = NULL;
3190                 int ret_sub;
3191
3192                 index = offset >> PAGE_CACHE_SHIFT;
3193
3194                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3195                 if (!page) {
3196                         pr_err("find_or_create_page() failed\n");
3197                         ret = -ENOMEM;
3198                         goto next_page;
3199                 }
3200
3201                 if (PageUptodate(page)) {
3202                         if (PageDirty(page))
3203                                 goto next_page;
3204                 } else {
3205                         ClearPageError(page);
3206                         ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3207                                                          io_tree,
3208                                                         page, btrfs_get_extent,
3209                                                         nocow_ctx->mirror_num);
3210                         if (ret_sub) {
3211                                 ret = ret_sub;
3212                                 goto next_page;
3213                         }
3214                         wait_on_page_locked(page);
3215                         if (!PageUptodate(page)) {
3216                                 ret = -EIO;
3217                                 goto next_page;
3218                         }
3219                 }
3220                 ret_sub = write_page_nocow(nocow_ctx->sctx,
3221                                            physical_for_dev_replace, page);
3222                 if (ret_sub) {
3223                         ret = ret_sub;
3224                         goto next_page;
3225                 }
3226
3227 next_page:
3228                 if (page) {
3229                         unlock_page(page);
3230                         put_page(page);
3231                 }
3232                 offset += PAGE_CACHE_SIZE;
3233                 physical_for_dev_replace += PAGE_CACHE_SIZE;
3234                 len -= PAGE_CACHE_SIZE;
3235         }
3236
3237         if (inode)
3238                 iput(inode);
3239         return ret;
3240 }
3241
3242 static int write_page_nocow(struct scrub_ctx *sctx,
3243                             u64 physical_for_dev_replace, struct page *page)
3244 {
3245         struct bio *bio;
3246         struct btrfs_device *dev;
3247         int ret;
3248         DECLARE_COMPLETION_ONSTACK(compl);
3249
3250         dev = sctx->wr_ctx.tgtdev;
3251         if (!dev)
3252                 return -EIO;
3253         if (!dev->bdev) {
3254                 printk_ratelimited(KERN_WARNING
3255                         "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3256                 return -EIO;
3257         }
3258         bio = bio_alloc(GFP_NOFS, 1);
3259         if (!bio) {
3260                 spin_lock(&sctx->stat_lock);
3261                 sctx->stat.malloc_errors++;
3262                 spin_unlock(&sctx->stat_lock);
3263                 return -ENOMEM;
3264         }
3265         bio->bi_private = &compl;
3266         bio->bi_end_io = scrub_complete_bio_end_io;
3267         bio->bi_size = 0;
3268         bio->bi_sector = physical_for_dev_replace >> 9;
3269         bio->bi_bdev = dev->bdev;
3270         ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3271         if (ret != PAGE_CACHE_SIZE) {
3272 leave_with_eio:
3273                 bio_put(bio);
3274                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3275                 return -EIO;
3276         }
3277         btrfsic_submit_bio(WRITE_SYNC, bio);
3278         wait_for_completion(&compl);
3279
3280         if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3281                 goto leave_with_eio;
3282
3283         bio_put(bio);
3284         return 0;
3285 }