Btrfs: fix race between scrub and block group deletion
authorFilipe Manana <fdmanana@suse.com>
Thu, 19 Nov 2015 10:57:20 +0000 (10:57 +0000)
committerChris Mason <clm@fb.com>
Wed, 25 Nov 2015 13:19:51 +0000 (05:19 -0800)
Scrub can race with the cleaner kthread deleting block groups that are
unused (and with relocation too) leading to a failure with error -EINVAL
that gets returned to user space.

The following diagram illustrates how it happens:

              CPU 1                                 CPU 2

 cleaner kthread
   btrfs_delete_unused_bgs()

     gets block group X from
     fs_info->unused_bgs

     sets block group to RO

       btrfs_remove_chunk(bg X)

         deletes device extents

                                         scrub_enumerate_chunks()

                                           searches device tree using
                                           its commit root

                                           finds device extent for
                                           block group X

                                           gets block group X from the tree
                                           fs_info->block_group_cache_tree
                                           (via btrfs_lookup_block_group())

                                           sets bg X to RO (again)

          btrfs_remove_block_group(bg X)

            deletes block group from
            fs_info->block_group_cache_tree

            removes extent map from
            fs_info->mapping_tree

                                               scrub_chunk(offset X)

                                                 searches fs_info->mapping_tree
                                                 for extent map starting at
                                                 offset X

                                                    --> doesn't find any such
                                                        extent map
                                                    --> returns -EINVAL and scrub
                                                        errors out to userspace
                                                        with -EINVAL

Fix this by dealing with an extent map lookup failure as an indicator of
block group deletion.
Issue reproduced with fstest btrfs/071.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
fs/btrfs/scrub.c

index 6b3fd51d9a99ce95167fee40e79d577b992d5682..68af3169d527c1f8cc5d1df49ff7c2f5598161a4 100644 (file)
@@ -3432,7 +3432,9 @@ out:
 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
                                          struct btrfs_device *scrub_dev,
                                          u64 chunk_offset, u64 length,
-                                         u64 dev_offset, int is_dev_replace)
+                                         u64 dev_offset,
+                                         struct btrfs_block_group_cache *cache,
+                                         int is_dev_replace)
 {
        struct btrfs_mapping_tree *map_tree =
                &sctx->dev_root->fs_info->mapping_tree;
@@ -3445,8 +3447,18 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
        read_unlock(&map_tree->map_tree.lock);
 
-       if (!em)
-               return -EINVAL;
+       if (!em) {
+               /*
+                * Might have been an unused block group deleted by the cleaner
+                * kthread or relocation.
+                */
+               spin_lock(&cache->lock);
+               if (!cache->removed)
+                       ret = -EINVAL;
+               spin_unlock(&cache->lock);
+
+               return ret;
+       }
 
        map = (struct map_lookup *)em->bdev;
        if (em->start != chunk_offset)
@@ -3592,7 +3604,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                dev_replace->cursor_left = found_key.offset;
                dev_replace->item_needs_writeback = 1;
                ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
-                                 found_key.offset, is_dev_replace);
+                                 found_key.offset, cache, is_dev_replace);
 
                /*
                 * flush, submit all pending read and write bios, afterwards