Merge branch 'restriper' of git://github.com/idryomov/btrfs-unstable into integration
authorChris Mason <chris.mason@oracle.com>
Mon, 16 Jan 2012 20:26:02 +0000 (15:26 -0500)
committerChris Mason <chris.mason@oracle.com>
Mon, 16 Jan 2012 20:26:02 +0000 (15:26 -0500)
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/ioctl.c
fs/btrfs/ioctl.h
fs/btrfs/super.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index 67385033323d6e49817398a1df9b1596df07e839..dfc136cc07d7e9c4193b80188b5f452b1183fd9d 100644 (file)
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
 /* holds checksums of all the data extents */
 #define BTRFS_CSUM_TREE_OBJECTID 7ULL
 
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
        __le16 name_len;
 } __attribute__ ((__packed__));
 
+struct btrfs_disk_balance_args {
+       /*
+        * profiles to operate on, single is denoted by
+        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+        */
+       __le64 profiles;
+
+       /* usage filter */
+       __le64 usage;
+
+       /* devid filter */
+       __le64 devid;
+
+       /* devid subset filter [pstart..pend) */
+       __le64 pstart;
+       __le64 pend;
+
+       /* btrfs virtual address space subset filter [vstart..vend) */
+       __le64 vstart;
+       __le64 vend;
+
+       /*
+        * profile to convert to, single is denoted by
+        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+        */
+       __le64 target;
+
+       /* BTRFS_BALANCE_ARGS_* */
+       __le64 flags;
+
+       __le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+       /* BTRFS_BALANCE_* */
+       __le64 flags;
+
+       struct btrfs_disk_balance_args data;
+       struct btrfs_disk_balance_args meta;
+       struct btrfs_disk_balance_args sys;
+
+       __le64 unused[4];
+} __attribute__ ((__packed__));
+
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
 } __attribute__ ((__packed__));
 
 /* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP     (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
-#define BTRFS_NR_RAID_TYPES       5
+#define BTRFS_BLOCK_GROUP_DATA         (1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM       (1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA     (1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0                (1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1                (1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP          (1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10       (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RESERVED     BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_NR_RAID_TYPES            5
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK    (BTRFS_BLOCK_GROUP_DATA |    \
+                                        BTRFS_BLOCK_GROUP_SYSTEM |  \
+                                        BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 |   \
+                                        BTRFS_BLOCK_GROUP_RAID1 |   \
+                                        BTRFS_BLOCK_GROUP_DUP |     \
+                                        BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available.  This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE   (1ULL << 48)
 
 struct btrfs_block_group_item {
        __le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct btrfs_fs_info {
        u8 fsid[BTRFS_FSID_SIZE];
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
        spinlock_t ref_cache_lock;
        u64 total_ref_cache_size;
 
+       /*
+        * these three are in extended format (availability of single
+        * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+        * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+        */
        u64 avail_data_alloc_bits;
        u64 avail_metadata_alloc_bits;
        u64 avail_system_alloc_bits;
-       u64 data_alloc_profile;
-       u64 metadata_alloc_profile;
-       u64 system_alloc_profile;
+
+       /* restriper state */
+       spinlock_t balance_lock;
+       struct mutex balance_mutex;
+       atomic_t balance_running;
+       atomic_t balance_pause_req;
+       atomic_t balance_cancel_req;
+       struct btrfs_balance_control *balance_ctl;
+       wait_queue_head_t balance_wait_q;
 
        unsigned data_chunk_allocations;
        unsigned metadata_ratio;
@@ -1383,6 +1464,8 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_ITEM_KEY     216
 #define BTRFS_CHUNK_ITEM_KEY   228
 
+#define BTRFS_BALANCE_ITEM_KEY 248
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -1413,6 +1496,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_AUTO_DEFRAG                (1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE    (1 << 17)
 #define BTRFS_MOUNT_RECOVERY           (1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE       (1 << 19)
 
 #define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)          ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2161,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
                   num_devices, 64);
 
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+                                     struct btrfs_balance_item *bi,
+                                     struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+                                         struct btrfs_balance_item *bi,
+                                         struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+                                     struct btrfs_balance_item *bi,
+                                     struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
 
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+                                         struct btrfs_balance_item *bi,
+                                         struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+                                    struct btrfs_balance_item *bi,
+                                    struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+                                        struct btrfs_balance_item *bi,
+                                        struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+                              struct btrfs_disk_balance_args *disk)
+{
+       memset(cpu, 0, sizeof(*cpu));
+
+       cpu->profiles = le64_to_cpu(disk->profiles);
+       cpu->usage = le64_to_cpu(disk->usage);
+       cpu->devid = le64_to_cpu(disk->devid);
+       cpu->pstart = le64_to_cpu(disk->pstart);
+       cpu->pend = le64_to_cpu(disk->pend);
+       cpu->vstart = le64_to_cpu(disk->vstart);
+       cpu->vend = le64_to_cpu(disk->vend);
+       cpu->target = le64_to_cpu(disk->target);
+       cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+                              struct btrfs_balance_args *cpu)
+{
+       memset(disk, 0, sizeof(*disk));
+
+       disk->profiles = cpu_to_le64(cpu->profiles);
+       disk->usage = cpu_to_le64(cpu->usage);
+       disk->devid = cpu_to_le64(cpu->devid);
+       disk->pstart = cpu_to_le64(cpu->pstart);
+       disk->pend = cpu_to_le64(cpu->pend);
+       disk->vstart = cpu_to_le64(cpu->vstart);
+       disk->vend = cpu_to_le64(cpu->vend);
+       disk->target = cpu_to_le64(cpu->target);
+       disk->flags = cpu_to_le64(cpu->flags);
+}
+
+/* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2500,6 +2662,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 }
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+       kfree(fs_info->balance_ctl);
        kfree(fs_info->delayed_root);
        kfree(fs_info->extent_root);
        kfree(fs_info->tree_root);
@@ -2510,6 +2673,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
        kfree(fs_info->super_for_commit);
        kfree(fs_info);
 }
+/**
+ * profile_is_valid - tests whether a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static inline int profile_is_valid(u64 flags, int extended)
+{
+       u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+       if (extended)
+               mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (flags & mask)
+               return 0;
+       /* true if zero or exactly one bit set */
+       return (flags & (~flags + 1)) == flags;
+}
 
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
index f44b3928dc2dc94cb62cefe72f1063f282dc09c2..9c1a744e595b076868e966867a6bd55e5f6376cd 100644 (file)
@@ -2002,6 +2002,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        init_rwsem(&fs_info->scrub_super_lock);
        fs_info->scrub_workers_refcnt = 0;
 
+       spin_lock_init(&fs_info->balance_lock);
+       mutex_init(&fs_info->balance_mutex);
+       atomic_set(&fs_info->balance_running, 0);
+       atomic_set(&fs_info->balance_pause_req, 0);
+       atomic_set(&fs_info->balance_cancel_req, 0);
+       fs_info->balance_ctl = NULL;
+       init_waitqueue_head(&fs_info->balance_wait_q);
+
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
        sb->s_bdi = &fs_info->bdi;
@@ -2321,9 +2329,6 @@ retry_root_backup:
 
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
-       fs_info->data_alloc_profile = (u64)-1;
-       fs_info->metadata_alloc_profile = (u64)-1;
-       fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
@@ -2426,6 +2431,10 @@ retry_root_backup:
                if (!err)
                        err = btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
+
+               if (!err)
+                       err = btrfs_recover_balance(fs_info->tree_root);
+
                if (err) {
                        close_ctree(tree_root);
                        return ERR_PTR(err);
@@ -2975,6 +2984,9 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
 
+       /* pause restriper - we want to resume on mount */
+       btrfs_pause_balance(root->fs_info);
+
        btrfs_scrub_cancel(root);
 
        /* wait for any defraggers to finish */
index 37594e4bf660e166b5c3d2d37ff78a459076120d..352083ad233ca2a18b8c3817d3f3a5644000bf76 100644 (file)
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
 
-       flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
-                BTRFS_BLOCK_GROUP_METADATA;
+       flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
@@ -2999,9 +2998,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
        spin_lock_init(&found->lock);
-       found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
-                               BTRFS_BLOCK_GROUP_SYSTEM |
-                               BTRFS_BLOCK_GROUP_METADATA);
+       found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
        found->total_bytes = total_bytes;
        found->disk_total = total_bytes * factor;
        found->bytes_used = bytes_used;
@@ -3022,20 +3019,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
-       u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
-                                  BTRFS_BLOCK_GROUP_RAID1 |
-                                  BTRFS_BLOCK_GROUP_RAID10 |
-                                  BTRFS_BLOCK_GROUP_DUP);
-       if (extra_flags) {
-               if (flags & BTRFS_BLOCK_GROUP_DATA)
-                       fs_info->avail_data_alloc_bits |= extra_flags;
-               if (flags & BTRFS_BLOCK_GROUP_METADATA)
-                       fs_info->avail_metadata_alloc_bits |= extra_flags;
-               if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-                       fs_info->avail_system_alloc_bits |= extra_flags;
-       }
+       u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       /* chunk -> extended profile */
+       if (extra_flags == 0)
+               extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               fs_info->avail_data_alloc_bits |= extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               fs_info->avail_metadata_alloc_bits |= extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               fs_info->avail_system_alloc_bits |= extra_flags;
 }
 
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format.  If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        /*
@@ -3046,6 +3050,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        u64 num_devices = root->fs_info->fs_devices->rw_devices +
                root->fs_info->fs_devices->missing_devices;
 
+       /* pick restriper's target profile if it's available */
+       spin_lock(&root->fs_info->balance_lock);
+       if (root->fs_info->balance_ctl) {
+               struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+               u64 tgt = 0;
+
+               if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
+                   (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                   (flags & bctl->data.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+               } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+                          (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                          (flags & bctl->sys.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+               } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
+                          (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                          (flags & bctl->meta.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+               }
+
+               if (tgt) {
+                       spin_unlock(&root->fs_info->balance_lock);
+                       flags = tgt;
+                       goto out;
+               }
+       }
+       spin_unlock(&root->fs_info->balance_lock);
+
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
        if (num_devices < 4)
@@ -3065,22 +3097,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
             (flags & BTRFS_BLOCK_GROUP_RAID10) |
-            (flags & BTRFS_BLOCK_GROUP_DUP)))
+            (flags & BTRFS_BLOCK_GROUP_DUP))) {
                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+       }
+
+out:
+       /* extended -> chunk profile */
+       flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
        return flags;
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        if (flags & BTRFS_BLOCK_GROUP_DATA)
-               flags |= root->fs_info->avail_data_alloc_bits &
-                        root->fs_info->data_alloc_profile;
+               flags |= root->fs_info->avail_data_alloc_bits;
        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-               flags |= root->fs_info->avail_system_alloc_bits &
-                        root->fs_info->system_alloc_profile;
+               flags |= root->fs_info->avail_system_alloc_bits;
        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-               flags |= root->fs_info->avail_metadata_alloc_bits &
-                        root->fs_info->metadata_alloc_profile;
+               flags |= root->fs_info->avail_metadata_alloc_bits;
+
        return btrfs_reduce_alloc_profile(root, flags);
 }
 
@@ -3282,7 +3317,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        int wait_for_alloc = 0;
        int ret = 0;
 
-       flags = btrfs_reduce_alloc_profile(extent_root, flags);
+       BUG_ON(!profile_is_valid(flags, 0));
 
        space_info = __find_space_info(extent_root->fs_info, flags);
        if (!space_info) {
@@ -6792,6 +6827,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
+       if (root->fs_info->balance_ctl) {
+               struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+               u64 tgt = 0;
+
+               /* pick restriper's target profile and return */
+               if (flags & BTRFS_BLOCK_GROUP_DATA &&
+                   bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+               } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+                          bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+               } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+                          bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+               }
+
+               if (tgt) {
+                       /* extended -> chunk profile */
+                       tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+                       return tgt;
+               }
+       }
+
        /*
         * we add in the count of missing devices because we want
         * to make sure that any RAID levels on a degraded FS
@@ -7466,6 +7524,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+       u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       /* chunk -> extended profile */
+       if (extra_flags == 0)
+               extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               fs_info->avail_data_alloc_bits &= ~extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
+
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start)
 {
@@ -7476,6 +7550,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        struct inode *inode;
        int ret;
+       int index;
        int factor;
 
        root = root->fs_info->extent_root;
@@ -7491,6 +7566,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        free_excluded_extents(root, block_group);
 
        memcpy(&key, &block_group->key, sizeof(key));
+       index = get_block_group_index(block_group);
        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
                                  BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7641,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
+       if (list_empty(&block_group->space_info->block_groups[index]))
+               clear_avail_alloc_bits(root->fs_info, block_group->flags);
        up_write(&block_group->space_info->groups_sem);
 
        if (block_group->cached == BTRFS_CACHE_STARTED)
index c04f02c7d5bbea215557a1d8ae2fc8ccce6e5862..1e7a9bac31ab9f6774e98e76264893752921ff36 100644 (file)
@@ -1203,13 +1203,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
+       mutex_lock(&root->fs_info->volume_mutex);
+       if (root->fs_info->balance_ctl) {
+               printk(KERN_INFO "btrfs: balance in progress\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
        vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args))
-               return PTR_ERR(vol_args);
+       if (IS_ERR(vol_args)) {
+               ret = PTR_ERR(vol_args);
+               goto out;
+       }
 
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
-       mutex_lock(&root->fs_info->volume_mutex);
        sizestr = vol_args->name;
        devstr = strchr(sizestr, ':');
        if (devstr) {
@@ -1226,7 +1234,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
                ret = -EINVAL;
-               goto out_unlock;
+               goto out_free;
        }
        if (!strcmp(sizestr, "max"))
                new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1249,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                new_size = memparse(sizestr, NULL);
                if (new_size == 0) {
                        ret = -EINVAL;
-                       goto out_unlock;
+                       goto out_free;
                }
        }
 
@@ -1250,7 +1258,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (mod < 0) {
                if (new_size > old_size) {
                        ret = -EINVAL;
-                       goto out_unlock;
+                       goto out_free;
                }
                new_size = old_size - new_size;
        } else if (mod > 0) {
@@ -1259,11 +1267,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
        if (new_size < 256 * 1024 * 1024) {
                ret = -EINVAL;
-               goto out_unlock;
+               goto out_free;
        }
        if (new_size > device->bdev->bd_inode->i_size) {
                ret = -EFBIG;
-               goto out_unlock;
+               goto out_free;
        }
 
        do_div(new_size, root->sectorsize);
@@ -1276,7 +1284,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                trans = btrfs_start_transaction(root, 0);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
-                       goto out_unlock;
+                       goto out_free;
                }
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
@@ -1284,9 +1292,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                ret = btrfs_shrink_device(device, new_size);
        }
 
-out_unlock:
-       mutex_unlock(&root->fs_info->volume_mutex);
+out_free:
        kfree(vol_args);
+out:
+       mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
 
@@ -2052,14 +2061,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
+       mutex_lock(&root->fs_info->volume_mutex);
+       if (root->fs_info->balance_ctl) {
+               printk(KERN_INFO "btrfs: balance in progress\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
        vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args))
-               return PTR_ERR(vol_args);
+       if (IS_ERR(vol_args)) {
+               ret = PTR_ERR(vol_args);
+               goto out;
+       }
 
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_init_new_device(root, vol_args->name);
 
        kfree(vol_args);
+out:
+       mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
 
@@ -2074,14 +2094,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
 
+       mutex_lock(&root->fs_info->volume_mutex);
+       if (root->fs_info->balance_ctl) {
+               printk(KERN_INFO "btrfs: balance in progress\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
        vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args))
-               return PTR_ERR(vol_args);
+       if (IS_ERR(vol_args)) {
+               ret = PTR_ERR(vol_args);
+               goto out;
+       }
 
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_rm_device(root, vol_args->name);
 
        kfree(vol_args);
+out:
+       mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
 
@@ -3034,6 +3065,163 @@ out:
        return ret;
 }
 
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+                              struct btrfs_ioctl_balance_args *bargs)
+{
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+       bargs->flags = bctl->flags;
+
+       if (atomic_read(&fs_info->balance_running))
+               bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+       if (atomic_read(&fs_info->balance_pause_req))
+               bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+       if (atomic_read(&fs_info->balance_cancel_req))
+               bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+       memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+       memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+       memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+       if (lock) {
+               spin_lock(&fs_info->balance_lock);
+               memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+               spin_unlock(&fs_info->balance_lock);
+       } else {
+               memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+       }
+}
+
+static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_ioctl_balance_args *bargs;
+       struct btrfs_balance_control *bctl;
+       int ret;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       mutex_lock(&fs_info->volume_mutex);
+       mutex_lock(&fs_info->balance_mutex);
+
+       if (arg) {
+               bargs = memdup_user(arg, sizeof(*bargs));
+               if (IS_ERR(bargs)) {
+                       ret = PTR_ERR(bargs);
+                       goto out;
+               }
+
+               if (bargs->flags & BTRFS_BALANCE_RESUME) {
+                       if (!fs_info->balance_ctl) {
+                               ret = -ENOTCONN;
+                               goto out_bargs;
+                       }
+
+                       bctl = fs_info->balance_ctl;
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->flags |= BTRFS_BALANCE_RESUME;
+                       spin_unlock(&fs_info->balance_lock);
+
+                       goto do_balance;
+               }
+       } else {
+               bargs = NULL;
+       }
+
+       if (fs_info->balance_ctl) {
+               ret = -EINPROGRESS;
+               goto out_bargs;
+       }
+
+       bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+       if (!bctl) {
+               ret = -ENOMEM;
+               goto out_bargs;
+       }
+
+       bctl->fs_info = fs_info;
+       if (arg) {
+               memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+               memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+               memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+               bctl->flags = bargs->flags;
+       } else {
+               /* balance everything - no filters */
+               bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+       }
+
+do_balance:
+       ret = btrfs_balance(bctl, bargs);
+       /*
+        * bctl is freed in __cancel_balance or in free_fs_info if
+        * restriper was paused all the way until unmount
+        */
+       if (arg) {
+               if (copy_to_user(arg, bargs, sizeof(*bargs)))
+                       ret = -EFAULT;
+       }
+
+out_bargs:
+       kfree(bargs);
+out:
+       mutex_unlock(&fs_info->balance_mutex);
+       mutex_unlock(&fs_info->volume_mutex);
+       return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       switch (cmd) {
+       case BTRFS_BALANCE_CTL_PAUSE:
+               return btrfs_pause_balance(root->fs_info);
+       case BTRFS_BALANCE_CTL_CANCEL:
+               return btrfs_cancel_balance(root->fs_info);
+       }
+
+       return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+                                        void __user *arg)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_ioctl_balance_args *bargs;
+       int ret = 0;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       mutex_lock(&fs_info->balance_mutex);
+       if (!fs_info->balance_ctl) {
+               ret = -ENOTCONN;
+               goto out;
+       }
+
+       bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+       if (!bargs) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       update_ioctl_balance_args(fs_info, 1, bargs);
+
+       if (copy_to_user(arg, bargs, sizeof(*bargs)))
+               ret = -EFAULT;
+
+       kfree(bargs);
+out:
+       mutex_unlock(&fs_info->balance_mutex);
+       return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -3078,7 +3266,7 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_DEV_INFO:
                return btrfs_ioctl_dev_info(root, argp);
        case BTRFS_IOC_BALANCE:
-               return btrfs_balance(root->fs_info->dev_root);
+               return btrfs_ioctl_balance(root, NULL);
        case BTRFS_IOC_CLONE:
                return btrfs_ioctl_clone(file, arg, 0, 0, 0);
        case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3298,12 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_scrub_cancel(root, argp);
        case BTRFS_IOC_SCRUB_PROGRESS:
                return btrfs_ioctl_scrub_progress(root, argp);
+       case BTRFS_IOC_BALANCE_V2:
+               return btrfs_ioctl_balance(root, argp);
+       case BTRFS_IOC_BALANCE_CTL:
+               return btrfs_ioctl_balance_ctl(root, arg);
+       case BTRFS_IOC_BALANCE_PROGRESS:
+               return btrfs_ioctl_balance_progress(root, argp);
        }
 
        return -ENOTTY;
index 252ae9915de8fcfa4b6b7a3a502735c28d1819f6..4f69028a68c486268bf5bcfc097a5412dbdd2ad0 100644 (file)
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
        __u64 reserved[124];                    /* pad to 1k */
 };
 
+/* balance control ioctl modes */
+#define BTRFS_BALANCE_CTL_PAUSE                1
+#define BTRFS_BALANCE_CTL_CANCEL       2
+
+/*
+ * this is packed, because it should be exactly the same as its disk
+ * byte order counterpart (struct btrfs_disk_balance_args)
+ */
+struct btrfs_balance_args {
+       __u64 profiles;
+       __u64 usage;
+       __u64 devid;
+       __u64 pstart;
+       __u64 pend;
+       __u64 vstart;
+       __u64 vend;
+
+       __u64 target;
+
+       __u64 flags;
+
+       __u64 unused[8];
+} __attribute__ ((__packed__));
+
+/* report balance progress to userspace */
+struct btrfs_balance_progress {
+       __u64 expected;         /* estimated # of chunks that will be
+                                * relocated to fulfill the request */
+       __u64 considered;       /* # of chunks we have considered so far */
+       __u64 completed;        /* # of chunks relocated so far */
+};
+
+#define BTRFS_BALANCE_STATE_RUNNING    (1ULL << 0)
+#define BTRFS_BALANCE_STATE_PAUSE_REQ  (1ULL << 1)
+#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
+
+struct btrfs_ioctl_balance_args {
+       __u64 flags;                            /* in/out */
+       __u64 state;                            /* out */
+
+       struct btrfs_balance_args data;         /* in/out */
+       struct btrfs_balance_args meta;         /* in/out */
+       struct btrfs_balance_args sys;          /* in/out */
+
+       struct btrfs_balance_progress stat;     /* out */
+
+       __u64 unused[72];                       /* pad to 1k */
+};
+
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
        __u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
                                 struct btrfs_ioctl_dev_info_args)
 #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
                               struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
+                                  struct btrfs_ioctl_balance_args)
+#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
+#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
+                                       struct btrfs_ioctl_balance_args)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
                                        struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
index 200f63bc6675eca20cf1b55c9ced534efccaf63b..5a7227fa93804c7b78bb205d6af94b52c55e1b32 100644 (file)
@@ -164,8 +164,9 @@ enum {
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-       Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-       Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+       Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+       Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+       Opt_err,
 };
 
 static match_table_t tokens = {
@@ -200,6 +201,7 @@ static match_table_t tokens = {
        {Opt_inode_cache, "inode_cache"},
        {Opt_no_space_cache, "nospace_cache"},
        {Opt_recovery, "recovery"},
+       {Opt_skip_balance, "skip_balance"},
        {Opt_err, NULL},
 };
 
@@ -398,6 +400,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: enabling auto recovery");
                        btrfs_set_opt(info->mount_opt, RECOVERY);
                        break;
+               case Opt_skip_balance:
+                       btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+                       break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@ -723,6 +728,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",autodefrag");
        if (btrfs_test_opt(root, INODE_MAP_CACHE))
                seq_puts(seq, ",inode_cache");
+       if (btrfs_test_opt(root, SKIP_BALANCE))
+               seq_puts(seq, ",skip_balance");
        return 0;
 }
 
index ac00e3aa80a11c4752e15576c52510befb2f2dcc..9489a2aca47b04bef55c159dcd5e56615ba3ee86 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
+#include <linux/kthread.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -1282,7 +1283,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        bool clear_super = false;
 
        mutex_lock(&uuid_mutex);
-       mutex_lock(&root->fs_info->volume_mutex);
 
        all_avail = root->fs_info->avail_data_alloc_bits |
                root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1452,6 @@ error_close:
        if (bdev)
                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
-       mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
        return ret;
 error_undo:
@@ -1629,7 +1628,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        }
 
        filemap_write_and_wait(bdev->bd_inode->i_mapping);
-       mutex_lock(&root->fs_info->volume_mutex);
 
        devices = &root->fs_info->fs_devices->devices;
        /*
@@ -1757,8 +1755,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                ret = btrfs_relocate_sys_chunks(root);
                BUG_ON(ret);
        }
-out:
-       mutex_unlock(&root->fs_info->volume_mutex);
+
        return ret;
 error:
        blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1763,7 @@ error:
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
        }
-       goto out;
+       return ret;
 }
 
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2074,362 @@ error:
        return ret;
 }
 
+static int insert_balance_item(struct btrfs_root *root,
+                              struct btrfs_balance_control *bctl)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_balance_item *item;
+       struct btrfs_disk_balance_args disk_bargs;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       int ret, err;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
+
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                     sizeof(*item));
+       if (ret)
+               goto out;
+
+       leaf = path->nodes[0];
+       item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+       memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+       btrfs_set_balance_data(leaf, item, &disk_bargs);
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+       btrfs_set_balance_meta(leaf, item, &disk_bargs);
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+       btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+       btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+       btrfs_mark_buffer_dirty(leaf);
+out:
+       btrfs_free_path(path);
+       err = btrfs_commit_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+       return ret;
+}
+
+static int del_balance_item(struct btrfs_root *root)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int ret, err;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
+
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+       if (ret > 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       ret = btrfs_del_item(trans, root, path);
+out:
+       btrfs_free_path(path);
+       err = btrfs_commit_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+       return ret;
+}
+
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+       /*
+        * Turn on soft mode for chunk types that were being converted.
+        */
+       if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+       if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+       if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+       /*
+        * Turn on usage filter if is not already used.  The idea is
+        * that chunks that we have already balanced should be
+        * reasonably full.  Don't do it for chunks that are being
+        * converted - that will keep us from relocating unconverted
+        * (albeit full) chunks.
+        */
+       if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->data.usage = 90;
+       }
+       if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->sys.usage = 90;
+       }
+       if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->meta.usage = 90;
+       }
+}
+
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper.  Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+       BUG_ON(fs_info->balance_ctl);
+
+       spin_lock(&fs_info->balance_lock);
+       fs_info->balance_ctl = bctl;
+       spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+       BUG_ON(!fs_info->balance_ctl);
+
+       spin_lock(&fs_info->balance_lock);
+       fs_info->balance_ctl = NULL;
+       spin_unlock(&fs_info->balance_lock);
+
+       kfree(bctl);
+}
+
+/*
+ * Balance filters.  Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_profile,
+                                struct btrfs_balance_args *bargs)
+{
+       chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       if (chunk_profile == 0)
+               chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (bargs->profiles & chunk_profile)
+               return 0;
+
+       return 1;
+}
+
+static u64 div_factor_fine(u64 num, int factor)
+{
+       if (factor <= 0)
+               return 0;
+       if (factor >= 100)
+               return num;
+
+       num *= factor;
+       do_div(num, 100);
+       return num;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+                             struct btrfs_balance_args *bargs)
+{
+       struct btrfs_block_group_cache *cache;
+       u64 chunk_used, user_thresh;
+       int ret = 1;
+
+       cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+       chunk_used = btrfs_block_group_used(&cache->item);
+
+       user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+       if (chunk_used < user_thresh)
+               ret = 0;
+
+       btrfs_put_block_group(cache);
+       return ret;
+}
+
+static int chunk_devid_filter(struct extent_buffer *leaf,
+                             struct btrfs_chunk *chunk,
+                             struct btrfs_balance_args *bargs)
+{
+       struct btrfs_stripe *stripe;
+       int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+       int i;
+
+       for (i = 0; i < num_stripes; i++) {
+               stripe = btrfs_stripe_nr(chunk, i);
+               if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+                       return 0;
+       }
+
+       return 1;
+}
+
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+                              struct btrfs_chunk *chunk,
+                              u64 chunk_offset,
+                              struct btrfs_balance_args *bargs)
+{
+       struct btrfs_stripe *stripe;
+       int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+       u64 stripe_offset;
+       u64 stripe_length;
+       int factor;
+       int i;
+
+       if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+               return 0;
+
+       if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+            BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+               factor = 2;
+       else
+               factor = 1;
+       factor = num_stripes / factor;
+
+       for (i = 0; i < num_stripes; i++) {
+               stripe = btrfs_stripe_nr(chunk, i);
+               if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+                       continue;
+
+               stripe_offset = btrfs_stripe_offset(leaf, stripe);
+               stripe_length = btrfs_chunk_length(leaf, chunk);
+               do_div(stripe_length, factor);
+
+               if (stripe_offset < bargs->pend &&
+                   stripe_offset + stripe_length > bargs->pstart)
+                       return 0;
+       }
+
+       return 1;
+}
+
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+                              struct btrfs_chunk *chunk,
+                              u64 chunk_offset,
+                              struct btrfs_balance_args *bargs)
+{
+       if (chunk_offset < bargs->vend &&
+           chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+               /* at least part of the chunk is inside this vrange */
+               return 0;
+
+       return 1;
+}
+
+static int chunk_soft_convert_filter(u64 chunk_profile,
+                                    struct btrfs_balance_args *bargs)
+{
+       if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+               return 0;
+
+       chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       if (chunk_profile == 0)
+               chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (bargs->target & chunk_profile)
+               return 1;
+
+       return 0;
+}
+
+static int should_balance_chunk(struct btrfs_root *root,
+                               struct extent_buffer *leaf,
+                               struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+       struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+       struct btrfs_balance_args *bargs = NULL;
+       u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+       /* type filter */
+       if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+             (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+               return 0;
+       }
+
+       if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+               bargs = &bctl->data;
+       else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+               bargs = &bctl->sys;
+       else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+               bargs = &bctl->meta;
+
+       /* profiles filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+           chunk_profiles_filter(chunk_type, bargs)) {
+               return 0;
+       }
+
+       /* usage filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+               return 0;
+       }
+
+       /* devid filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+           chunk_devid_filter(leaf, chunk, bargs)) {
+               return 0;
+       }
+
+       /* drange filter, makes sense only with devid filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+           chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+               return 0;
+       }
+
+       /* vrange filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+           chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+               return 0;
+       }
+
+       /* soft profile changing mode */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+           chunk_soft_convert_filter(chunk_type, bargs)) {
+               return 0;
+       }
+
+       return 1;
+}
+
 static u64 div_factor(u64 num, int factor)
 {
        if (factor == 10)
@@ -2086,29 +2439,28 @@ static u64 div_factor(u64 num, int factor)
        return num;
 }
 
-int btrfs_balance(struct btrfs_root *dev_root)
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 {
-       int ret;
-       struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+       struct btrfs_root *chunk_root = fs_info->chunk_root;
+       struct btrfs_root *dev_root = fs_info->dev_root;
+       struct list_head *devices;
        struct btrfs_device *device;
        u64 old_size;
        u64 size_to_free;
+       struct btrfs_chunk *chunk;
        struct btrfs_path *path;
        struct btrfs_key key;
-       struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
-       struct btrfs_trans_handle *trans;
        struct btrfs_key found_key;
-
-       if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       mutex_lock(&dev_root->fs_info->volume_mutex);
-       dev_root = dev_root->fs_info->dev_root;
+       struct btrfs_trans_handle *trans;
+       struct extent_buffer *leaf;
+       int slot;
+       int ret;
+       int enospc_errors = 0;
+       bool counting = true;
 
        /* step one make some room on all the devices */
+       devices = &fs_info->fs_devices->devices;
        list_for_each_entry(device, devices, dev_list) {
                old_size = device->total_bytes;
                size_to_free = div_factor(old_size, 1);
@@ -2137,11 +2489,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
                ret = -ENOMEM;
                goto error;
        }
+
+       /* zero out stat counters */
+       spin_lock(&fs_info->balance_lock);
+       memset(&bctl->stat, 0, sizeof(bctl->stat));
+       spin_unlock(&fs_info->balance_lock);
+again:
        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.offset = (u64)-1;
        key.type = BTRFS_CHUNK_ITEM_KEY;
 
        while (1) {
+               if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+                   atomic_read(&fs_info->balance_cancel_req)) {
+                       ret = -ECANCELED;
+                       goto error;
+               }
+
                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
                if (ret < 0)
                        goto error;
@@ -2151,15 +2515,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
                 * failed
                 */
                if (ret == 0)
-                       break;
+                       BUG(); /* FIXME break ? */
 
                ret = btrfs_previous_item(chunk_root, path, 0,
                                          BTRFS_CHUNK_ITEM_KEY);
-               if (ret)
+               if (ret) {
+                       ret = 0;
                        break;
+               }
+
+               leaf = path->nodes[0];
+               slot = path->slots[0];
+               btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
-               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                     path->slots[0]);
                if (found_key.objectid != key.objectid)
                        break;
 
@@ -2167,22 +2535,375 @@ int btrfs_balance(struct btrfs_root *dev_root)
                if (found_key.offset == 0)
                        break;
 
+               chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+               if (!counting) {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.considered++;
+                       spin_unlock(&fs_info->balance_lock);
+               }
+
+               ret = should_balance_chunk(chunk_root, leaf, chunk,
+                                          found_key.offset);
                btrfs_release_path(path);
+               if (!ret)
+                       goto loop;
+
+               if (counting) {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.expected++;
+                       spin_unlock(&fs_info->balance_lock);
+                       goto loop;
+               }
+
                ret = btrfs_relocate_chunk(chunk_root,
                                           chunk_root->root_key.objectid,
                                           found_key.objectid,
                                           found_key.offset);
                if (ret && ret != -ENOSPC)
                        goto error;
+               if (ret == -ENOSPC) {
+                       enospc_errors++;
+               } else {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.completed++;
+                       spin_unlock(&fs_info->balance_lock);
+               }
+loop:
                key.offset = found_key.offset - 1;
        }
-       ret = 0;
+
+       if (counting) {
+               btrfs_release_path(path);
+               counting = false;
+               goto again;
+       }
 error:
        btrfs_free_path(path);
-       mutex_unlock(&dev_root->fs_info->volume_mutex);
+       if (enospc_errors) {
+               printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+                      enospc_errors);
+               if (!ret)
+                       ret = -ENOSPC;
+       }
+
        return ret;
 }
 
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+       /* cancel requested || normal exit path */
+       return atomic_read(&fs_info->balance_cancel_req) ||
+               (atomic_read(&fs_info->balance_pause_req) == 0 &&
+                atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+       int ret;
+
+       unset_balance_control(fs_info);
+       ret = del_balance_item(fs_info->tree_root);
+       BUG_ON(ret);
+}
+
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+                              struct btrfs_ioctl_balance_args *bargs);
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+                 struct btrfs_ioctl_balance_args *bargs)
+{
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+       u64 allowed;
+       int ret;
+
+       if (btrfs_fs_closing(fs_info) ||
+           atomic_read(&fs_info->balance_pause_req) ||
+           atomic_read(&fs_info->balance_cancel_req)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * In case of mixed groups both data and meta should be picked,
+        * and identical options should be given for both of them.
+        */
+       allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+       if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+           (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
+               if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+                   !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+                   memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+                       printk(KERN_ERR "btrfs: with mixed groups data and "
+                              "metadata balance options must be the same\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       /*
+        * Profile changing sanity checks.  Skip them if a simple
+        * balance is requested.
+        */
+       if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
+             BTRFS_BALANCE_ARGS_CONVERT))
+               goto do_balance;
+
+       allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+       if (fs_info->fs_devices->num_devices == 1)
+               allowed |= BTRFS_BLOCK_GROUP_DUP;
+       else if (fs_info->fs_devices->num_devices < 4)
+               allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+       else
+               allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+                               BTRFS_BLOCK_GROUP_RAID10);
+
+       if (!profile_is_valid(bctl->data.target, 1) ||
+           bctl->data.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "data profile %llu\n",
+                      (unsigned long long)bctl->data.target);
+               ret = -EINVAL;
+               goto out;
+       }
+       if (!profile_is_valid(bctl->meta.target, 1) ||
+           bctl->meta.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "metadata profile %llu\n",
+                      (unsigned long long)bctl->meta.target);
+               ret = -EINVAL;
+               goto out;
+       }
+       if (!profile_is_valid(bctl->sys.target, 1) ||
+           bctl->sys.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "system profile %llu\n",
+                      (unsigned long long)bctl->sys.target);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
+               printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* allow to reduce meta or sys integrity only if force set */
+       allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+                       BTRFS_BLOCK_GROUP_RAID10;
+       if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+            (fs_info->avail_system_alloc_bits & allowed) &&
+            !(bctl->sys.target & allowed)) ||
+           ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+            (fs_info->avail_metadata_alloc_bits & allowed) &&
+            !(bctl->meta.target & allowed))) {
+               if (bctl->flags & BTRFS_BALANCE_FORCE) {
+                       printk(KERN_INFO "btrfs: force reducing metadata "
+                              "integrity\n");
+               } else {
+                       printk(KERN_ERR "btrfs: balance will reduce metadata "
+                              "integrity, use force if you want this\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+do_balance:
+       ret = insert_balance_item(fs_info->tree_root, bctl);
+       if (ret && ret != -EEXIST)
+               goto out;
+
+       if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+               BUG_ON(ret == -EEXIST);
+               set_balance_control(bctl);
+       } else {
+               BUG_ON(ret != -EEXIST);
+               spin_lock(&fs_info->balance_lock);
+               update_balance_args(bctl);
+               spin_unlock(&fs_info->balance_lock);
+       }
+
+       atomic_inc(&fs_info->balance_running);
+       mutex_unlock(&fs_info->balance_mutex);
+
+       ret = __btrfs_balance(fs_info);
+
+       mutex_lock(&fs_info->balance_mutex);
+       atomic_dec(&fs_info->balance_running);
+
+       if (bargs) {
+               memset(bargs, 0, sizeof(*bargs));
+               update_ioctl_balance_args(fs_info, 0, bargs);
+       }
+
+       if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+           balance_need_close(fs_info)) {
+               __cancel_balance(fs_info);
+       }
+
+       wake_up(&fs_info->balance_wait_q);
+
+       return ret;
+out:
+       if (bctl->flags & BTRFS_BALANCE_RESUME)
+               __cancel_balance(fs_info);
+       else
+               kfree(bctl);
+       return ret;
+}
+
+static int balance_kthread(void *data)
+{
+       struct btrfs_balance_control *bctl =
+                       (struct btrfs_balance_control *)data;
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+       int ret = 0;
+
+       mutex_lock(&fs_info->volume_mutex);
+       mutex_lock(&fs_info->balance_mutex);
+
+       set_balance_control(bctl);
+
+       if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+               printk(KERN_INFO "btrfs: force skipping balance\n");
+       } else {
+               printk(KERN_INFO "btrfs: continuing balance\n");
+               ret = btrfs_balance(bctl, NULL);
+       }
+
+       mutex_unlock(&fs_info->balance_mutex);
+       mutex_unlock(&fs_info->volume_mutex);
+       return ret;
+}
+
+int btrfs_recover_balance(struct btrfs_root *tree_root)
+{
+       struct task_struct *tsk;
+       struct btrfs_balance_control *bctl;
+       struct btrfs_balance_item *item;
+       struct btrfs_disk_balance_args disk_bargs;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+       if (!bctl) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out_bctl;
+       if (ret > 0) { /* ret = -ENOENT; */
+               ret = 0;
+               goto out_bctl;
+       }
+
+       leaf = path->nodes[0];
+       item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+       bctl->fs_info = tree_root->fs_info;
+       bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+
+       btrfs_balance_data(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+       btrfs_balance_meta(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+       btrfs_balance_sys(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+       tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+       if (IS_ERR(tsk))
+               ret = PTR_ERR(tsk);
+       else
+               goto out;
+
+out_bctl:
+       kfree(bctl);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+       int ret = 0;
+
+       mutex_lock(&fs_info->balance_mutex);
+       if (!fs_info->balance_ctl) {
+               mutex_unlock(&fs_info->balance_mutex);
+               return -ENOTCONN;
+       }
+
+       if (atomic_read(&fs_info->balance_running)) {
+               atomic_inc(&fs_info->balance_pause_req);
+               mutex_unlock(&fs_info->balance_mutex);
+
+               wait_event(fs_info->balance_wait_q,
+                          atomic_read(&fs_info->balance_running) == 0);
+
+               mutex_lock(&fs_info->balance_mutex);
+               /* we are good with balance_ctl ripped off from under us */
+               BUG_ON(atomic_read(&fs_info->balance_running));
+               atomic_dec(&fs_info->balance_pause_req);
+       } else {
+               ret = -ENOTCONN;
+       }
+
+       mutex_unlock(&fs_info->balance_mutex);
+       return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+       mutex_lock(&fs_info->balance_mutex);
+       if (!fs_info->balance_ctl) {
+               mutex_unlock(&fs_info->balance_mutex);
+               return -ENOTCONN;
+       }
+
+       atomic_inc(&fs_info->balance_cancel_req);
+       /*
+        * if we are running just wait and return, balance item is
+        * deleted in btrfs_balance in this case
+        */
+       if (atomic_read(&fs_info->balance_running)) {
+               mutex_unlock(&fs_info->balance_mutex);
+               wait_event(fs_info->balance_wait_q,
+                          atomic_read(&fs_info->balance_running) == 0);
+               mutex_lock(&fs_info->balance_mutex);
+       } else {
+               /* __cancel_balance needs volume_mutex */
+               mutex_unlock(&fs_info->balance_mutex);
+               mutex_lock(&fs_info->volume_mutex);
+               mutex_lock(&fs_info->balance_mutex);
+
+               if (fs_info->balance_ctl)
+                       __cancel_balance(fs_info);
+
+               mutex_unlock(&fs_info->volume_mutex);
+       }
+
+       BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+       atomic_dec(&fs_info->balance_cancel_req);
+       mutex_unlock(&fs_info->balance_mutex);
+       return 0;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
@@ -2756,8 +3477,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
                return ret;
 
        alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-                       (fs_info->metadata_alloc_profile &
-                        fs_info->avail_metadata_alloc_bits);
+                               fs_info->avail_metadata_alloc_bits;
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2767,8 +3487,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
        sys_chunk_offset = chunk_offset + chunk_size;
 
        alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-                       (fs_info->system_alloc_profile &
-                        fs_info->avail_system_alloc_bits);
+                               fs_info->avail_system_alloc_bits;
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -2955,12 +3674,8 @@ again:
                }
        }
        if (rw & REQ_DISCARD) {
-               if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-                                BTRFS_BLOCK_GROUP_RAID1 |
-                                BTRFS_BLOCK_GROUP_DUP |
-                                BTRFS_BLOCK_GROUP_RAID10)) {
+               if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK)
                        stripes_required = map->num_stripes;
-               }
        }
        if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
            stripes_allocated < stripes_required) {
@@ -2984,10 +3699,7 @@ again:
 
        if (rw & REQ_DISCARD)
                *length = min_t(u64, em->len - offset, *length);
-       else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-                             BTRFS_BLOCK_GROUP_RAID1 |
-                             BTRFS_BLOCK_GROUP_RAID10 |
-                             BTRFS_BLOCK_GROUP_DUP)) {
+       else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
                /* we limit the length of each bio to what fits in a stripe */
                *length = min_t(u64, em->len - offset,
                                map->stripe_len - stripe_offset);
index 78f2d4d4f37fe81317395688a8b090b71e53a612..6faec9dd1f93b95e5c497763a3c49b04f4c422cb 100644 (file)
@@ -186,6 +186,51 @@ struct map_lookup {
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
                            (sizeof(struct btrfs_bio_stripe) * (n)))
 
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA             (1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM           (1ULL << 1)
+#define BTRFS_BALANCE_METADATA         (1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK                (BTRFS_BALANCE_DATA |       \
+                                        BTRFS_BALANCE_SYSTEM |     \
+                                        BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE            (1ULL << 3)
+#define BTRFS_BALANCE_RESUME           (1ULL << 4)
+
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES    (1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE       (1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID       (1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE      (1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE      (1ULL << 4)
+
+/*
+ * Profile changing flags.  When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT     (1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT                (1ULL << 9)
+
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+       struct btrfs_fs_info *fs_info;
+
+       struct btrfs_balance_args data;
+       struct btrfs_balance_args meta;
+       struct btrfs_balance_args sys;
+
+       u64 flags;
+
+       struct btrfs_balance_progress stat;
+};
+
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                   u64 end, u64 *length);
 
@@ -228,7 +273,11 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
                                       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_root *dev_root);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+                 struct btrfs_ioctl_balance_args *bargs);
+int btrfs_recover_balance(struct btrfs_root *tree_root);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_device *device, u64 num_bytes,