Merge branch 'jeffm-discard-4.3' into for-linus-4.3
authorChris Mason <clm@fb.com>
Sun, 9 Aug 2015 14:35:33 +0000 (07:35 -0700)
committerChris Mason <clm@fb.com>
Sun, 9 Aug 2015 14:35:33 +0000 (07:35 -0700)
1  2 
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

diff --combined fs/btrfs/ctree.h
index d4042c89d29bb74a5fc93a68ed2ad08cb4bf997a,19ef3f30655978dc8b92b99431ccb8c7dce723b7..938efe33be809240bc19bdf81d854c623191a3d8
@@@ -1300,7 -1300,7 +1300,7 @@@ struct btrfs_block_group_cache 
        /* for raid56, this is a full stripe, without parity */
        unsigned long full_stripe_len;
  
 -      unsigned int ro:1;
 +      unsigned int ro;
        unsigned int iref:1;
        unsigned int has_caching_ctl:1;
        unsigned int removed:1;
@@@ -1518,6 -1518,12 +1518,6 @@@ struct btrfs_fs_info 
         */
        struct mutex ordered_operations_mutex;
  
 -      /*
 -       * Same as ordered_operations_mutex except this is for ordered extents
 -       * and not the operations.
 -       */
 -      struct mutex ordered_extent_flush_mutex;
 -
        struct rw_semaphore commit_root_sem;
  
        struct rw_semaphore cleanup_work_sem;
@@@ -3431,6 -3437,8 +3431,8 @@@ int btrfs_remove_block_group(struct btr
                             struct btrfs_root *root, u64 group_start,
                             struct extent_map *em);
  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+ void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
+ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
  void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root);
  u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@@ -3489,9 -3497,9 +3491,9 @@@ int btrfs_cond_migrate_bytes(struct btr
  void btrfs_block_rsv_release(struct btrfs_root *root,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes);
 -int btrfs_set_block_group_ro(struct btrfs_root *root,
 +int btrfs_inc_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 -void btrfs_set_block_group_rw(struct btrfs_root *root,
 +void btrfs_dec_block_group_ro(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache);
  void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
  u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
@@@ -4067,6 -4075,7 +4069,7 @@@ __col
  void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
                     unsigned int line, int errno, const char *fmt, ...);
  
+ const char *btrfs_decode_error(int errno);
  
  __cold
  void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
@@@ -4179,7 -4188,8 +4182,7 @@@ int btrfs_reloc_clone_csums(struct inod
  int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct extent_buffer *buf,
                          struct extent_buffer *cow);
 -void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
 -                            struct btrfs_pending_snapshot *pending,
 +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
                              u64 *bytes_to_reserve);
  int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
                              struct btrfs_pending_snapshot *pending);
diff --combined fs/btrfs/disk-io.c
index 230546b45474253a16e8db8f8f7b24c1659ad726,053109ba26b7e22a8ce995d2485977dc58dbd9ba..cc15514b4a7675a9a2493f1d8198dd0c9e236850
@@@ -1724,7 -1724,6 +1724,7 @@@ static int setup_bdi(struct btrfs_fs_in
        bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
        bdi->congested_fn       = btrfs_congested_fn;
        bdi->congested_data     = info;
 +      bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
        return 0;
  }
  
@@@ -1745,7 -1744,7 +1745,7 @@@ static void end_workqueue_fn(struct btr
        bio->bi_private = end_io_wq->private;
        bio->bi_end_io = end_io_wq->end_io;
        kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
 -      bio_endio_nodec(bio, error);
 +      bio_endio(bio, error);
  }
  
  static int cleaner_kthread(void *arg)
@@@ -2609,6 -2608,7 +2609,6 @@@ int open_ctree(struct super_block *sb
  
  
        mutex_init(&fs_info->ordered_operations_mutex);
 -      mutex_init(&fs_info->ordered_extent_flush_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->chunk_mutex);
        mutex_init(&fs_info->transaction_kthread_mutex);
@@@ -2950,9 -2950,8 +2950,9 @@@ retry_root_backup
        if (fs_info->fs_devices->missing_devices >
             fs_info->num_tolerated_disk_barrier_failures &&
            !(sb->s_flags & MS_RDONLY)) {
 -              printk(KERN_WARNING "BTRFS: "
 -                      "too many missing devices, writeable mount is not allowed\n");
 +              pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n",
 +                      fs_info->fs_devices->missing_devices,
 +                      fs_info->num_tolerated_disk_barrier_failures);
                goto fail_sysfs;
        }
  
@@@ -3327,8 -3326,11 +3327,8 @@@ static int write_dev_supers(struct btrf
   */
  static void btrfs_end_empty_barrier(struct bio *bio, int err)
  {
 -      if (err) {
 -              if (err == -EOPNOTSUPP)
 -                      set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
 +      if (err)
                clear_bit(BIO_UPTODATE, &bio->bi_flags);
 -      }
        if (bio->bi_private)
                complete(bio->bi_private);
        bio_put(bio);
@@@ -3356,7 -3358,11 +3356,7 @@@ static int write_dev_flush(struct btrfs
  
                wait_for_completion(&device->flush_wait);
  
 -              if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
 -                      printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
 -                                    rcu_str_deref(device->name));
 -                      device->nobarriers = 1;
 -              } else if (!bio_flagged(bio, BIO_UPTODATE)) {
 +              if (!bio_flagged(bio, BIO_UPTODATE)) {
                        ret = -EIO;
                        btrfs_dev_stat_inc_and_print(device,
                                BTRFS_DEV_STAT_FLUSH_ERRS);
@@@ -3761,6 -3767,15 +3761,15 @@@ void close_ctree(struct btrfs_root *roo
        cancel_work_sync(&fs_info->async_reclaim_work);
  
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+               /*
+                * If the cleaner thread is stopped and there are
+                * block groups queued for removal, the deletion will be
+                * skipped when we quit the cleaner thread.
+                */
+               mutex_lock(&root->fs_info->cleaner_mutex);
+               btrfs_delete_unused_bgs(root->fs_info);
+               mutex_unlock(&root->fs_info->cleaner_mutex);
                ret = btrfs_commit_super(root);
                if (ret)
                        btrfs_err(fs_info, "commit super ret %d", ret);
diff --combined fs/btrfs/extent-tree.c
index 59d59d98bca141bd4665730911cd5ec95ed25863,6b791f39469827721e0b465a8c9105ad3949ac98..5411f0ab56831aa3923f5d4b2d1d53adf9a88991
@@@ -1316,7 -1316,8 +1316,7 @@@ static noinline int remove_extent_data_
        return ret;
  }
  
 -static noinline u32 extent_data_ref_count(struct btrfs_root *root,
 -                                        struct btrfs_path *path,
 +static noinline u32 extent_data_ref_count(struct btrfs_path *path,
                                          struct btrfs_extent_inline_ref *iref)
  {
        struct btrfs_key key;
@@@ -1882,10 -1883,77 +1882,77 @@@ static int remove_extent_backref(struc
        return ret;
  }
  
- static int btrfs_issue_discard(struct block_device *bdev,
-                               u64 start, u64 len)
+ #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
+ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+                              u64 *discarded_bytes)
  {
-       return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+       int j, ret = 0;
+       u64 bytes_left, end;
+       u64 aligned_start = ALIGN(start, 1 << 9);
+       if (WARN_ON(start != aligned_start)) {
+               len -= aligned_start - start;
+               len = round_down(len, 1 << 9);
+               start = aligned_start;
+       }
+       *discarded_bytes = 0;
+       if (!len)
+               return 0;
+       end = start + len;
+       bytes_left = len;
+       /* Skip any superblocks on this device. */
+       for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+               u64 sb_start = btrfs_sb_offset(j);
+               u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+               u64 size = sb_start - start;
+               if (!in_range(sb_start, start, bytes_left) &&
+                   !in_range(sb_end, start, bytes_left) &&
+                   !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+                       continue;
+               /*
+                * Superblock spans beginning of range.  Adjust start and
+                * try again.
+                */
+               if (sb_start <= start) {
+                       start += sb_end - start;
+                       if (start > end) {
+                               bytes_left = 0;
+                               break;
+                       }
+                       bytes_left = end - start;
+                       continue;
+               }
+               if (size) {
+                       ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+                                                  GFP_NOFS, 0);
+                       if (!ret)
+                               *discarded_bytes += size;
+                       else if (ret != -EOPNOTSUPP)
+                               return ret;
+               }
+               start = sb_end;
+               if (start > end) {
+                       bytes_left = 0;
+                       break;
+               }
+               bytes_left = end - start;
+       }
+       if (bytes_left) {
+               ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+                                          GFP_NOFS, 0);
+               if (!ret)
+                       *discarded_bytes += bytes_left;
+       }
+       return ret;
  }
  
  int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
  
  
                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+                       u64 bytes;
                        if (!stripe->dev->can_discard)
                                continue;
  
                        ret = btrfs_issue_discard(stripe->dev->bdev,
                                                  stripe->physical,
-                                                 stripe->length);
+                                                 stripe->length,
+                                                 &bytes);
                        if (!ret)
-                               discarded_bytes += stripe->length;
+                               discarded_bytes += bytes;
                        else if (ret != -EOPNOTSUPP)
                                break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
  
@@@ -6061,20 -6131,19 +6130,19 @@@ int btrfs_finish_extent_commit(struct b
                               struct btrfs_root *root)
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_block_group_cache *block_group, *tmp;
+       struct list_head *deleted_bgs;
        struct extent_io_tree *unpin;
        u64 start;
        u64 end;
        int ret;
  
-       if (trans->aborted)
-               return 0;
        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
                unpin = &fs_info->freed_extents[1];
        else
                unpin = &fs_info->freed_extents[0];
  
-       while (1) {
+       while (!trans->aborted) {
                mutex_lock(&fs_info->unused_bg_unpin_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY, NULL);
                cond_resched();
        }
  
+       /*
+        * Transaction is finished.  We don't need the lock anymore.  We
+        * do need to clean up the block groups in case of a transaction
+        * abort.
+        */
+       deleted_bgs = &trans->transaction->deleted_bgs;
+       list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+               u64 trimmed = 0;
+               ret = -EROFS;
+               if (!trans->aborted)
+                       ret = btrfs_discard_extent(root,
+                                                  block_group->key.objectid,
+                                                  block_group->key.offset,
+                                                  &trimmed);
+               list_del_init(&block_group->bg_list);
+               btrfs_put_block_group_trimming(block_group);
+               btrfs_put_block_group(block_group);
+               if (ret) {
+                       const char *errstr = btrfs_decode_error(ret);
+                       btrfs_warn(fs_info,
+                                  "Discard failed while removing blockgroup: errno=%d %s\n",
+                                  ret, errstr);
+               }
+       }
        return 0;
  }
  
@@@ -6348,7 -6445,7 +6444,7 @@@ static int __btrfs_free_extent(struct b
        } else {
                if (found_extent) {
                        BUG_ON(is_data && refs_to_drop !=
 -                             extent_data_ref_count(root, path, iref));
 +                             extent_data_ref_count(path, iref));
                        if (iref) {
                                BUG_ON(path->slots[0] != extent_slot);
                        } else {
@@@ -7566,6 -7663,9 +7662,6 @@@ static void unuse_block_rsv(struct btrf
  
  /*
   * finds a free extent and does all the dirty work required for allocation
 - * returns the key for the extent through ins, and a tree buffer for
 - * the first block of the extent through buf.
 - *
   * returns the tree buffer or an ERR_PTR on error.
   */
  struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@@ -8719,13 -8819,14 +8815,13 @@@ static u64 update_block_group_flags(str
        return flags;
  }
  
 -static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 +static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
  {
        struct btrfs_space_info *sinfo = cache->space_info;
        u64 num_bytes;
        u64 min_allocable_bytes;
        int ret = -ENOSPC;
  
 -
        /*
         * We need some metadata space and system metadata space for
         * allocating chunks in some corner cases until we force to set
        spin_lock(&cache->lock);
  
        if (cache->ro) {
 +              cache->ro++;
                ret = 0;
                goto out;
        }
            sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
            min_allocable_bytes <= sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
 -              cache->ro = 1;
 +              cache->ro++;
                list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
                ret = 0;
        }
@@@ -8764,7 -8864,7 +8860,7 @@@ out
        return ret;
  }
  
 -int btrfs_set_block_group_ro(struct btrfs_root *root,
 +int btrfs_inc_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache)
  
  {
        u64 alloc_flags;
        int ret;
  
 -      BUG_ON(cache->ro);
 -
  again:
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                        goto out;
        }
  
 -      ret = set_block_group_ro(cache, 0);
 +      ret = inc_block_group_ro(cache, 0);
        if (!ret)
                goto out;
        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
                             CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
 -      ret = set_block_group_ro(cache, 0);
 +      ret = inc_block_group_ro(cache, 0);
  out:
        if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
                alloc_flags = update_block_group_flags(root, cache->flags);
@@@ -8885,7 -8987,7 +8981,7 @@@ u64 btrfs_account_ro_block_groups_free_
        return free_bytes;
  }
  
 -void btrfs_set_block_group_rw(struct btrfs_root *root,
 +void btrfs_dec_block_group_ro(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache)
  {
        struct btrfs_space_info *sinfo = cache->space_info;
  
        spin_lock(&sinfo->lock);
        spin_lock(&cache->lock);
 -      num_bytes = cache->key.offset - cache->reserved - cache->pinned -
 -                  cache->bytes_super - btrfs_block_group_used(&cache->item);
 -      sinfo->bytes_readonly -= num_bytes;
 -      cache->ro = 0;
 -      list_del_init(&cache->ro_list);
 +      if (!--cache->ro) {
 +              num_bytes = cache->key.offset - cache->reserved -
 +                          cache->pinned - cache->bytes_super -
 +                          btrfs_block_group_used(&cache->item);
 +              sinfo->bytes_readonly -= num_bytes;
 +              list_del_init(&cache->ro_list);
 +      }
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
  }
@@@ -9417,7 -9517,7 +9513,7 @@@ int btrfs_read_block_groups(struct btrf
  
                set_avail_alloc_bits(root->fs_info, cache->flags);
                if (btrfs_chunk_readonly(root, cache->key.objectid)) {
 -                      set_block_group_ro(cache, 1);
 +                      inc_block_group_ro(cache, 1);
                } else if (btrfs_block_group_used(&cache->item) == 0) {
                        spin_lock(&info->unused_bgs_lock);
                        /* Should always be true but just in case. */
                list_for_each_entry(cache,
                                &space_info->block_groups[BTRFS_RAID_RAID0],
                                list)
 -                      set_block_group_ro(cache, 1);
 +                      inc_block_group_ro(cache, 1);
                list_for_each_entry(cache,
                                &space_info->block_groups[BTRFS_RAID_SINGLE],
                                list)
 -                      set_block_group_ro(cache, 1);
 +                      inc_block_group_ro(cache, 1);
        }
  
        init_global_block_rsv(info);
@@@ -9830,6 -9930,11 +9926,11 @@@ int btrfs_remove_block_group(struct btr
         * currently running transaction might finish and a new one start,
         * allowing for new block groups to be created that can reuse the same
         * physical device locations unless we take this special care.
+        *
+        * There may also be an implicit trim operation if the file system
+        * is mounted with -odiscard. The same protections must remain
+        * in place until the extents have been discarded completely when
+        * the transaction commit has completed.
         */
        remove_em = (atomic_read(&block_group->trimming) == 0);
        /*
@@@ -9904,6 -10009,7 +10005,7 @@@ void btrfs_delete_unused_bgs(struct btr
        spin_lock(&fs_info->unused_bgs_lock);
        while (!list_empty(&fs_info->unused_bgs)) {
                u64 start, end;
+               int trimming;
  
                block_group = list_first_entry(&fs_info->unused_bgs,
                                               struct btrfs_block_group_cache,
                spin_unlock(&block_group->lock);
  
                /* We don't want to force the issue, only flip if it's ok. */
 -              ret = set_block_group_ro(block_group, 0);
 +              ret = inc_block_group_ro(block_group, 0);
                up_write(&space_info->groups_sem);
                if (ret < 0) {
                        ret = 0;
                /* 1 for btrfs_orphan_reserve_metadata() */
                trans = btrfs_start_transaction(root, 1);
                if (IS_ERR(trans)) {
 -                      btrfs_set_block_group_rw(root, block_group);
 +                      btrfs_dec_block_group_ro(root, block_group);
                        ret = PTR_ERR(trans);
                        goto next;
                }
                                  EXTENT_DIRTY, GFP_NOFS);
                if (ret) {
                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 -                      btrfs_set_block_group_rw(root, block_group);
 +                      btrfs_dec_block_group_ro(root, block_group);
                        goto end_trans;
                }
                ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
                                  EXTENT_DIRTY, GFP_NOFS);
                if (ret) {
                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 -                      btrfs_set_block_group_rw(root, block_group);
 +                      btrfs_dec_block_group_ro(root, block_group);
                        goto end_trans;
                }
                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                spin_unlock(&block_group->lock);
                spin_unlock(&space_info->lock);
  
+               /* DISCARD can flip during remount */
+               trimming = btrfs_test_opt(root, DISCARD);
+               /* Implicit trim during transaction commit. */
+               if (trimming)
+                       btrfs_get_block_group_trimming(block_group);
                /*
                 * Btrfs_remove_chunk will abort the transaction if things go
                 * horribly wrong.
                 */
                ret = btrfs_remove_chunk(trans, root,
                                         block_group->key.objectid);
+               if (ret) {
+                       if (trimming)
+                               btrfs_put_block_group_trimming(block_group);
+                       goto end_trans;
+               }
+               /*
+                * If we're not mounted with -odiscard, we can just forget
+                * about this block group. Otherwise we'll need to wait
+                * until transaction commit to do the actual discard.
+                */
+               if (trimming) {
+                       WARN_ON(!list_empty(&block_group->bg_list));
+                       spin_lock(&trans->transaction->deleted_bgs_lock);
+                       list_move(&block_group->bg_list,
+                                 &trans->transaction->deleted_bgs);
+                       spin_unlock(&trans->transaction->deleted_bgs_lock);
+                       btrfs_get_block_group(block_group);
+               }
  end_trans:
                btrfs_end_transaction(trans, root);
  next:
@@@ -10062,10 -10195,99 +10191,99 @@@ int btrfs_error_unpin_extent_range(stru
        return unpin_extent_range(root, start, end, false);
  }
  
+ /*
+  * It used to be that old block groups would be left around forever.
+  * Iterating over them would be enough to trim unused space.  Since we
+  * now automatically remove them, we also need to iterate over unallocated
+  * space.
+  *
+  * We don't want a transaction for this since the discard may take a
+  * substantial amount of time.  We don't require that a transaction be
+  * running, but we do need to take a running transaction into account
+  * to ensure that we're not discarding chunks that were released in
+  * the current transaction.
+  *
+  * Holding the chunks lock will prevent other threads from allocating
+  * or releasing chunks, but it won't prevent a running transaction
+  * from committing and releasing the memory that the pending chunks
+  * list head uses.  For that, we need to take a reference to the
+  * transaction.
+  */
+ static int btrfs_trim_free_extents(struct btrfs_device *device,
+                                  u64 minlen, u64 *trimmed)
+ {
+       u64 start = 0, len = 0;
+       int ret;
+       *trimmed = 0;
+       /* Not writeable = nothing to do. */
+       if (!device->writeable)
+               return 0;
+       /* No free space = nothing to do. */
+       if (device->total_bytes <= device->bytes_used)
+               return 0;
+       ret = 0;
+       while (1) {
+               struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+               struct btrfs_transaction *trans;
+               u64 bytes;
+               ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+               if (ret)
+                       return ret;
+               down_read(&fs_info->commit_root_sem);
+               spin_lock(&fs_info->trans_lock);
+               trans = fs_info->running_transaction;
+               if (trans)
+                       atomic_inc(&trans->use_count);
+               spin_unlock(&fs_info->trans_lock);
+               ret = find_free_dev_extent_start(trans, device, minlen, start,
+                                                &start, &len);
+               if (trans)
+                       btrfs_put_transaction(trans);
+               if (ret) {
+                       up_read(&fs_info->commit_root_sem);
+                       mutex_unlock(&fs_info->chunk_mutex);
+                       if (ret == -ENOSPC)
+                               ret = 0;
+                       break;
+               }
+               ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+               up_read(&fs_info->commit_root_sem);
+               mutex_unlock(&fs_info->chunk_mutex);
+               if (ret)
+                       break;
+               start += len;
+               *trimmed += bytes;
+               if (fatal_signal_pending(current)) {
+                       ret = -ERESTARTSYS;
+                       break;
+               }
+               cond_resched();
+       }
+       return ret;
+ }
  int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_group_cache *cache = NULL;
+       struct btrfs_device *device;
+       struct list_head *devices;
        u64 group_trimmed;
        u64 start;
        u64 end;
                cache = next_block_group(fs_info->tree_root, cache);
        }
  
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+       devices = &root->fs_info->fs_devices->alloc_list;
+       list_for_each_entry(device, devices, dev_alloc_list) {
+               ret = btrfs_trim_free_extents(device, range->minlen,
+                                             &group_trimmed);
+               if (ret)
+                       break;
+               trimmed += group_trimmed;
+       }
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        range->len = trimmed;
        return ret;
  }
diff --combined fs/btrfs/super.c
index d366dd4664d066269e9394403c060fece1a9100d,8da24e24289630631d0e4930380de96e996c5e33..c389c13f0f383c4e086bb12045a47c4cfb343ef8
@@@ -69,7 -69,7 +69,7 @@@ static struct file_system_type btrfs_fs
  
  static int btrfs_remount(struct super_block *sb, int *flags, char *data);
  
static const char *btrfs_decode_error(int errno)
+ const char *btrfs_decode_error(int errno)
  {
        char *errstr = "unknown";
  
@@@ -1033,7 -1033,6 +1033,7 @@@ static int btrfs_fill_super(struct supe
        sb->s_flags |= MS_POSIXACL;
  #endif
        sb->s_flags |= MS_I_VERSION;
 +      sb->s_iflags |= SB_I_CGROUPWB;
        err = open_ctree(sb, fs_devices, (char *)data);
        if (err) {
                printk(KERN_ERR "BTRFS: open_ctree failed\n");
@@@ -1651,6 -1650,17 +1651,17 @@@ static int btrfs_remount(struct super_b
  
                sb->s_flags |= MS_RDONLY;
  
+               /*
+                * Setting MS_RDONLY will put the cleaner thread to
+                * sleep at the next loop if it's already active.
+                * If it's already asleep, we'll leave unused block
+                * groups on disk until we're mounted read-write again
+                * unless we clean them up here.
+                */
+               mutex_lock(&root->fs_info->cleaner_mutex);
+               btrfs_delete_unused_bgs(fs_info);
+               mutex_unlock(&root->fs_info->cleaner_mutex);
                btrfs_dev_replace_suspend_for_unmount(fs_info);
                btrfs_scrub_cancel(fs_info);
                btrfs_pause_balance(fs_info);
diff --combined fs/btrfs/transaction.c
index 91f44c9f7ebcd5ce2ead5ca05f7fb677179be920,44da9299a25bf0e4d35942428a1be933880ba576..20267d47dbcd501dda238e7ff1ce0381137473a7
@@@ -258,6 -258,8 +258,8 @@@ loop
        mutex_init(&cur_trans->cache_write_mutex);
        cur_trans->num_dirty_bgs = 0;
        spin_lock_init(&cur_trans->dirty_bgs_lock);
+       INIT_LIST_HEAD(&cur_trans->deleted_bgs);
+       spin_lock_init(&cur_trans->deleted_bgs_lock);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
                             fs_info->btree_inode->i_mapping);
@@@ -1301,7 -1303,7 +1303,7 @@@ static noinline int create_pending_snap
         */
        btrfs_set_skip_qgroup(trans, objectid);
  
 -      btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
 +      btrfs_reloc_pre_snapshot(pending, &to_reserve);
  
        if (to_reserve > 0) {
                pending->error = btrfs_block_rsv_add(root,
diff --combined fs/btrfs/volumes.c
index 88e2fe931bde2320b4df5a7e9310dff76e2a3bca,141c6051cf58ac40b43466d82419035b51542b88..7c84a8122c37847c0c0ccb6cfb9818954db1e1df
@@@ -349,7 -349,7 +349,7 @@@ loop_lock
                    waitqueue_active(&fs_info->async_submit_wait))
                        wake_up(&fs_info->async_submit_wait);
  
 -              BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 +              BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
  
                /*
                 * if we're doing the sync list, record that our
        return ret;
  }
  
- static int contains_pending_extent(struct btrfs_trans_handle *trans,
+ static int contains_pending_extent(struct btrfs_transaction *transaction,
                                   struct btrfs_device *device,
                                   u64 *start, u64 len)
  {
+       struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
        struct extent_map *em;
-       struct list_head *search_list = &trans->transaction->pending_chunks;
+       struct list_head *search_list = &fs_info->pinned_chunks;
        int ret = 0;
        u64 physical_start = *start;
  
+       if (transaction)
+               search_list = &transaction->pending_chunks;
  again:
        list_for_each_entry(em, search_list, list) {
                struct map_lookup *map;
                        }
                }
        }
-       if (search_list == &trans->transaction->pending_chunks) {
-               search_list = &trans->root->fs_info->pinned_chunks;
+       if (search_list != &fs_info->pinned_chunks) {
+               search_list = &fs_info->pinned_chunks;
                goto again;
        }
  
  
  
  /*
-  * find_free_dev_extent - find free space in the specified device
-  * @device:   the device which we search the free space in
-  * @num_bytes:        the size of the free space that we need
-  * @start:    store the start of the free space.
-  * @len:      the size of the free space. that we find, or the size of the max
-  *            free space if we don't find suitable free space
+  * find_free_dev_extent_start - find free space in the specified device
+  * @device:     the device which we search the free space in
+  * @num_bytes:          the size of the free space that we need
+  * @search_start: the position from which to begin the search
+  * @start:      store the start of the free space.
+  * @len:        the size of the free space. that we find, or the size
+  *              of the max free space if we don't find suitable free space
   *
   * this uses a pretty simple search, the expectation is that it is
   * called very infrequently and that a given device has a small number
   * But if we don't find suitable free space, it is used to store the size of
   * the max free space.
   */
- int find_free_dev_extent(struct btrfs_trans_handle *trans,
-                        struct btrfs_device *device, u64 num_bytes,
-                        u64 *start, u64 *len)
+ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+                              struct btrfs_device *device, u64 num_bytes,
+                              u64 search_start, u64 *start, u64 *len)
  {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
        u64 max_hole_start;
        u64 max_hole_size;
        u64 extent_end;
-       u64 search_start;
        u64 search_end = device->total_bytes;
        int ret;
        int slot;
        struct extent_buffer *l;
  
-       /* FIXME use last free of some kind */
-       /* we don't want to overwrite the superblock on the drive,
-        * so we make sure to start at an offset of at least 1MB
-        */
-       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@@ -1273,7 -1269,7 +1269,7 @@@ again
                         * Have to check before we set max_hole_start, otherwise
                         * we could end up sending back this offset anyway.
                         */
-                       if (contains_pending_extent(trans, device,
+                       if (contains_pending_extent(transaction, device,
                                                    &search_start,
                                                    hole_size)) {
                                if (key.offset >= search_start) {
@@@ -1322,7 -1318,7 +1318,7 @@@ next
        if (search_end > search_start) {
                hole_size = search_end - search_start;
  
-               if (contains_pending_extent(trans, device, &search_start,
+               if (contains_pending_extent(transaction, device, &search_start,
                                            hole_size)) {
                        btrfs_release_path(path);
                        goto again;
        return ret;
  }
  
+ int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                        struct btrfs_device *device, u64 num_bytes,
+                        u64 *start, u64 *len)
+ {
+       struct btrfs_root *root = device->dev_root;
+       u64 search_start;
+       /* FIXME use last free of some kind */
+       /*
+        * we don't want to overwrite the superblock on the drive,
+        * so we make sure to start at an offset of at least 1MB
+        */
+       search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+       return find_free_dev_extent_start(trans->transaction, device,
+                                         num_bytes, search_start, start, len);
+ }
  static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
                          struct btrfs_device *device,
                          u64 start, u64 *dev_extent_len)
@@@ -2755,7 -2769,9 +2769,7 @@@ out
        return ret;
  }
  
 -static int btrfs_relocate_chunk(struct btrfs_root *root,
 -                              u64 chunk_objectid,
 -                              u64 chunk_offset)
 +static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
  {
        struct btrfs_root *extent_root;
        struct btrfs_trans_handle *trans;
                return -ENOSPC;
  
        /* step one, relocate all the extents inside this chunk */
 +      btrfs_scrub_pause(root);
        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
 +      btrfs_scrub_continue(root);
        if (ret)
                return ret;
  
@@@ -2855,6 -2869,7 +2869,6 @@@ again
  
                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
                        ret = btrfs_relocate_chunk(chunk_root,
 -                                                 found_key.objectid,
                                                   found_key.offset);
                        if (ret == -ENOSPC)
                                failed++;
@@@ -3374,6 -3389,7 +3388,6 @@@ again
                }
  
                ret = btrfs_relocate_chunk(chunk_root,
 -                                         found_key.objectid,
                                           found_key.offset);
                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
                if (ret && ret != -ENOSPC)
@@@ -4075,6 -4091,7 +4089,6 @@@ int btrfs_shrink_device(struct btrfs_de
        struct btrfs_dev_extent *dev_extent = NULL;
        struct btrfs_path *path;
        u64 length;
 -      u64 chunk_objectid;
        u64 chunk_offset;
        int ret;
        int slot;
@@@ -4151,10 -4168,11 +4165,10 @@@ again
                        break;
                }
  
 -              chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
                btrfs_release_path(path);
  
 -              ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
 +              ret = btrfs_relocate_chunk(root, chunk_offset);
                mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
                if (ret && ret != -ENOSPC)
                        goto done;
                u64 start = new_size;
                u64 len = old_size - new_size;
  
-               if (contains_pending_extent(trans, device, &start, len)) {
+               if (contains_pending_extent(trans->transaction, device,
+                                           &start, len)) {
                        unlock_chunks(root);
                        checked_pending_chunks = true;
                        failed = 0;
@@@ -5739,10 -5758,10 +5754,10 @@@ int btrfs_rmap_block(struct btrfs_mappi
  
  static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
  {
 -      if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
 -              bio_endio_nodec(bio, err);
 -      else
 -              bio_endio(bio, err);
 +      bio->bi_private = bbio->private;
 +      bio->bi_end_io = bbio->end_io;
 +      bio_endio(bio, err);
 +
        btrfs_put_bbio(bbio);
  }
  
@@@ -5786,6 -5805,8 +5801,6 @@@ static void btrfs_end_bio(struct bio *b
                        bio = bbio->orig_bio;
                }
  
 -              bio->bi_private = bbio->private;
 -              bio->bi_end_io = bbio->end_io;
                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
                /* only send an error to the higher layers if it is
                 * beyond the tolerance of the btrfs bio
@@@ -5942,14 -5963,6 +5957,14 @@@ again
        if (!bio)
                return -ENOMEM;
  
 +      if (first_bio->bi_ioc) {
 +              get_io_context_active(first_bio->bi_ioc);
 +              bio->bi_ioc = first_bio->bi_ioc;
 +      }
 +      if (first_bio->bi_css) {
 +              css_get(first_bio->bi_css);
 +              bio->bi_css = first_bio->bi_css;
 +      }
        while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
                if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
                                 bvec->bv_offset) < bvec->bv_len) {
@@@ -5975,6 -5988,8 +5990,6 @@@ static void bbio_error(struct btrfs_bi
                /* Shoud be the original bio. */
                WARN_ON(bio != bbio->orig_bio);
  
 -              bio->bi_private = bbio->private;
 -              bio->bi_end_io = bbio->end_io;
                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
                bio->bi_iter.bi_sector = logical >> 9;
  
@@@ -6055,8 -6070,10 +6070,8 @@@ int btrfs_map_bio(struct btrfs_root *ro
                if (dev_nr < total_devs - 1) {
                        bio = btrfs_bio_clone(first_bio, GFP_NOFS);
                        BUG_ON(!bio); /* -ENOMEM */
 -              } else {
 +              } else
                        bio = first_bio;
 -                      bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
 -              }
  
                submit_stripe_bio(root, bbio, bio,
                                  bbio->stripes[dev_nr].physical, dev_nr, rw,
diff --combined fs/btrfs/volumes.h
index 95842a909e7f7cbbb9be2a70e24376db7f08baf7,57b0217b5300748e3b181e0cc2987ff8e3cfece4..2ca784a14e84bc2a00d0c3d1ec1a15290128edfc
@@@ -298,6 -298,8 +298,6 @@@ struct btrfs_bio_stripe 
  struct btrfs_bio;
  typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
  
 -#define BTRFS_BIO_ORIG_BIO_SUBMITTED  (1 << 0)
 -
  struct btrfs_bio {
        atomic_t refs;
        atomic_t stripes_pending;
@@@ -453,6 -455,9 +453,9 @@@ int btrfs_cancel_balance(struct btrfs_f
  int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
  int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
  int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+                        struct btrfs_device *device, u64 num_bytes,
+                        u64 search_start, u64 *start, u64 *max_avail);
  int find_free_dev_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *max_avail);