Btrfs: Fix streaming read performance with checksumming on
authorChris Mason <chris.mason@oracle.com>
Thu, 31 Jul 2008 19:42:53 +0000 (15:42 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 25 Sep 2008 15:04:05 +0000 (11:04 -0400)
Large streaming reads make for large bios, which means each entry on the
list async work queues represents a large amount of data.  IO
congestion throttling on the device was kicking in before the async
worker threads decided a single thread was busy and needed some help.

The end result was that a streaming read would result in a single CPU
running at 100% instead of balancing the work off to other CPUs.

This patch also changes the pre-IO checksum lookup done by reads to
work on a per-bio basis instead of a per-page.  This results in many
extra btree lookups on large streaming reads.  Doing the checksum lookup
right before bio submit allows us to reuse searches while processing
adjacent offsets.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/async-thread.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/file-item.c
fs/btrfs/inode.c

index 5fe6a0d532ed46a353b663ce4a13692313522399..bc2980c433ef7a8b980972c87d75e0578f3ad33d 100644 (file)
@@ -160,7 +160,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max)
        INIT_LIST_HEAD(&workers->idle_list);
        spin_lock_init(&workers->lock);
        workers->max_workers = max;
-       workers->idle_thresh = 64;
+       workers->idle_thresh = 32;
 }
 
 /*
index be16cd49ef69df1c797ee88b0465d04144e78be2..d788ab0dcd96b72eae4f0c399dca03ef79385992 100644 (file)
@@ -1613,6 +1613,8 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
                       struct btrfs_key *location, int mod);
 
 /* file-item.c */
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+                         struct bio *bio);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               u64 objectid, u64 pos, u64 disk_offset,
index e826730d750fe156d64c0ff4ca2efe44e1118e5d..d2d1cc87e8ad65da8c159946d6241117e0548f5e 100644 (file)
@@ -1357,10 +1357,25 @@ struct btrfs_root *open_ctree(struct super_block *sb,
         */
        btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
        btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
+
+       /* a higher idle thresh on the submit workers makes it much more
+        * likely that bios will be send down in a sane order to the
+        * devices
+        */
+       fs_info->submit_workers.idle_thresh = 64;
+
        btrfs_init_workers(&fs_info->fixup_workers, 1);
        btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
        btrfs_init_workers(&fs_info->endio_write_workers,
                           fs_info->thread_pool_size);
+
+       /*
+        * endios are largely parallel and should have a very
+        * low idle thresh
+        */
+       fs_info->endio_workers.idle_thresh = 4;
+       fs_info->endio_write_workers.idle_thresh = 4;
+
        btrfs_start_workers(&fs_info->workers, 1);
        btrfs_start_workers(&fs_info->submit_workers, 1);
        btrfs_start_workers(&fs_info->fixup_workers, 1);
index 2311061f070e06f87f09b66b3369b0ac0fa0a8d8..a5ff19b34b219fc3a00c37e336849433693df34c 100644 (file)
@@ -134,6 +134,83 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+                         struct bio *bio)
+{
+       u32 sum;
+       struct bio_vec *bvec = bio->bi_io_vec;
+       int bio_index = 0;
+       u64 offset;
+       u64 item_start_offset = 0;
+       u64 item_last_offset = 0;
+       u32 diff;
+       int ret;
+       struct btrfs_path *path;
+       struct btrfs_csum_item *item = NULL;
+       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+
+       path = btrfs_alloc_path();
+       path->reada = 2;
+
+       WARN_ON(bio->bi_vcnt <= 0);
+
+       while(bio_index < bio->bi_vcnt) {
+               offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+               ret = btrfs_find_ordered_sum(inode, offset, &sum);
+               if (ret == 0)
+                       goto found;
+
+               if (!item || offset < item_start_offset ||
+                   offset >= item_last_offset) {
+                       struct btrfs_key found_key;
+                       u32 item_size;
+
+                       if (item)
+                               btrfs_release_path(root, path);
+                       item = btrfs_lookup_csum(NULL, root, path,
+                                                inode->i_ino, offset, 0);
+                       if (IS_ERR(item)) {
+                               ret = PTR_ERR(item);
+                               if (ret == -ENOENT || ret == -EFBIG)
+                                       ret = 0;
+                               sum = 0;
+                               printk("no csum found for inode %lu start "
+                                      "%llu\n", inode->i_ino,
+                                      (unsigned long long)offset);
+                               goto found;
+                       }
+                       btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                             path->slots[0]);
+
+                       item_start_offset = found_key.offset;
+                       item_size = btrfs_item_size_nr(path->nodes[0],
+                                                      path->slots[0]);
+                       item_last_offset = item_start_offset +
+                               (item_size / BTRFS_CRC32_SIZE) *
+                               root->sectorsize;
+                       item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                             struct btrfs_csum_item);
+               }
+               /*
+                * this byte range must be able to fit inside
+                * a single leaf so it will also fit inside a u32
+                */
+               diff = offset - item_start_offset;
+               diff = diff / root->sectorsize;
+               diff = diff * BTRFS_CRC32_SIZE;
+
+               read_extent_buffer(path->nodes[0], &sum,
+                                  (unsigned long)item + diff,
+                                  BTRFS_CRC32_SIZE);
+found:
+               set_state_private(io_tree, offset, sum);
+               bio_index++;
+               bvec++;
+       }
+       btrfs_free_path(path);
+       return 0;
+}
+
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                       struct bio *bio)
 {
index c4afa9d78da98300a81196dbc63f691b7570f8e3..31d52c51acc374c471c6c50d12c91e1bdbfc27c6 100644 (file)
@@ -374,6 +374,10 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        BUG_ON(ret);
 
        if (!(rw & (1 << BIO_RW))) {
+               if (!btrfs_test_opt(root, NODATASUM) &&
+                   !btrfs_test_flag(inode, NODATASUM)) {
+                       btrfs_lookup_bio_sums(root, inode, bio);
+               }
                goto mapit;
        }
 
@@ -598,58 +602,6 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
        return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
-int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
-{
-       int ret = 0;
-       struct inode *inode = page->mapping->host;
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       struct btrfs_csum_item *item;
-       struct btrfs_path *path = NULL;
-       u32 csum;
-
-       if (btrfs_test_opt(root, NODATASUM) ||
-           btrfs_test_flag(inode, NODATASUM))
-               return 0;
-
-       /*
-        * It is possible there is an ordered extent that has
-        * not yet finished for this range in the file.  If so,
-        * that extent will have a csum cached, and it will insert
-        * the sum after all the blocks in the extent are fully
-        * on disk.  So, look for an ordered extent and use the
-        * sum if found.  We have to do this before looking in the
-        * btree because csum items are pre-inserted based on
-        * the file size.  btrfs_lookup_csum might find an item
-        * that still hasn't been fully filled.
-        */
-       ret = btrfs_find_ordered_sum(inode, start, &csum);
-       if (ret == 0)
-               goto found;
-
-       ret = 0;
-       path = btrfs_alloc_path();
-       item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
-       if (IS_ERR(item)) {
-               ret = PTR_ERR(item);
-               /* a csum that isn't present is a preallocated region. */
-               if (ret == -ENOENT || ret == -EFBIG)
-                       ret = 0;
-               csum = 0;
-               printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
-                      start);
-               goto out;
-       }
-       read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
-                          BTRFS_CRC32_SIZE);
-found:
-       set_state_private(io_tree, start, csum);
-out:
-       if (path)
-               btrfs_free_path(path);
-       return ret;
-}
-
 struct io_failure_record {
        struct page *page;
        u64 start;
@@ -3613,7 +3565,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
        .fill_delalloc = run_delalloc_range,
        .submit_bio_hook = btrfs_submit_bio_hook,
        .merge_bio_hook = btrfs_merge_bio_hook,
-       .readpage_io_hook = btrfs_readpage_io_hook,
        .readpage_end_io_hook = btrfs_readpage_end_io_hook,
        .writepage_end_io_hook = btrfs_writepage_end_io_hook,
        .writepage_start_hook = btrfs_writepage_start_hook,