Btrfs: Add zlib compression support

author Chris Mason <chris.mason@oracle.com>

Wed, 29 Oct 2008 18:49:59 +0000 (14:49 -0400)

committer Chris Mason <chris.mason@oracle.com>

Wed, 29 Oct 2008 18:49:59 +0000 (14:49 -0400)
author Chris Mason <chris.mason@oracle.com>
Wed, 29 Oct 2008 18:49:59 +0000 (14:49 -0400)
committer Chris Mason <chris.mason@oracle.com>
Wed, 29 Oct 2008 18:49:59 +0000 (14:49 -0400)
diff --git a/fs/Kconfig b/fs/Kconfig

index 18f5a85b47c667a90e3cc8048a76c41fa85a7a8d..31cce5d88b1a001c69a600caf72cb99deb82ea3a 100644 (file)
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -501,6 +501,8 @@ config BTRFS_FS
         tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
         depends on EXPERIMENTAL
         select LIBCRC32C
+       select ZLIB_INFLATE
+       select ZLIB_DEFLATE
         help
           Btrfs is a new filesystem with extents, writable snapshotting,
           support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile

index 7125716e142b4a75aa411b8b758c632877065cec..d2cf5a54a4b816d308ed7c452892fae27e009db1 100644 (file)
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
            transaction.o inode.o file.o tree-defrag.o \
            extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
            extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-          ref-cache.o export.o tree-log.o acl.o free-space-cache.o
+          ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+          compression.o
  else
  
  # Normal Makefile
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c

new file mode 100644 (file)

index 0000000..c547036
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compat.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+       /* number of bios pending for this compressed extent */
+       atomic_t pending_bios;
+
+       /* the pages with the compressed data on them */
+       struct page **compressed_pages;
+
+       /* inode that owns this data */
+       struct inode *inode;
+
+       /* starting offset in the inode for our pages */
+       u64 start;
+
+       /* number of bytes in the inode we're working on */
+       unsigned long len;
+
+       /* number of bytes on disk */
+       unsigned long compressed_len;
+
+       /* number of compressed pages in the array */
+       unsigned long nr_pages;
+
+       /* IO errors */
+       int errors;
+
+       /* for reads, this is the bio we are copying the data into */
+       struct bio *orig_bio;
+};
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+                                       u64 first_byte, gfp_t gfp_flags)
+{
+       struct bio *bio;
+       int nr_vecs;
+
+       nr_vecs = bio_get_nr_vecs(bdev);
+       bio = bio_alloc(gfp_flags, nr_vecs);
+
+       if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+               while (!bio && (nr_vecs /= 2))
+                       bio = bio_alloc(gfp_flags, nr_vecs);
+       }
+
+       if (bio) {
+               bio->bi_size = 0;
+               bio->bi_bdev = bdev;
+               bio->bi_sector = first_byte >> 9;
+       }
+       return bio;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+       struct extent_io_tree *tree;
+       struct compressed_bio *cb = bio->bi_private;
+       struct inode *inode;
+       struct page *page;
+       unsigned long index;
+       int ret;
+
+       if (err)
+               cb->errors = 1;
+
+       /* if there are more bios still pending for this compressed
+        * extent, just exit
+        */
+       if (!atomic_dec_and_test(&cb->pending_bios))
+               goto out;
+
+       /* ok, we're the last bio for this extent, lets start
+        * the decompression.
+        */
+       inode = cb->inode;
+       tree = &BTRFS_I(inode)->io_tree;
+       ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+                                       cb->start,
+                                       cb->orig_bio->bi_io_vec,
+                                       cb->orig_bio->bi_vcnt,
+                                       cb->compressed_len);
+       if (ret)
+               cb->errors = 1;
+
+       /* release the compressed pages */
+       index = 0;
+       for (index = 0; index < cb->nr_pages; index++) {
+               page = cb->compressed_pages[index];
+               page->mapping = NULL;
+               page_cache_release(page);
+       }
+
+       /* do io completion on the original bio */
+       if (cb->errors)
+               bio_io_error(cb->orig_bio);
+       else
+               bio_endio(cb->orig_bio, 0);
+
+       /* finally free the cb struct */
+       kfree(cb->compressed_pages);
+       kfree(cb);
+out:
+       bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+                                            unsigned long ram_size)
+{
+       unsigned long index = start >> PAGE_CACHE_SHIFT;
+       unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+       struct page *pages[16];
+       unsigned long nr_pages = end_index - index + 1;
+       int i;
+       int ret;
+
+       while(nr_pages > 0) {
+               ret = find_get_pages_contig(inode->i_mapping, index,
+                                    min(nr_pages, ARRAY_SIZE(pages)), pages);
+               if (ret == 0) {
+                       nr_pages -= 1;
+                       index += 1;
+                       continue;
+               }
+               for (i = 0; i < ret; i++) {
+                       end_page_writeback(pages[i]);
+                       page_cache_release(pages[i]);
+               }
+               nr_pages -= ret;
+               index += ret;
+       }
+       /* the inode may be gone now */
+       return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+       struct extent_io_tree *tree;
+       struct compressed_bio *cb = bio->bi_private;
+       struct inode *inode;
+       struct page *page;
+       unsigned long index;
+
+       if (err)
+               cb->errors = 1;
+
+       /* if there are more bios still pending for this compressed
+        * extent, just exit
+        */
+       if (!atomic_dec_and_test(&cb->pending_bios))
+               goto out;
+
+       /* ok, we're the last bio for this extent, step one is to
+        * call back into the FS and do all the end_io operations
+        */
+       inode = cb->inode;
+       tree = &BTRFS_I(inode)->io_tree;
+       tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+                                        cb->start,
+                                        cb->start + cb->len - 1,
+                                        NULL, 1);
+
+       end_compressed_writeback(inode, cb->start, cb->len);
+       /* note, our inode could be gone now */
+
+       /*
+        * release the compressed pages, these came from alloc_page and
+        * are not attached to the inode at all
+        */
+       index = 0;
+       for (index = 0; index < cb->nr_pages; index++) {
+               page = cb->compressed_pages[index];
+               page->mapping = NULL;
+               page_cache_release(page);
+       }
+
+       /* finally free the cb struct */
+       kfree(cb->compressed_pages);
+       kfree(cb);
+out:
+       bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+                                unsigned long len, u64 disk_start,
+                                unsigned long compressed_len,
+                                struct page **compressed_pages,
+                                unsigned long nr_pages)
+{
+       struct bio *bio = NULL;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct compressed_bio *cb;
+       unsigned long bytes_left;
+       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       int page_index = 0;
+       struct page *page;
+       u64 first_byte = disk_start;
+       struct block_device *bdev;
+       int ret;
+
+       WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+       cb = kmalloc(sizeof(*cb), GFP_NOFS);
+       atomic_set(&cb->pending_bios, 0);
+       cb->errors = 0;
+       cb->inode = inode;
+       cb->start = start;
+       cb->len = len;
+       cb->compressed_pages = compressed_pages;
+       cb->compressed_len = compressed_len;
+       cb->orig_bio = NULL;
+       cb->nr_pages = nr_pages;
+
+       bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+       ret = btrfs_csum_file_bytes(root, inode, start, len);
+       BUG_ON(ret);
+
+       bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+       bio->bi_private = cb;
+       bio->bi_end_io = end_compressed_bio_write;
+       atomic_inc(&cb->pending_bios);
+
+       /* create and submit bios for the compressed pages */
+       bytes_left = compressed_len;
+       while(bytes_left > 0) {
+               page = compressed_pages[page_index];
+               page->mapping = inode->i_mapping;
+               if (bio->bi_size)
+                       ret = io_tree->ops->merge_bio_hook(page, 0,
+                                                          PAGE_CACHE_SIZE,
+                                                          bio, 0);
+               else
+                       ret = 0;
+
+               if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+                   PAGE_CACHE_SIZE) {
+                       bio_get(bio);
+
+                       ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+                       BUG_ON(ret);
+
+                       ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+                       BUG_ON(ret);
+
+                       bio_put(bio);
+
+                       bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+                       atomic_inc(&cb->pending_bios);
+                       bio->bi_private = cb;
+                       bio->bi_end_io = end_compressed_bio_write;
+                       bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+               }
+               page_index++;
+               bytes_left -= PAGE_CACHE_SIZE;
+               first_byte += PAGE_CACHE_SIZE;
+       }
+       bio_get(bio);
+
+       ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+       BUG_ON(ret);
+
+       ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+       BUG_ON(ret);
+
+       bio_put(bio);
+       return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+                                int mirror_num, unsigned long bio_flags)
+{
+       struct extent_io_tree *tree;
+       struct extent_map_tree *em_tree;
+       struct compressed_bio *cb;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+       unsigned long compressed_len;
+       unsigned long nr_pages;
+       unsigned long page_index;
+       struct page *page;
+       struct block_device *bdev;
+       struct bio *comp_bio;
+       u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+       struct extent_map *em;
+       int ret;
+
+       tree = &BTRFS_I(inode)->io_tree;
+       em_tree = &BTRFS_I(inode)->extent_tree;
+
+       /* we need the actual starting offset of this extent in the file */
+       spin_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree,
+                                  page_offset(bio->bi_io_vec->bv_page),
+                                  PAGE_CACHE_SIZE);
+       spin_unlock(&em_tree->lock);
+
+       cb = kmalloc(sizeof(*cb), GFP_NOFS);
+       atomic_set(&cb->pending_bios, 0);
+       cb->errors = 0;
+       cb->inode = inode;
+
+       cb->start = em->start;
+       compressed_len = em->block_len;
+       free_extent_map(em);
+
+       cb->len = uncompressed_len;
+       cb->compressed_len = compressed_len;
+       cb->orig_bio = bio;
+
+       nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+                                PAGE_CACHE_SIZE;
+       cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+                                      GFP_NOFS);
+       bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+       for (page_index = 0; page_index < nr_pages; page_index++) {
+               cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+                                                             __GFP_HIGHMEM);
+       }
+       cb->nr_pages = nr_pages;
+
+       comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+       comp_bio->bi_private = cb;
+       comp_bio->bi_end_io = end_compressed_bio_read;
+       atomic_inc(&cb->pending_bios);
+
+       for (page_index = 0; page_index < nr_pages; page_index++) {
+               page = cb->compressed_pages[page_index];
+               page->mapping = inode->i_mapping;
+               if (comp_bio->bi_size)
+                       ret = tree->ops->merge_bio_hook(page, 0,
+                                                       PAGE_CACHE_SIZE,
+                                                       comp_bio, 0);
+               else
+                       ret = 0;
+
+               if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+                   PAGE_CACHE_SIZE) {
+                       bio_get(comp_bio);
+
+                       ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+                       BUG_ON(ret);
+
+                       ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+                       BUG_ON(ret);
+
+                       bio_put(comp_bio);
+
+                       comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+                                                       GFP_NOFS);
+                       atomic_inc(&cb->pending_bios);
+                       bio->bi_private = cb;
+                       bio->bi_end_io = end_compressed_bio_write;
+                       bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+               }
+               cur_disk_byte += PAGE_CACHE_SIZE;
+       }
+       bio_get(comp_bio);
+
+       ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+       BUG_ON(ret);
+
+       ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+       BUG_ON(ret);
+
+       bio_put(comp_bio);
+       return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h

new file mode 100644 (file)

index 0000000..421f5b4
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+int btrfs_zlib_decompress(unsigned char *data_in,
+                         struct page *dest_page,
+                         unsigned long start_byte,
+                         size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+                             u64 start, unsigned long len,
+                             struct page **pages,
+                             unsigned long nr_dest_pages,
+                             unsigned long *out_pages,
+                             unsigned long *total_in,
+                             unsigned long *total_out,
+                             unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                             u64 disk_start,
+                             struct bio_vec *bvec,
+                             int vcnt,
+                             size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+                                 unsigned long len, u64 disk_start,
+                                 unsigned long compressed_len,
+                                 struct page **compressed_pages,
+                                 unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+                                int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 8559f39fd47fb8ed0165ffd6c686cfa146f618f4..793d8fdda24474a147edd4b520ba677a752b8404 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -400,10 +400,18 @@ struct btrfs_timespec {
         __le32 nsec;
  } __attribute__ ((__packed__));
  
-/*
- * there is no padding here on purpose.  If you want to extent the inode,
- * make a new item type
- */
+typedef enum {
+       BTRFS_COMPRESS_NONE = 0,
+       BTRFS_COMPRESS_ZLIB = 1,
+       BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+
+/* we don't understand any encryption methods right now */
+typedef enum {
+       BTRFS_ENCRYPTION_NONE = 0,
+       BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
+
  struct btrfs_inode_item {
         /* nfs style generation number */
         __le64 generation;
@@ -419,6 +427,7 @@ struct btrfs_inode_item {
         __le64 rdev;
         __le16 flags;
         __le16 compat_flags;
+
         struct btrfs_timespec atime;
         struct btrfs_timespec ctime;
         struct btrfs_timespec mtime;
@@ -454,8 +463,33 @@ struct btrfs_root_item {
  #define BTRFS_FILE_EXTENT_INLINE 1
  
  struct btrfs_file_extent_item {
+       /*
+        * transaction id that created this extent
+        */
         __le64 generation;
+       /*
+        * max number of bytes to hold this extent in ram
+        * when we split a compressed extent we can't know how big
+        * each of the resulting pieces will be.  So, this is
+        * an upper limit on the size of the extent in ram instead of
+        * an exact limit.
+        */
+       __le64 ram_bytes;
+
+       /*
+        * 32 bits for the various ways we might encode the data,
+        * including compression and encryption.  If any of these
+        * are set to something a given disk format doesn't understand
+        * it is treated like an incompat flag for reading and writing,
+        * but not for stat.
+        */
+       u8 compression;
+       u8 encryption;
+       __le16 other_encoding; /* spare for later use */
+
+       /* are we inline data or a real extent? */
         u8 type;
+
         /*
          * disk space consumed by the extent, checksum blocks are included
          * in these numbers
@@ -471,9 +505,11 @@ struct btrfs_file_extent_item {
          */
         __le64 offset;
         /*
-        * the logical number of file blocks (no csums included)
+        * the logical number of file blocks (no csums included).  This
+        * always reflects the size uncompressed and without encoding.
          */
         __le64 num_bytes;
+
  } __attribute__ ((__packed__));
  
  struct btrfs_csum_item {
@@ -814,6 +850,7 @@ struct btrfs_root {
  #define BTRFS_MOUNT_NOBARRIER          (1 << 2)
  #define BTRFS_MOUNT_SSD                        (1 << 3)
  #define BTRFS_MOUNT_DEGRADED           (1 << 4)
+#define BTRFS_MOUNT_COMPRESS           (1 << 5)
  
  #define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
  #define btrfs_set_opt(o, opt)          ((o) |= BTRFS_MOUNT_##opt)
@@ -825,6 +862,7 @@ struct btrfs_root {
  #define BTRFS_INODE_NODATASUM          (1 << 0)
  #define BTRFS_INODE_NODATACOW          (1 << 1)
  #define BTRFS_INODE_READONLY           (1 << 2)
+#define BTRFS_INODE_NOCOMPRESS         (1 << 3)
  #define btrfs_clear_flag(inode, flag)  (BTRFS_I(inode)->flags &= \
                                          ~BTRFS_INODE_##flag)
  #define btrfs_set_flag(inode, flag)    (BTRFS_I(inode)->flags |= \
@@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
         return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
  }
  
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
-                                              struct btrfs_item *e)
-{
-       unsigned long offset;
-       offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
-       return btrfs_item_size(eb, e) - offset;
-}
-
  BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
                    disk_bytenr, 64);
  BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
                   offset, 64);
  BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
                    num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+                  ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+                  compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+                  encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+                  other_encoding, 16);
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+                                              struct btrfs_file_extent_item *e)
+{
+       return btrfs_file_extent_ram_bytes(eb, e);
+}
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers.  If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+                                                   struct btrfs_item *e)
+{
+       unsigned long offset;
+       offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+       return btrfs_item_size(eb, e) - offset;
+}
  
  static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
  {
@@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
  int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
                           struct bio *bio);
  int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              u64 objectid, u64 pos, u64 disk_offset,
-                              u64 disk_num_bytes,
-                            u64 num_bytes, u64 offset);
+                            struct btrfs_root *root,
+                            u64 objectid, u64 pos,
+                            u64 disk_offset, u64 disk_num_bytes,
+                            u64 num_bytes, u64 offset, u64 ram_bytes,
+                            u8 compression, u8 encryption, u16 other_encoding);
  int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct btrfs_path *path, u64 objectid,
@@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                            struct btrfs_ordered_sum *sums);
  int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                        struct bio *bio);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+                         u64 start, unsigned long len);
  struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
                                           struct btrfs_root *root,
                                           struct btrfs_path *path,
@@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
                                   int namelen);
  
  int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-                        size_t size, struct bio *bio);
+                        size_t size, struct bio *bio, unsigned long bio_flags);
  
  unsigned long btrfs_force_ra(struct address_space *mapping,
                               struct file_ra_state *ra, struct file *file,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 0be044bb619499a8d588b4c2735bd1c402e9d79f..dc95f636a11ba0a58eb45624e565a8f8a82813e1 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -83,6 +83,7 @@ struct async_submit_bio {
         extent_submit_bio_hook_t *submit_bio_hook;
         int rw;
         int mirror_num;
+       unsigned long bio_flags;
         struct btrfs_work work;
  };
  
@@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
         }
         em->start = 0;
         em->len = (u64)-1;
+       em->block_len = (u64)-1;
         em->block_start = 0;
         em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
  
@@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work)
                 wake_up(&fs_info->async_submit_wait);
  
         async->submit_bio_hook(async->inode, async->rw, async->bio,
-                              async->mirror_num);
+                              async->mirror_num, async->bio_flags);
         kfree(async);
  }
  
  int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                         int rw, struct bio *bio, int mirror_num,
+                       unsigned long bio_flags,
                         extent_submit_bio_hook_t *submit_bio_hook)
  {
         struct async_submit_bio *async;
@@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
         async->submit_bio_hook = submit_bio_hook;
         async->work.func = run_one_async_submit;
         async->work.flags = 0;
+       async->bio_flags = bio_flags;
  
         while(atomic_read(&fs_info->async_submit_draining) &&
               atomic_read(&fs_info->nr_async_submits)) {
@@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio)
  }
  
  static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                                int mirror_num)
+                                int mirror_num, unsigned long bio_flags)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         int ret;
@@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
  }
  
  static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                                int mirror_num)
+                                int mirror_num, unsigned long bio_flags)
  {
         /*
          * kthread helpers are used to submit writes so that checksumming
          * can happen in parallel across all CPUs
          */
         if (!(rw & (1 << BIO_RW))) {
-               return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
+               return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);
         }
         return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-                                  inode, rw, bio, mirror_num,
+                                  inode, rw, bio, mirror_num, 0,
                                    __btree_submit_bio_hook);
  }
  
@@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
         fs_info->btree_inode = new_inode(sb);
         fs_info->btree_inode->i_ino = 1;
         fs_info->btree_inode->i_nlink = 1;
+
         fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
  
         INIT_LIST_HEAD(&fs_info->ordered_extents);
@@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
          */
         btrfs_init_workers(&fs_info->workers, "worker",
                            fs_info->thread_pool_size);
+
         btrfs_init_workers(&fs_info->submit_workers, "submit",
                            min_t(u64, fs_devices->num_devices,
                            fs_info->thread_pool_size));
@@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
         }
  
         fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+       fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+                                   4 * 1024 * 1024 / PAGE_CACHE_SIZE);
  
         nodesize = btrfs_super_nodesize(disk_super);
         leafsize = btrfs_super_leafsize(disk_super);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h

index f84f5058dbbb892435802c52ad9efd430d7f7900..4eb1f1408d21285bd75a7abe58dfbe571aeb94e6 100644 (file)
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                         int metadata);
  int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                         int rw, struct bio *bio, int mirror_num,
+                       unsigned long bio_flags,
                         extent_submit_bio_hook_t *submit_bio_hook);
  int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
  unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 280ac1aa9b6d7d39816a24283836cdaa6de60cf4..bbf04e80a1a3c2b130b223b44c7858f52441b39e 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
  
         em->start = extent_key->objectid - offset;
         em->len = extent_key->offset;
+       em->block_len = extent_key->offset;
         em->block_start = extent_key->objectid;
         em->bdev = root->fs_info->fs_devices->latest_bdev;
         set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -3314,10 +3315,14 @@ struct btrfs_ref_path {
  };
  
  struct disk_extent {
+       u64 ram_bytes;
         u64 disk_bytenr;
         u64 disk_num_bytes;
         u64 offset;
         u64 num_bytes;
+       u8 compression;
+       u8 encryption;
+       u16 other_encoding;
  };
  
  static int is_cowonly_root(u64 root_objectid)
@@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode,
                         btrfs_file_extent_disk_num_bytes(leaf, fi);
                 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
                 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+               exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+               exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+               exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+               exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+                                                                          fi);
                 WARN_ON(exts[nr].offset > 0);
                 WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
  
@@ -3846,6 +3856,8 @@ next:
                                                 new_extents[0].disk_bytenr);
                         btrfs_set_file_extent_disk_num_bytes(leaf, fi,
                                                 new_extents[0].disk_num_bytes);
+                       btrfs_set_file_extent_ram_bytes(leaf, fi,
+                                               new_extents[0].ram_bytes);
                         ext_offset += new_extents[0].offset;
                         btrfs_set_file_extent_offset(leaf, fi, ext_offset);
                         btrfs_mark_buffer_dirty(leaf);
@@ -3911,6 +3923,16 @@ next:
                                                 new_extents[i].disk_bytenr);
                                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
                                                 new_extents[i].disk_num_bytes);
+                               btrfs_set_file_extent_ram_bytes(leaf, fi,
+                                               new_extents[i].ram_bytes);
+
+                               btrfs_set_file_extent_compression(leaf, fi,
+                                               new_extents[i].compression);
+                               btrfs_set_file_extent_encryption(leaf, fi,
+                                               new_extents[i].encryption);
+                               btrfs_set_file_extent_other_encoding(leaf, fi,
+                                               new_extents[i].other_encoding);
+
                                 btrfs_set_file_extent_num_bytes(leaf, fi,
                                                         extent_len);
                                 ext_offset += new_extents[i].offset;
@@ -4169,6 +4191,8 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
                 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
  
                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+               btrfs_set_file_extent_ram_bytes(leaf, fi,
+                                               new_extent->ram_bytes);
                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
                                                 new_extent->disk_bytenr);
                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
@@ -4847,7 +4871,8 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
         BUG_ON(err);
  
         err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-                                      group->key.offset, 0);
+                                      group->key.offset, 0, group->key.offset,
+                                      0, 0, 0);
         BUG_ON(err);
  
         inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index 563b2d12f4f29cc517bbe6a79c8120516991994b..314041fdfa43a25d5df9cc865b6831563a2c4b64 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache;
  static LIST_HEAD(buffers);
  static LIST_HEAD(states);
  
+#define LEAK_DEBUG 1
  #ifdef LEAK_DEBUG
  static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
  #endif
@@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state);
   *
   * 1 is returned if we find something, 0 if nothing was in the tree
   */
-static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
-                                            u64 *start, u64 *end, u64 max_bytes)
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+                                       u64 *start, u64 *end, u64 max_bytes)
  {
         struct rb_node *node;
         struct extent_state *state;
@@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
         u64 total_bytes = 0;
  
         spin_lock_irq(&tree->lock);
+
         /*
          * this search will find all the extents that end after
          * our range starts.
          */
-search_again:
         node = tree_search(tree, cur_start);
         if (!node) {
                 if (!found)
@@ -1100,40 +1101,6 @@ search_again:
                                 *end = state->end;
                         goto out;
                 }
-               if (!found && !(state->state & EXTENT_BOUNDARY)) {
-                       struct extent_state *prev_state;
-                       struct rb_node *prev_node = node;
-                       while(1) {
-                               prev_node = rb_prev(prev_node);
-                               if (!prev_node)
-                                       break;
-                               prev_state = rb_entry(prev_node,
-                                                     struct extent_state,
-                                                     rb_node);
-                               if ((prev_state->end + 1 != state->start) ||
-                                   !(prev_state->state & EXTENT_DELALLOC))
-                                       break;
-                               if ((cur_start - prev_state->start) * 2 >
-                                    max_bytes)
-                                       break;
-                               state = prev_state;
-                               node = prev_node;
-                       }
-               }
-               if (state->state & EXTENT_LOCKED) {
-                       DEFINE_WAIT(wait);
-                       atomic_inc(&state->refs);
-                       prepare_to_wait(&state->wq, &wait,
-                                       TASK_UNINTERRUPTIBLE);
-                       spin_unlock_irq(&tree->lock);
-                       schedule();
-                       spin_lock_irq(&tree->lock);
-                       finish_wait(&state->wq, &wait);
-                       free_extent_state(state);
-                       goto search_again;
-               }
-               set_state_cb(tree, state, EXTENT_LOCKED);
-               state->state |= EXTENT_LOCKED;
                 if (!found)
                         *start = state->start;
                 found++;
@@ -1151,6 +1118,208 @@ out:
         return found;
  }
  
+static noinline int __unlock_for_delalloc(struct inode *inode,
+                                         struct page *locked_page,
+                                         u64 start, u64 end)
+{
+       int ret;
+       struct page *pages[16];
+       unsigned long index = start >> PAGE_CACHE_SHIFT;
+       unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+       unsigned long nr_pages = end_index - index + 1;
+       int i;
+
+       if (index == locked_page->index && end_index == index)
+               return 0;
+
+       while(nr_pages > 0) {
+               ret = find_get_pages_contig(inode->i_mapping, index,
+                                    min(nr_pages, ARRAY_SIZE(pages)), pages);
+               for (i = 0; i < ret; i++) {
+                       if (pages[i] != locked_page)
+                               unlock_page(pages[i]);
+                       page_cache_release(pages[i]);
+               }
+               nr_pages -= ret;
+               index += ret;
+               cond_resched();
+       }
+       return 0;
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+                                       struct page *locked_page,
+                                       u64 delalloc_start,
+                                       u64 delalloc_end)
+{
+       unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+       unsigned long start_index = index;
+       unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+       unsigned long pages_locked = 0;
+       struct page *pages[16];
+       unsigned long nrpages;
+       int ret;
+       int i;
+
+       /* the caller is responsible for locking the start index */
+       if (index == locked_page->index && index == end_index)
+               return 0;
+
+       /* skip the page at the start index */
+       nrpages = end_index - index + 1;
+       while(nrpages > 0) {
+               ret = find_get_pages_contig(inode->i_mapping, index,
+                                    min(nrpages, ARRAY_SIZE(pages)), pages);
+               if (ret == 0) {
+                       ret = -EAGAIN;
+                       goto done;
+               }
+               /* now we have an array of pages, lock them all */
+               for (i = 0; i < ret; i++) {
+                       /*
+                        * the caller is taking responsibility for
+                        * locked_page
+                        */
+                       if (pages[i] != locked_page)
+                               lock_page(pages[i]);
+                       page_cache_release(pages[i]);
+               }
+               pages_locked += ret;
+               nrpages -= ret;
+               index += ret;
+               cond_resched();
+       }
+       ret = 0;
+done:
+       if (ret && pages_locked) {
+               __unlock_for_delalloc(inode, locked_page,
+                             delalloc_start,
+                             ((u64)(start_index + pages_locked - 1)) <<
+                             PAGE_CACHE_SHIFT);
+       }
+       return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+                                            struct extent_io_tree *tree,
+                                            struct page *locked_page,
+                                            u64 *start, u64 *end,
+                                            u64 max_bytes)
+{
+       u64 delalloc_start;
+       u64 delalloc_end;
+       u64 found;
+       int ret;
+       int loops = 0;
+
+again:
+       /* step one, find a bunch of delalloc bytes starting at start */
+       delalloc_start = *start;
+       delalloc_end = 0;
+       found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+                                   max_bytes);
+       if (!found) {
+               *start = delalloc_start;
+               *end = delalloc_end;
+               return found;
+       }
+
+       /*
+        * make sure to limit the number of pages we try to lock down
+        * if we're looping.
+        */
+       if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
+               delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
+                       ~((u64)PAGE_CACHE_SIZE - 1);
+       }
+       /* step two, lock all the pages after the page that has start */
+       ret = lock_delalloc_pages(inode, locked_page,
+                                 delalloc_start, delalloc_end);
+       if (ret == -EAGAIN) {
+               /* some of the pages are gone, lets avoid looping by
+                * shortening the size of the delalloc range we're searching
+                */
+               if (!loops) {
+                       unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+                       max_bytes = PAGE_CACHE_SIZE - offset;
+                       loops = 1;
+                       goto again;
+               } else {
+                       found = 0;
+                       goto out_failed;
+               }
+       }
+       BUG_ON(ret);
+
+       /* step three, lock the state bits for the whole range */
+       lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+
+       /* then test to make sure it is all still delalloc */
+       ret = test_range_bit(tree, delalloc_start, delalloc_end,
+                            EXTENT_DELALLOC, 1);
+       if (!ret) {
+               unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+               __unlock_for_delalloc(inode, locked_page,
+                             delalloc_start, delalloc_end);
+               cond_resched();
+               goto again;
+       }
+       *start = delalloc_start;
+       *end = delalloc_end;
+out_failed:
+       return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode,
+                               struct extent_io_tree *tree,
+                               u64 start, u64 end, struct page *locked_page,
+                               int clear_dirty, int set_writeback,
+                               int end_writeback)
+{
+       int ret;
+       struct page *pages[16];
+       unsigned long index = start >> PAGE_CACHE_SHIFT;
+       unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+       unsigned long nr_pages = end_index - index + 1;
+       int i;
+       int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+
+       if (clear_dirty)
+               clear_bits |= EXTENT_DIRTY;
+
+       clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+
+       while(nr_pages > 0) {
+               ret = find_get_pages_contig(inode->i_mapping, index,
+                                    min(nr_pages, ARRAY_SIZE(pages)), pages);
+               for (i = 0; i < ret; i++) {
+                       if (pages[i] == locked_page) {
+                               page_cache_release(pages[i]);
+                               continue;
+                       }
+                       if (clear_dirty)
+                               clear_page_dirty_for_io(pages[i]);
+                       if (set_writeback)
+                               set_page_writeback(pages[i]);
+                       if (end_writeback)
+                               end_page_writeback(pages[i]);
+                       unlock_page(pages[i]);
+                       page_cache_release(pages[i]);
+               }
+               nr_pages -= ret;
+               index += ret;
+               cond_resched();
+       }
+       return 0;
+}
+EXPORT_SYMBOL(extent_clear_unlock_delalloc);
+
  /*
   * count the number of bytes in the tree that have a given bit(s)
   * set.  This can be fairly slow, except for EXTENT_DIRTY which is
@@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
         return bio;
  }
  
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+                         unsigned long bio_flags)
  {
         int ret = 0;
         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
         struct page *page = bvec->bv_page;
         struct extent_io_tree *tree = bio->bi_private;
-       struct rb_node *node;
-       struct extent_state *state;
         u64 start;
         u64 end;
  
         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
         end = start + bvec->bv_len - 1;
  
-       spin_lock_irq(&tree->lock);
-       node = __etree_search(tree, start, NULL, NULL);
-       BUG_ON(!node);
-       state = rb_entry(node, struct extent_state, rb_node);
-       while(state->end < end) {
-               node = rb_next(node);
-               state = rb_entry(node, struct extent_state, rb_node);
-       }
-       BUG_ON(state->end != end);
-       spin_unlock_irq(&tree->lock);
-
         bio->bi_private = NULL;
  
         bio_get(bio);
  
         if (tree->ops && tree->ops->submit_bio_hook)
                 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
-                                          mirror_num);
+                                          mirror_num, bio_flags);
         else
                 submit_bio(rw, bio);
         if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                               struct bio **bio_ret,
                               unsigned long max_pages,
                               bio_end_io_t end_io_func,
-                             int mirror_num)
+                             int mirror_num,
+                             unsigned long prev_bio_flags,
+                             unsigned long bio_flags)
  {
         int ret = 0;
         struct bio *bio;
         int nr;
+       int contig = 0;
+       int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+       int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+       size_t page_size = min(size, PAGE_CACHE_SIZE);
  
         if (bio_ret && *bio_ret) {
                 bio = *bio_ret;
-               if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+               if (old_compressed)
+                       contig = bio->bi_sector == sector;
+               else
+                       contig = bio->bi_sector + (bio->bi_size >> 9) ==
+                               sector;
+
+               if (prev_bio_flags != bio_flags || !contig ||
                     (tree->ops && tree->ops->merge_bio_hook &&
-                    tree->ops->merge_bio_hook(page, offset, size, bio)) ||
-                   bio_add_page(bio, page, size, offset) < size) {
-                       ret = submit_one_bio(rw, bio, mirror_num);
+                    tree->ops->merge_bio_hook(page, offset, page_size, bio,
+                                              bio_flags)) ||
+                   bio_add_page(bio, page, page_size, offset) < page_size) {
+                       ret = submit_one_bio(rw, bio, mirror_num,
+                                            prev_bio_flags);
                         bio = NULL;
                 } else {
                         return 0;
                 }
         }
-       nr = bio_get_nr_vecs(bdev);
+       if (this_compressed)
+               nr = BIO_MAX_PAGES;
+       else
+               nr = bio_get_nr_vecs(bdev);
+
         bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
         if (!bio) {
                 printk("failed to allocate bio nr %d\n", nr);
         }
  
-
-       bio_add_page(bio, page, size, offset);
+       bio_add_page(bio, page, page_size, offset);
         bio->bi_end_io = end_io_func;
         bio->bi_private = tree;
  
         if (bio_ret) {
                 *bio_ret = bio;
         } else {
-               ret = submit_one_bio(rw, bio, mirror_num);
+               ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
         }
  
         return ret;
@@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len)
  static int __extent_read_full_page(struct extent_io_tree *tree,
                                    struct page *page,
                                    get_extent_t *get_extent,
-                                  struct bio **bio, int mirror_num)
+                                  struct bio **bio, int mirror_num,
+                                  unsigned long *bio_flags)
  {
         struct inode *inode = page->mapping->host;
         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
         int nr = 0;
         size_t page_offset = 0;
         size_t iosize;
+       size_t disk_io_size;
         size_t blocksize = inode->i_sb->s_blocksize;
+       unsigned long this_bio_flag = 0;
  
         set_page_extent_mapped(page);
  
         end = page_end;
         lock_extent(tree, start, end, GFP_NOFS);
  
+       if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+               char *userpage;
+               size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+               if (zero_offset) {
+                       iosize = PAGE_CACHE_SIZE - zero_offset;
+                       userpage = kmap_atomic(page, KM_USER0);
+                       memset(userpage + zero_offset, 0, iosize);
+                       flush_dcache_page(page);
+                       kunmap_atomic(userpage, KM_USER0);
+               }
+       }
         while (cur <= end) {
                 if (cur >= last_byte) {
                         char *userpage;
@@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
                 }
                 BUG_ON(end < cur);
  
+               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                       this_bio_flag = EXTENT_BIO_COMPRESSED;
+
                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
                 cur_end = min(extent_map_end(em) - 1, end);
                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
-               sector = (em->block_start + extent_offset) >> 9;
+               if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+                       disk_io_size = em->block_len;
+                       sector = em->block_start >> 9;
+               } else {
+                       sector = (em->block_start + extent_offset) >> 9;
+                       disk_io_size = iosize;
+               }
                 bdev = em->bdev;
                 block_start = em->block_start;
                 free_extent_map(em);
@@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
                         unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
                         pnr -= page->index;
                         ret = submit_extent_page(READ, tree, page,
-                                        sector, iosize, page_offset,
+                                        sector, disk_io_size, page_offset,
                                          bdev, bio, pnr,
-                                        end_bio_extent_readpage, mirror_num);
+                                        end_bio_extent_readpage, mirror_num,
+                                        *bio_flags,
+                                        this_bio_flag);
                         nr++;
+                       *bio_flags = this_bio_flag;
                 }
                 if (ret)
                         SetPageError(page);
@@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
                             get_extent_t *get_extent)
  {
         struct bio *bio = NULL;
+       unsigned long bio_flags = 0;
         int ret;
  
-       ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
+       ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+                                     &bio_flags);
         if (bio)
-               submit_one_bio(READ, bio, 0);
+               submit_one_bio(READ, bio, 0, bio_flags);
         return ret;
  }
  EXPORT_SYMBOL(extent_read_full_page);
@@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
         u64 nr_delalloc;
         u64 delalloc_end;
+       int page_started;
+       int compressed;
  
         WARN_ON(!PageLocked(page));
         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
@@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
  
         delalloc_start = start;
         delalloc_end = 0;
+       page_started = 0;
         while(delalloc_end < page_end) {
-               nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
+               nr_delalloc = find_lock_delalloc_range(inode, tree,
+                                                      page,
+                                                      &delalloc_start,
                                                        &delalloc_end,
                                                        128 * 1024 * 1024);
                 if (nr_delalloc == 0) {
                         delalloc_start = delalloc_end + 1;
                         continue;
                 }
-               tree->ops->fill_delalloc(inode, delalloc_start,
-                                        delalloc_end);
-               clear_extent_bit(tree, delalloc_start,
-                                delalloc_end,
-                                EXTENT_LOCKED | EXTENT_DELALLOC,
-                                1, 0, GFP_NOFS);
+               tree->ops->fill_delalloc(inode, page, delalloc_start,
+                                        delalloc_end, &page_started);
                 delalloc_start = delalloc_end + 1;
         }
+
+       /* did the fill delalloc function already unlock and start the IO? */
+       if (page_started) {
+               return 0;
+       }
+
         lock_extent(tree, start, page_end, GFP_NOFS);
         unlock_start = start;
  
         if (tree->ops && tree->ops->writepage_start_hook) {
-               ret = tree->ops->writepage_start_hook(page, start, page_end);
+               ret = tree->ops->writepage_start_hook(page, start,
+                                                     page_end);
                 if (ret == -EAGAIN) {
                         unlock_extent(tree, start, page_end, GFP_NOFS);
                         redirty_page_for_writepage(wbc, page);
@@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                 sector = (em->block_start + extent_offset) >> 9;
                 bdev = em->bdev;
                 block_start = em->block_start;
+               compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                 free_extent_map(em);
                 em = NULL;
  
-               if (block_start == EXTENT_MAP_HOLE ||
+               /*
+                * compressed and inline extents are written through other
+                * paths in the FS
+                */
+               if (compressed || block_start == EXTENT_MAP_HOLE ||
                     block_start == EXTENT_MAP_INLINE) {
                         clear_extent_dirty(tree, cur,
                                            cur + iosize - 1, GFP_NOFS);
@@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                         unlock_extent(tree, unlock_start, cur + iosize -1,
                                       GFP_NOFS);
  
-                       if (tree->ops && tree->ops->writepage_end_io_hook)
+                       /*
+                        * end_io notification does not happen here for
+                        * compressed extents
+                        */
+                       if (!compressed && tree->ops &&
+                           tree->ops->writepage_end_io_hook)
                                 tree->ops->writepage_end_io_hook(page, cur,
                                                          cur + iosize - 1,
                                                          NULL, 1);
-                       cur = cur + iosize;
+                       else if (compressed) {
+                               /* we don't want to end_page_writeback on
+                                * a compressed extent.  this happens
+                                * elsewhere
+                                */
+                               nr++;
+                       }
+
+                       cur += iosize;
                         pg_offset += iosize;
                         unlock_start = cur;
                         continue;
                 }
-
                 /* leave this out until we have a page_mkwrite call */
                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
                                    EXTENT_DIRTY, 0)) {
@@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                         pg_offset += iosize;
                         continue;
                 }
+
                 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
                 if (tree->ops && tree->ops->writepage_io_hook) {
                         ret = tree->ops->writepage_io_hook(page, cur,
@@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                         ret = submit_extent_page(WRITE, tree, page, sector,
                                                  iosize, pg_offset, bdev,
                                                  &epd->bio, max_nr,
-                                                end_bio_extent_writepage, 0);
+                                                end_bio_extent_writepage,
+                                                0, 0, 0);
                         if (ret)
                                 SetPageError(page);
                 }
@@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
         extent_write_cache_pages(tree, mapping, &wbc_writepages,
                                  __extent_writepage, &epd);
         if (epd.bio) {
-               submit_one_bio(WRITE, epd.bio, 0);
+               submit_one_bio(WRITE, epd.bio, 0, 0);
         }
         return ret;
  }
@@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree,
         ret = extent_write_cache_pages(tree, mapping, wbc,
                                        __extent_writepage, &epd);
         if (epd.bio) {
-               submit_one_bio(WRITE, epd.bio, 0);
+               submit_one_bio(WRITE, epd.bio, 0, 0);
         }
         return ret;
  }
@@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree,
         struct bio *bio = NULL;
         unsigned page_idx;
         struct pagevec pvec;
+       unsigned long bio_flags = 0;
  
         pagevec_init(&pvec, 0);
         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree,
                         if (!pagevec_add(&pvec, page))
                                 __pagevec_lru_add(&pvec);
                         __extent_read_full_page(tree, page, get_extent,
-                                               &bio, 0);
+                                               &bio, 0, &bio_flags);
                 }
                 page_cache_release(page);
         }
@@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree,
                 __pagevec_lru_add(&pvec);
         BUG_ON(!list_empty(pages));
         if (bio)
-               submit_one_bio(READ, bio, 0);
+               submit_one_bio(READ, bio, 0, bio_flags);
         return 0;
  }
  EXPORT_SYMBOL(extent_readpages);
@@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
                         ret = submit_extent_page(READ, tree, page,
                                          sector, iosize, page_offset, em->bdev,
                                          NULL, 1,
-                                        end_bio_extent_preparewrite, 0);
+                                        end_bio_extent_preparewrite, 0,
+                                        0, 0);
                         iocount++;
                         block_start = block_start + iosize;
                 } else {
@@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                         }
                         if (!test_range_bit(tree, em->start,
                                             extent_map_end(em) - 1,
-                                           EXTENT_LOCKED, 0)) {
+                                           EXTENT_LOCKED | EXTENT_WRITEBACK |
+                                           EXTENT_ORDERED,
+                                           0)) {
                                 remove_extent_mapping(map, em);
                                 /* once for the rb tree */
                                 free_extent_map(em);
@@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
         int inc_all_pages = 0;
         unsigned long num_pages;
         struct bio *bio = NULL;
+       unsigned long bio_flags = 0;
  
         if (eb->flags & EXTENT_UPTODATE)
                 return 0;
@@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                         ClearPageError(page);
                         err = __extent_read_full_page(tree, page,
                                                       get_extent, &bio,
-                                                     mirror_num);
+                                                     mirror_num, &bio_flags);
                         if (err) {
                                 ret = err;
                                 printk("err %d from __extent_read_full_page\n", ret);
@@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
         }
  
         if (bio)
-               submit_one_bio(READ, bio, mirror_num);
+               submit_one_bio(READ, bio, mirror_num, bio_flags);
  
         if (ret || !wait) {
                 if (ret)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h

index c9d1908a1ae3881d68deafb48b3f8c32b668a707..86f859b87a6ec51bffe6975f8a95704f65bc5c53 100644 (file)
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,6 +18,9 @@
  #define EXTENT_BOUNDARY (1 << 11)
  #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
  
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
+
  /*
   * page->private values.  Every page that is controlled by the extent
   * map has page->private set to one.
@@ -28,14 +31,17 @@
  struct extent_state;
  
  typedef        int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
-                                      struct bio *bio, int mirror_num);
+                                      struct bio *bio, int mirror_num,
+                                      unsigned long bio_flags);
  struct extent_io_ops {
-       int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+       int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+                            u64 start, u64 end, int *page_started);
         int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
         int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
         extent_submit_bio_hook_t *submit_bio_hook;
         int (*merge_bio_hook)(struct page *page, unsigned long offset,
-                             size_t size, struct bio *bio);
+                             size_t size, struct bio *bio,
+                             unsigned long bio_flags);
         int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
         int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
                                        u64 start, u64 end,
@@ -245,4 +251,9 @@ void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
  int release_extent_buffer_tail_pages(struct extent_buffer *eb);
  int extent_range_uptodate(struct extent_io_tree *tree,
                           u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+                               struct extent_io_tree *tree,
+                               u64 start, u64 end, struct page *locked_page,
+                               int clear_dirty, int set_writeback,
+                               int clear_writeback);
  #endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c

index 74b2a29880d36a3db2e40efcf45b8784319998ee..fd3ebfb8c3c5e80bf440c21e6541d976325bcef2 100644 (file)
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
         if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
                 return 0;
  
+       /*
+        * don't merge compressed extents, we need to know their
+        * actual size
+        */
+       if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+               return 0;
+
         if (extent_map_end(prev) == next->start &&
             prev->flags == next->flags &&
             prev->bdev == next->bdev &&
@@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
                 if (rb && mergable_maps(merge, em)) {
                         em->start = merge->start;
                         em->len += merge->len;
+                       em->block_len += merge->block_len;
                         em->block_start = merge->block_start;
                         merge->in_tree = 0;
                         rb_erase(&merge->rb_node, &tree->map);
@@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
                 merge = rb_entry(rb, struct extent_map, rb_node);
         if (rb && mergable_maps(em, merge)) {
                 em->len += merge->len;
+               em->block_len += merge->len;
                 rb_erase(&merge->rb_node, &tree->map);
                 merge->in_tree = 0;
                 free_extent_map(merge);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h

index 26ac6fe0b2682ed15df0467931d0913c9af6fa4f..abbcbeb28c79d0d910ac922164b8def8a353a8ea 100644 (file)
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -10,6 +10,7 @@
  
  /* bits for the flags field */
  #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
  
  struct extent_map {
         struct rb_node rb_node;
@@ -18,6 +19,7 @@ struct extent_map {
         u64 start;
         u64 len;
         u64 block_start;
+       u64 block_len;
         unsigned long flags;
         struct block_device *bdev;
         atomic_t refs;
@@ -38,9 +40,9 @@ static inline u64 extent_map_end(struct extent_map *em)
  
  static inline u64 extent_map_block_end(struct extent_map *em)
  {
-       if (em->block_start + em->len < em->block_start)
+       if (em->block_start + em->block_len < em->block_start)
                 return (u64)-1;
-       return em->block_start + em->len;
+       return em->block_start + em->block_len;
  }
  
  void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c

index 6dbe88b9d7d43c2707bb6be2d270bb9eb3b4d50f..f4d3fa71bc419c3c5ef569eda6ded776ba3e605e 100644 (file)
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -31,7 +31,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 objectid, u64 pos,
                              u64 disk_offset, u64 disk_num_bytes,
-                            u64 num_bytes, u64 offset)
+                            u64 num_bytes, u64 offset, u64 ram_bytes,
+                            u8 compression, u8 encryption, u16 other_encoding)
  {
         int ret = 0;
         struct btrfs_file_extent_item *item;
@@ -57,8 +58,13 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
         btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
         btrfs_set_file_extent_offset(leaf, item, offset);
         btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+       btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
         btrfs_set_file_extent_generation(leaf, item, trans->transid);
         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+       btrfs_set_file_extent_compression(leaf, item, compression);
+       btrfs_set_file_extent_encryption(leaf, item, encryption);
+       btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
         btrfs_mark_buffer_dirty(leaf);
  out:
         btrfs_free_path(path);
@@ -213,6 +219,73 @@ found:
         return 0;
  }
  
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+                         u64 start, unsigned long len)
+{
+       struct btrfs_ordered_sum *sums;
+       struct btrfs_sector_sum *sector_sum;
+       struct btrfs_ordered_extent *ordered;
+       char *data;
+       struct page *page;
+       unsigned long total_bytes = 0;
+       unsigned long this_sum_bytes = 0;
+
+       sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
+       if (!sums)
+               return -ENOMEM;
+
+       sector_sum = sums->sums;
+       sums->file_offset = start;
+       sums->len = len;
+       INIT_LIST_HEAD(&sums->list);
+       ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
+       BUG_ON(!ordered);
+
+       while(len > 0) {
+               if (start >= ordered->file_offset + ordered->len ||
+                   start < ordered->file_offset) {
+                       sums->len = this_sum_bytes;
+                       this_sum_bytes = 0;
+                       btrfs_add_ordered_sum(inode, ordered, sums);
+                       btrfs_put_ordered_extent(ordered);
+
+                       sums = kzalloc(btrfs_ordered_sum_size(root, len),
+                                      GFP_NOFS);
+                       BUG_ON(!sums);
+                       sector_sum = sums->sums;
+                       sums->len = len;
+                       sums->file_offset = start;
+                       ordered = btrfs_lookup_ordered_extent(inode,
+                                                     sums->file_offset);
+                       BUG_ON(!ordered);
+               }
+
+               page = find_get_page(inode->i_mapping,
+                                    start >> PAGE_CACHE_SHIFT);
+
+               data = kmap_atomic(page, KM_USER0);
+               sector_sum->sum = ~(u32)0;
+               sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
+                                                 PAGE_CACHE_SIZE);
+               kunmap_atomic(data, KM_USER0);
+               btrfs_csum_final(sector_sum->sum,
+                                (char *)&sector_sum->sum);
+               sector_sum->offset = page_offset(page);
+               page_cache_release(page);
+
+               sector_sum++;
+               total_bytes += PAGE_CACHE_SIZE;
+               this_sum_bytes += PAGE_CACHE_SIZE;
+               start += PAGE_CACHE_SIZE;
+
+               WARN_ON(len < PAGE_CACHE_SIZE);
+               len -= PAGE_CACHE_SIZE;
+       }
+       btrfs_add_ordered_sum(inode, ordered, sums);
+       btrfs_put_ordered_extent(ordered);
+       return 0;
+}
+
  int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                        struct bio *bio)
  {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index 69abbe19add25aa3f8485e3dcae0f2b393d86f29..0aa15436590e46bdc05396e28871d4aa3e12c072 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
         }
  }
  
-/* this does all the hard work for inserting an inline extent into
- * the btree.  Any existing inline extent is extended as required to make room,
- * otherwise things are inserted as required into the btree
- */
-static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root, struct inode *inode,
-                               u64 offset, size_t size,
-                               struct page **pages, size_t page_offset,
-                               int num_pages)
-{
-       struct btrfs_key key;
-       struct btrfs_path *path;
-       struct extent_buffer *leaf;
-       char *kaddr;
-       unsigned long ptr;
-       struct btrfs_file_extent_item *ei;
-       struct page *page;
-       u32 datasize;
-       int err = 0;
-       int ret;
-       int i;
-       ssize_t cur_size;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       btrfs_set_trans_block_group(trans, inode);
-
-       key.objectid = inode->i_ino;
-       key.offset = offset;
-       btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-
-       ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-       if (ret < 0) {
-               err = ret;
-               goto fail;
-       }
-       if (ret == 1) {
-               struct btrfs_key found_key;
-
-               if (path->slots[0] == 0)
-                       goto insert;
-
-               path->slots[0]--;
-               leaf = path->nodes[0];
-               btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
-               if (found_key.objectid != inode->i_ino)
-                       goto insert;
-
-               if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-                       goto insert;
-               ei = btrfs_item_ptr(leaf, path->slots[0],
-                                   struct btrfs_file_extent_item);
-
-               if (btrfs_file_extent_type(leaf, ei) !=
-                   BTRFS_FILE_EXTENT_INLINE) {
-                       goto insert;
-               }
-               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-               ret = 0;
-       }
-       if (ret == 0) {
-               u32 found_size;
-               u64 found_end;
-
-               leaf = path->nodes[0];
-               ei = btrfs_item_ptr(leaf, path->slots[0],
-                                   struct btrfs_file_extent_item);
-
-               if (btrfs_file_extent_type(leaf, ei) !=
-                   BTRFS_FILE_EXTENT_INLINE) {
-                       err = ret;
-                       btrfs_print_leaf(root, leaf);
-                       printk("found wasn't inline offset %Lu inode %lu\n",
-                              offset, inode->i_ino);
-                       goto fail;
-               }
-               found_size = btrfs_file_extent_inline_len(leaf,
-                                         btrfs_item_nr(leaf, path->slots[0]));
-               found_end = key.offset + found_size;
-
-               if (found_end < offset + size) {
-                       btrfs_release_path(root, path);
-                       ret = btrfs_search_slot(trans, root, &key, path,
-                                               offset + size - found_end, 1);
-                       BUG_ON(ret != 0);
-
-                       ret = btrfs_extend_item(trans, root, path,
-                                               offset + size - found_end);
-                       if (ret) {
-                               err = ret;
-                               goto fail;
-                       }
-                       leaf = path->nodes[0];
-                       ei = btrfs_item_ptr(leaf, path->slots[0],
-                                           struct btrfs_file_extent_item);
-                       inode_add_bytes(inode, offset + size - found_end);
-               }
-               if (found_end < offset) {
-                       ptr = btrfs_file_extent_inline_start(ei) + found_size;
-                       memset_extent_buffer(leaf, 0, ptr, offset - found_end);
-               }
-       } else {
-insert:
-               btrfs_release_path(root, path);
-               datasize = offset + size - key.offset;
-               inode_add_bytes(inode, datasize);
-               datasize = btrfs_file_extent_calc_inline_size(datasize);
-               ret = btrfs_insert_empty_item(trans, root, path, &key,
-                                             datasize);
-               if (ret) {
-                       err = ret;
-                       printk("got bad ret %d\n", ret);
-                       goto fail;
-               }
-               leaf = path->nodes[0];
-               ei = btrfs_item_ptr(leaf, path->slots[0],
-                                   struct btrfs_file_extent_item);
-               btrfs_set_file_extent_generation(leaf, ei, trans->transid);
-               btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
-       }
-       ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
-
-       cur_size = size;
-       i = 0;
-       while (size > 0) {
-               page = pages[i];
-               kaddr = kmap_atomic(page, KM_USER0);
-               cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
-               write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
-               kunmap_atomic(kaddr, KM_USER0);
-               page_offset = 0;
-               ptr += cur_size;
-               size -= cur_size;
-               if (i >= num_pages) {
-                       printk("i %d num_pages %d\n", i, num_pages);
-               }
-               i++;
-       }
-       btrfs_mark_buffer_dirty(leaf);
-fail:
-       btrfs_free_path(path);
-       return err;
-}
-
  /*
   * after copy_from_user, pages need to be dirtied and we need to make
   * sure holes are created between the current EOF and the start of
@@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
         u64 start_pos;
         u64 end_of_last_block;
         u64 end_pos = pos + write_bytes;
-       u64 inline_size;
-       int did_inline = 0;
         loff_t isize = i_size_read(inode);
  
         start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
                         err = btrfs_insert_file_extent(trans, root,
                                                        inode->i_ino,
                                                        last_pos_in_file,
-                                                      0, 0, hole_size, 0);
+                                                      0, 0, hole_size, 0,
+                                                      hole_size, 0, 0, 0);
                         btrfs_drop_extent_cache(inode, last_pos_in_file,
                                         last_pos_in_file + hole_size - 1, 0);
                         mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
                         goto failed;
         }
  
-       /*
-        * either allocate an extent for the new bytes or setup the key
-        * to show we are doing inline data in the extent
+       /* check for reserved extents on each page, we don't want
+        * to reset the delalloc bit on things that already have
+        * extents reserved.
          */
-       inline_size = end_pos;
-       if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-           inline_size > root->fs_info->max_inline ||
-           (inline_size & (root->sectorsize -1)) == 0 ||
-           inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-               /* check for reserved extents on each page, we don't want
-                * to reset the delalloc bit on things that already have
-                * extents reserved.
-                */
-               btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
-               for (i = 0; i < num_pages; i++) {
-                       struct page *p = pages[i];
-                       SetPageUptodate(p);
-                       ClearPageChecked(p);
-                       set_page_dirty(p);
-               }
-       } else {
-               u64 aligned_end;
-               /* step one, delete the existing extents in this range */
-               aligned_end = (pos + write_bytes + root->sectorsize - 1) &
-                       ~((u64)root->sectorsize - 1);
-               mutex_lock(&BTRFS_I(inode)->extent_mutex);
-               err = btrfs_drop_extents(trans, root, inode, start_pos,
-                                        aligned_end, aligned_end, &hint_byte);
-               if (err)
-                       goto failed;
-               if (isize > inline_size)
-                       inline_size = min_t(u64, isize, aligned_end);
-               inline_size -= start_pos;
-               err = insert_inline_extent(trans, root, inode, start_pos,
-                                          inline_size, pages, 0, num_pages);
-               btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
-               BUG_ON(err);
-               mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-
-               /*
-                * an ugly way to do all the prop accounting around
-                * the page bits and mapping tags
-                */
-               set_page_writeback(pages[0]);
-               end_page_writeback(pages[0]);
-               did_inline = 1;
+       btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = pages[i];
+               SetPageUptodate(p);
+               ClearPageChecked(p);
+               set_page_dirty(p);
         }
         if (end_pos > isize) {
                 i_size_write(inode, end_pos);
-               if (did_inline)
-                       BTRFS_I(inode)->disk_i_size = end_pos;
                 btrfs_update_inode(trans, root, inode);
         }
  failed:
@@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
         int ret;
         int testend = 1;
         unsigned long flags;
+       int compressed = 0;
  
         WARN_ON(end < start);
         if (end == (u64)-1) {
@@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                         free_extent_map(em);
                         continue;
                 }
+               compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
                 remove_extent_mapping(em_tree, em);
  
@@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                         split->start = em->start;
                         split->len = start - em->start;
                         split->block_start = em->block_start;
+
+                       if (compressed)
+                               split->block_len = em->block_len;
+                       else
+                               split->block_len = split->len;
+
                         split->bdev = em->bdev;
                         split->flags = flags;
                         ret = add_extent_mapping(em_tree, split);
@@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                         split->bdev = em->bdev;
                         split->flags = flags;
  
-                       split->block_start = em->block_start + diff;
+                       if (compressed) {
+                               split->block_len = em->block_len;
+                               split->block_start = em->block_start;
+                       } else {
+                               split->block_len = split->len;
+                               split->block_start = em->block_start + diff;
+                       }
  
                         ret = add_extent_mapping(em_tree, split);
                         BUG_ON(ret);
@@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
                         struct btrfs_item *item;
                         item = btrfs_item_nr(leaf, slot);
                         extent_end = found_key.offset +
-                            btrfs_file_extent_inline_len(leaf, item);
+                            btrfs_file_extent_inline_len(leaf, extent);
                         extent_end = (extent_end + root->sectorsize - 1) &
                                 ~((u64)root->sectorsize -1 );
                 }
@@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
         u64 extent_end = 0;
         u64 search_start = start;
         u64 leaf_start;
+       u64 ram_bytes = 0;
+       u8 compression = 0;
+       u8 encryption = 0;
+       u16 other_encoding = 0;
         u64 root_gen;
         u64 root_owner;
         struct extent_buffer *leaf;
@@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
         int recow;
         int ret;
  
+       inline_limit = 0;
         btrfs_drop_extent_cache(inode, start, end - 1, 0);
  
         path = btrfs_alloc_path();
@@ -637,6 +470,12 @@ next_slot:
                         extent = btrfs_item_ptr(leaf, slot,
                                                 struct btrfs_file_extent_item);
                         found_type = btrfs_file_extent_type(leaf, extent);
+                       compression = btrfs_file_extent_compression(leaf,
+                                                                   extent);
+                       encryption = btrfs_file_extent_encryption(leaf,
+                                                                 extent);
+                       other_encoding = btrfs_file_extent_other_encoding(leaf,
+                                                                 extent);
                         if (found_type == BTRFS_FILE_EXTENT_REG) {
                                 extent_end =
                                      btrfs_file_extent_disk_bytenr(leaf,
@@ -646,13 +485,13 @@ next_slot:
  
                                 extent_end = key.offset +
                                      btrfs_file_extent_num_bytes(leaf, extent);
+                               ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+                                                               extent);
                                 found_extent = 1;
                         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-                               struct btrfs_item *item;
-                               item = btrfs_item_nr(leaf, slot);
                                 found_inline = 1;
                                 extent_end = key.offset +
-                                    btrfs_file_extent_inline_len(leaf, item);
+                                    btrfs_file_extent_inline_len(leaf, extent);
                         }
                 } else {
                         extent_end = search_start;
@@ -680,10 +519,9 @@ next_slot:
                         search_start = (extent_end + mask) & ~mask;
                 } else
                         search_start = extent_end;
-               if (end <= extent_end && start >= key.offset && found_inline) {
+
+               if (end <= extent_end && start >= key.offset && found_inline)
                         *hint_byte = EXTENT_MAP_INLINE;
-                       goto out;
-               }
  
                 if (found_extent) {
                         read_extent_buffer(leaf, &old, (unsigned long)extent,
@@ -770,12 +608,27 @@ next_slot:
                         write_extent_buffer(leaf, &old,
                                             (unsigned long)extent, sizeof(old));
  
+                       btrfs_set_file_extent_compression(leaf, extent,
+                                                         compression);
+                       btrfs_set_file_extent_encryption(leaf, extent,
+                                                        encryption);
+                       btrfs_set_file_extent_other_encoding(leaf, extent,
+                                                            other_encoding);
                         btrfs_set_file_extent_offset(leaf, extent,
                                     le64_to_cpu(old.offset) + end - key.offset);
                         WARN_ON(le64_to_cpu(old.num_bytes) <
                                 (extent_end - end));
                         btrfs_set_file_extent_num_bytes(leaf, extent,
                                                         extent_end - end);
+
+                       /*
+                        * set the ram bytes to the size of the full extent
+                        * before splitting.  This is a worst case flag,
+                        * but its the best we can do because we don't know
+                        * how splitting affects compression
+                        */
+                       btrfs_set_file_extent_ram_bytes(leaf, extent,
+                                                       ram_bytes);
                         btrfs_set_file_extent_type(leaf, extent,
                                                    BTRFS_FILE_EXTENT_REG);
  
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index bf4bed6ca4d601b2773fe9c9bb17890637a5aa99..9797592dc86b7c7b8398a18f61a892d70a86c24d 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -49,6 +49,7 @@
  #include "compat.h"
  #include "tree-log.h"
  #include "ref-cache.h"
+#include "compression.h"
  
  struct btrfs_iget_args {
         u64 ino;
@@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
  };
  
  static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
  
  /*
   * a very lame attempt at stopping writes when the FS is 85% full.  There
@@ -113,58 +115,375 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
         return ret;
  }
  
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree.  The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root, struct inode *inode,
+                               u64 start, size_t size, size_t compressed_size,
+                               struct page **compressed_pages)
+{
+       struct btrfs_key key;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct page *page = NULL;
+       char *kaddr;
+       unsigned long ptr;
+       struct btrfs_file_extent_item *ei;
+       int err = 0;
+       int ret;
+       size_t cur_size = size;
+       size_t datasize;
+       unsigned long offset;
+       int use_compress = 0;
+
+       if (compressed_size && compressed_pages) {
+               use_compress = 1;
+               cur_size = compressed_size;
+       }
+
+       path = btrfs_alloc_path(); if (!path)
+               return -ENOMEM;
+
+       btrfs_set_trans_block_group(trans, inode);
+
+       key.objectid = inode->i_ino;
+       key.offset = start;
+       btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+       inode_add_bytes(inode, size);
+       datasize = btrfs_file_extent_calc_inline_size(cur_size);
+
+       inode_add_bytes(inode, size);
+       ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                     datasize);
+       BUG_ON(ret);
+       if (ret) {
+               err = ret;
+               printk("got bad ret %d\n", ret);
+               goto fail;
+       }
+       leaf = path->nodes[0];
+       ei = btrfs_item_ptr(leaf, path->slots[0],
+                           struct btrfs_file_extent_item);
+       btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+       btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+       btrfs_set_file_extent_encryption(leaf, ei, 0);
+       btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+       btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+       ptr = btrfs_file_extent_inline_start(ei);
+
+       if (use_compress) {
+               struct page *cpage;
+               int i = 0;
+               while(compressed_size > 0) {
+                       cpage = compressed_pages[i];
+                       cur_size = min(compressed_size,
+                                      PAGE_CACHE_SIZE);
+
+                       kaddr = kmap(cpage);
+                       write_extent_buffer(leaf, kaddr, ptr, cur_size);
+                       kunmap(cpage);
+
+                       i++;
+                       ptr += cur_size;
+                       compressed_size -= cur_size;
+               }
+               btrfs_set_file_extent_compression(leaf, ei,
+                                                 BTRFS_COMPRESS_ZLIB);
+       } else {
+               page = find_get_page(inode->i_mapping,
+                                    start >> PAGE_CACHE_SHIFT);
+               btrfs_set_file_extent_compression(leaf, ei, 0);
+               kaddr = kmap_atomic(page, KM_USER0);
+               offset = start & (PAGE_CACHE_SIZE - 1);
+               write_extent_buffer(leaf, kaddr + offset, ptr, size);
+               kunmap_atomic(kaddr, KM_USER0);
+               page_cache_release(page);
+       }
+       btrfs_mark_buffer_dirty(leaf);
+       btrfs_free_path(path);
+
+       BTRFS_I(inode)->disk_i_size = inode->i_size;
+       btrfs_update_inode(trans, root, inode);
+       return 0;
+fail:
+       btrfs_free_path(path);
+       return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file.  This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct inode *inode, u64 start, u64 end,
+                                size_t compressed_size,
+                                struct page **compressed_pages)
+{
+       u64 isize = i_size_read(inode);
+       u64 actual_end = min(end + 1, isize);
+       u64 inline_len = actual_end - start;
+       u64 aligned_end = (end + root->sectorsize - 1) &
+                       ~((u64)root->sectorsize - 1);
+       u64 hint_byte;
+       u64 data_len = inline_len;
+       int ret;
+
+       if (compressed_size)
+               data_len = compressed_size;
+
+       if (start > 0 ||
+           data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+           (!compressed_size &&
+           (actual_end & (root->sectorsize - 1)) == 0) ||
+           end + 1 < isize ||
+           data_len > root->fs_info->max_inline) {
+               return 1;
+       }
+
+       mutex_lock(&BTRFS_I(inode)->extent_mutex);
+       ret = btrfs_drop_extents(trans, root, inode, start,
+                                aligned_end, aligned_end, &hint_byte);
+       BUG_ON(ret);
+
+       if (isize > actual_end)
+               inline_len = min_t(u64, isize, actual_end);
+       ret = insert_inline_extent(trans, root, inode, start,
+                                  inline_len, compressed_size,
+                                  compressed_pages);
+       BUG_ON(ret);
+       btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+       mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+       return 0;
+}
+
  /*
   * when extent_io.c finds a delayed allocation range in the file,
   * the call backs end up in this code.  The basic idea is to
   * allocate extents on disk for the range, and create ordered data structs
   * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
   */
-static int cow_file_range(struct inode *inode, u64 start, u64 end)
+static int cow_file_range(struct inode *inode, struct page *locked_page,
+                         u64 start, u64 end, int *page_started)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_trans_handle *trans;
         u64 alloc_hint = 0;
         u64 num_bytes;
+       unsigned long ram_size;
+       u64 orig_start;
+       u64 disk_num_bytes;
         u64 cur_alloc_size;
         u64 blocksize = root->sectorsize;
-       u64 orig_num_bytes;
+       u64 actual_end;
         struct btrfs_key ins;
         struct extent_map *em;
         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
         int ret = 0;
+       struct page **pages = NULL;
+       unsigned long nr_pages;
+       unsigned long nr_pages_ret = 0;
+       unsigned long total_compressed = 0;
+       unsigned long total_in = 0;
+       unsigned long max_compressed = 128 * 1024;
+       unsigned long max_uncompressed = 256 * 1024;
+       int i;
+       int will_compress;
  
         trans = btrfs_join_transaction(root, 1);
         BUG_ON(!trans);
         btrfs_set_trans_block_group(trans, inode);
+       orig_start = start;
+
+       /*
+        * compression made this loop a bit ugly, but the basic idea is to
+        * compress some pages but keep the total size of the compressed
+        * extent relatively small.  If compression is off, this goto target
+        * is never used.
+        */
+again:
+       will_compress = 0;
+       nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+       nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
  
+       actual_end = min_t(u64, i_size_read(inode), end + 1);
+       total_compressed = actual_end - start;
+
+       /* we want to make sure that amount of ram required to uncompress
+        * an extent is reasonable, so we limit the total size in ram
+        * of a compressed extent to 256k
+        */
+       total_compressed = min(total_compressed, max_uncompressed);
         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
         num_bytes = max(blocksize,  num_bytes);
-       orig_num_bytes = num_bytes;
+       disk_num_bytes = num_bytes;
+       total_in = 0;
+       ret = 0;
  
-       if (alloc_hint == EXTENT_MAP_INLINE)
-               goto out;
+       /* we do compression for mount -o compress and when the
+        * inode has not been flagged as nocompress
+        */
+       if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+           btrfs_test_opt(root, COMPRESS)) {
+               WARN_ON(pages);
+               pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+
+               /* we want to make sure the amount of IO required to satisfy
+                * a random read is reasonably small, so we limit the size
+                * of a compressed extent to 128k
+                */
+               ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+                                               total_compressed, pages,
+                                               nr_pages, &nr_pages_ret,
+                                               &total_in,
+                                               &total_compressed,
+                                               max_compressed);
+
+               if (!ret) {
+                       unsigned long offset = total_compressed &
+                               (PAGE_CACHE_SIZE - 1);
+                       struct page *page = pages[nr_pages_ret - 1];
+                       char *kaddr;
+
+                       /* zero the tail end of the last page, we might be
+                        * sending it down to disk
+                        */
+                       if (offset) {
+                               kaddr = kmap_atomic(page, KM_USER0);
+                               memset(kaddr + offset, 0,
+                                      PAGE_CACHE_SIZE - offset);
+                               kunmap_atomic(kaddr, KM_USER0);
+                       }
+                       will_compress = 1;
+               }
+       }
+       if (start == 0) {
+               /* lets try to make an inline extent */
+               if (ret || total_in < (end - start + 1)) {
+                       /* we didn't compress the entire range, try
+                        * to make an uncompressed inline extent.  This
+                        * is almost sure to fail, but maybe inline sizes
+                        * will get bigger later
+                        */
+                       ret = cow_file_range_inline(trans, root, inode,
+                                                   start, end, 0, NULL);
+               } else {
+                       ret = cow_file_range_inline(trans, root, inode,
+                                                   start, end,
+                                                   total_compressed, pages);
+               }
+               if (ret == 0) {
+                       extent_clear_unlock_delalloc(inode,
+                                                    &BTRFS_I(inode)->io_tree,
+                                                    start, end, NULL,
+                                                    1, 1, 1);
+                       *page_started = 1;
+                       ret = 0;
+                       goto free_pages_out;
+               }
+       }
+
+       if (will_compress) {
+               /*
+                * we aren't doing an inline extent round the compressed size
+                * up to a block size boundary so the allocator does sane
+                * things
+                */
+               total_compressed = (total_compressed + blocksize - 1) &
+                       ~(blocksize - 1);
+
+               /*
+                * one last check to make sure the compression is really a
+                * win, compare the page count read with the blocks on disk
+                */
+               total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+                       ~(PAGE_CACHE_SIZE - 1);
+               if (total_compressed >= total_in) {
+                       will_compress = 0;
+               } else {
+                       disk_num_bytes = total_compressed;
+                       num_bytes = total_in;
+               }
+       }
+       if (!will_compress && pages) {
+               /*
+                * the compression code ran but failed to make things smaller,
+                * free any pages it allocated and our page pointer array
+                */
+               for (i = 0; i < nr_pages_ret; i++) {
+                       page_cache_release(pages[i]);
+               }
+               kfree(pages);
+               pages = NULL;
+               total_compressed = 0;
+               nr_pages_ret = 0;
+
+               /* flag the file so we don't compress in the future */
+               btrfs_set_flag(inode, NOCOMPRESS);
+       }
+
+       BUG_ON(disk_num_bytes >
+              btrfs_super_total_bytes(&root->fs_info->super_copy));
  
-       BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
         mutex_lock(&BTRFS_I(inode)->extent_mutex);
         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
         mutex_unlock(&BTRFS_I(inode)->extent_mutex);
  
-       while(num_bytes > 0) {
-               cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
+       while(disk_num_bytes > 0) {
+               unsigned long min_bytes;
+
+               /*
+                * the max size of a compressed extent is pretty small,
+                * make the code a little less complex by forcing
+                * the allocator to find a whole compressed extent at once
+                */
+               if (will_compress)
+                       min_bytes = disk_num_bytes;
+               else
+                       min_bytes = root->sectorsize;
+
+               cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
                 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
-                                          root->sectorsize, 0, alloc_hint,
+                                          min_bytes, 0, alloc_hint,
                                            (u64)-1, &ins, 1);
                 if (ret) {
                         WARN_ON(1);
-                       goto out;
+                       goto free_pages_out_fail;
                 }
                 em = alloc_extent_map(GFP_NOFS);
                 em->start = start;
-               em->len = ins.offset;
+
+               if (will_compress) {
+                       ram_size = num_bytes;
+                       em->len = num_bytes;
+               } else {
+                       /* ramsize == disk size */
+                       ram_size = ins.offset;
+                       em->len = ins.offset;
+               }
+
                 em->block_start = ins.objectid;
+               em->block_len = ins.offset;
                 em->bdev = root->fs_info->fs_devices->latest_bdev;
+
                 mutex_lock(&BTRFS_I(inode)->extent_mutex);
                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+               if (will_compress)
+                       set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
                 while(1) {
                         spin_lock(&em_tree->lock);
                         ret = add_extent_mapping(em_tree, em);
@@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
                                 break;
                         }
                         btrfs_drop_extent_cache(inode, start,
-                                               start + ins.offset - 1, 0);
+                                               start + ram_size - 1, 0);
                 }
                 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
  
                 cur_alloc_size = ins.offset;
                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
-                                              ins.offset, 0);
+                                              ram_size, cur_alloc_size, 0,
+                                              will_compress);
                 BUG_ON(ret);
-               if (num_bytes < cur_alloc_size) {
-                       printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
+
+               if (disk_num_bytes < cur_alloc_size) {
+                       printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
                                cur_alloc_size);
                         break;
                 }
+
+               if (will_compress) {
+                       /*
+                        * we're doing compression, we and we need to
+                        * submit the compressed extents down to the device.
+                        *
+                        * We lock down all the file pages, clearing their
+                        * dirty bits and setting them writeback.  Everyone
+                        * that wants to modify the page will wait on the
+                        * ordered extent above.
+                        *
+                        * The writeback bits on the file pages are
+                        * cleared when the compressed pages are on disk
+                        */
+                       btrfs_end_transaction(trans, root);
+
+                       if (start <= page_offset(locked_page) &&
+                           page_offset(locked_page) < start + ram_size) {
+                               *page_started = 1;
+                       }
+
+                       extent_clear_unlock_delalloc(inode,
+                                                    &BTRFS_I(inode)->io_tree,
+                                                    start,
+                                                    start + ram_size - 1,
+                                                    NULL, 1, 1, 0);
+
+                       ret = btrfs_submit_compressed_write(inode, start,
+                                                ram_size, ins.objectid,
+                                                cur_alloc_size, pages,
+                                                nr_pages_ret);
+
+                       BUG_ON(ret);
+                       trans = btrfs_join_transaction(root, 1);
+                       if (start + ram_size < end) {
+                               start += ram_size;
+                               alloc_hint = ins.objectid + ins.offset;
+                               /* pages will be freed at end_bio time */
+                               pages = NULL;
+                               goto again;
+                       } else {
+                               /* we've written everything, time to go */
+                               break;
+                       }
+               }
+               /* we're not doing compressed IO, don't unlock the first
+                * page (which the caller expects to stay locked), don't
+                * clear any dirty bits and don't set any writeback bits
+                */
+               extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                            start, start + ram_size - 1,
+                                            locked_page, 0, 0, 0);
+               disk_num_bytes -= cur_alloc_size;
                 num_bytes -= cur_alloc_size;
                 alloc_hint = ins.objectid + ins.offset;
                 start += cur_alloc_size;
         }
+
+       ret = 0;
  out:
         btrfs_end_transaction(trans, root);
+
         return ret;
+
+free_pages_out_fail:
+       extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                    start, end, locked_page, 0, 0, 0);
+free_pages_out:
+       for (i = 0; i < nr_pages_ret; i++)
+               page_cache_release(pages[i]);
+       if (pages)
+               kfree(pages);
+
+       goto out;
  }
  
  /*
@@ -203,7 +591,8 @@ out:
   * If no cow copies or snapshots exist, we write directly to the existing
   * blocks on disk
   */
-static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+                             u64 start, u64 end, int *page_started)
  {
         u64 extent_start;
         u64 extent_end;
@@ -260,6 +649,11 @@ again:
                 extent_end = extent_start + extent_num_bytes;
                 err = 0;
  
+               if (btrfs_file_extent_compression(leaf, item) ||
+                   btrfs_file_extent_encryption(leaf,item) ||
+                   btrfs_file_extent_other_encoding(leaf, item))
+                       goto not_found;
+
                 if (loops && start != extent_start)
                         goto not_found;
  
@@ -284,7 +678,8 @@ again:
                 bytenr += btrfs_file_extent_offset(leaf, item);
                 extent_num_bytes = min(end + 1, extent_end) - start;
                 ret = btrfs_add_ordered_extent(inode, start, bytenr,
-                                               extent_num_bytes, 1);
+                                               extent_num_bytes,
+                                               extent_num_bytes, 1, 0);
                 if (ret) {
                         err = ret;
                         goto out;
@@ -300,7 +695,8 @@ again:
  not_found:
                 btrfs_end_transaction(trans, root);
                 btrfs_free_path(path);
-               return cow_file_range(inode, start, end);
+               return cow_file_range(inode, locked_page, start, end,
+                                     page_started);
         }
  out:
         WARN_ON(err);
@@ -312,16 +708,19 @@ out:
  /*
   * extent_io.c call back to do delayed allocation processing
   */
-static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+                             u64 start, u64 end, int *page_started)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         int ret;
  
         if (btrfs_test_opt(root, NODATACOW) ||
             btrfs_test_flag(inode, NODATACOW))
-               ret = run_delalloc_nocow(inode, start, end);
+               ret = run_delalloc_nocow(inode, locked_page, start, end,
+                                        page_started);
         else
-               ret = cow_file_range(inode, start, end);
+               ret = cow_file_range(inode, locked_page, start, end,
+                                    page_started);
  
         return ret;
  }
@@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
   * we don't create bios that span stripes or chunks
   */
  int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-                        size_t size, struct bio *bio)
+                        size_t size, struct bio *bio,
+                        unsigned long bio_flags)
  {
         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
         struct btrfs_mapping_tree *map_tree;
@@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
   * are inserted into the btree
   */
  int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num)
+                         int mirror_num, unsigned long bio_flags)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         int ret = 0;
@@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
   * or reading the csums from the tree before a read
   */
  int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num)
+                         int mirror_num, unsigned long bio_flags)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         int ret = 0;
@@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
  
         if (!(rw & (1 << BIO_RW))) {
                 btrfs_lookup_bio_sums(root, inode, bio);
+
+               if (bio_flags & EXTENT_BIO_COMPRESSED) {
+                       return btrfs_submit_compressed_read(inode, bio,
+                                                   mirror_num, bio_flags);
+               }
+
                 goto mapit;
         }
         return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                    inode, rw, bio, mirror_num,
-                                  __btrfs_submit_bio_hook);
+                                  bio_flags, __btrfs_submit_bio_hook);
  mapit:
         return btrfs_map_bio(root, rw, bio, mirror_num, 0);
  }
@@ -539,7 +945,7 @@ out_page:
   * good idea.  This causes problems because we want to make sure COW
   * properly happens and the data=ordered rules are followed.
   *
- * In our case any range that doesn't have the EXTENT_ORDERED bit set
+ * In our case any range that doesn't have the ORDERED bit set
   * hasn't been properly setup for IO.  We kick off an async process
   * to fix it up.  The async helper will wait for ordered extents, set
   * the delalloc bit and make it safe to write the page.
@@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
         btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
                                           ordered_extent->start);
         btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
-                                            ordered_extent->len);
+                                            ordered_extent->disk_len);
         btrfs_set_file_extent_offset(leaf, extent_item, 0);
+
+       if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+               btrfs_set_file_extent_compression(leaf, extent_item, 1);
+       else
+               btrfs_set_file_extent_compression(leaf, extent_item, 0);
+       btrfs_set_file_extent_encryption(leaf, extent_item, 0);
+       btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
+
+       /* ram bytes = extent_num_bytes for now */
         btrfs_set_file_extent_num_bytes(leaf, extent_item,
                                         ordered_extent->len);
+       btrfs_set_file_extent_ram_bytes(leaf, extent_item,
+                                       ordered_extent->len);
         btrfs_mark_buffer_dirty(leaf);
  
         btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
@@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
         mutex_unlock(&BTRFS_I(inode)->extent_mutex);
  
         ins.objectid = ordered_extent->start;
-       ins.offset = ordered_extent->len;
+       ins.offset = ordered_extent->disk_len;
         ins.type = BTRFS_EXTENT_ITEM_KEY;
         ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
                                           root->root_key.objectid,
@@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
         int ret;
         int rw;
         u64 logical;
+       unsigned long bio_flags = 0;
  
         ret = get_state_private(failure_tree, start, &private);
         if (ret) {
@@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
                 }
                 logical = start - em->start;
                 logical = em->block_start + logical;
+               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                       bio_flags = EXTENT_BIO_COMPRESSED;
                 failrec->logical = logical;
                 free_extent_map(em);
                 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
                 rw = READ;
  
         BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-                                                     failrec->last_mirror);
+                                                     failrec->last_mirror,
+                                                     bio_flags);
         return 0;
  }
  
@@ -1644,10 +2065,8 @@ search_again:
                                 item_end +=
                                     btrfs_file_extent_num_bytes(leaf, fi);
                         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-                               struct btrfs_item *item = btrfs_item_nr(leaf,
-                                                               path->slots[0]);
                                 item_end += btrfs_file_extent_inline_len(leaf,
-                                                                        item);
+                                                                        fi);
                         }
                         item_end--;
                 }
@@ -1715,7 +2134,14 @@ search_again:
                                 root_owner = btrfs_header_owner(leaf);
                         }
                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-                       if (!del_item) {
+                       /*
+                        * we can't truncate inline items that have had
+                        * special encodings
+                        */
+                       if (!del_item &&
+                           btrfs_file_extent_compression(leaf, fi) == 0 &&
+                           btrfs_file_extent_encryption(leaf, fi) == 0 &&
+                           btrfs_file_extent_other_encoding(leaf, fi) == 0) {
                                 u32 size = new_size - found_key.offset;
  
                                 if (root->ref_cows) {
@@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                         err = btrfs_insert_file_extent(trans, root,
                                                        inode->i_ino,
                                                        hole_start, 0, 0,
-                                                      hole_size, 0);
+                                                      hole_size, 0, hole_size,
+                                                      0, 0, 0);
                         btrfs_drop_extent_cache(inode, hole_start,
                                                 (u64)-1, 0);
                         btrfs_check_file(root, inode);
@@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
         start_diff = map_start - em->start;
         em->start = map_start;
         em->len = map_len;
-       if (em->block_start < EXTENT_MAP_LAST_BYTE)
+       if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+           !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                 em->block_start += start_diff;
+               em->block_len -= start_diff;
+       }
         return add_extent_mapping(em_tree, em);
  }
  
+static noinline int uncompress_inline(struct btrfs_path *path,
+                                     struct inode *inode, struct page *page,
+                                     size_t pg_offset, u64 extent_offset,
+                                     struct btrfs_file_extent_item *item)
+{
+       int ret;
+       struct extent_buffer *leaf = path->nodes[0];
+       char *tmp;
+       size_t max_size;
+       unsigned long inline_size;
+       unsigned long ptr;
+
+       WARN_ON(pg_offset != 0);
+       max_size = btrfs_file_extent_ram_bytes(leaf, item);
+       inline_size = btrfs_file_extent_inline_item_len(leaf,
+                                       btrfs_item_nr(leaf, path->slots[0]));
+       tmp = kmalloc(inline_size, GFP_NOFS);
+       ptr = btrfs_file_extent_inline_start(item);
+
+       read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+       max_size = min(PAGE_CACHE_SIZE, max_size);
+       ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+                                   inline_size, max_size);
+       if (ret) {
+               char *kaddr = kmap_atomic(page, KM_USER0);
+               unsigned long copy_size = min_t(u64,
+                                 PAGE_CACHE_SIZE - pg_offset,
+                                 max_size - extent_offset);
+               memset(kaddr + pg_offset, 0, copy_size);
+               kunmap_atomic(kaddr, KM_USER0);
+       }
+       kfree(tmp);
+       return 0;
+}
+
  /*
   * a bit scary, this does extent mapping from logical file offset to the disk.
   * the ugly parts come from merging extents from the disk with the
@@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
         struct btrfs_trans_handle *trans = NULL;
+       int compressed;
  
  again:
         spin_lock(&em_tree->lock);
@@ -2951,6 +3418,7 @@ again:
         em->bdev = root->fs_info->fs_devices->latest_bdev;
         em->start = EXTENT_MAP_HOLE;
         em->len = (u64)-1;
+       em->block_len = (u64)-1;
  
         if (!path) {
                 path = btrfs_alloc_path();
@@ -2983,6 +3451,7 @@ again:
  
         found_type = btrfs_file_extent_type(leaf, item);
         extent_start = found_key.offset;
+       compressed = btrfs_file_extent_compression(leaf, item);
         if (found_type == BTRFS_FILE_EXTENT_REG) {
                 extent_end = extent_start +
                        btrfs_file_extent_num_bytes(leaf, item);
@@ -3005,10 +3474,18 @@ again:
                         em->block_start = EXTENT_MAP_HOLE;
                         goto insert;
                 }
-               bytenr += btrfs_file_extent_offset(leaf, item);
-               em->block_start = bytenr;
                 em->start = extent_start;
                 em->len = extent_end - extent_start;
+               if (compressed) {
+                       set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                       em->block_start = bytenr;
+                       em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+                                                                        item);
+               } else {
+                       bytenr += btrfs_file_extent_offset(leaf, item);
+                       em->block_start = bytenr;
+                       em->block_len = em->len;
+               }
                 goto insert;
         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                 u64 page_start;
@@ -3018,8 +3495,7 @@ again:
                 size_t extent_offset;
                 size_t copy_size;
  
-               size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
-                                                   path->slots[0]));
+               size = btrfs_file_extent_inline_len(leaf, item);
                 extent_end = (extent_start + size + root->sectorsize - 1) &
                         ~((u64)root->sectorsize - 1);
                 if (start < extent_start || start >= extent_end) {
@@ -3035,9 +3511,10 @@ again:
                 }
                 em->block_start = EXTENT_MAP_INLINE;
  
-               if (!page) {
+               if (!page || create) {
                         em->start = extent_start;
-                       em->len = size;
+                       em->len = (size + root->sectorsize - 1) &
+                       ~((u64)root->sectorsize - 1);
                         goto out;
                 }
  
@@ -3048,11 +3525,22 @@ again:
                 em->start = extent_start + extent_offset;
                 em->len = (copy_size + root->sectorsize - 1) &
                         ~((u64)root->sectorsize - 1);
-               map = kmap(page);
+               if (compressed)
+                       set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
                 if (create == 0 && !PageUptodate(page)) {
-                       read_extent_buffer(leaf, map + pg_offset, ptr,
-                                          copy_size);
+                       if (btrfs_file_extent_compression(leaf, item) ==
+                           BTRFS_COMPRESS_ZLIB) {
+                               ret = uncompress_inline(path, inode, page,
+                                                       pg_offset,
+                                                       extent_offset, item);
+                               BUG_ON(ret);
+                       } else {
+                               map = kmap(page);
+                               read_extent_buffer(leaf, map + pg_offset, ptr,
+                                                  copy_size);
+                               kunmap(page);
+                       }
                         flush_dcache_page(page);
                 } else if (create && PageUptodate(page)) {
                         if (!trans) {
@@ -3063,11 +3551,12 @@ again:
                                 trans = btrfs_join_transaction(root, 1);
                                 goto again;
                         }
+                       map = kmap(page);
                         write_extent_buffer(leaf, map + pg_offset, ptr,
                                             copy_size);
+                       kunmap(page);
                         btrfs_mark_buffer_dirty(leaf);
                 }
-               kunmap(page);
                 set_extent_uptodate(io_tree, em->start,
                                     extent_map_end(em) - 1, GFP_NOFS);
                 goto insert;
@@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
         btrfs_set_file_extent_type(leaf, ei,
                                    BTRFS_FILE_EXTENT_INLINE);
+       btrfs_set_file_extent_encryption(leaf, ei, 0);
+       btrfs_set_file_extent_compression(leaf, ei, 0);
+       btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+       btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
         ptr = btrfs_file_extent_inline_start(ei);
         write_extent_buffer(leaf, symname, ptr, name_len);
         btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c

index 2eb6caba57c278ad92a044da28b7782d009890fb..b5745bb96d40ddfc34d7d3719c5c4ab5b04df790 100644 (file)
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -165,7 +165,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
   * inserted.
   */
  int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                            u64 start, u64 len, int nocow)
+                            u64 start, u64 len, u64 disk_len, int nocow,
+                            int compressed)
  {
         struct btrfs_ordered_inode_tree *tree;
         struct rb_node *node;
@@ -180,9 +181,12 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
         entry->file_offset = file_offset;
         entry->start = start;
         entry->len = len;
+       entry->disk_len = disk_len;
         entry->inode = inode;
         if (nocow)
                 set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
+       if (compressed)
+               set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags);
  
         /* one ref for the tree */
         atomic_set(&entry->refs, 1);
@@ -389,9 +393,10 @@ void btrfs_start_ordered_extent(struct inode *inode,
          * for pdflush to find them
          */
         btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
-       if (wait)
+       if (wait) {
                 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
                                                  &entry->flags));
+       }
  }
  
  /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h

index f50f8870a1449801386846b1dd66d6dd4eb068fd..1ef464145d226a509a8f24646e0758e52e18273b 100644 (file)
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -66,6 +66,8 @@ struct btrfs_ordered_sum {
  
  #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
  
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+
  struct btrfs_ordered_extent {
         /* logical offset in the file */
         u64 file_offset;
@@ -73,9 +75,12 @@ struct btrfs_ordered_extent {
         /* disk byte number */
         u64 start;
  
-       /* length of the extent in bytes */
+       /* ram length of the extent in bytes */
         u64 len;
  
+       /* extent length on disk */
+       u64 disk_len;
+
         /* flags (described above) */
         unsigned long flags;
  
@@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode,
  int btrfs_dec_test_ordered_pending(struct inode *inode,
                                        u64 file_offset, u64 io_size);
  int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                            u64 start, u64 len, int nocow);
+                            u64 start, u64 len, u64 disk_len, int nocow,
+                            int compressed);
  int btrfs_add_ordered_sum(struct inode *inode,
                           struct btrfs_ordered_extent *entry,
                           struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c

index bd9ab3e9a7f2fa44fa1795a06e6c6e59800c8ffb..64725c13aa1160de7c44e09b88995f7d090cbe09 100644 (file)
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -115,15 +115,16 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                         if (btrfs_file_extent_type(l, fi) ==
                             BTRFS_FILE_EXTENT_INLINE) {
                                 printk("\t\tinline extent data size %u\n",
-                                  btrfs_file_extent_inline_len(l, item));
+                                  btrfs_file_extent_inline_len(l, fi));
                                 break;
                         }
                         printk("\t\textent data disk bytenr %llu nr %llu\n",
                                (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
                                (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
-                       printk("\t\textent data offset %llu nr %llu\n",
+                       printk("\t\textent data offset %llu nr %llu ram %llu\n",
                           (unsigned long long)btrfs_file_extent_offset(l, fi),
-                         (unsigned long long)btrfs_file_extent_num_bytes(l, fi));
+                         (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
+                         (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
                         break;
                 case BTRFS_BLOCK_GROUP_ITEM_KEY:
                         bi = btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index 2e6039825b7bc19d97f79276b494a4f3fd177d92..431fdf144b585709afe6f6f50f81df2f8b0021e6 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -47,6 +47,7 @@
  #include "volumes.h"
  #include "version.h"
  #include "export.h"
+#include "compression.h"
  
  #define BTRFS_SUPER_MAGIC 0x9123683E
  
@@ -69,7 +70,7 @@ static void btrfs_put_super (struct super_block * sb)
  enum {
         Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
         Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-       Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_err,
+       Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
  };
  
  static match_table_t tokens = {
@@ -83,6 +84,7 @@ static match_table_t tokens = {
         {Opt_max_inline, "max_inline=%s"},
         {Opt_alloc_start, "alloc_start=%s"},
         {Opt_thread_pool, "thread_pool=%d"},
+       {Opt_compress, "compress"},
         {Opt_ssd, "ssd"},
         {Opt_noacl, "noacl"},
         {Opt_err, NULL},
@@ -163,6 +165,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                         btrfs_set_opt(info->mount_opt, NODATACOW);
                         btrfs_set_opt(info->mount_opt, NODATASUM);
                         break;
+               case Opt_compress:
+                       printk(KERN_INFO "btrfs: use compression\n");
+                       btrfs_set_opt(info->mount_opt, COMPRESS);
+                       break;
                 case Opt_ssd:
                         printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
                         btrfs_set_opt(info->mount_opt, SSD);
@@ -622,6 +628,7 @@ static int __init init_btrfs_fs(void)
         err = btrfs_interface_init();
         if (err)
                 goto free_extent_map;
+
         err = register_filesystem(&btrfs_fs_type);
         if (err)
                 goto unregister_ioctl;
@@ -651,6 +658,7 @@ static void __exit exit_btrfs_fs(void)
         unregister_filesystem(&btrfs_fs_type);
         btrfs_exit_sysfs();
         btrfs_cleanup_fs_uuids();
+       btrfs_zlib_exit();
  }
  
  module_init(init_btrfs_fs)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index cf618cc8b34ad3fd59d065adb01342ba027c2558..e6d579053a475b6191e31c6a8396140ab8293e98 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -540,8 +540,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
         if (found_type == BTRFS_FILE_EXTENT_REG)
                 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
         else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-               size = btrfs_file_extent_inline_len(eb,
-                                                   btrfs_item_nr(eb, slot));
+               size = btrfs_file_extent_inline_len(eb, item);
                 extent_end = (start + size + mask) & ~mask;
         } else {
                 ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 2eed7f91f51a3dca150ba8d2f61d746991167ef1..7db4cfd03a98bd5697e851c4113ee9e74c43af4c 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1816,6 +1816,7 @@ again:
         em->start = key.offset;
         em->len = *num_bytes;
         em->block_start = 0;
+       em->block_len = em->len;
  
         if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
                 ret = btrfs_add_system_chunk(trans, chunk_root, &key,
@@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
         em->start = logical;
         em->len = length;
         em->block_start = 0;
+       em->block_len = em->len;
  
         map->num_stripes = num_stripes;
         map->io_width = btrfs_chunk_io_width(leaf, chunk);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c

new file mode 100644 (file)

index 0000000..e993091
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,637 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+
+/* Plan: call deflate() with avail_in == *sourcelen,
+       avail_out = *dstlen - 12 and flush == Z_FINISH.
+       If it doesn't manage to finish, call it again with
+       avail_in == 0 and avail_out set to the remaining 12
+       bytes for it to clean up.
+   Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+
+struct workspace {
+       z_stream inf_strm;
+       z_stream def_strm;
+       char *buf;
+       struct list_head list;
+};
+
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+       struct workspace *workspace;
+       int ret;
+       int cpus = num_online_cpus();
+
+again:
+       spin_lock(&workspace_lock);
+       if (!list_empty(&idle_workspace)) {
+               workspace = list_entry(idle_workspace.next, struct workspace,
+                                      list);
+               list_del(&workspace->list);
+               num_workspace--;
+               spin_unlock(&workspace_lock);
+               return workspace;
+
+       }
+       spin_unlock(&workspace_lock);
+       if (atomic_read(&alloc_workspace) > cpus) {
+               DEFINE_WAIT(wait);
+               prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+               if (atomic_read(&alloc_workspace) > cpus)
+                       schedule();
+               finish_wait(&workspace_wait, &wait);
+               goto again;
+       }
+       atomic_inc(&alloc_workspace);
+       workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+       if (!workspace) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+
+       workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+       if (!workspace->def_strm.workspace) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+       workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+       if (!workspace->inf_strm.workspace) {
+               ret = -ENOMEM;
+               goto fail_inflate;
+       }
+       workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+       if (!workspace->buf) {
+               ret = -ENOMEM;
+               goto fail_kmalloc;
+       }
+       return workspace;
+
+fail_kmalloc:
+       vfree(workspace->inf_strm.workspace);
+fail_inflate:
+       vfree(workspace->def_strm.workspace);
+fail:
+       kfree(workspace);
+       atomic_dec(&alloc_workspace);
+       wake_up(&workspace_wait);
+       return ERR_PTR(ret);
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+       spin_lock(&workspace_lock);
+       if (num_workspace < num_online_cpus()) {
+               list_add_tail(&workspace->list, &idle_workspace);
+               num_workspace++;
+               spin_unlock(&workspace_lock);
+               if (waitqueue_active(&workspace_wait))
+                       wake_up(&workspace_wait);
+               return 0;
+       }
+       spin_unlock(&workspace_lock);
+       vfree(workspace->def_strm.workspace);
+       vfree(workspace->inf_strm.workspace);
+       kfree(workspace->buf);
+       kfree(workspace);
+
+       atomic_dec(&alloc_workspace);
+       if (waitqueue_active(&workspace_wait))
+               wake_up(&workspace_wait);
+       return 0;
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+       struct workspace *workspace;
+       while(!list_empty(&idle_workspace)) {
+               workspace = list_entry(idle_workspace.next, struct workspace,
+                                      list);
+               list_del(&workspace->list);
+               vfree(workspace->def_strm.workspace);
+               vfree(workspace->inf_strm.workspace);
+               kfree(workspace->buf);
+               kfree(workspace);
+               atomic_dec(&alloc_workspace);
+       }
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+                             u64 start, unsigned long len,
+                             struct page **pages,
+                             unsigned long nr_dest_pages,
+                             unsigned long *out_pages,
+                             unsigned long *total_in,
+                             unsigned long *total_out,
+                             unsigned long max_out)
+{
+       int ret;
+       struct workspace *workspace;
+       char *data_in;
+       char *cpage_out;
+       int nr_pages = 0;
+       struct page *in_page = NULL;
+       struct page *out_page = NULL;
+       int out_written = 0;
+       int in_read = 0;
+       unsigned long bytes_left;
+
+       *out_pages = 0;
+       *total_out = 0;
+       *total_in = 0;
+
+       workspace = find_zlib_workspace();
+       if (!workspace)
+               return -1;
+
+       if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+               printk(KERN_WARNING "deflateInit failed\n");
+               ret = -1;
+               goto out;
+       }
+
+       workspace->def_strm.total_in = 0;
+       workspace->def_strm.total_out = 0;
+
+       in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+       data_in = kmap(in_page);
+
+       out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       cpage_out = kmap(out_page);
+       pages[0] = out_page;
+       nr_pages = 1;
+
+       workspace->def_strm.next_in = data_in;
+       workspace->def_strm.next_out = cpage_out;
+       workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+       workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+       out_written = 0;
+       in_read = 0;
+
+       while (workspace->def_strm.total_in < len) {
+               ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+               if (ret != Z_OK) {
+                       printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                              ret);
+                       zlib_deflateEnd(&workspace->def_strm);
+                       ret = -1;
+                       goto out;
+               }
+
+               /* we're making it bigger, give up */
+               if (workspace->def_strm.total_in > 8192 &&
+                   workspace->def_strm.total_in <
+                   workspace->def_strm.total_out) {
+                       ret = -1;
+                       goto out;
+               }
+               /* we need another page for writing out.  Test this
+                * before the total_in so we will pull in a new page for
+                * the stream end if required
+                */
+               if (workspace->def_strm.avail_out == 0) {
+                       kunmap(out_page);
+                       if (nr_pages == nr_dest_pages) {
+                               out_page = NULL;
+                               ret = -1;
+                               goto out;
+                       }
+                       out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                       cpage_out = kmap(out_page);
+                       pages[nr_pages] = out_page;
+                       nr_pages++;
+                       workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+                       workspace->def_strm.next_out = cpage_out;
+               }
+               /* we're all done */
+               if (workspace->def_strm.total_in >= len)
+                       break;
+
+               /* we've read in a full page, get a new one */
+               if (workspace->def_strm.avail_in == 0) {
+                       if (workspace->def_strm.total_out > max_out)
+                               break;
+
+                       bytes_left = len - workspace->def_strm.total_in;
+                       kunmap(in_page);
+                       page_cache_release(in_page);
+
+                       start += PAGE_CACHE_SIZE;
+                       in_page = find_get_page(mapping,
+                                               start >> PAGE_CACHE_SHIFT);
+                       data_in = kmap(in_page);
+                       workspace->def_strm.avail_in = min(bytes_left,
+                                                          PAGE_CACHE_SIZE);
+                       workspace->def_strm.next_in = data_in;
+               }
+       }
+       workspace->def_strm.avail_in = 0;
+       ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+       zlib_deflateEnd(&workspace->def_strm);
+
+       if (ret != Z_STREAM_END) {
+               ret = -1;
+               goto out;
+       }
+
+       if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+               ret = -1;
+               goto out;
+       }
+
+       ret = 0;
+       *total_out = workspace->def_strm.total_out;
+       *total_in = workspace->def_strm.total_in;
+out:
+       *out_pages = nr_pages;
+       if (out_page)
+               kunmap(out_page);
+
+       if (in_page) {
+               kunmap(in_page);
+               page_cache_release(in_page);
+       }
+       free_workspace(workspace);
+       return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                             u64 disk_start,
+                             struct bio_vec *bvec,
+                             int vcnt,
+                             size_t srclen)
+{
+       int ret = 0;
+       int wbits = MAX_WBITS;
+       struct workspace *workspace;
+       char *data_in;
+       size_t total_out = 0;
+       unsigned long page_bytes_left;
+       unsigned long page_in_index = 0;
+       unsigned long page_out_index = 0;
+       struct page *page_out;
+       unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+                                       PAGE_CACHE_SIZE;
+       unsigned long buf_start;
+       unsigned long buf_offset;
+       unsigned long bytes;
+       unsigned long working_bytes;
+       unsigned long pg_offset;
+       unsigned long start_byte;
+       unsigned long current_buf_start;
+       char *kaddr;
+
+       workspace = find_zlib_workspace();
+       if (!workspace)
+               return -ENOMEM;
+
+       data_in = kmap(pages_in[page_in_index]);
+       workspace->inf_strm.next_in = data_in;
+       workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
+       workspace->inf_strm.total_in = 0;
+
+       workspace->inf_strm.total_out = 0;
+       workspace->inf_strm.next_out = workspace->buf;
+       workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+       page_out = bvec[page_out_index].bv_page;
+       page_bytes_left = PAGE_CACHE_SIZE;
+       pg_offset = 0;
+
+       /* If it's deflate, and it's got no preset dictionary, then
+          we can tell zlib to skip the adler32 check. */
+       if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+           ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+           !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+               wbits = -((data_in[0] >> 4) + 8);
+               workspace->inf_strm.next_in += 2;
+               workspace->inf_strm.avail_in -= 2;
+       }
+
+       if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+               printk(KERN_WARNING "inflateInit failed\n");
+               ret = -1;
+               goto out;
+       }
+       while(workspace->inf_strm.total_in < srclen) {
+               ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+               if (ret != Z_OK && ret != Z_STREAM_END) {
+                       break;
+               }
+
+               /*
+                * buf start is the byte offset we're of the start of
+                * our workspace buffer
+                */
+               buf_start = total_out;
+
+               /* total_out is the last byte of the workspace buffer */
+               total_out = workspace->inf_strm.total_out;
+
+               working_bytes = total_out - buf_start;
+
+               /*
+                * start byte is the first byte of the page we're currently
+                * copying into relative to the start of the compressed data.
+                */
+               start_byte = page_offset(page_out) - disk_start;
+
+               if (working_bytes == 0) {
+                       /* we didn't make progress in this inflate
+                        * call, we're done
+                        */
+                       if (ret != Z_STREAM_END)
+                               ret = -1;
+                       break;
+               }
+
+               /* we haven't yet hit data corresponding to this page */
+               if (total_out <= start_byte) {
+                       goto next;
+               }
+
+               /*
+                * the start of the data we care about is offset into
+                * the middle of our working buffer
+                */
+               if (total_out > start_byte && buf_start < start_byte) {
+                       buf_offset = start_byte - buf_start;
+                       working_bytes -= buf_offset;
+               } else {
+                       buf_offset = 0;
+               }
+               current_buf_start = buf_start;
+
+               /* copy bytes from the working buffer into the pages */
+               while(working_bytes > 0) {
+                       bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                                   PAGE_CACHE_SIZE - buf_offset);
+                       bytes = min(bytes, working_bytes);
+                       kaddr = kmap_atomic(page_out, KM_USER0);
+                       memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+                              bytes);
+                       kunmap_atomic(kaddr, KM_USER0);
+                       flush_dcache_page(page_out);
+
+                       pg_offset += bytes;
+                       page_bytes_left -= bytes;
+                       buf_offset += bytes;
+                       working_bytes -= bytes;
+                       current_buf_start += bytes;
+
+                       /* check if we need to pick another page */
+                       if (page_bytes_left == 0) {
+                               page_out_index++;
+                               if (page_out_index >= vcnt) {
+                                       ret = 0;
+                                       goto done;
+                               }
+                               page_out = bvec[page_out_index].bv_page;
+                               pg_offset = 0;
+                               page_bytes_left = PAGE_CACHE_SIZE;
+                               start_byte = page_offset(page_out) - disk_start;
+
+                               /*
+                                * make sure our new page is covered by this
+                                * working buffer
+                                */
+                               if (total_out <= start_byte) {
+                                       goto next;
+                               }
+
+                               /* the next page in the biovec might not
+                                * be adjacent to the last page, but it
+                                * might still be found inside this working
+                                * buffer.  bump our offset pointer
+                                */
+                               if (total_out > start_byte &&
+                                   current_buf_start < start_byte) {
+                                       buf_offset = start_byte - buf_start;
+                                       working_bytes = total_out - start_byte;
+                                       current_buf_start = buf_start +
+                                               buf_offset;
+                               }
+                       }
+               }
+next:
+               workspace->inf_strm.next_out = workspace->buf;
+               workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+
+               if (workspace->inf_strm.avail_in == 0) {
+                       unsigned long tmp;
+                       kunmap(pages_in[page_in_index]);
+                       page_in_index++;
+                       if (page_in_index >= total_pages_in) {
+                               data_in = NULL;
+                               break;
+                       }
+                       data_in = kmap(pages_in[page_in_index]);
+                       workspace->inf_strm.next_in = data_in;
+                       tmp = srclen - workspace->inf_strm.total_in;
+                       workspace->inf_strm.avail_in = min(tmp,
+                                                          PAGE_CACHE_SIZE);
+               }
+       }
+       if (ret != Z_STREAM_END) {
+               ret = -1;
+       } else {
+               ret = 0;
+       }
+done:
+       zlib_inflateEnd(&workspace->inf_strm);
+       if (data_in)
+               kunmap(pages_in[page_in_index]);
+out:
+       free_workspace(workspace);
+       return ret;
+}
+
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+                         struct page *dest_page,
+                         unsigned long start_byte,
+                         size_t srclen, size_t destlen)
+{
+       int ret = 0;
+       int wbits = MAX_WBITS;
+       struct workspace *workspace;
+       unsigned long bytes_left = destlen;
+       unsigned long total_out = 0;
+       char *kaddr;
+
+       if (destlen > PAGE_CACHE_SIZE)
+               return -ENOMEM;
+
+       workspace = find_zlib_workspace();
+       if (!workspace)
+               return -ENOMEM;
+
+       workspace->inf_strm.next_in = data_in;
+       workspace->inf_strm.avail_in = srclen;
+       workspace->inf_strm.total_in = 0;
+
+       workspace->inf_strm.next_out = workspace->buf;
+       workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+       workspace->inf_strm.total_out = 0;
+       /* If it's deflate, and it's got no preset dictionary, then
+          we can tell zlib to skip the adler32 check. */
+       if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+           ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+           !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+               wbits = -((data_in[0] >> 4) + 8);
+               workspace->inf_strm.next_in += 2;
+               workspace->inf_strm.avail_in -= 2;
+       }
+
+       if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+               printk(KERN_WARNING "inflateInit failed\n");
+               ret = -1;
+               goto out;
+       }
+
+       while(bytes_left > 0) {
+               unsigned long buf_start;
+               unsigned long buf_offset;
+               unsigned long bytes;
+               unsigned long pg_offset = 0;
+
+               ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+               if (ret != Z_OK && ret != Z_STREAM_END) {
+                       break;
+               }
+
+               buf_start = total_out;
+               total_out = workspace->inf_strm.total_out;
+
+               if (total_out == buf_start) {
+                       ret = -1;
+                       break;
+               }
+
+               if (total_out <= start_byte) {
+                       goto next;
+               }
+
+               if (total_out > start_byte && buf_start < start_byte) {
+                       buf_offset = start_byte - buf_start;
+               } else {
+                       buf_offset = 0;
+               }
+
+               bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                           PAGE_CACHE_SIZE - buf_offset);
+               bytes = min(bytes, bytes_left);
+
+               kaddr = kmap_atomic(dest_page, KM_USER0);
+               memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+               kunmap_atomic(kaddr, KM_USER0);
+
+               pg_offset += bytes;
+               bytes_left -= bytes;
+next:
+               workspace->inf_strm.next_out = workspace->buf;
+               workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+       }
+       if (ret != Z_STREAM_END && bytes_left != 0) {
+               ret = -1;
+       } else {
+               ret = 0;
+       }
+       zlib_inflateEnd(&workspace->inf_strm);
+out:
+       free_workspace(workspace);
+       return ret;
+}
+
+void btrfs_zlib_exit(void)
+{
+    free_workspaces();
+}
author	Chris Mason <chris.mason@oracle.com>
	Wed, 29 Oct 2008 18:49:59 +0000 (14:49 -0400)
committer	Chris Mason <chris.mason@oracle.com>
	Wed, 29 Oct 2008 18:49:59 +0000 (14:49 -0400)
fs/Kconfig		patch \| blob \| history
fs/btrfs/Makefile		patch \| blob \| history
fs/btrfs/compression.c	[new file with mode: 0644]	patch \| blob
fs/btrfs/compression.h	[new file with mode: 0644]	patch \| blob
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/disk-io.h		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/extent_io.h		patch \| blob \| history
fs/btrfs/extent_map.c		patch \| blob \| history
fs/btrfs/extent_map.h		patch \| blob \| history
fs/btrfs/file-item.c		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ordered-data.c		patch \| blob \| history
fs/btrfs/ordered-data.h		patch \| blob \| history
fs/btrfs/print-tree.c		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history
fs/btrfs/volumes.c		patch \| blob \| history
fs/btrfs/zlib.c	[new file with mode: 0644]	patch \| blob