ext4: add self-testing infrastructure to do a sanity check
[firefly-linux-kernel-4.4.55.git] / fs / ext4 / inode.c
index 5042c8773ad75c9bf9deaf899aa227da4b2098e4..3186a43fa4b0334ada4df3436b1464348ab2ef47 100644 (file)
@@ -343,7 +343,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
-               ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+               ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks",
                         __func__, inode->i_ino, used,
                         ei->i_reserved_data_blocks);
@@ -352,10 +352,12 @@ void ext4_da_update_reserve_space(struct inode *inode,
        }
 
        if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
-               ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
-                        "with only %d reserved metadata blocks\n", __func__,
-                        inode->i_ino, ei->i_allocated_meta_blocks,
-                        ei->i_reserved_meta_blocks);
+               ext4_warning(inode->i_sb, "ino %lu, allocated %d "
+                       "with only %d reserved metadata blocks "
+                       "(releasing %d blocks with reserved %d data blocks)",
+                       inode->i_ino, ei->i_allocated_meta_blocks,
+                            ei->i_reserved_meta_blocks, used,
+                            ei->i_reserved_data_blocks);
                WARN_ON(1);
                ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
        }
@@ -480,6 +482,58 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
        return num;
 }
 
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_map_blocks_es_recheck(handle_t *handle,
+                                      struct inode *inode,
+                                      struct ext4_map_blocks *es_map,
+                                      struct ext4_map_blocks *map,
+                                      int flags)
+{
+       int retval;
+
+       map->m_flags = 0;
+       /*
+        * There is a race window that the result is not the same.
+        * e.g. xfstests #223 when dioread_nolock enables.  The reason
+        * is that we lookup a block mapping in extent status tree with
+        * out taking i_data_sem.  So at the time the unwritten extent
+        * could be converted.
+        */
+       if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+               down_read((&EXT4_I(inode)->i_data_sem));
+       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+               retval = ext4_ext_map_blocks(handle, inode, map, flags &
+                                            EXT4_GET_BLOCKS_KEEP_SIZE);
+       } else {
+               retval = ext4_ind_map_blocks(handle, inode, map, flags &
+                                            EXT4_GET_BLOCKS_KEEP_SIZE);
+       }
+       if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+               up_read((&EXT4_I(inode)->i_data_sem));
+       /*
+        * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
+        * because it shouldn't be marked in es_map->m_flags.
+        */
+       map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
+
+       /*
+        * We don't check m_len because extent will be collpased in status
+        * tree.  So the m_len might not equal.
+        */
+       if (es_map->m_lblk != map->m_lblk ||
+           es_map->m_flags != map->m_flags ||
+           es_map->m_pblk != map->m_pblk) {
+               printk("ES cache assertation failed for inode: %lu "
+                      "es_cached ex [%d/%d/%llu/%x] != "
+                      "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
+                      inode->i_ino, es_map->m_lblk, es_map->m_len,
+                      es_map->m_pblk, es_map->m_flags, map->m_lblk,
+                      map->m_len, map->m_pblk, map->m_flags,
+                      retval, flags);
+       }
+}
+#endif /* ES_AGGRESSIVE_TEST */
+
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
@@ -505,12 +559,42 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 int ext4_map_blocks(handle_t *handle, struct inode *inode,
                    struct ext4_map_blocks *map, int flags)
 {
+       struct extent_status es;
        int retval;
+#ifdef ES_AGGRESSIVE_TEST
+       struct ext4_map_blocks orig_map;
+
+       memcpy(&orig_map, map, sizeof(*map));
+#endif
 
        map->m_flags = 0;
        ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
                  (unsigned long) map->m_lblk);
+
+       /* Lookup extent status tree firstly */
+       if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+               if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+                       map->m_pblk = ext4_es_pblock(&es) +
+                                       map->m_lblk - es.es_lblk;
+                       map->m_flags |= ext4_es_is_written(&es) ?
+                                       EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+                       retval = es.es_len - (map->m_lblk - es.es_lblk);
+                       if (retval > map->m_len)
+                               retval = map->m_len;
+                       map->m_len = retval;
+               } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+                       retval = 0;
+               } else {
+                       BUG_ON(1);
+               }
+#ifdef ES_AGGRESSIVE_TEST
+               ext4_map_blocks_es_recheck(handle, inode, map,
+                                          &orig_map, flags);
+#endif
+               goto found;
+       }
+
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
@@ -524,20 +608,36 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                retval = ext4_ind_map_blocks(handle, inode, map, flags &
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        }
+       if (retval > 0) {
+               int ret;
+               unsigned long long status;
+
+#ifdef ES_AGGRESSIVE_TEST
+               if (retval != map->m_len) {
+                       printk("ES len assertation failed for inode: %lu "
+                              "retval %d != map->m_len %d "
+                              "in %s (lookup)\n", inode->i_ino, retval,
+                              map->m_len, __func__);
+               }
+#endif
+
+               status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+                               EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+               if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+                   ext4_find_delalloc_range(inode, map->m_lblk,
+                                            map->m_lblk + map->m_len - 1))
+                       status |= EXTENT_STATUS_DELAYED;
+               ret = ext4_es_insert_extent(inode, map->m_lblk,
+                                           map->m_len, map->m_pblk, status);
+               if (ret < 0)
+                       retval = ret;
+       }
        if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
                up_read((&EXT4_I(inode)->i_data_sem));
 
+found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-               int ret;
-               if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-                       /* delayed alloc may be allocated by fallocate and
-                        * coverted to initialized by directIO.
-                        * we need to handle delayed extent here.
-                        */
-                       down_write((&EXT4_I(inode)->i_data_sem));
-                       goto delayed_mapped;
-               }
-               ret = check_block_validity(inode, map);
+               int ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -557,16 +657,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                return retval;
 
        /*
-        * When we call get_blocks without the create flag, the
-        * BH_Unwritten flag could have gotten set if the blocks
-        * requested were part of a uninitialized extent.  We need to
-        * clear this flag now that we are committed to convert all or
-        * part of the uninitialized extent to be an initialized
-        * extent.  This is because we need to avoid the combination
-        * of BH_Unwritten and BH_Mapped flags being simultaneously
-        * set on the buffer_head.
+        * Here we clear m_flags because after allocating an new extent,
+        * it will be set again.
         */
-       map->m_flags &= ~EXT4_MAP_UNWRITTEN;
+       map->m_flags &= ~EXT4_MAP_FLAGS;
 
        /*
         * New blocks allocate and/or writing to uninitialized extent
@@ -612,18 +706,32 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
-       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
-               if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                       int ret;
-delayed_mapped:
-                       /* delayed allocation blocks has been allocated */
-                       ret = ext4_es_remove_extent(inode, map->m_lblk,
-                                                   map->m_len);
-                       if (ret < 0)
-                               retval = ret;
+       if (retval > 0) {
+               int ret;
+               unsigned long long status;
+
+#ifdef ES_AGGRESSIVE_TEST
+               if (retval != map->m_len) {
+                       printk("ES len assertation failed for inode: %lu "
+                              "retval %d != map->m_len %d "
+                              "in %s (allocation)\n", inode->i_ino, retval,
+                              map->m_len, __func__);
                }
+#endif
+
+               status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+                               EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+               if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+                   ext4_find_delalloc_range(inode, map->m_lblk,
+                                            map->m_lblk + map->m_len - 1))
+                       status |= EXTENT_STATUS_DELAYED;
+               ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+                                           map->m_pblk, status);
+               if (ret < 0)
+                       retval = ret;
        }
 
        up_write((&EXT4_I(inode)->i_data_sem));
@@ -705,6 +813,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
        /* ensure we send some value back into *errp */
        *errp = 0;
 
+       if (create && err == 0)
+               err = -ENOSPC;  /* should never happen */
        if (err < 0)
                *errp = err;
        if (err <= 0)
@@ -875,32 +985,40 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
                ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
                                                    flags, pagep);
                if (ret < 0)
-                       goto out;
-               if (ret == 1) {
-                       ret = 0;
-                       goto out;
-               }
+                       return ret;
+               if (ret == 1)
+                       return 0;
        }
 
-retry:
+       /*
+        * grab_cache_page_write_begin() can take a long time if the
+        * system is thrashing due to memory pressure, or if the page
+        * is being written back.  So grab it first before we start
+        * the transaction handle.  This also allows us to allocate
+        * the page (if needed) without using GFP_NOFS.
+        */
+retry_grab:
+       page = grab_cache_page_write_begin(mapping, index, flags);
+       if (!page)
+               return -ENOMEM;
+       unlock_page(page);
+
+retry_journal:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
-               ret = PTR_ERR(handle);
-               goto out;
+               page_cache_release(page);
+               return PTR_ERR(handle);
        }
 
-       /* We cannot recurse into the filesystem as the transaction is already
-        * started */
-       flags |= AOP_FLAG_NOFS;
-
-       page = grab_cache_page_write_begin(mapping, index, flags);
-       if (!page) {
+       lock_page(page);
+       if (page->mapping != mapping) {
+               /* The page got truncated from under us */
+               unlock_page(page);
+               page_cache_release(page);
                ext4_journal_stop(handle);
-               ret = -ENOMEM;
-               goto out;
+               goto retry_grab;
        }
-
-       *pagep = page;
+       wait_on_page_writeback(page);
 
        if (ext4_should_dioread_nolock(inode))
                ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -915,7 +1033,6 @@ retry:
 
        if (ret) {
                unlock_page(page);
-               page_cache_release(page);
                /*
                 * __block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
@@ -939,11 +1056,14 @@ retry:
                        if (inode->i_nlink)
                                ext4_orphan_del(NULL, inode);
                }
-       }
 
-       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-               goto retry;
-out:
+               if (ret == -ENOSPC &&
+                   ext4_should_retry_alloc(inode->i_sb, &retries))
+                       goto retry_journal;
+               page_cache_release(page);
+               return ret;
+       }
+       *pagep = page;
        return ret;
 }
 
@@ -1253,7 +1373,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 * function is called from invalidate page, it's
                 * harmless to return without any action.
                 */
-               ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+               ext4_warning(inode->i_sb, "ext4_da_release_space: "
                         "ino %lu, to_free %d with only %d reserved "
                         "data blocks", inode->i_ino, to_free,
                         ei->i_reserved_data_blocks);
@@ -1599,7 +1719,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                                 (unsigned long long) next,
                                 mpd->b_size >> mpd->inode->i_blkbits, err);
                        ext4_msg(sb, KERN_CRIT,
-                               "This should not happen!! Data will be lost\n");
+                               "This should not happen!! Data will be lost");
                        if (err == -ENOSPC)
                                ext4_print_free_blocks(mpd->inode);
                }
@@ -1724,8 +1844,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                              struct ext4_map_blocks *map,
                              struct buffer_head *bh)
 {
+       struct extent_status es;
        int retval;
        sector_t invalid_block = ~((sector_t) 0xffff);
+#ifdef ES_AGGRESSIVE_TEST
+       struct ext4_map_blocks orig_map;
+
+       memcpy(&orig_map, map, sizeof(*map));
+#endif
 
        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
                invalid_block = ~0;
@@ -1734,6 +1860,45 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
        ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
                  "logical block %lu\n", inode->i_ino, map->m_len,
                  (unsigned long) map->m_lblk);
+
+       /* Lookup extent status tree firstly */
+       if (ext4_es_lookup_extent(inode, iblock, &es)) {
+
+               if (ext4_es_is_hole(&es)) {
+                       retval = 0;
+                       down_read((&EXT4_I(inode)->i_data_sem));
+                       goto add_delayed;
+               }
+
+               /*
+                * Delayed extent could be allocated by fallocate.
+                * So we need to check it.
+                */
+               if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+                       map_bh(bh, inode->i_sb, invalid_block);
+                       set_buffer_new(bh);
+                       set_buffer_delay(bh);
+                       return 0;
+               }
+
+               map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+               retval = es.es_len - (iblock - es.es_lblk);
+               if (retval > map->m_len)
+                       retval = map->m_len;
+               map->m_len = retval;
+               if (ext4_es_is_written(&es))
+                       map->m_flags |= EXT4_MAP_MAPPED;
+               else if (ext4_es_is_unwritten(&es))
+                       map->m_flags |= EXT4_MAP_UNWRITTEN;
+               else
+                       BUG_ON(1);
+
+#ifdef ES_AGGRESSIVE_TEST
+               ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
+               return retval;
+       }
+
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
@@ -1752,11 +1917,15 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                        map->m_flags |= EXT4_MAP_FROM_CLUSTER;
                retval = 0;
        } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-               retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+               retval = ext4_ext_map_blocks(NULL, inode, map,
+                                            EXT4_GET_BLOCKS_NO_PUT_HOLE);
        else
-               retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+               retval = ext4_ind_map_blocks(NULL, inode, map,
+                                            EXT4_GET_BLOCKS_NO_PUT_HOLE);
 
+add_delayed:
        if (retval == 0) {
+               int ret;
                /*
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
@@ -1764,15 +1933,20 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                /* If the block was allocated from previously allocated cluster,
                 * then we dont need to reserve it again. */
                if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
-                       retval = ext4_da_reserve_space(inode, iblock);
-                       if (retval)
+                       ret = ext4_da_reserve_space(inode, iblock);
+                       if (ret) {
                                /* not enough space to reserve */
+                               retval = ret;
                                goto out_unlock;
+                       }
                }
 
-               retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
-               if (retval)
+               ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+                                           ~0, EXTENT_STATUS_DELAYED);
+               if (ret) {
+                       retval = ret;
                        goto out_unlock;
+               }
 
                /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
                 * and it should not appear on the bh->b_state.
@@ -1782,6 +1956,25 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                map_bh(bh, inode->i_sb, invalid_block);
                set_buffer_new(bh);
                set_buffer_delay(bh);
+       } else if (retval > 0) {
+               int ret;
+               unsigned long long status;
+
+#ifdef ES_AGGRESSIVE_TEST
+               if (retval != map->m_len) {
+                       printk("ES len assertation failed for inode: %lu "
+                              "retval %d != map->m_len %d "
+                              "in %s (lookup)\n", inode->i_ino, retval,
+                              map->m_len, __func__);
+               }
+#endif
+
+               status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+                               EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+               ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+                                           map->m_pblk, status);
+               if (ret != 0)
+                       retval = ret;
        }
 
 out_unlock:
@@ -2458,42 +2651,52 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                                      pos, len, flags,
                                                      pagep, fsdata);
                if (ret < 0)
-                       goto out;
-               if (ret == 1) {
-                       ret = 0;
-                       goto out;
-               }
+                       return ret;
+               if (ret == 1)
+                       return 0;
        }
 
-retry:
+       /*
+        * grab_cache_page_write_begin() can take a long time if the
+        * system is thrashing due to memory pressure, or if the page
+        * is being written back.  So grab it first before we start
+        * the transaction handle.  This also allows us to allocate
+        * the page (if needed) without using GFP_NOFS.
+        */
+retry_grab:
+       page = grab_cache_page_write_begin(mapping, index, flags);
+       if (!page)
+               return -ENOMEM;
+       unlock_page(page);
+
        /*
         * With delayed allocation, we don't log the i_disksize update
         * if there is delayed block allocation. But we still need
         * to journalling the i_disksize update if writes to the end
         * of file which has an already mapped buffer.
         */
+retry_journal:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
        if (IS_ERR(handle)) {
-               ret = PTR_ERR(handle);
-               goto out;
+               page_cache_release(page);
+               return PTR_ERR(handle);
        }
-       /* We cannot recurse into the filesystem as the transaction is already
-        * started */
-       flags |= AOP_FLAG_NOFS;
 
-       page = grab_cache_page_write_begin(mapping, index, flags);
-       if (!page) {
+       lock_page(page);
+       if (page->mapping != mapping) {
+               /* The page got truncated from under us */
+               unlock_page(page);
+               page_cache_release(page);
                ext4_journal_stop(handle);
-               ret = -ENOMEM;
-               goto out;
+               goto retry_grab;
        }
-       *pagep = page;
+       /* In case writeback began while the page was unlocked */
+       wait_on_page_writeback(page);
 
        ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
-               page_cache_release(page);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
@@ -2501,11 +2704,16 @@ retry:
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);
+
+               if (ret == -ENOSPC &&
+                   ext4_should_retry_alloc(inode->i_sb, &retries))
+                       goto retry_journal;
+
+               page_cache_release(page);
+               return ret;
        }
 
-       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-               goto retry;
-out:
+       *pagep = page;
        return ret;
 }