ext4: teach ext4_ext_find_extent() to realloc path if necessary
[firefly-linux-kernel-4.4.55.git] / fs / ext4 / extents.c
1 /*
2  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3  * Written by Alex Tomas <alex@clusterfs.com>
4  *
5  * Architecture independence:
6  *   Copyright (c) 2005, Bull S.A.
7  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public Licens
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21  */
22
23 /*
24  * Extents support for EXT4
25  *
26  * TODO:
27  *   - ext4*_error() should be used in some situations
28  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29  *   - smart tree reduction
30  */
31
32 #include <linux/fs.h>
33 #include <linux/time.h>
34 #include <linux/jbd2.h>
35 #include <linux/highuid.h>
36 #include <linux/pagemap.h>
37 #include <linux/quotaops.h>
38 #include <linux/string.h>
39 #include <linux/slab.h>
40 #include <asm/uaccess.h>
41 #include <linux/fiemap.h>
42 #include "ext4_jbd2.h"
43 #include "ext4_extents.h"
44 #include "xattr.h"
45
46 #include <trace/events/ext4.h>
47
48 /*
49  * used by extent splitting.
50  */
51 #define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
52                                         due to ENOSPC */
53 #define EXT4_EXT_MARK_UNWRIT1   0x2  /* mark first half unwritten */
54 #define EXT4_EXT_MARK_UNWRIT2   0x4  /* mark second half unwritten */
55
56 #define EXT4_EXT_DATA_VALID1    0x8  /* first half contains valid data */
57 #define EXT4_EXT_DATA_VALID2    0x10 /* second half contains valid data */
58
59 static __le32 ext4_extent_block_csum(struct inode *inode,
60                                      struct ext4_extent_header *eh)
61 {
62         struct ext4_inode_info *ei = EXT4_I(inode);
63         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
64         __u32 csum;
65
66         csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
67                            EXT4_EXTENT_TAIL_OFFSET(eh));
68         return cpu_to_le32(csum);
69 }
70
71 static int ext4_extent_block_csum_verify(struct inode *inode,
72                                          struct ext4_extent_header *eh)
73 {
74         struct ext4_extent_tail *et;
75
76         if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
77                 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
78                 return 1;
79
80         et = find_ext4_extent_tail(eh);
81         if (et->et_checksum != ext4_extent_block_csum(inode, eh))
82                 return 0;
83         return 1;
84 }
85
86 static void ext4_extent_block_csum_set(struct inode *inode,
87                                        struct ext4_extent_header *eh)
88 {
89         struct ext4_extent_tail *et;
90
91         if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
92                 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
93                 return;
94
95         et = find_ext4_extent_tail(eh);
96         et->et_checksum = ext4_extent_block_csum(inode, eh);
97 }
98
99 static int ext4_split_extent(handle_t *handle,
100                                 struct inode *inode,
101                                 struct ext4_ext_path **ppath,
102                                 struct ext4_map_blocks *map,
103                                 int split_flag,
104                                 int flags);
105
106 static int ext4_split_extent_at(handle_t *handle,
107                              struct inode *inode,
108                              struct ext4_ext_path **ppath,
109                              ext4_lblk_t split,
110                              int split_flag,
111                              int flags);
112
113 static int ext4_find_delayed_extent(struct inode *inode,
114                                     struct extent_status *newes);
115
116 static int ext4_ext_truncate_extend_restart(handle_t *handle,
117                                             struct inode *inode,
118                                             int needed)
119 {
120         int err;
121
122         if (!ext4_handle_valid(handle))
123                 return 0;
124         if (handle->h_buffer_credits > needed)
125                 return 0;
126         err = ext4_journal_extend(handle, needed);
127         if (err <= 0)
128                 return err;
129         err = ext4_truncate_restart_trans(handle, inode, needed);
130         if (err == 0)
131                 err = -EAGAIN;
132
133         return err;
134 }
135
136 /*
137  * could return:
138  *  - EROFS
139  *  - ENOMEM
140  */
141 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
142                                 struct ext4_ext_path *path)
143 {
144         if (path->p_bh) {
145                 /* path points to block */
146                 BUFFER_TRACE(path->p_bh, "get_write_access");
147                 return ext4_journal_get_write_access(handle, path->p_bh);
148         }
149         /* path points to leaf/index in inode body */
150         /* we use in-core data, no need to protect them */
151         return 0;
152 }
153
154 /*
155  * could return:
156  *  - EROFS
157  *  - ENOMEM
158  *  - EIO
159  */
160 int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
161                      struct inode *inode, struct ext4_ext_path *path)
162 {
163         int err;
164
165         WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
166         if (path->p_bh) {
167                 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
168                 /* path points to block */
169                 err = __ext4_handle_dirty_metadata(where, line, handle,
170                                                    inode, path->p_bh);
171         } else {
172                 /* path points to leaf/index in inode body */
173                 err = ext4_mark_inode_dirty(handle, inode);
174         }
175         return err;
176 }
177
178 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
179                               struct ext4_ext_path *path,
180                               ext4_lblk_t block)
181 {
182         if (path) {
183                 int depth = path->p_depth;
184                 struct ext4_extent *ex;
185
186                 /*
187                  * Try to predict block placement assuming that we are
188                  * filling in a file which will eventually be
189                  * non-sparse --- i.e., in the case of libbfd writing
190                  * an ELF object sections out-of-order but in a way
191                  * the eventually results in a contiguous object or
192                  * executable file, or some database extending a table
193                  * space file.  However, this is actually somewhat
194                  * non-ideal if we are writing a sparse file such as
195                  * qemu or KVM writing a raw image file that is going
196                  * to stay fairly sparse, since it will end up
197                  * fragmenting the file system's free space.  Maybe we
198                  * should have some hueristics or some way to allow
199                  * userspace to pass a hint to file system,
200                  * especially if the latter case turns out to be
201                  * common.
202                  */
203                 ex = path[depth].p_ext;
204                 if (ex) {
205                         ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
206                         ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
207
208                         if (block > ext_block)
209                                 return ext_pblk + (block - ext_block);
210                         else
211                                 return ext_pblk - (ext_block - block);
212                 }
213
214                 /* it looks like index is empty;
215                  * try to find starting block from index itself */
216                 if (path[depth].p_bh)
217                         return path[depth].p_bh->b_blocknr;
218         }
219
220         /* OK. use inode's group */
221         return ext4_inode_to_goal_block(inode);
222 }
223
224 /*
225  * Allocation for a meta data block
226  */
227 static ext4_fsblk_t
228 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
229                         struct ext4_ext_path *path,
230                         struct ext4_extent *ex, int *err, unsigned int flags)
231 {
232         ext4_fsblk_t goal, newblock;
233
234         goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
235         newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
236                                         NULL, err);
237         return newblock;
238 }
239
240 static inline int ext4_ext_space_block(struct inode *inode, int check)
241 {
242         int size;
243
244         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
245                         / sizeof(struct ext4_extent);
246 #ifdef AGGRESSIVE_TEST
247         if (!check && size > 6)
248                 size = 6;
249 #endif
250         return size;
251 }
252
253 static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
254 {
255         int size;
256
257         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
258                         / sizeof(struct ext4_extent_idx);
259 #ifdef AGGRESSIVE_TEST
260         if (!check && size > 5)
261                 size = 5;
262 #endif
263         return size;
264 }
265
266 static inline int ext4_ext_space_root(struct inode *inode, int check)
267 {
268         int size;
269
270         size = sizeof(EXT4_I(inode)->i_data);
271         size -= sizeof(struct ext4_extent_header);
272         size /= sizeof(struct ext4_extent);
273 #ifdef AGGRESSIVE_TEST
274         if (!check && size > 3)
275                 size = 3;
276 #endif
277         return size;
278 }
279
280 static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
281 {
282         int size;
283
284         size = sizeof(EXT4_I(inode)->i_data);
285         size -= sizeof(struct ext4_extent_header);
286         size /= sizeof(struct ext4_extent_idx);
287 #ifdef AGGRESSIVE_TEST
288         if (!check && size > 4)
289                 size = 4;
290 #endif
291         return size;
292 }
293
294 static inline int
295 ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
296                            struct ext4_ext_path **ppath, ext4_lblk_t lblk,
297                            int nofail)
298 {
299         struct ext4_ext_path *path = *ppath;
300         int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
301
302         return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
303                         EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
304                         EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
305                         (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
306 }
307
308 /*
309  * Calculate the number of metadata blocks needed
310  * to allocate @blocks
311  * Worse case is one block per extent
312  */
313 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
314 {
315         struct ext4_inode_info *ei = EXT4_I(inode);
316         int idxs;
317
318         idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
319                 / sizeof(struct ext4_extent_idx));
320
321         /*
322          * If the new delayed allocation block is contiguous with the
323          * previous da block, it can share index blocks with the
324          * previous block, so we only need to allocate a new index
325          * block every idxs leaf blocks.  At ldxs**2 blocks, we need
326          * an additional index block, and at ldxs**3 blocks, yet
327          * another index blocks.
328          */
329         if (ei->i_da_metadata_calc_len &&
330             ei->i_da_metadata_calc_last_lblock+1 == lblock) {
331                 int num = 0;
332
333                 if ((ei->i_da_metadata_calc_len % idxs) == 0)
334                         num++;
335                 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
336                         num++;
337                 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
338                         num++;
339                         ei->i_da_metadata_calc_len = 0;
340                 } else
341                         ei->i_da_metadata_calc_len++;
342                 ei->i_da_metadata_calc_last_lblock++;
343                 return num;
344         }
345
346         /*
347          * In the worst case we need a new set of index blocks at
348          * every level of the inode's extent tree.
349          */
350         ei->i_da_metadata_calc_len = 1;
351         ei->i_da_metadata_calc_last_lblock = lblock;
352         return ext_depth(inode) + 1;
353 }
354
355 static int
356 ext4_ext_max_entries(struct inode *inode, int depth)
357 {
358         int max;
359
360         if (depth == ext_depth(inode)) {
361                 if (depth == 0)
362                         max = ext4_ext_space_root(inode, 1);
363                 else
364                         max = ext4_ext_space_root_idx(inode, 1);
365         } else {
366                 if (depth == 0)
367                         max = ext4_ext_space_block(inode, 1);
368                 else
369                         max = ext4_ext_space_block_idx(inode, 1);
370         }
371
372         return max;
373 }
374
375 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
376 {
377         ext4_fsblk_t block = ext4_ext_pblock(ext);
378         int len = ext4_ext_get_actual_len(ext);
379         ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
380         ext4_lblk_t last = lblock + len - 1;
381
382         if (lblock > last)
383                 return 0;
384         return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
385 }
386
387 static int ext4_valid_extent_idx(struct inode *inode,
388                                 struct ext4_extent_idx *ext_idx)
389 {
390         ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
391
392         return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
393 }
394
395 static int ext4_valid_extent_entries(struct inode *inode,
396                                 struct ext4_extent_header *eh,
397                                 int depth)
398 {
399         unsigned short entries;
400         if (eh->eh_entries == 0)
401                 return 1;
402
403         entries = le16_to_cpu(eh->eh_entries);
404
405         if (depth == 0) {
406                 /* leaf entries */
407                 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
408                 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
409                 ext4_fsblk_t pblock = 0;
410                 ext4_lblk_t lblock = 0;
411                 ext4_lblk_t prev = 0;
412                 int len = 0;
413                 while (entries) {
414                         if (!ext4_valid_extent(inode, ext))
415                                 return 0;
416
417                         /* Check for overlapping extents */
418                         lblock = le32_to_cpu(ext->ee_block);
419                         len = ext4_ext_get_actual_len(ext);
420                         if ((lblock <= prev) && prev) {
421                                 pblock = ext4_ext_pblock(ext);
422                                 es->s_last_error_block = cpu_to_le64(pblock);
423                                 return 0;
424                         }
425                         ext++;
426                         entries--;
427                         prev = lblock + len - 1;
428                 }
429         } else {
430                 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
431                 while (entries) {
432                         if (!ext4_valid_extent_idx(inode, ext_idx))
433                                 return 0;
434                         ext_idx++;
435                         entries--;
436                 }
437         }
438         return 1;
439 }
440
441 static int __ext4_ext_check(const char *function, unsigned int line,
442                             struct inode *inode, struct ext4_extent_header *eh,
443                             int depth, ext4_fsblk_t pblk)
444 {
445         const char *error_msg;
446         int max = 0;
447
448         if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
449                 error_msg = "invalid magic";
450                 goto corrupted;
451         }
452         if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
453                 error_msg = "unexpected eh_depth";
454                 goto corrupted;
455         }
456         if (unlikely(eh->eh_max == 0)) {
457                 error_msg = "invalid eh_max";
458                 goto corrupted;
459         }
460         max = ext4_ext_max_entries(inode, depth);
461         if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
462                 error_msg = "too large eh_max";
463                 goto corrupted;
464         }
465         if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
466                 error_msg = "invalid eh_entries";
467                 goto corrupted;
468         }
469         if (!ext4_valid_extent_entries(inode, eh, depth)) {
470                 error_msg = "invalid extent entries";
471                 goto corrupted;
472         }
473         /* Verify checksum on non-root extent tree nodes */
474         if (ext_depth(inode) != depth &&
475             !ext4_extent_block_csum_verify(inode, eh)) {
476                 error_msg = "extent tree corrupted";
477                 goto corrupted;
478         }
479         return 0;
480
481 corrupted:
482         ext4_error_inode(inode, function, line, 0,
483                          "pblk %llu bad header/extent: %s - magic %x, "
484                          "entries %u, max %u(%u), depth %u(%u)",
485                          (unsigned long long) pblk, error_msg,
486                          le16_to_cpu(eh->eh_magic),
487                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
488                          max, le16_to_cpu(eh->eh_depth), depth);
489         return -EIO;
490 }
491
492 #define ext4_ext_check(inode, eh, depth, pblk)                  \
493         __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
494
495 int ext4_ext_check_inode(struct inode *inode)
496 {
497         return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
498 }
499
500 static struct buffer_head *
501 __read_extent_tree_block(const char *function, unsigned int line,
502                          struct inode *inode, ext4_fsblk_t pblk, int depth,
503                          int flags)
504 {
505         struct buffer_head              *bh;
506         int                             err;
507
508         bh = sb_getblk(inode->i_sb, pblk);
509         if (unlikely(!bh))
510                 return ERR_PTR(-ENOMEM);
511
512         if (!bh_uptodate_or_lock(bh)) {
513                 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
514                 err = bh_submit_read(bh);
515                 if (err < 0)
516                         goto errout;
517         }
518         if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
519                 return bh;
520         err = __ext4_ext_check(function, line, inode,
521                                ext_block_hdr(bh), depth, pblk);
522         if (err)
523                 goto errout;
524         set_buffer_verified(bh);
525         /*
526          * If this is a leaf block, cache all of its entries
527          */
528         if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
529                 struct ext4_extent_header *eh = ext_block_hdr(bh);
530                 struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
531                 ext4_lblk_t prev = 0;
532                 int i;
533
534                 for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
535                         unsigned int status = EXTENT_STATUS_WRITTEN;
536                         ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
537                         int len = ext4_ext_get_actual_len(ex);
538
539                         if (prev && (prev != lblk))
540                                 ext4_es_cache_extent(inode, prev,
541                                                      lblk - prev, ~0,
542                                                      EXTENT_STATUS_HOLE);
543
544                         if (ext4_ext_is_unwritten(ex))
545                                 status = EXTENT_STATUS_UNWRITTEN;
546                         ext4_es_cache_extent(inode, lblk, len,
547                                              ext4_ext_pblock(ex), status);
548                         prev = lblk + len;
549                 }
550         }
551         return bh;
552 errout:
553         put_bh(bh);
554         return ERR_PTR(err);
555
556 }
557
558 #define read_extent_tree_block(inode, pblk, depth, flags)               \
559         __read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
560                                  (depth), (flags))
561
562 /*
563  * This function is called to cache a file's extent information in the
564  * extent status tree
565  */
566 int ext4_ext_precache(struct inode *inode)
567 {
568         struct ext4_inode_info *ei = EXT4_I(inode);
569         struct ext4_ext_path *path = NULL;
570         struct buffer_head *bh;
571         int i = 0, depth, ret = 0;
572
573         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
574                 return 0;       /* not an extent-mapped inode */
575
576         down_read(&ei->i_data_sem);
577         depth = ext_depth(inode);
578
579         path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
580                        GFP_NOFS);
581         if (path == NULL) {
582                 up_read(&ei->i_data_sem);
583                 return -ENOMEM;
584         }
585
586         /* Don't cache anything if there are no external extent blocks */
587         if (depth == 0)
588                 goto out;
589         path[0].p_hdr = ext_inode_hdr(inode);
590         ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
591         if (ret)
592                 goto out;
593         path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
594         while (i >= 0) {
595                 /*
596                  * If this is a leaf block or we've reached the end of
597                  * the index block, go up
598                  */
599                 if ((i == depth) ||
600                     path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
601                         brelse(path[i].p_bh);
602                         path[i].p_bh = NULL;
603                         i--;
604                         continue;
605                 }
606                 bh = read_extent_tree_block(inode,
607                                             ext4_idx_pblock(path[i].p_idx++),
608                                             depth - i - 1,
609                                             EXT4_EX_FORCE_CACHE);
610                 if (IS_ERR(bh)) {
611                         ret = PTR_ERR(bh);
612                         break;
613                 }
614                 i++;
615                 path[i].p_bh = bh;
616                 path[i].p_hdr = ext_block_hdr(bh);
617                 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
618         }
619         ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
620 out:
621         up_read(&ei->i_data_sem);
622         ext4_ext_drop_refs(path);
623         kfree(path);
624         return ret;
625 }
626
627 #ifdef EXT_DEBUG
628 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
629 {
630         int k, l = path->p_depth;
631
632         ext_debug("path:");
633         for (k = 0; k <= l; k++, path++) {
634                 if (path->p_idx) {
635                   ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
636                             ext4_idx_pblock(path->p_idx));
637                 } else if (path->p_ext) {
638                         ext_debug("  %d:[%d]%d:%llu ",
639                                   le32_to_cpu(path->p_ext->ee_block),
640                                   ext4_ext_is_unwritten(path->p_ext),
641                                   ext4_ext_get_actual_len(path->p_ext),
642                                   ext4_ext_pblock(path->p_ext));
643                 } else
644                         ext_debug("  []");
645         }
646         ext_debug("\n");
647 }
648
649 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
650 {
651         int depth = ext_depth(inode);
652         struct ext4_extent_header *eh;
653         struct ext4_extent *ex;
654         int i;
655
656         if (!path)
657                 return;
658
659         eh = path[depth].p_hdr;
660         ex = EXT_FIRST_EXTENT(eh);
661
662         ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
663
664         for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
665                 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
666                           ext4_ext_is_unwritten(ex),
667                           ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
668         }
669         ext_debug("\n");
670 }
671
672 static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
673                         ext4_fsblk_t newblock, int level)
674 {
675         int depth = ext_depth(inode);
676         struct ext4_extent *ex;
677
678         if (depth != level) {
679                 struct ext4_extent_idx *idx;
680                 idx = path[level].p_idx;
681                 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
682                         ext_debug("%d: move %d:%llu in new index %llu\n", level,
683                                         le32_to_cpu(idx->ei_block),
684                                         ext4_idx_pblock(idx),
685                                         newblock);
686                         idx++;
687                 }
688
689                 return;
690         }
691
692         ex = path[depth].p_ext;
693         while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
694                 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
695                                 le32_to_cpu(ex->ee_block),
696                                 ext4_ext_pblock(ex),
697                                 ext4_ext_is_unwritten(ex),
698                                 ext4_ext_get_actual_len(ex),
699                                 newblock);
700                 ex++;
701         }
702 }
703
704 #else
705 #define ext4_ext_show_path(inode, path)
706 #define ext4_ext_show_leaf(inode, path)
707 #define ext4_ext_show_move(inode, path, newblock, level)
708 #endif
709
710 void ext4_ext_drop_refs(struct ext4_ext_path *path)
711 {
712         int depth, i;
713
714         if (!path)
715                 return;
716         depth = path->p_depth;
717         for (i = 0; i <= depth; i++, path++)
718                 if (path->p_bh) {
719                         brelse(path->p_bh);
720                         path->p_bh = NULL;
721                 }
722 }
723
724 /*
725  * ext4_ext_binsearch_idx:
726  * binary search for the closest index of the given block
727  * the header must be checked before calling this
728  */
729 static void
730 ext4_ext_binsearch_idx(struct inode *inode,
731                         struct ext4_ext_path *path, ext4_lblk_t block)
732 {
733         struct ext4_extent_header *eh = path->p_hdr;
734         struct ext4_extent_idx *r, *l, *m;
735
736
737         ext_debug("binsearch for %u(idx):  ", block);
738
739         l = EXT_FIRST_INDEX(eh) + 1;
740         r = EXT_LAST_INDEX(eh);
741         while (l <= r) {
742                 m = l + (r - l) / 2;
743                 if (block < le32_to_cpu(m->ei_block))
744                         r = m - 1;
745                 else
746                         l = m + 1;
747                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
748                                 m, le32_to_cpu(m->ei_block),
749                                 r, le32_to_cpu(r->ei_block));
750         }
751
752         path->p_idx = l - 1;
753         ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
754                   ext4_idx_pblock(path->p_idx));
755
756 #ifdef CHECK_BINSEARCH
757         {
758                 struct ext4_extent_idx *chix, *ix;
759                 int k;
760
761                 chix = ix = EXT_FIRST_INDEX(eh);
762                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
763                   if (k != 0 &&
764                       le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
765                                 printk(KERN_DEBUG "k=%d, ix=0x%p, "
766                                        "first=0x%p\n", k,
767                                        ix, EXT_FIRST_INDEX(eh));
768                                 printk(KERN_DEBUG "%u <= %u\n",
769                                        le32_to_cpu(ix->ei_block),
770                                        le32_to_cpu(ix[-1].ei_block));
771                         }
772                         BUG_ON(k && le32_to_cpu(ix->ei_block)
773                                            <= le32_to_cpu(ix[-1].ei_block));
774                         if (block < le32_to_cpu(ix->ei_block))
775                                 break;
776                         chix = ix;
777                 }
778                 BUG_ON(chix != path->p_idx);
779         }
780 #endif
781
782 }
783
784 /*
785  * ext4_ext_binsearch:
786  * binary search for closest extent of the given block
787  * the header must be checked before calling this
788  */
789 static void
790 ext4_ext_binsearch(struct inode *inode,
791                 struct ext4_ext_path *path, ext4_lblk_t block)
792 {
793         struct ext4_extent_header *eh = path->p_hdr;
794         struct ext4_extent *r, *l, *m;
795
796         if (eh->eh_entries == 0) {
797                 /*
798                  * this leaf is empty:
799                  * we get such a leaf in split/add case
800                  */
801                 return;
802         }
803
804         ext_debug("binsearch for %u:  ", block);
805
806         l = EXT_FIRST_EXTENT(eh) + 1;
807         r = EXT_LAST_EXTENT(eh);
808
809         while (l <= r) {
810                 m = l + (r - l) / 2;
811                 if (block < le32_to_cpu(m->ee_block))
812                         r = m - 1;
813                 else
814                         l = m + 1;
815                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
816                                 m, le32_to_cpu(m->ee_block),
817                                 r, le32_to_cpu(r->ee_block));
818         }
819
820         path->p_ext = l - 1;
821         ext_debug("  -> %d:%llu:[%d]%d ",
822                         le32_to_cpu(path->p_ext->ee_block),
823                         ext4_ext_pblock(path->p_ext),
824                         ext4_ext_is_unwritten(path->p_ext),
825                         ext4_ext_get_actual_len(path->p_ext));
826
827 #ifdef CHECK_BINSEARCH
828         {
829                 struct ext4_extent *chex, *ex;
830                 int k;
831
832                 chex = ex = EXT_FIRST_EXTENT(eh);
833                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
834                         BUG_ON(k && le32_to_cpu(ex->ee_block)
835                                           <= le32_to_cpu(ex[-1].ee_block));
836                         if (block < le32_to_cpu(ex->ee_block))
837                                 break;
838                         chex = ex;
839                 }
840                 BUG_ON(chex != path->p_ext);
841         }
842 #endif
843
844 }
845
846 int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
847 {
848         struct ext4_extent_header *eh;
849
850         eh = ext_inode_hdr(inode);
851         eh->eh_depth = 0;
852         eh->eh_entries = 0;
853         eh->eh_magic = EXT4_EXT_MAGIC;
854         eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
855         ext4_mark_inode_dirty(handle, inode);
856         return 0;
857 }
858
859 struct ext4_ext_path *
860 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
861                      struct ext4_ext_path **orig_path, int flags)
862 {
863         struct ext4_extent_header *eh;
864         struct buffer_head *bh;
865         struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
866         short int depth, i, ppos = 0;
867         int ret;
868
869         eh = ext_inode_hdr(inode);
870         depth = ext_depth(inode);
871
872         if (path) {
873                 ext4_ext_drop_refs(path);
874                 if (depth > path[0].p_maxdepth) {
875                         kfree(path);
876                         *orig_path = path = NULL;
877                 }
878         }
879         if (!path) {
880                 /* account possible depth increase */
881                 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
882                                 GFP_NOFS);
883                 if (unlikely(!path))
884                         return ERR_PTR(-ENOMEM);
885                 path[0].p_maxdepth = depth + 1;
886         }
887         path[0].p_hdr = eh;
888         path[0].p_bh = NULL;
889
890         i = depth;
891         /* walk through the tree */
892         while (i) {
893                 ext_debug("depth %d: num %d, max %d\n",
894                           ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
895
896                 ext4_ext_binsearch_idx(inode, path + ppos, block);
897                 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
898                 path[ppos].p_depth = i;
899                 path[ppos].p_ext = NULL;
900
901                 bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
902                                             flags);
903                 if (unlikely(IS_ERR(bh))) {
904                         ret = PTR_ERR(bh);
905                         goto err;
906                 }
907
908                 eh = ext_block_hdr(bh);
909                 ppos++;
910                 if (unlikely(ppos > depth)) {
911                         put_bh(bh);
912                         EXT4_ERROR_INODE(inode,
913                                          "ppos %d > depth %d", ppos, depth);
914                         ret = -EIO;
915                         goto err;
916                 }
917                 path[ppos].p_bh = bh;
918                 path[ppos].p_hdr = eh;
919         }
920
921         path[ppos].p_depth = i;
922         path[ppos].p_ext = NULL;
923         path[ppos].p_idx = NULL;
924
925         /* find extent */
926         ext4_ext_binsearch(inode, path + ppos, block);
927         /* if not an empty leaf */
928         if (path[ppos].p_ext)
929                 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
930
931         ext4_ext_show_path(inode, path);
932
933         return path;
934
935 err:
936         ext4_ext_drop_refs(path);
937         kfree(path);
938         if (orig_path)
939                 *orig_path = NULL;
940         return ERR_PTR(ret);
941 }
942
943 /*
944  * ext4_ext_insert_index:
945  * insert new index [@logical;@ptr] into the block at @curp;
946  * check where to insert: before @curp or after @curp
947  */
948 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
949                                  struct ext4_ext_path *curp,
950                                  int logical, ext4_fsblk_t ptr)
951 {
952         struct ext4_extent_idx *ix;
953         int len, err;
954
955         err = ext4_ext_get_access(handle, inode, curp);
956         if (err)
957                 return err;
958
959         if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
960                 EXT4_ERROR_INODE(inode,
961                                  "logical %d == ei_block %d!",
962                                  logical, le32_to_cpu(curp->p_idx->ei_block));
963                 return -EIO;
964         }
965
966         if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
967                              >= le16_to_cpu(curp->p_hdr->eh_max))) {
968                 EXT4_ERROR_INODE(inode,
969                                  "eh_entries %d >= eh_max %d!",
970                                  le16_to_cpu(curp->p_hdr->eh_entries),
971                                  le16_to_cpu(curp->p_hdr->eh_max));
972                 return -EIO;
973         }
974
975         if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
976                 /* insert after */
977                 ext_debug("insert new index %d after: %llu\n", logical, ptr);
978                 ix = curp->p_idx + 1;
979         } else {
980                 /* insert before */
981                 ext_debug("insert new index %d before: %llu\n", logical, ptr);
982                 ix = curp->p_idx;
983         }
984
985         len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
986         BUG_ON(len < 0);
987         if (len > 0) {
988                 ext_debug("insert new index %d: "
989                                 "move %d indices from 0x%p to 0x%p\n",
990                                 logical, len, ix, ix + 1);
991                 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
992         }
993
994         if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
995                 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
996                 return -EIO;
997         }
998
999         ix->ei_block = cpu_to_le32(logical);
1000         ext4_idx_store_pblock(ix, ptr);
1001         le16_add_cpu(&curp->p_hdr->eh_entries, 1);
1002
1003         if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
1004                 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
1005                 return -EIO;
1006         }
1007
1008         err = ext4_ext_dirty(handle, inode, curp);
1009         ext4_std_error(inode->i_sb, err);
1010
1011         return err;
1012 }
1013
1014 /*
1015  * ext4_ext_split:
1016  * inserts new subtree into the path, using free index entry
1017  * at depth @at:
1018  * - allocates all needed blocks (new leaf and all intermediate index blocks)
1019  * - makes decision where to split
1020  * - moves remaining extents and index entries (right to the split point)
1021  *   into the newly allocated blocks
1022  * - initializes subtree
1023  */
1024 static int ext4_ext_split(handle_t *handle, struct inode *inode,
1025                           unsigned int flags,
1026                           struct ext4_ext_path *path,
1027                           struct ext4_extent *newext, int at)
1028 {
1029         struct buffer_head *bh = NULL;
1030         int depth = ext_depth(inode);
1031         struct ext4_extent_header *neh;
1032         struct ext4_extent_idx *fidx;
1033         int i = at, k, m, a;
1034         ext4_fsblk_t newblock, oldblock;
1035         __le32 border;
1036         ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1037         int err = 0;
1038
1039         /* make decision: where to split? */
1040         /* FIXME: now decision is simplest: at current extent */
1041
1042         /* if current leaf will be split, then we should use
1043          * border from split point */
1044         if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1045                 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1046                 return -EIO;
1047         }
1048         if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1049                 border = path[depth].p_ext[1].ee_block;
1050                 ext_debug("leaf will be split."
1051                                 " next leaf starts at %d\n",
1052                                   le32_to_cpu(border));
1053         } else {
1054                 border = newext->ee_block;
1055                 ext_debug("leaf will be added."
1056                                 " next leaf starts at %d\n",
1057                                 le32_to_cpu(border));
1058         }
1059
1060         /*
1061          * If error occurs, then we break processing
1062          * and mark filesystem read-only. index won't
1063          * be inserted and tree will be in consistent
1064          * state. Next mount will repair buffers too.
1065          */
1066
1067         /*
1068          * Get array to track all allocated blocks.
1069          * We need this to handle errors and free blocks
1070          * upon them.
1071          */
1072         ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
1073         if (!ablocks)
1074                 return -ENOMEM;
1075
1076         /* allocate all needed blocks */
1077         ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1078         for (a = 0; a < depth - at; a++) {
1079                 newblock = ext4_ext_new_meta_block(handle, inode, path,
1080                                                    newext, &err, flags);
1081                 if (newblock == 0)
1082                         goto cleanup;
1083                 ablocks[a] = newblock;
1084         }
1085
1086         /* initialize new leaf */
1087         newblock = ablocks[--a];
1088         if (unlikely(newblock == 0)) {
1089                 EXT4_ERROR_INODE(inode, "newblock == 0!");
1090                 err = -EIO;
1091                 goto cleanup;
1092         }
1093         bh = sb_getblk(inode->i_sb, newblock);
1094         if (unlikely(!bh)) {
1095                 err = -ENOMEM;
1096                 goto cleanup;
1097         }
1098         lock_buffer(bh);
1099
1100         err = ext4_journal_get_create_access(handle, bh);
1101         if (err)
1102                 goto cleanup;
1103
1104         neh = ext_block_hdr(bh);
1105         neh->eh_entries = 0;
1106         neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1107         neh->eh_magic = EXT4_EXT_MAGIC;
1108         neh->eh_depth = 0;
1109
1110         /* move remainder of path[depth] to the new leaf */
1111         if (unlikely(path[depth].p_hdr->eh_entries !=
1112                      path[depth].p_hdr->eh_max)) {
1113                 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1114                                  path[depth].p_hdr->eh_entries,
1115                                  path[depth].p_hdr->eh_max);
1116                 err = -EIO;
1117                 goto cleanup;
1118         }
1119         /* start copy from next extent */
1120         m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1121         ext4_ext_show_move(inode, path, newblock, depth);
1122         if (m) {
1123                 struct ext4_extent *ex;
1124                 ex = EXT_FIRST_EXTENT(neh);
1125                 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1126                 le16_add_cpu(&neh->eh_entries, m);
1127         }
1128
1129         ext4_extent_block_csum_set(inode, neh);
1130         set_buffer_uptodate(bh);
1131         unlock_buffer(bh);
1132
1133         err = ext4_handle_dirty_metadata(handle, inode, bh);
1134         if (err)
1135                 goto cleanup;
1136         brelse(bh);
1137         bh = NULL;
1138
1139         /* correct old leaf */
1140         if (m) {
1141                 err = ext4_ext_get_access(handle, inode, path + depth);
1142                 if (err)
1143                         goto cleanup;
1144                 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1145                 err = ext4_ext_dirty(handle, inode, path + depth);
1146                 if (err)
1147                         goto cleanup;
1148
1149         }
1150
1151         /* create intermediate indexes */
1152         k = depth - at - 1;
1153         if (unlikely(k < 0)) {
1154                 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1155                 err = -EIO;
1156                 goto cleanup;
1157         }
1158         if (k)
1159                 ext_debug("create %d intermediate indices\n", k);
1160         /* insert new index into current index block */
1161         /* current depth stored in i var */
1162         i = depth - 1;
1163         while (k--) {
1164                 oldblock = newblock;
1165                 newblock = ablocks[--a];
1166                 bh = sb_getblk(inode->i_sb, newblock);
1167                 if (unlikely(!bh)) {
1168                         err = -ENOMEM;
1169                         goto cleanup;
1170                 }
1171                 lock_buffer(bh);
1172
1173                 err = ext4_journal_get_create_access(handle, bh);
1174                 if (err)
1175                         goto cleanup;
1176
1177                 neh = ext_block_hdr(bh);
1178                 neh->eh_entries = cpu_to_le16(1);
1179                 neh->eh_magic = EXT4_EXT_MAGIC;
1180                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1181                 neh->eh_depth = cpu_to_le16(depth - i);
1182                 fidx = EXT_FIRST_INDEX(neh);
1183                 fidx->ei_block = border;
1184                 ext4_idx_store_pblock(fidx, oldblock);
1185
1186                 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1187                                 i, newblock, le32_to_cpu(border), oldblock);
1188
1189                 /* move remainder of path[i] to the new index block */
1190                 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1191                                         EXT_LAST_INDEX(path[i].p_hdr))) {
1192                         EXT4_ERROR_INODE(inode,
1193                                          "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1194                                          le32_to_cpu(path[i].p_ext->ee_block));
1195                         err = -EIO;
1196                         goto cleanup;
1197                 }
1198                 /* start copy indexes */
1199                 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1200                 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1201                                 EXT_MAX_INDEX(path[i].p_hdr));
1202                 ext4_ext_show_move(inode, path, newblock, i);
1203                 if (m) {
1204                         memmove(++fidx, path[i].p_idx,
1205                                 sizeof(struct ext4_extent_idx) * m);
1206                         le16_add_cpu(&neh->eh_entries, m);
1207                 }
1208                 ext4_extent_block_csum_set(inode, neh);
1209                 set_buffer_uptodate(bh);
1210                 unlock_buffer(bh);
1211
1212                 err = ext4_handle_dirty_metadata(handle, inode, bh);
1213                 if (err)
1214                         goto cleanup;
1215                 brelse(bh);
1216                 bh = NULL;
1217
1218                 /* correct old index */
1219                 if (m) {
1220                         err = ext4_ext_get_access(handle, inode, path + i);
1221                         if (err)
1222                                 goto cleanup;
1223                         le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1224                         err = ext4_ext_dirty(handle, inode, path + i);
1225                         if (err)
1226                                 goto cleanup;
1227                 }
1228
1229                 i--;
1230         }
1231
1232         /* insert new index */
1233         err = ext4_ext_insert_index(handle, inode, path + at,
1234                                     le32_to_cpu(border), newblock);
1235
1236 cleanup:
1237         if (bh) {
1238                 if (buffer_locked(bh))
1239                         unlock_buffer(bh);
1240                 brelse(bh);
1241         }
1242
1243         if (err) {
1244                 /* free all allocated blocks in error case */
1245                 for (i = 0; i < depth; i++) {
1246                         if (!ablocks[i])
1247                                 continue;
1248                         ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1249                                          EXT4_FREE_BLOCKS_METADATA);
1250                 }
1251         }
1252         kfree(ablocks);
1253
1254         return err;
1255 }
1256
1257 /*
1258  * ext4_ext_grow_indepth:
1259  * implements tree growing procedure:
1260  * - allocates new block
1261  * - moves top-level data (index block or leaf) into the new block
1262  * - initializes new top-level, creating index that points to the
1263  *   just created block
1264  */
1265 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1266                                  unsigned int flags,
1267                                  struct ext4_extent *newext)
1268 {
1269         struct ext4_extent_header *neh;
1270         struct buffer_head *bh;
1271         ext4_fsblk_t newblock;
1272         int err = 0;
1273
1274         newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1275                 newext, &err, flags);
1276         if (newblock == 0)
1277                 return err;
1278
1279         bh = sb_getblk(inode->i_sb, newblock);
1280         if (unlikely(!bh))
1281                 return -ENOMEM;
1282         lock_buffer(bh);
1283
1284         err = ext4_journal_get_create_access(handle, bh);
1285         if (err) {
1286                 unlock_buffer(bh);
1287                 goto out;
1288         }
1289
1290         /* move top-level index/leaf into new block */
1291         memmove(bh->b_data, EXT4_I(inode)->i_data,
1292                 sizeof(EXT4_I(inode)->i_data));
1293
1294         /* set size of new block */
1295         neh = ext_block_hdr(bh);
1296         /* old root could have indexes or leaves
1297          * so calculate e_max right way */
1298         if (ext_depth(inode))
1299                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1300         else
1301                 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1302         neh->eh_magic = EXT4_EXT_MAGIC;
1303         ext4_extent_block_csum_set(inode, neh);
1304         set_buffer_uptodate(bh);
1305         unlock_buffer(bh);
1306
1307         err = ext4_handle_dirty_metadata(handle, inode, bh);
1308         if (err)
1309                 goto out;
1310
1311         /* Update top-level index: num,max,pointer */
1312         neh = ext_inode_hdr(inode);
1313         neh->eh_entries = cpu_to_le16(1);
1314         ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1315         if (neh->eh_depth == 0) {
1316                 /* Root extent block becomes index block */
1317                 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1318                 EXT_FIRST_INDEX(neh)->ei_block =
1319                         EXT_FIRST_EXTENT(neh)->ee_block;
1320         }
1321         ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1322                   le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1323                   le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1324                   ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1325
1326         le16_add_cpu(&neh->eh_depth, 1);
1327         ext4_mark_inode_dirty(handle, inode);
1328 out:
1329         brelse(bh);
1330
1331         return err;
1332 }
1333
1334 /*
1335  * ext4_ext_create_new_leaf:
1336  * finds empty index and adds new leaf.
1337  * if no free index is found, then it requests in-depth growing.
1338  */
1339 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1340                                     unsigned int mb_flags,
1341                                     unsigned int gb_flags,
1342                                     struct ext4_ext_path **ppath,
1343                                     struct ext4_extent *newext)
1344 {
1345         struct ext4_ext_path *path = *ppath;
1346         struct ext4_ext_path *curp;
1347         int depth, i, err = 0;
1348
1349 repeat:
1350         i = depth = ext_depth(inode);
1351
1352         /* walk up to the tree and look for free index entry */
1353         curp = path + depth;
1354         while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1355                 i--;
1356                 curp--;
1357         }
1358
1359         /* we use already allocated block for index block,
1360          * so subsequent data blocks should be contiguous */
1361         if (EXT_HAS_FREE_INDEX(curp)) {
1362                 /* if we found index with free entry, then use that
1363                  * entry: create all needed subtree and add new leaf */
1364                 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1365                 if (err)
1366                         goto out;
1367
1368                 /* refill path */
1369                 path = ext4_ext_find_extent(inode,
1370                                     (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1371                                     ppath, gb_flags);
1372                 if (IS_ERR(path))
1373                         err = PTR_ERR(path);
1374         } else {
1375                 /* tree is full, time to grow in depth */
1376                 err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
1377                 if (err)
1378                         goto out;
1379
1380                 /* refill path */
1381                 path = ext4_ext_find_extent(inode,
1382                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1383                                     ppath, gb_flags);
1384                 if (IS_ERR(path)) {
1385                         err = PTR_ERR(path);
1386                         goto out;
1387                 }
1388
1389                 /*
1390                  * only first (depth 0 -> 1) produces free space;
1391                  * in all other cases we have to split the grown tree
1392                  */
1393                 depth = ext_depth(inode);
1394                 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1395                         /* now we need to split */
1396                         goto repeat;
1397                 }
1398         }
1399
1400 out:
1401         return err;
1402 }
1403
1404 /*
1405  * search the closest allocated block to the left for *logical
1406  * and returns it at @logical + it's physical address at @phys
1407  * if *logical is the smallest allocated block, the function
1408  * returns 0 at @phys
1409  * return value contains 0 (success) or error code
1410  */
1411 static int ext4_ext_search_left(struct inode *inode,
1412                                 struct ext4_ext_path *path,
1413                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1414 {
1415         struct ext4_extent_idx *ix;
1416         struct ext4_extent *ex;
1417         int depth, ee_len;
1418
1419         if (unlikely(path == NULL)) {
1420                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1421                 return -EIO;
1422         }
1423         depth = path->p_depth;
1424         *phys = 0;
1425
1426         if (depth == 0 && path->p_ext == NULL)
1427                 return 0;
1428
1429         /* usually extent in the path covers blocks smaller
1430          * then *logical, but it can be that extent is the
1431          * first one in the file */
1432
1433         ex = path[depth].p_ext;
1434         ee_len = ext4_ext_get_actual_len(ex);
1435         if (*logical < le32_to_cpu(ex->ee_block)) {
1436                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1437                         EXT4_ERROR_INODE(inode,
1438                                          "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1439                                          *logical, le32_to_cpu(ex->ee_block));
1440                         return -EIO;
1441                 }
1442                 while (--depth >= 0) {
1443                         ix = path[depth].p_idx;
1444                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1445                                 EXT4_ERROR_INODE(inode,
1446                                   "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1447                                   ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1448                                   EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1449                 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1450                                   depth);
1451                                 return -EIO;
1452                         }
1453                 }
1454                 return 0;
1455         }
1456
1457         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1458                 EXT4_ERROR_INODE(inode,
1459                                  "logical %d < ee_block %d + ee_len %d!",
1460                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1461                 return -EIO;
1462         }
1463
1464         *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1465         *phys = ext4_ext_pblock(ex) + ee_len - 1;
1466         return 0;
1467 }
1468
1469 /*
1470  * search the closest allocated block to the right for *logical
1471  * and returns it at @logical + it's physical address at @phys
1472  * if *logical is the largest allocated block, the function
1473  * returns 0 at @phys
1474  * return value contains 0 (success) or error code
1475  */
1476 static int ext4_ext_search_right(struct inode *inode,
1477                                  struct ext4_ext_path *path,
1478                                  ext4_lblk_t *logical, ext4_fsblk_t *phys,
1479                                  struct ext4_extent **ret_ex)
1480 {
1481         struct buffer_head *bh = NULL;
1482         struct ext4_extent_header *eh;
1483         struct ext4_extent_idx *ix;
1484         struct ext4_extent *ex;
1485         ext4_fsblk_t block;
1486         int depth;      /* Note, NOT eh_depth; depth from top of tree */
1487         int ee_len;
1488
1489         if (unlikely(path == NULL)) {
1490                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1491                 return -EIO;
1492         }
1493         depth = path->p_depth;
1494         *phys = 0;
1495
1496         if (depth == 0 && path->p_ext == NULL)
1497                 return 0;
1498
1499         /* usually extent in the path covers blocks smaller
1500          * then *logical, but it can be that extent is the
1501          * first one in the file */
1502
1503         ex = path[depth].p_ext;
1504         ee_len = ext4_ext_get_actual_len(ex);
1505         if (*logical < le32_to_cpu(ex->ee_block)) {
1506                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1507                         EXT4_ERROR_INODE(inode,
1508                                          "first_extent(path[%d].p_hdr) != ex",
1509                                          depth);
1510                         return -EIO;
1511                 }
1512                 while (--depth >= 0) {
1513                         ix = path[depth].p_idx;
1514                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1515                                 EXT4_ERROR_INODE(inode,
1516                                                  "ix != EXT_FIRST_INDEX *logical %d!",
1517                                                  *logical);
1518                                 return -EIO;
1519                         }
1520                 }
1521                 goto found_extent;
1522         }
1523
1524         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1525                 EXT4_ERROR_INODE(inode,
1526                                  "logical %d < ee_block %d + ee_len %d!",
1527                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1528                 return -EIO;
1529         }
1530
1531         if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1532                 /* next allocated block in this leaf */
1533                 ex++;
1534                 goto found_extent;
1535         }
1536
1537         /* go up and search for index to the right */
1538         while (--depth >= 0) {
1539                 ix = path[depth].p_idx;
1540                 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1541                         goto got_index;
1542         }
1543
1544         /* we've gone up to the root and found no index to the right */
1545         return 0;
1546
1547 got_index:
1548         /* we've found index to the right, let's
1549          * follow it and find the closest allocated
1550          * block to the right */
1551         ix++;
1552         block = ext4_idx_pblock(ix);
1553         while (++depth < path->p_depth) {
1554                 /* subtract from p_depth to get proper eh_depth */
1555                 bh = read_extent_tree_block(inode, block,
1556                                             path->p_depth - depth, 0);
1557                 if (IS_ERR(bh))
1558                         return PTR_ERR(bh);
1559                 eh = ext_block_hdr(bh);
1560                 ix = EXT_FIRST_INDEX(eh);
1561                 block = ext4_idx_pblock(ix);
1562                 put_bh(bh);
1563         }
1564
1565         bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1566         if (IS_ERR(bh))
1567                 return PTR_ERR(bh);
1568         eh = ext_block_hdr(bh);
1569         ex = EXT_FIRST_EXTENT(eh);
1570 found_extent:
1571         *logical = le32_to_cpu(ex->ee_block);
1572         *phys = ext4_ext_pblock(ex);
1573         *ret_ex = ex;
1574         if (bh)
1575                 put_bh(bh);
1576         return 0;
1577 }
1578
1579 /*
1580  * ext4_ext_next_allocated_block:
1581  * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1582  * NOTE: it considers block number from index entry as
1583  * allocated block. Thus, index entries have to be consistent
1584  * with leaves.
1585  */
1586 ext4_lblk_t
1587 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1588 {
1589         int depth;
1590
1591         BUG_ON(path == NULL);
1592         depth = path->p_depth;
1593
1594         if (depth == 0 && path->p_ext == NULL)
1595                 return EXT_MAX_BLOCKS;
1596
1597         while (depth >= 0) {
1598                 if (depth == path->p_depth) {
1599                         /* leaf */
1600                         if (path[depth].p_ext &&
1601                                 path[depth].p_ext !=
1602                                         EXT_LAST_EXTENT(path[depth].p_hdr))
1603                           return le32_to_cpu(path[depth].p_ext[1].ee_block);
1604                 } else {
1605                         /* index */
1606                         if (path[depth].p_idx !=
1607                                         EXT_LAST_INDEX(path[depth].p_hdr))
1608                           return le32_to_cpu(path[depth].p_idx[1].ei_block);
1609                 }
1610                 depth--;
1611         }
1612
1613         return EXT_MAX_BLOCKS;
1614 }
1615
1616 /*
1617  * ext4_ext_next_leaf_block:
1618  * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1619  */
1620 static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1621 {
1622         int depth;
1623
1624         BUG_ON(path == NULL);
1625         depth = path->p_depth;
1626
1627         /* zero-tree has no leaf blocks at all */
1628         if (depth == 0)
1629                 return EXT_MAX_BLOCKS;
1630
1631         /* go to index block */
1632         depth--;
1633
1634         while (depth >= 0) {
1635                 if (path[depth].p_idx !=
1636                                 EXT_LAST_INDEX(path[depth].p_hdr))
1637                         return (ext4_lblk_t)
1638                                 le32_to_cpu(path[depth].p_idx[1].ei_block);
1639                 depth--;
1640         }
1641
1642         return EXT_MAX_BLOCKS;
1643 }
1644
1645 /*
1646  * ext4_ext_correct_indexes:
1647  * if leaf gets modified and modified extent is first in the leaf,
1648  * then we have to correct all indexes above.
1649  * TODO: do we need to correct tree in all cases?
1650  */
1651 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1652                                 struct ext4_ext_path *path)
1653 {
1654         struct ext4_extent_header *eh;
1655         int depth = ext_depth(inode);
1656         struct ext4_extent *ex;
1657         __le32 border;
1658         int k, err = 0;
1659
1660         eh = path[depth].p_hdr;
1661         ex = path[depth].p_ext;
1662
1663         if (unlikely(ex == NULL || eh == NULL)) {
1664                 EXT4_ERROR_INODE(inode,
1665                                  "ex %p == NULL or eh %p == NULL", ex, eh);
1666                 return -EIO;
1667         }
1668
1669         if (depth == 0) {
1670                 /* there is no tree at all */
1671                 return 0;
1672         }
1673
1674         if (ex != EXT_FIRST_EXTENT(eh)) {
1675                 /* we correct tree if first leaf got modified only */
1676                 return 0;
1677         }
1678
1679         /*
1680          * TODO: we need correction if border is smaller than current one
1681          */
1682         k = depth - 1;
1683         border = path[depth].p_ext->ee_block;
1684         err = ext4_ext_get_access(handle, inode, path + k);
1685         if (err)
1686                 return err;
1687         path[k].p_idx->ei_block = border;
1688         err = ext4_ext_dirty(handle, inode, path + k);
1689         if (err)
1690                 return err;
1691
1692         while (k--) {
1693                 /* change all left-side indexes */
1694                 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1695                         break;
1696                 err = ext4_ext_get_access(handle, inode, path + k);
1697                 if (err)
1698                         break;
1699                 path[k].p_idx->ei_block = border;
1700                 err = ext4_ext_dirty(handle, inode, path + k);
1701                 if (err)
1702                         break;
1703         }
1704
1705         return err;
1706 }
1707
1708 int
1709 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1710                                 struct ext4_extent *ex2)
1711 {
1712         unsigned short ext1_ee_len, ext2_ee_len;
1713
1714         /*
1715          * Make sure that both extents are initialized. We don't merge
1716          * unwritten extents so that we can be sure that end_io code has
1717          * the extent that was written properly split out and conversion to
1718          * initialized is trivial.
1719          */
1720         if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1721                 return 0;
1722
1723         ext1_ee_len = ext4_ext_get_actual_len(ex1);
1724         ext2_ee_len = ext4_ext_get_actual_len(ex2);
1725
1726         if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1727                         le32_to_cpu(ex2->ee_block))
1728                 return 0;
1729
1730         /*
1731          * To allow future support for preallocated extents to be added
1732          * as an RO_COMPAT feature, refuse to merge to extents if
1733          * this can result in the top bit of ee_len being set.
1734          */
1735         if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1736                 return 0;
1737         if (ext4_ext_is_unwritten(ex1) &&
1738             (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1739              atomic_read(&EXT4_I(inode)->i_unwritten) ||
1740              (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
1741                 return 0;
1742 #ifdef AGGRESSIVE_TEST
1743         if (ext1_ee_len >= 4)
1744                 return 0;
1745 #endif
1746
1747         if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1748                 return 1;
1749         return 0;
1750 }
1751
1752 /*
1753  * This function tries to merge the "ex" extent to the next extent in the tree.
1754  * It always tries to merge towards right. If you want to merge towards
1755  * left, pass "ex - 1" as argument instead of "ex".
1756  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1757  * 1 if they got merged.
1758  */
1759 static int ext4_ext_try_to_merge_right(struct inode *inode,
1760                                  struct ext4_ext_path *path,
1761                                  struct ext4_extent *ex)
1762 {
1763         struct ext4_extent_header *eh;
1764         unsigned int depth, len;
1765         int merge_done = 0, unwritten;
1766
1767         depth = ext_depth(inode);
1768         BUG_ON(path[depth].p_hdr == NULL);
1769         eh = path[depth].p_hdr;
1770
1771         while (ex < EXT_LAST_EXTENT(eh)) {
1772                 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1773                         break;
1774                 /* merge with next extent! */
1775                 unwritten = ext4_ext_is_unwritten(ex);
1776                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1777                                 + ext4_ext_get_actual_len(ex + 1));
1778                 if (unwritten)
1779                         ext4_ext_mark_unwritten(ex);
1780
1781                 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1782                         len = (EXT_LAST_EXTENT(eh) - ex - 1)
1783                                 * sizeof(struct ext4_extent);
1784                         memmove(ex + 1, ex + 2, len);
1785                 }
1786                 le16_add_cpu(&eh->eh_entries, -1);
1787                 merge_done = 1;
1788                 WARN_ON(eh->eh_entries == 0);
1789                 if (!eh->eh_entries)
1790                         EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1791         }
1792
1793         return merge_done;
1794 }
1795
1796 /*
1797  * This function does a very simple check to see if we can collapse
1798  * an extent tree with a single extent tree leaf block into the inode.
1799  */
1800 static void ext4_ext_try_to_merge_up(handle_t *handle,
1801                                      struct inode *inode,
1802                                      struct ext4_ext_path *path)
1803 {
1804         size_t s;
1805         unsigned max_root = ext4_ext_space_root(inode, 0);
1806         ext4_fsblk_t blk;
1807
1808         if ((path[0].p_depth != 1) ||
1809             (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1810             (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1811                 return;
1812
1813         /*
1814          * We need to modify the block allocation bitmap and the block
1815          * group descriptor to release the extent tree block.  If we
1816          * can't get the journal credits, give up.
1817          */
1818         if (ext4_journal_extend(handle, 2))
1819                 return;
1820
1821         /*
1822          * Copy the extent data up to the inode
1823          */
1824         blk = ext4_idx_pblock(path[0].p_idx);
1825         s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1826                 sizeof(struct ext4_extent_idx);
1827         s += sizeof(struct ext4_extent_header);
1828
1829         path[1].p_maxdepth = path[0].p_maxdepth;
1830         memcpy(path[0].p_hdr, path[1].p_hdr, s);
1831         path[0].p_depth = 0;
1832         path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1833                 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1834         path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1835
1836         brelse(path[1].p_bh);
1837         ext4_free_blocks(handle, inode, NULL, blk, 1,
1838                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1839 }
1840
1841 /*
1842  * This function tries to merge the @ex extent to neighbours in the tree.
1843  * return 1 if merge left else 0.
1844  */
1845 static void ext4_ext_try_to_merge(handle_t *handle,
1846                                   struct inode *inode,
1847                                   struct ext4_ext_path *path,
1848                                   struct ext4_extent *ex) {
1849         struct ext4_extent_header *eh;
1850         unsigned int depth;
1851         int merge_done = 0;
1852
1853         depth = ext_depth(inode);
1854         BUG_ON(path[depth].p_hdr == NULL);
1855         eh = path[depth].p_hdr;
1856
1857         if (ex > EXT_FIRST_EXTENT(eh))
1858                 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1859
1860         if (!merge_done)
1861                 (void) ext4_ext_try_to_merge_right(inode, path, ex);
1862
1863         ext4_ext_try_to_merge_up(handle, inode, path);
1864 }
1865
1866 /*
1867  * check if a portion of the "newext" extent overlaps with an
1868  * existing extent.
1869  *
1870  * If there is an overlap discovered, it updates the length of the newext
1871  * such that there will be no overlap, and then returns 1.
1872  * If there is no overlap found, it returns 0.
1873  */
1874 static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1875                                            struct inode *inode,
1876                                            struct ext4_extent *newext,
1877                                            struct ext4_ext_path *path)
1878 {
1879         ext4_lblk_t b1, b2;
1880         unsigned int depth, len1;
1881         unsigned int ret = 0;
1882
1883         b1 = le32_to_cpu(newext->ee_block);
1884         len1 = ext4_ext_get_actual_len(newext);
1885         depth = ext_depth(inode);
1886         if (!path[depth].p_ext)
1887                 goto out;
1888         b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1889
1890         /*
1891          * get the next allocated block if the extent in the path
1892          * is before the requested block(s)
1893          */
1894         if (b2 < b1) {
1895                 b2 = ext4_ext_next_allocated_block(path);
1896                 if (b2 == EXT_MAX_BLOCKS)
1897                         goto out;
1898                 b2 = EXT4_LBLK_CMASK(sbi, b2);
1899         }
1900
1901         /* check for wrap through zero on extent logical start block*/
1902         if (b1 + len1 < b1) {
1903                 len1 = EXT_MAX_BLOCKS - b1;
1904                 newext->ee_len = cpu_to_le16(len1);
1905                 ret = 1;
1906         }
1907
1908         /* check for overlap */
1909         if (b1 + len1 > b2) {
1910                 newext->ee_len = cpu_to_le16(b2 - b1);
1911                 ret = 1;
1912         }
1913 out:
1914         return ret;
1915 }
1916
1917 /*
1918  * ext4_ext_insert_extent:
1919  * tries to merge requsted extent into the existing extent or
1920  * inserts requested extent as new one into the tree,
1921  * creating new leaf in the no-space case.
1922  */
1923 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1924                                 struct ext4_ext_path **ppath,
1925                                 struct ext4_extent *newext, int gb_flags)
1926 {
1927         struct ext4_ext_path *path = *ppath;
1928         struct ext4_extent_header *eh;
1929         struct ext4_extent *ex, *fex;
1930         struct ext4_extent *nearex; /* nearest extent */
1931         struct ext4_ext_path *npath = NULL;
1932         int depth, len, err;
1933         ext4_lblk_t next;
1934         int mb_flags = 0, unwritten;
1935
1936         if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1937                 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1938                 return -EIO;
1939         }
1940         depth = ext_depth(inode);
1941         ex = path[depth].p_ext;
1942         eh = path[depth].p_hdr;
1943         if (unlikely(path[depth].p_hdr == NULL)) {
1944                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1945                 return -EIO;
1946         }
1947
1948         /* try to insert block into found extent and return */
1949         if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1950
1951                 /*
1952                  * Try to see whether we should rather test the extent on
1953                  * right from ex, or from the left of ex. This is because
1954                  * ext4_ext_find_extent() can return either extent on the
1955                  * left, or on the right from the searched position. This
1956                  * will make merging more effective.
1957                  */
1958                 if (ex < EXT_LAST_EXTENT(eh) &&
1959                     (le32_to_cpu(ex->ee_block) +
1960                     ext4_ext_get_actual_len(ex) <
1961                     le32_to_cpu(newext->ee_block))) {
1962                         ex += 1;
1963                         goto prepend;
1964                 } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1965                            (le32_to_cpu(newext->ee_block) +
1966                            ext4_ext_get_actual_len(newext) <
1967                            le32_to_cpu(ex->ee_block)))
1968                         ex -= 1;
1969
1970                 /* Try to append newex to the ex */
1971                 if (ext4_can_extents_be_merged(inode, ex, newext)) {
1972                         ext_debug("append [%d]%d block to %u:[%d]%d"
1973                                   "(from %llu)\n",
1974                                   ext4_ext_is_unwritten(newext),
1975                                   ext4_ext_get_actual_len(newext),
1976                                   le32_to_cpu(ex->ee_block),
1977                                   ext4_ext_is_unwritten(ex),
1978                                   ext4_ext_get_actual_len(ex),
1979                                   ext4_ext_pblock(ex));
1980                         err = ext4_ext_get_access(handle, inode,
1981                                                   path + depth);
1982                         if (err)
1983                                 return err;
1984                         unwritten = ext4_ext_is_unwritten(ex);
1985                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1986                                         + ext4_ext_get_actual_len(newext));
1987                         if (unwritten)
1988                                 ext4_ext_mark_unwritten(ex);
1989                         eh = path[depth].p_hdr;
1990                         nearex = ex;
1991                         goto merge;
1992                 }
1993
1994 prepend:
1995                 /* Try to prepend newex to the ex */
1996                 if (ext4_can_extents_be_merged(inode, newext, ex)) {
1997                         ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
1998                                   "(from %llu)\n",
1999                                   le32_to_cpu(newext->ee_block),
2000                                   ext4_ext_is_unwritten(newext),
2001                                   ext4_ext_get_actual_len(newext),
2002                                   le32_to_cpu(ex->ee_block),
2003                                   ext4_ext_is_unwritten(ex),
2004                                   ext4_ext_get_actual_len(ex),
2005                                   ext4_ext_pblock(ex));
2006                         err = ext4_ext_get_access(handle, inode,
2007                                                   path + depth);
2008                         if (err)
2009                                 return err;
2010
2011                         unwritten = ext4_ext_is_unwritten(ex);
2012                         ex->ee_block = newext->ee_block;
2013                         ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
2014                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2015                                         + ext4_ext_get_actual_len(newext));
2016                         if (unwritten)
2017                                 ext4_ext_mark_unwritten(ex);
2018                         eh = path[depth].p_hdr;
2019                         nearex = ex;
2020                         goto merge;
2021                 }
2022         }
2023
2024         depth = ext_depth(inode);
2025         eh = path[depth].p_hdr;
2026         if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2027                 goto has_space;
2028
2029         /* probably next leaf has space for us? */
2030         fex = EXT_LAST_EXTENT(eh);
2031         next = EXT_MAX_BLOCKS;
2032         if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2033                 next = ext4_ext_next_leaf_block(path);
2034         if (next != EXT_MAX_BLOCKS) {
2035                 ext_debug("next leaf block - %u\n", next);
2036                 BUG_ON(npath != NULL);
2037                 npath = ext4_ext_find_extent(inode, next, NULL, 0);
2038                 if (IS_ERR(npath))
2039                         return PTR_ERR(npath);
2040                 BUG_ON(npath->p_depth != path->p_depth);
2041                 eh = npath[depth].p_hdr;
2042                 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2043                         ext_debug("next leaf isn't full(%d)\n",
2044                                   le16_to_cpu(eh->eh_entries));
2045                         path = npath;
2046                         goto has_space;
2047                 }
2048                 ext_debug("next leaf has no free space(%d,%d)\n",
2049                           le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2050         }
2051
2052         /*
2053          * There is no free space in the found leaf.
2054          * We're gonna add a new leaf in the tree.
2055          */
2056         if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2057                 mb_flags = EXT4_MB_USE_RESERVED;
2058         err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2059                                        ppath, newext);
2060         if (err)
2061                 goto cleanup;
2062         depth = ext_depth(inode);
2063         eh = path[depth].p_hdr;
2064
2065 has_space:
2066         nearex = path[depth].p_ext;
2067
2068         err = ext4_ext_get_access(handle, inode, path + depth);
2069         if (err)
2070                 goto cleanup;
2071
2072         if (!nearex) {
2073                 /* there is no extent in this leaf, create first one */
2074                 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
2075                                 le32_to_cpu(newext->ee_block),
2076                                 ext4_ext_pblock(newext),
2077                                 ext4_ext_is_unwritten(newext),
2078                                 ext4_ext_get_actual_len(newext));
2079                 nearex = EXT_FIRST_EXTENT(eh);
2080         } else {
2081                 if (le32_to_cpu(newext->ee_block)
2082                            > le32_to_cpu(nearex->ee_block)) {
2083                         /* Insert after */
2084                         ext_debug("insert %u:%llu:[%d]%d before: "
2085                                         "nearest %p\n",
2086                                         le32_to_cpu(newext->ee_block),
2087                                         ext4_ext_pblock(newext),
2088                                         ext4_ext_is_unwritten(newext),
2089                                         ext4_ext_get_actual_len(newext),
2090                                         nearex);
2091                         nearex++;
2092                 } else {
2093                         /* Insert before */
2094                         BUG_ON(newext->ee_block == nearex->ee_block);
2095                         ext_debug("insert %u:%llu:[%d]%d after: "
2096                                         "nearest %p\n",
2097                                         le32_to_cpu(newext->ee_block),
2098                                         ext4_ext_pblock(newext),
2099                                         ext4_ext_is_unwritten(newext),
2100                                         ext4_ext_get_actual_len(newext),
2101                                         nearex);
2102                 }
2103                 len = EXT_LAST_EXTENT(eh) - nearex + 1;
2104                 if (len > 0) {
2105                         ext_debug("insert %u:%llu:[%d]%d: "
2106                                         "move %d extents from 0x%p to 0x%p\n",
2107                                         le32_to_cpu(newext->ee_block),
2108                                         ext4_ext_pblock(newext),
2109                                         ext4_ext_is_unwritten(newext),
2110                                         ext4_ext_get_actual_len(newext),
2111                                         len, nearex, nearex + 1);
2112                         memmove(nearex + 1, nearex,
2113                                 len * sizeof(struct ext4_extent));
2114                 }
2115         }
2116
2117         le16_add_cpu(&eh->eh_entries, 1);
2118         path[depth].p_ext = nearex;
2119         nearex->ee_block = newext->ee_block;
2120         ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2121         nearex->ee_len = newext->ee_len;
2122
2123 merge:
2124         /* try to merge extents */
2125         if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2126                 ext4_ext_try_to_merge(handle, inode, path, nearex);
2127
2128
2129         /* time to correct all indexes above */
2130         err = ext4_ext_correct_indexes(handle, inode, path);
2131         if (err)
2132                 goto cleanup;
2133
2134         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2135
2136 cleanup:
2137         ext4_ext_drop_refs(npath);
2138         kfree(npath);
2139         return err;
2140 }
2141
2142 static int ext4_fill_fiemap_extents(struct inode *inode,
2143                                     ext4_lblk_t block, ext4_lblk_t num,
2144                                     struct fiemap_extent_info *fieinfo)
2145 {
2146         struct ext4_ext_path *path = NULL;
2147         struct ext4_extent *ex;
2148         struct extent_status es;
2149         ext4_lblk_t next, next_del, start = 0, end = 0;
2150         ext4_lblk_t last = block + num;
2151         int exists, depth = 0, err = 0;
2152         unsigned int flags = 0;
2153         unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2154
2155         while (block < last && block != EXT_MAX_BLOCKS) {
2156                 num = last - block;
2157                 /* find extent for this block */
2158                 down_read(&EXT4_I(inode)->i_data_sem);
2159
2160                 path = ext4_ext_find_extent(inode, block, &path, 0);
2161                 if (IS_ERR(path)) {
2162                         up_read(&EXT4_I(inode)->i_data_sem);
2163                         err = PTR_ERR(path);
2164                         path = NULL;
2165                         break;
2166                 }
2167
2168                 depth = ext_depth(inode);
2169                 if (unlikely(path[depth].p_hdr == NULL)) {
2170                         up_read(&EXT4_I(inode)->i_data_sem);
2171                         EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2172                         err = -EIO;
2173                         break;
2174                 }
2175                 ex = path[depth].p_ext;
2176                 next = ext4_ext_next_allocated_block(path);
2177
2178                 flags = 0;
2179                 exists = 0;
2180                 if (!ex) {
2181                         /* there is no extent yet, so try to allocate
2182                          * all requested space */
2183                         start = block;
2184                         end = block + num;
2185                 } else if (le32_to_cpu(ex->ee_block) > block) {
2186                         /* need to allocate space before found extent */
2187                         start = block;
2188                         end = le32_to_cpu(ex->ee_block);
2189                         if (block + num < end)
2190                                 end = block + num;
2191                 } else if (block >= le32_to_cpu(ex->ee_block)
2192                                         + ext4_ext_get_actual_len(ex)) {
2193                         /* need to allocate space after found extent */
2194                         start = block;
2195                         end = block + num;
2196                         if (end >= next)
2197                                 end = next;
2198                 } else if (block >= le32_to_cpu(ex->ee_block)) {
2199                         /*
2200                          * some part of requested space is covered
2201                          * by found extent
2202                          */
2203                         start = block;
2204                         end = le32_to_cpu(ex->ee_block)
2205                                 + ext4_ext_get_actual_len(ex);
2206                         if (block + num < end)
2207                                 end = block + num;
2208                         exists = 1;
2209                 } else {
2210                         BUG();
2211                 }
2212                 BUG_ON(end <= start);
2213
2214                 if (!exists) {
2215                         es.es_lblk = start;
2216                         es.es_len = end - start;
2217                         es.es_pblk = 0;
2218                 } else {
2219                         es.es_lblk = le32_to_cpu(ex->ee_block);
2220                         es.es_len = ext4_ext_get_actual_len(ex);
2221                         es.es_pblk = ext4_ext_pblock(ex);
2222                         if (ext4_ext_is_unwritten(ex))
2223                                 flags |= FIEMAP_EXTENT_UNWRITTEN;
2224                 }
2225
2226                 /*
2227                  * Find delayed extent and update es accordingly. We call
2228                  * it even in !exists case to find out whether es is the
2229                  * last existing extent or not.
2230                  */
2231                 next_del = ext4_find_delayed_extent(inode, &es);
2232                 if (!exists && next_del) {
2233                         exists = 1;
2234                         flags |= (FIEMAP_EXTENT_DELALLOC |
2235                                   FIEMAP_EXTENT_UNKNOWN);
2236                 }
2237                 up_read(&EXT4_I(inode)->i_data_sem);
2238
2239                 if (unlikely(es.es_len == 0)) {
2240                         EXT4_ERROR_INODE(inode, "es.es_len == 0");
2241                         err = -EIO;
2242                         break;
2243                 }
2244
2245                 /*
2246                  * This is possible iff next == next_del == EXT_MAX_BLOCKS.
2247                  * we need to check next == EXT_MAX_BLOCKS because it is
2248                  * possible that an extent is with unwritten and delayed
2249                  * status due to when an extent is delayed allocated and
2250                  * is allocated by fallocate status tree will track both of
2251                  * them in a extent.
2252                  *
2253                  * So we could return a unwritten and delayed extent, and
2254                  * its block is equal to 'next'.
2255                  */
2256                 if (next == next_del && next == EXT_MAX_BLOCKS) {
2257                         flags |= FIEMAP_EXTENT_LAST;
2258                         if (unlikely(next_del != EXT_MAX_BLOCKS ||
2259                                      next != EXT_MAX_BLOCKS)) {
2260                                 EXT4_ERROR_INODE(inode,
2261                                                  "next extent == %u, next "
2262                                                  "delalloc extent = %u",
2263                                                  next, next_del);
2264                                 err = -EIO;
2265                                 break;
2266                         }
2267                 }
2268
2269                 if (exists) {
2270                         err = fiemap_fill_next_extent(fieinfo,
2271                                 (__u64)es.es_lblk << blksize_bits,
2272                                 (__u64)es.es_pblk << blksize_bits,
2273                                 (__u64)es.es_len << blksize_bits,
2274                                 flags);
2275                         if (err < 0)
2276                                 break;
2277                         if (err == 1) {
2278                                 err = 0;
2279                                 break;
2280                         }
2281                 }
2282
2283                 block = es.es_lblk + es.es_len;
2284         }
2285
2286         ext4_ext_drop_refs(path);
2287         kfree(path);
2288         return err;
2289 }
2290
2291 /*
2292  * ext4_ext_put_gap_in_cache:
2293  * calculate boundaries of the gap that the requested block fits into
2294  * and cache this gap
2295  */
2296 static void
2297 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2298                                 ext4_lblk_t block)
2299 {
2300         int depth = ext_depth(inode);
2301         unsigned long len = 0;
2302         ext4_lblk_t lblock = 0;
2303         struct ext4_extent *ex;
2304
2305         ex = path[depth].p_ext;
2306         if (ex == NULL) {
2307                 /*
2308                  * there is no extent yet, so gap is [0;-] and we
2309                  * don't cache it
2310                  */
2311                 ext_debug("cache gap(whole file):");
2312         } else if (block < le32_to_cpu(ex->ee_block)) {
2313                 lblock = block;
2314                 len = le32_to_cpu(ex->ee_block) - block;
2315                 ext_debug("cache gap(before): %u [%u:%u]",
2316                                 block,
2317                                 le32_to_cpu(ex->ee_block),
2318                                  ext4_ext_get_actual_len(ex));
2319                 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2320                         ext4_es_insert_extent(inode, lblock, len, ~0,
2321                                               EXTENT_STATUS_HOLE);
2322         } else if (block >= le32_to_cpu(ex->ee_block)
2323                         + ext4_ext_get_actual_len(ex)) {
2324                 ext4_lblk_t next;
2325                 lblock = le32_to_cpu(ex->ee_block)
2326                         + ext4_ext_get_actual_len(ex);
2327
2328                 next = ext4_ext_next_allocated_block(path);
2329                 ext_debug("cache gap(after): [%u:%u] %u",
2330                                 le32_to_cpu(ex->ee_block),
2331                                 ext4_ext_get_actual_len(ex),
2332                                 block);
2333                 BUG_ON(next == lblock);
2334                 len = next - lblock;
2335                 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2336                         ext4_es_insert_extent(inode, lblock, len, ~0,
2337                                               EXTENT_STATUS_HOLE);
2338         } else {
2339                 BUG();
2340         }
2341
2342         ext_debug(" -> %u:%lu\n", lblock, len);
2343 }
2344
2345 /*
2346  * ext4_ext_rm_idx:
2347  * removes index from the index block.
2348  */
2349 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2350                         struct ext4_ext_path *path, int depth)
2351 {
2352         int err;
2353         ext4_fsblk_t leaf;
2354
2355         /* free index block */
2356         depth--;
2357         path = path + depth;
2358         leaf = ext4_idx_pblock(path->p_idx);
2359         if (unlikely(path->p_hdr->eh_entries == 0)) {
2360                 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2361                 return -EIO;
2362         }
2363         err = ext4_ext_get_access(handle, inode, path);
2364         if (err)
2365                 return err;
2366
2367         if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2368                 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2369                 len *= sizeof(struct ext4_extent_idx);
2370                 memmove(path->p_idx, path->p_idx + 1, len);
2371         }
2372
2373         le16_add_cpu(&path->p_hdr->eh_entries, -1);
2374         err = ext4_ext_dirty(handle, inode, path);
2375         if (err)
2376                 return err;
2377         ext_debug("index is empty, remove it, free block %llu\n", leaf);
2378         trace_ext4_ext_rm_idx(inode, leaf);
2379
2380         ext4_free_blocks(handle, inode, NULL, leaf, 1,
2381                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2382
2383         while (--depth >= 0) {
2384                 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2385                         break;
2386                 path--;
2387                 err = ext4_ext_get_access(handle, inode, path);
2388                 if (err)
2389                         break;
2390                 path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2391                 err = ext4_ext_dirty(handle, inode, path);
2392                 if (err)
2393                         break;
2394         }
2395         return err;
2396 }
2397
2398 /*
2399  * ext4_ext_calc_credits_for_single_extent:
2400  * This routine returns max. credits that needed to insert an extent
2401  * to the extent tree.
2402  * When pass the actual path, the caller should calculate credits
2403  * under i_data_sem.
2404  */
2405 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2406                                                 struct ext4_ext_path *path)
2407 {
2408         if (path) {
2409                 int depth = ext_depth(inode);
2410                 int ret = 0;
2411
2412                 /* probably there is space in leaf? */
2413                 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2414                                 < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2415
2416                         /*
2417                          *  There are some space in the leaf tree, no
2418                          *  need to account for leaf block credit
2419                          *
2420                          *  bitmaps and block group descriptor blocks
2421                          *  and other metadata blocks still need to be
2422                          *  accounted.
2423                          */
2424                         /* 1 bitmap, 1 block group descriptor */
2425                         ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2426                         return ret;
2427                 }
2428         }
2429
2430         return ext4_chunk_trans_blocks(inode, nrblocks);
2431 }
2432
2433 /*
2434  * How many index/leaf blocks need to change/allocate to add @extents extents?
2435  *
2436  * If we add a single extent, then in the worse case, each tree level
2437  * index/leaf need to be changed in case of the tree split.
2438  *
2439  * If more extents are inserted, they could cause the whole tree split more
2440  * than once, but this is really rare.
2441  */
2442 int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2443 {
2444         int index;
2445         int depth;
2446
2447         /* If we are converting the inline data, only one is needed here. */
2448         if (ext4_has_inline_data(inode))
2449                 return 1;
2450
2451         depth = ext_depth(inode);
2452
2453         if (extents <= 1)
2454                 index = depth * 2;
2455         else
2456                 index = depth * 3;
2457
2458         return index;
2459 }
2460
2461 static inline int get_default_free_blocks_flags(struct inode *inode)
2462 {
2463         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2464                 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2465         else if (ext4_should_journal_data(inode))
2466                 return EXT4_FREE_BLOCKS_FORGET;
2467         return 0;
2468 }
2469
2470 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2471                               struct ext4_extent *ex,
2472                               long long *partial_cluster,
2473                               ext4_lblk_t from, ext4_lblk_t to)
2474 {
2475         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2476         unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2477         ext4_fsblk_t pblk;
2478         int flags = get_default_free_blocks_flags(inode);
2479
2480         /*
2481          * For bigalloc file systems, we never free a partial cluster
2482          * at the beginning of the extent.  Instead, we make a note
2483          * that we tried freeing the cluster, and check to see if we
2484          * need to free it on a subsequent call to ext4_remove_blocks,
2485          * or at the end of the ext4_truncate() operation.
2486          */
2487         flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2488
2489         trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2490         /*
2491          * If we have a partial cluster, and it's different from the
2492          * cluster of the last block, we need to explicitly free the
2493          * partial cluster here.
2494          */
2495         pblk = ext4_ext_pblock(ex) + ee_len - 1;
2496         if ((*partial_cluster > 0) &&
2497             (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2498                 ext4_free_blocks(handle, inode, NULL,
2499                                  EXT4_C2B(sbi, *partial_cluster),
2500                                  sbi->s_cluster_ratio, flags);
2501                 *partial_cluster = 0;
2502         }
2503
2504 #ifdef EXTENTS_STATS
2505         {
2506                 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2507                 spin_lock(&sbi->s_ext_stats_lock);
2508                 sbi->s_ext_blocks += ee_len;
2509                 sbi->s_ext_extents++;
2510                 if (ee_len < sbi->s_ext_min)
2511                         sbi->s_ext_min = ee_len;
2512                 if (ee_len > sbi->s_ext_max)
2513                         sbi->s_ext_max = ee_len;
2514                 if (ext_depth(inode) > sbi->s_depth_max)
2515                         sbi->s_depth_max = ext_depth(inode);
2516                 spin_unlock(&sbi->s_ext_stats_lock);
2517         }
2518 #endif
2519         if (from >= le32_to_cpu(ex->ee_block)
2520             && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2521                 /* tail removal */
2522                 ext4_lblk_t num;
2523                 unsigned int unaligned;
2524
2525                 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2526                 pblk = ext4_ext_pblock(ex) + ee_len - num;
2527                 /*
2528                  * Usually we want to free partial cluster at the end of the
2529                  * extent, except for the situation when the cluster is still
2530                  * used by any other extent (partial_cluster is negative).
2531                  */
2532                 if (*partial_cluster < 0 &&
2533                     -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
2534                         flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2535
2536                 ext_debug("free last %u blocks starting %llu partial %lld\n",
2537                           num, pblk, *partial_cluster);
2538                 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2539                 /*
2540                  * If the block range to be freed didn't start at the
2541                  * beginning of a cluster, and we removed the entire
2542                  * extent and the cluster is not used by any other extent,
2543                  * save the partial cluster here, since we might need to
2544                  * delete if we determine that the truncate operation has
2545                  * removed all of the blocks in the cluster.
2546                  *
2547                  * On the other hand, if we did not manage to free the whole
2548                  * extent, we have to mark the cluster as used (store negative
2549                  * cluster number in partial_cluster).
2550                  */
2551                 unaligned = EXT4_PBLK_COFF(sbi, pblk);
2552                 if (unaligned && (ee_len == num) &&
2553                     (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
2554                         *partial_cluster = EXT4_B2C(sbi, pblk);
2555                 else if (unaligned)
2556                         *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
2557                 else if (*partial_cluster > 0)
2558                         *partial_cluster = 0;
2559         } else
2560                 ext4_error(sbi->s_sb, "strange request: removal(2) "
2561                            "%u-%u from %u:%u\n",
2562                            from, to, le32_to_cpu(ex->ee_block), ee_len);
2563         return 0;
2564 }
2565
2566
2567 /*
2568  * ext4_ext_rm_leaf() Removes the extents associated with the
2569  * blocks appearing between "start" and "end", and splits the extents
2570  * if "start" and "end" appear in the same extent
2571  *
2572  * @handle: The journal handle
2573  * @inode:  The files inode
2574  * @path:   The path to the leaf
2575  * @partial_cluster: The cluster which we'll have to free if all extents
2576  *                   has been released from it. It gets negative in case
2577  *                   that the cluster is still used.
2578  * @start:  The first block to remove
2579  * @end:   The last block to remove
2580  */
2581 static int
2582 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2583                  struct ext4_ext_path *path,
2584                  long long *partial_cluster,
2585                  ext4_lblk_t start, ext4_lblk_t end)
2586 {
2587         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2588         int err = 0, correct_index = 0;
2589         int depth = ext_depth(inode), credits;
2590         struct ext4_extent_header *eh;
2591         ext4_lblk_t a, b;
2592         unsigned num;
2593         ext4_lblk_t ex_ee_block;
2594         unsigned short ex_ee_len;
2595         unsigned unwritten = 0;
2596         struct ext4_extent *ex;
2597         ext4_fsblk_t pblk;
2598
2599         /* the header must be checked already in ext4_ext_remove_space() */
2600         ext_debug("truncate since %u in leaf to %u\n", start, end);
2601         if (!path[depth].p_hdr)
2602                 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2603         eh = path[depth].p_hdr;
2604         if (unlikely(path[depth].p_hdr == NULL)) {
2605                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2606                 return -EIO;
2607         }
2608         /* find where to start removing */
2609         ex = path[depth].p_ext;
2610         if (!ex)
2611                 ex = EXT_LAST_EXTENT(eh);
2612
2613         ex_ee_block = le32_to_cpu(ex->ee_block);
2614         ex_ee_len = ext4_ext_get_actual_len(ex);
2615
2616         /*
2617          * If we're starting with an extent other than the last one in the
2618          * node, we need to see if it shares a cluster with the extent to
2619          * the right (towards the end of the file). If its leftmost cluster
2620          * is this extent's rightmost cluster and it is not cluster aligned,
2621          * we'll mark it as a partial that is not to be deallocated.
2622          */
2623
2624         if (ex != EXT_LAST_EXTENT(eh)) {
2625                 ext4_fsblk_t current_pblk, right_pblk;
2626                 long long current_cluster, right_cluster;
2627
2628                 current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2629                 current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
2630                 right_pblk = ext4_ext_pblock(ex + 1);
2631                 right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
2632                 if (current_cluster == right_cluster &&
2633                         EXT4_PBLK_COFF(sbi, right_pblk))
2634                         *partial_cluster = -right_cluster;
2635         }
2636
2637         trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2638
2639         while (ex >= EXT_FIRST_EXTENT(eh) &&
2640                         ex_ee_block + ex_ee_len > start) {
2641
2642                 if (ext4_ext_is_unwritten(ex))
2643                         unwritten = 1;
2644                 else
2645                         unwritten = 0;
2646
2647                 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2648                           unwritten, ex_ee_len);
2649                 path[depth].p_ext = ex;
2650
2651                 a = ex_ee_block > start ? ex_ee_block : start;
2652                 b = ex_ee_block+ex_ee_len - 1 < end ?
2653                         ex_ee_block+ex_ee_len - 1 : end;
2654
2655                 ext_debug("  border %u:%u\n", a, b);
2656
2657                 /* If this extent is beyond the end of the hole, skip it */
2658                 if (end < ex_ee_block) {
2659                         /*
2660                          * We're going to skip this extent and move to another,
2661                          * so if this extent is not cluster aligned we have
2662                          * to mark the current cluster as used to avoid
2663                          * accidentally freeing it later on
2664                          */
2665                         pblk = ext4_ext_pblock(ex);
2666                         if (EXT4_PBLK_COFF(sbi, pblk))
2667                                 *partial_cluster =
2668                                         -((long long)EXT4_B2C(sbi, pblk));
2669                         ex--;
2670                         ex_ee_block = le32_to_cpu(ex->ee_block);
2671                         ex_ee_len = ext4_ext_get_actual_len(ex);
2672                         continue;
2673                 } else if (b != ex_ee_block + ex_ee_len - 1) {
2674                         EXT4_ERROR_INODE(inode,
2675                                          "can not handle truncate %u:%u "
2676                                          "on extent %u:%u",
2677                                          start, end, ex_ee_block,
2678                                          ex_ee_block + ex_ee_len - 1);
2679                         err = -EIO;
2680                         goto out;
2681                 } else if (a != ex_ee_block) {
2682                         /* remove tail of the extent */
2683                         num = a - ex_ee_block;
2684                 } else {
2685                         /* remove whole extent: excellent! */
2686                         num = 0;
2687                 }
2688                 /*
2689                  * 3 for leaf, sb, and inode plus 2 (bmap and group
2690                  * descriptor) for each block group; assume two block
2691                  * groups plus ex_ee_len/blocks_per_block_group for
2692                  * the worst case
2693                  */
2694                 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2695                 if (ex == EXT_FIRST_EXTENT(eh)) {
2696                         correct_index = 1;
2697                         credits += (ext_depth(inode)) + 1;
2698                 }
2699                 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2700
2701                 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2702                 if (err)
2703                         goto out;
2704
2705                 err = ext4_ext_get_access(handle, inode, path + depth);
2706                 if (err)
2707                         goto out;
2708
2709                 err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2710                                          a, b);
2711                 if (err)
2712                         goto out;
2713
2714                 if (num == 0)
2715                         /* this extent is removed; mark slot entirely unused */
2716                         ext4_ext_store_pblock(ex, 0);
2717
2718                 ex->ee_len = cpu_to_le16(num);
2719                 /*
2720                  * Do not mark unwritten if all the blocks in the
2721                  * extent have been removed.
2722                  */
2723                 if (unwritten && num)
2724                         ext4_ext_mark_unwritten(ex);
2725                 /*
2726                  * If the extent was completely released,
2727                  * we need to remove it from the leaf
2728                  */
2729                 if (num == 0) {
2730                         if (end != EXT_MAX_BLOCKS - 1) {
2731                                 /*
2732                                  * For hole punching, we need to scoot all the
2733                                  * extents up when an extent is removed so that
2734                                  * we dont have blank extents in the middle
2735                                  */
2736                                 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2737                                         sizeof(struct ext4_extent));
2738
2739                                 /* Now get rid of the one at the end */
2740                                 memset(EXT_LAST_EXTENT(eh), 0,
2741                                         sizeof(struct ext4_extent));
2742                         }
2743                         le16_add_cpu(&eh->eh_entries, -1);
2744                 } else if (*partial_cluster > 0)
2745                         *partial_cluster = 0;
2746
2747                 err = ext4_ext_dirty(handle, inode, path + depth);
2748                 if (err)
2749                         goto out;
2750
2751                 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2752                                 ext4_ext_pblock(ex));
2753                 ex--;
2754                 ex_ee_block = le32_to_cpu(ex->ee_block);
2755                 ex_ee_len = ext4_ext_get_actual_len(ex);
2756         }
2757
2758         if (correct_index && eh->eh_entries)
2759                 err = ext4_ext_correct_indexes(handle, inode, path);
2760
2761         /*
2762          * If there's a partial cluster and at least one extent remains in
2763          * the leaf, free the partial cluster if it isn't shared with the
2764          * current extent.  If there's a partial cluster and no extents
2765          * remain in the leaf, it can't be freed here.  It can only be
2766          * freed when it's possible to determine if it's not shared with
2767          * any other extent - when the next leaf is processed or when space
2768          * removal is complete.
2769          */
2770         if (*partial_cluster > 0 && eh->eh_entries &&
2771             (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2772              *partial_cluster)) {
2773                 int flags = get_default_free_blocks_flags(inode);
2774
2775                 ext4_free_blocks(handle, inode, NULL,
2776                                  EXT4_C2B(sbi, *partial_cluster),
2777                                  sbi->s_cluster_ratio, flags);
2778                 *partial_cluster = 0;
2779         }
2780
2781         /* if this leaf is free, then we should
2782          * remove it from index block above */
2783         if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2784                 err = ext4_ext_rm_idx(handle, inode, path, depth);
2785
2786 out:
2787         return err;
2788 }
2789
2790 /*
2791  * ext4_ext_more_to_rm:
2792  * returns 1 if current index has to be freed (even partial)
2793  */
2794 static int
2795 ext4_ext_more_to_rm(struct ext4_ext_path *path)
2796 {
2797         BUG_ON(path->p_idx == NULL);
2798
2799         if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2800                 return 0;
2801
2802         /*
2803          * if truncate on deeper level happened, it wasn't partial,
2804          * so we have to consider current index for truncation
2805          */
2806         if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2807                 return 0;
2808         return 1;
2809 }
2810
2811 int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2812                           ext4_lblk_t end)
2813 {
2814         struct super_block *sb = inode->i_sb;
2815         int depth = ext_depth(inode);
2816         struct ext4_ext_path *path = NULL;
2817         long long partial_cluster = 0;
2818         handle_t *handle;
2819         int i = 0, err = 0;
2820
2821         ext_debug("truncate since %u to %u\n", start, end);
2822
2823         /* probably first extent we're gonna free will be last in block */
2824         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
2825         if (IS_ERR(handle))
2826                 return PTR_ERR(handle);
2827
2828 again:
2829         trace_ext4_ext_remove_space(inode, start, end, depth);
2830
2831         /*
2832          * Check if we are removing extents inside the extent tree. If that
2833          * is the case, we are going to punch a hole inside the extent tree
2834          * so we have to check whether we need to split the extent covering
2835          * the last block to remove so we can easily remove the part of it
2836          * in ext4_ext_rm_leaf().
2837          */
2838         if (end < EXT_MAX_BLOCKS - 1) {
2839                 struct ext4_extent *ex;
2840                 ext4_lblk_t ee_block;
2841
2842                 /* find extent for this block */
2843                 path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2844                 if (IS_ERR(path)) {
2845                         ext4_journal_stop(handle);
2846                         return PTR_ERR(path);
2847                 }
2848                 depth = ext_depth(inode);
2849                 /* Leaf not may not exist only if inode has no blocks at all */
2850                 ex = path[depth].p_ext;
2851                 if (!ex) {
2852                         if (depth) {
2853                                 EXT4_ERROR_INODE(inode,
2854                                                  "path[%d].p_hdr == NULL",
2855                                                  depth);
2856                                 err = -EIO;
2857                         }
2858                         goto out;
2859                 }
2860
2861                 ee_block = le32_to_cpu(ex->ee_block);
2862
2863                 /*
2864                  * See if the last block is inside the extent, if so split
2865                  * the extent at 'end' block so we can easily remove the
2866                  * tail of the first part of the split extent in
2867                  * ext4_ext_rm_leaf().
2868                  */
2869                 if (end >= ee_block &&
2870                     end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
2871                         /*
2872                          * Split the extent in two so that 'end' is the last
2873                          * block in the first new extent. Also we should not
2874                          * fail removing space due to ENOSPC so try to use
2875                          * reserved block if that happens.
2876                          */
2877                         err = ext4_force_split_extent_at(handle, inode, &path,
2878                                                          end + 1, 1);
2879                         if (err < 0)
2880                                 goto out;
2881                 }
2882         }
2883         /*
2884          * We start scanning from right side, freeing all the blocks
2885          * after i_size and walking into the tree depth-wise.
2886          */
2887         depth = ext_depth(inode);
2888         if (path) {
2889                 int k = i = depth;
2890                 while (--k > 0)
2891                         path[k].p_block =
2892                                 le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2893         } else {
2894                 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
2895                                GFP_NOFS);
2896                 if (path == NULL) {
2897                         ext4_journal_stop(handle);
2898                         return -ENOMEM;
2899                 }
2900                 path[0].p_maxdepth = path[0].p_depth = depth;
2901                 path[0].p_hdr = ext_inode_hdr(inode);
2902                 i = 0;
2903
2904                 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
2905                         err = -EIO;
2906                         goto out;
2907                 }
2908         }
2909         err = 0;
2910
2911         while (i >= 0 && err == 0) {
2912                 if (i == depth) {
2913                         /* this is leaf block */
2914                         err = ext4_ext_rm_leaf(handle, inode, path,
2915                                                &partial_cluster, start,
2916                                                end);
2917                         /* root level has p_bh == NULL, brelse() eats this */
2918                         brelse(path[i].p_bh);
2919                         path[i].p_bh = NULL;
2920                         i--;
2921                         continue;
2922                 }
2923
2924                 /* this is index block */
2925                 if (!path[i].p_hdr) {
2926                         ext_debug("initialize header\n");
2927                         path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2928                 }
2929
2930                 if (!path[i].p_idx) {
2931                         /* this level hasn't been touched yet */
2932                         path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2933                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2934                         ext_debug("init index ptr: hdr 0x%p, num %d\n",
2935                                   path[i].p_hdr,
2936                                   le16_to_cpu(path[i].p_hdr->eh_entries));
2937                 } else {
2938                         /* we were already here, see at next index */
2939                         path[i].p_idx--;
2940                 }
2941
2942                 ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2943                                 i, EXT_FIRST_INDEX(path[i].p_hdr),
2944                                 path[i].p_idx);
2945                 if (ext4_ext_more_to_rm(path + i)) {
2946                         struct buffer_head *bh;
2947                         /* go to the next level */
2948                         ext_debug("move to level %d (block %llu)\n",
2949                                   i + 1, ext4_idx_pblock(path[i].p_idx));
2950                         memset(path + i + 1, 0, sizeof(*path));
2951                         bh = read_extent_tree_block(inode,
2952                                 ext4_idx_pblock(path[i].p_idx), depth - i - 1,
2953                                 EXT4_EX_NOCACHE);
2954                         if (IS_ERR(bh)) {
2955                                 /* should we reset i_size? */
2956                                 err = PTR_ERR(bh);
2957                                 break;
2958                         }
2959                         /* Yield here to deal with large extent trees.
2960                          * Should be a no-op if we did IO above. */
2961                         cond_resched();
2962                         if (WARN_ON(i + 1 > depth)) {
2963                                 err = -EIO;
2964                                 break;
2965                         }
2966                         path[i + 1].p_bh = bh;
2967
2968                         /* save actual number of indexes since this
2969                          * number is changed at the next iteration */
2970                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2971                         i++;
2972                 } else {
2973                         /* we finished processing this index, go up */
2974                         if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2975                                 /* index is empty, remove it;
2976                                  * handle must be already prepared by the
2977                                  * truncatei_leaf() */
2978                                 err = ext4_ext_rm_idx(handle, inode, path, i);
2979                         }
2980                         /* root level has p_bh == NULL, brelse() eats this */
2981                         brelse(path[i].p_bh);
2982                         path[i].p_bh = NULL;
2983                         i--;
2984                         ext_debug("return to level %d\n", i);
2985                 }
2986         }
2987
2988         trace_ext4_ext_remove_space_done(inode, start, end, depth,
2989                         partial_cluster, path->p_hdr->eh_entries);
2990
2991         /* If we still have something in the partial cluster and we have removed
2992          * even the first extent, then we should free the blocks in the partial
2993          * cluster as well. */
2994         if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
2995                 int flags = get_default_free_blocks_flags(inode);
2996
2997                 ext4_free_blocks(handle, inode, NULL,
2998                                  EXT4_C2B(EXT4_SB(sb), partial_cluster),
2999                                  EXT4_SB(sb)->s_cluster_ratio, flags);
3000                 partial_cluster = 0;
3001         }
3002
3003         /* TODO: flexible tree reduction should be here */
3004         if (path->p_hdr->eh_entries == 0) {
3005                 /*
3006                  * truncate to zero freed all the tree,
3007                  * so we need to correct eh_depth
3008                  */
3009                 err = ext4_ext_get_access(handle, inode, path);
3010                 if (err == 0) {
3011                         ext_inode_hdr(inode)->eh_depth = 0;
3012                         ext_inode_hdr(inode)->eh_max =
3013                                 cpu_to_le16(ext4_ext_space_root(inode, 0));
3014                         err = ext4_ext_dirty(handle, inode, path);
3015                 }
3016         }
3017 out:
3018         ext4_ext_drop_refs(path);
3019         kfree(path);
3020         path = NULL;
3021         if (err == -EAGAIN)
3022                 goto again;
3023         ext4_journal_stop(handle);
3024
3025         return err;
3026 }
3027
3028 /*
3029  * called at mount time
3030  */
3031 void ext4_ext_init(struct super_block *sb)
3032 {
3033         /*
3034          * possible initialization would be here
3035          */
3036
3037         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
3038 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3039                 printk(KERN_INFO "EXT4-fs: file extents enabled"
3040 #ifdef AGGRESSIVE_TEST
3041                        ", aggressive tests"
3042 #endif
3043 #ifdef CHECK_BINSEARCH
3044                        ", check binsearch"
3045 #endif
3046 #ifdef EXTENTS_STATS
3047                        ", stats"
3048 #endif
3049                        "\n");
3050 #endif
3051 #ifdef EXTENTS_STATS
3052                 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3053                 EXT4_SB(sb)->s_ext_min = 1 << 30;
3054                 EXT4_SB(sb)->s_ext_max = 0;
3055 #endif
3056         }
3057 }
3058
3059 /*
3060  * called at umount time
3061  */
3062 void ext4_ext_release(struct super_block *sb)
3063 {
3064         if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
3065                 return;
3066
3067 #ifdef EXTENTS_STATS
3068         if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3069                 struct ext4_sb_info *sbi = EXT4_SB(sb);
3070                 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3071                         sbi->s_ext_blocks, sbi->s_ext_extents,
3072                         sbi->s_ext_blocks / sbi->s_ext_extents);
3073                 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3074                         sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3075         }
3076 #endif
3077 }
3078
3079 static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3080 {
3081         ext4_lblk_t  ee_block;
3082         ext4_fsblk_t ee_pblock;
3083         unsigned int ee_len;
3084
3085         ee_block  = le32_to_cpu(ex->ee_block);
3086         ee_len    = ext4_ext_get_actual_len(ex);
3087         ee_pblock = ext4_ext_pblock(ex);
3088
3089         if (ee_len == 0)
3090                 return 0;
3091
3092         return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3093                                      EXTENT_STATUS_WRITTEN);
3094 }
3095
3096 /* FIXME!! we need to try to merge to left or right after zero-out  */
3097 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3098 {
3099         ext4_fsblk_t ee_pblock;
3100         unsigned int ee_len;
3101         int ret;
3102
3103         ee_len    = ext4_ext_get_actual_len(ex);
3104         ee_pblock = ext4_ext_pblock(ex);
3105
3106         ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
3107         if (ret > 0)
3108                 ret = 0;
3109
3110         return ret;
3111 }
3112
3113 /*
3114  * ext4_split_extent_at() splits an extent at given block.
3115  *
3116  * @handle: the journal handle
3117  * @inode: the file inode
3118  * @path: the path to the extent
3119  * @split: the logical block where the extent is splitted.
3120  * @split_flags: indicates if the extent could be zeroout if split fails, and
3121  *               the states(init or unwritten) of new extents.
3122  * @flags: flags used to insert new extent to extent tree.
3123  *
3124  *
3125  * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3126  * of which are deterimined by split_flag.
3127  *
3128  * There are two cases:
3129  *  a> the extent are splitted into two extent.
3130  *  b> split is not needed, and just mark the extent.
3131  *
3132  * return 0 on success.
3133  */
3134 static int ext4_split_extent_at(handle_t *handle,
3135                              struct inode *inode,
3136                              struct ext4_ext_path **ppath,
3137                              ext4_lblk_t split,
3138                              int split_flag,
3139                              int flags)
3140 {
3141         struct ext4_ext_path *path = *ppath;
3142         ext4_fsblk_t newblock;
3143         ext4_lblk_t ee_block;
3144         struct ext4_extent *ex, newex, orig_ex, zero_ex;
3145         struct ext4_extent *ex2 = NULL;
3146         unsigned int ee_len, depth;
3147         int err = 0;
3148
3149         BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3150                (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3151
3152         ext_debug("ext4_split_extents_at: inode %lu, logical"
3153                 "block %llu\n", inode->i_ino, (unsigned long long)split);
3154
3155         ext4_ext_show_leaf(inode, path);
3156
3157         depth = ext_depth(inode);
3158         ex = path[depth].p_ext;
3159         ee_block = le32_to_cpu(ex->ee_block);
3160         ee_len = ext4_ext_get_actual_len(ex);
3161         newblock = split - ee_block + ext4_ext_pblock(ex);
3162
3163         BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3164         BUG_ON(!ext4_ext_is_unwritten(ex) &&
3165                split_flag & (EXT4_EXT_MAY_ZEROOUT |
3166                              EXT4_EXT_MARK_UNWRIT1 |
3167                              EXT4_EXT_MARK_UNWRIT2));
3168
3169         err = ext4_ext_get_access(handle, inode, path + depth);
3170         if (err)
3171                 goto out;
3172
3173         if (split == ee_block) {
3174                 /*
3175                  * case b: block @split is the block that the extent begins with
3176                  * then we just change the state of the extent, and splitting
3177                  * is not needed.
3178                  */
3179                 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3180                         ext4_ext_mark_unwritten(ex);
3181                 else
3182                         ext4_ext_mark_initialized(ex);
3183
3184                 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3185                         ext4_ext_try_to_merge(handle, inode, path, ex);
3186
3187                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3188                 goto out;
3189         }
3190
3191         /* case a */
3192         memcpy(&orig_ex, ex, sizeof(orig_ex));
3193         ex->ee_len = cpu_to_le16(split - ee_block);
3194         if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3195                 ext4_ext_mark_unwritten(ex);
3196
3197         /*
3198          * path may lead to new leaf, not to original leaf any more
3199          * after ext4_ext_insert_extent() returns,
3200          */
3201         err = ext4_ext_dirty(handle, inode, path + depth);
3202         if (err)
3203                 goto fix_extent_len;
3204
3205         ex2 = &newex;
3206         ex2->ee_block = cpu_to_le32(split);
3207         ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3208         ext4_ext_store_pblock(ex2, newblock);
3209         if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3210                 ext4_ext_mark_unwritten(ex2);
3211
3212         err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3213         if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3214                 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3215                         if (split_flag & EXT4_EXT_DATA_VALID1) {
3216                                 err = ext4_ext_zeroout(inode, ex2);
3217                                 zero_ex.ee_block = ex2->ee_block;
3218                                 zero_ex.ee_len = cpu_to_le16(
3219                                                 ext4_ext_get_actual_len(ex2));
3220                                 ext4_ext_store_pblock(&zero_ex,
3221                                                       ext4_ext_pblock(ex2));
3222                         } else {
3223                                 err = ext4_ext_zeroout(inode, ex);
3224                                 zero_ex.ee_block = ex->ee_block;
3225                                 zero_ex.ee_len = cpu_to_le16(
3226                                                 ext4_ext_get_actual_len(ex));
3227                                 ext4_ext_store_pblock(&zero_ex,
3228                                                       ext4_ext_pblock(ex));
3229                         }
3230                 } else {
3231                         err = ext4_ext_zeroout(inode, &orig_ex);
3232                         zero_ex.ee_block = orig_ex.ee_block;
3233                         zero_ex.ee_len = cpu_to_le16(
3234                                                 ext4_ext_get_actual_len(&orig_ex));
3235                         ext4_ext_store_pblock(&zero_ex,
3236                                               ext4_ext_pblock(&orig_ex));
3237                 }
3238
3239                 if (err)
3240                         goto fix_extent_len;
3241                 /* update the extent length and mark as initialized */
3242                 ex->ee_len = cpu_to_le16(ee_len);
3243                 ext4_ext_try_to_merge(handle, inode, path, ex);
3244                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3245                 if (err)
3246                         goto fix_extent_len;
3247
3248                 /* update extent status tree */
3249                 err = ext4_zeroout_es(inode, &zero_ex);
3250
3251                 goto out;
3252         } else if (err)
3253                 goto fix_extent_len;
3254
3255 out:
3256         ext4_ext_show_leaf(inode, path);
3257         return err;
3258
3259 fix_extent_len:
3260         ex->ee_len = orig_ex.ee_len;
3261         ext4_ext_dirty(handle, inode, path + path->p_depth);
3262         return err;
3263 }
3264
3265 /*
3266  * ext4_split_extents() splits an extent and mark extent which is covered
3267  * by @map as split_flags indicates
3268  *
3269  * It may result in splitting the extent into multiple extents (up to three)
3270  * There are three possibilities:
3271  *   a> There is no split required
3272  *   b> Splits in two extents: Split is happening at either end of the extent
3273  *   c> Splits in three extents: Somone is splitting in middle of the extent
3274  *
3275  */
3276 static int ext4_split_extent(handle_t *handle,
3277                               struct inode *inode,
3278                               struct ext4_ext_path **ppath,
3279                               struct ext4_map_blocks *map,
3280                               int split_flag,
3281                               int flags)
3282 {
3283         struct ext4_ext_path *path = *ppath;
3284         ext4_lblk_t ee_block;
3285         struct ext4_extent *ex;
3286         unsigned int ee_len, depth;
3287         int err = 0;
3288         int unwritten;
3289         int split_flag1, flags1;
3290         int allocated = map->m_len;
3291
3292         depth = ext_depth(inode);
3293         ex = path[depth].p_ext;
3294         ee_block = le32_to_cpu(ex->ee_block);
3295         ee_len = ext4_ext_get_actual_len(ex);
3296         unwritten = ext4_ext_is_unwritten(ex);
3297
3298         if (map->m_lblk + map->m_len < ee_block + ee_len) {
3299                 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3300                 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3301                 if (unwritten)
3302                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3303                                        EXT4_EXT_MARK_UNWRIT2;
3304                 if (split_flag & EXT4_EXT_DATA_VALID2)
3305                         split_flag1 |= EXT4_EXT_DATA_VALID1;
3306                 err = ext4_split_extent_at(handle, inode, ppath,
3307                                 map->m_lblk + map->m_len, split_flag1, flags1);
3308                 if (err)
3309                         goto out;
3310         } else {
3311                 allocated = ee_len - (map->m_lblk - ee_block);
3312         }
3313         /*
3314          * Update path is required because previous ext4_split_extent_at() may
3315          * result in split of original leaf or extent zeroout.
3316          */
3317         path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
3318         if (IS_ERR(path))
3319                 return PTR_ERR(path);
3320         depth = ext_depth(inode);
3321         ex = path[depth].p_ext;
3322         if (!ex) {
3323                 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3324                                  (unsigned long) map->m_lblk);
3325                 return -EIO;
3326         }
3327         unwritten = ext4_ext_is_unwritten(ex);
3328         split_flag1 = 0;
3329
3330         if (map->m_lblk >= ee_block) {
3331                 split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3332                 if (unwritten) {
3333                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3334                         split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3335                                                      EXT4_EXT_MARK_UNWRIT2);
3336                 }
3337                 err = ext4_split_extent_at(handle, inode, ppath,
3338                                 map->m_lblk, split_flag1, flags);
3339                 if (err)
3340                         goto out;
3341         }
3342
3343         ext4_ext_show_leaf(inode, path);
3344 out:
3345         return err ? err : allocated;
3346 }
3347
3348 /*
3349  * This function is called by ext4_ext_map_blocks() if someone tries to write
3350  * to an unwritten extent. It may result in splitting the unwritten
3351  * extent into multiple extents (up to three - one initialized and two
3352  * unwritten).
3353  * There are three possibilities:
3354  *   a> There is no split required: Entire extent should be initialized
3355  *   b> Splits in two extents: Write is happening at either end of the extent
3356  *   c> Splits in three extents: Somone is writing in middle of the extent
3357  *
3358  * Pre-conditions:
3359  *  - The extent pointed to by 'path' is unwritten.
3360  *  - The extent pointed to by 'path' contains a superset
3361  *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3362  *
3363  * Post-conditions on success:
3364  *  - the returned value is the number of blocks beyond map->l_lblk
3365  *    that are allocated and initialized.
3366  *    It is guaranteed to be >= map->m_len.
3367  */
3368 static int ext4_ext_convert_to_initialized(handle_t *handle,
3369                                            struct inode *inode,
3370                                            struct ext4_map_blocks *map,
3371                                            struct ext4_ext_path **ppath,
3372                                            int flags)
3373 {
3374         struct ext4_ext_path *path = *ppath;
3375         struct ext4_sb_info *sbi;
3376         struct ext4_extent_header *eh;
3377         struct ext4_map_blocks split_map;
3378         struct ext4_extent zero_ex;
3379         struct ext4_extent *ex, *abut_ex;
3380         ext4_lblk_t ee_block, eof_block;
3381         unsigned int ee_len, depth, map_len = map->m_len;
3382         int allocated = 0, max_zeroout = 0;
3383         int err = 0;
3384         int split_flag = 0;
3385
3386         ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3387                 "block %llu, max_blocks %u\n", inode->i_ino,
3388                 (unsigned long long)map->m_lblk, map_len);
3389
3390         sbi = EXT4_SB(inode->i_sb);
3391         eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3392                 inode->i_sb->s_blocksize_bits;
3393         if (eof_block < map->m_lblk + map_len)
3394                 eof_block = map->m_lblk + map_len;
3395
3396         depth = ext_depth(inode);
3397         eh = path[depth].p_hdr;
3398         ex = path[depth].p_ext;
3399         ee_block = le32_to_cpu(ex->ee_block);
3400         ee_len = ext4_ext_get_actual_len(ex);
3401         zero_ex.ee_len = 0;
3402
3403         trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3404
3405         /* Pre-conditions */
3406         BUG_ON(!ext4_ext_is_unwritten(ex));
3407         BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3408
3409         /*
3410          * Attempt to transfer newly initialized blocks from the currently
3411          * unwritten extent to its neighbor. This is much cheaper
3412          * than an insertion followed by a merge as those involve costly
3413          * memmove() calls. Transferring to the left is the common case in
3414          * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3415          * followed by append writes.
3416          *
3417          * Limitations of the current logic:
3418          *  - L1: we do not deal with writes covering the whole extent.
3419          *    This would require removing the extent if the transfer
3420          *    is possible.
3421          *  - L2: we only attempt to merge with an extent stored in the
3422          *    same extent tree node.
3423          */
3424         if ((map->m_lblk == ee_block) &&
3425                 /* See if we can merge left */
3426                 (map_len < ee_len) &&           /*L1*/
3427                 (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
3428                 ext4_lblk_t prev_lblk;
3429                 ext4_fsblk_t prev_pblk, ee_pblk;
3430                 unsigned int prev_len;
3431
3432                 abut_ex = ex - 1;
3433                 prev_lblk = le32_to_cpu(abut_ex->ee_block);
3434                 prev_len = ext4_ext_get_actual_len(abut_ex);
3435                 prev_pblk = ext4_ext_pblock(abut_ex);
3436                 ee_pblk = ext4_ext_pblock(ex);
3437
3438                 /*
3439                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3440                  * upon those conditions:
3441                  * - C1: abut_ex is initialized,
3442                  * - C2: abut_ex is logically abutting ex,
3443                  * - C3: abut_ex is physically abutting ex,
3444                  * - C4: abut_ex can receive the additional blocks without
3445                  *   overflowing the (initialized) length limit.
3446                  */
3447                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3448                         ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
3449                         ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
3450                         (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
3451                         err = ext4_ext_get_access(handle, inode, path + depth);
3452                         if (err)
3453                                 goto out;
3454
3455                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3456                                 map, ex, abut_ex);
3457
3458                         /* Shift the start of ex by 'map_len' blocks */
3459                         ex->ee_block = cpu_to_le32(ee_block + map_len);
3460                         ext4_ext_store_pblock(ex, ee_pblk + map_len);
3461                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3462                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3463
3464                         /* Extend abut_ex by 'map_len' blocks */
3465                         abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3466
3467                         /* Result: number of initialized blocks past m_lblk */
3468                         allocated = map_len;
3469                 }
3470         } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3471                    (map_len < ee_len) &&        /*L1*/
3472                    ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
3473                 /* See if we can merge right */
3474                 ext4_lblk_t next_lblk;
3475                 ext4_fsblk_t next_pblk, ee_pblk;
3476                 unsigned int next_len;
3477
3478                 abut_ex = ex + 1;
3479                 next_lblk = le32_to_cpu(abut_ex->ee_block);
3480                 next_len = ext4_ext_get_actual_len(abut_ex);
3481                 next_pblk = ext4_ext_pblock(abut_ex);
3482                 ee_pblk = ext4_ext_pblock(ex);
3483
3484                 /*
3485                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3486                  * upon those conditions:
3487                  * - C1: abut_ex is initialized,
3488                  * - C2: abut_ex is logically abutting ex,
3489                  * - C3: abut_ex is physically abutting ex,
3490                  * - C4: abut_ex can receive the additional blocks without
3491                  *   overflowing the (initialized) length limit.
3492                  */
3493                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3494                     ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
3495                     ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
3496                     (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
3497                         err = ext4_ext_get_access(handle, inode, path + depth);
3498                         if (err)
3499                                 goto out;
3500
3501                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3502                                 map, ex, abut_ex);
3503
3504                         /* Shift the start of abut_ex by 'map_len' blocks */
3505                         abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3506                         ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3507                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3508                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3509
3510                         /* Extend abut_ex by 'map_len' blocks */
3511                         abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3512
3513                         /* Result: number of initialized blocks past m_lblk */
3514                         allocated = map_len;
3515                 }
3516         }
3517         if (allocated) {
3518                 /* Mark the block containing both extents as dirty */
3519                 ext4_ext_dirty(handle, inode, path + depth);
3520
3521                 /* Update path to point to the right extent */
3522                 path[depth].p_ext = abut_ex;
3523                 goto out;
3524         } else
3525                 allocated = ee_len - (map->m_lblk - ee_block);
3526
3527         WARN_ON(map->m_lblk < ee_block);
3528         /*
3529          * It is safe to convert extent to initialized via explicit
3530          * zeroout only if extent is fully inside i_size or new_size.
3531          */
3532         split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3533
3534         if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3535                 max_zeroout = sbi->s_extent_max_zeroout_kb >>
3536                         (inode->i_sb->s_blocksize_bits - 10);
3537
3538         /* If extent is less than s_max_zeroout_kb, zeroout directly */
3539         if (max_zeroout && (ee_len <= max_zeroout)) {
3540                 err = ext4_ext_zeroout(inode, ex);
3541                 if (err)
3542                         goto out;
3543                 zero_ex.ee_block = ex->ee_block;
3544                 zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
3545                 ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
3546
3547                 err = ext4_ext_get_access(handle, inode, path + depth);
3548                 if (err)
3549                         goto out;
3550                 ext4_ext_mark_initialized(ex);
3551                 ext4_ext_try_to_merge(handle, inode, path, ex);
3552                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3553                 goto out;
3554         }
3555
3556         /*
3557          * four cases:
3558          * 1. split the extent into three extents.
3559          * 2. split the extent into two extents, zeroout the first half.
3560          * 3. split the extent into two extents, zeroout the second half.
3561          * 4. split the extent into two extents with out zeroout.
3562          */
3563         split_map.m_lblk = map->m_lblk;
3564         split_map.m_len = map->m_len;
3565
3566         if (max_zeroout && (allocated > map->m_len)) {
3567                 if (allocated <= max_zeroout) {
3568                         /* case 3 */
3569                         zero_ex.ee_block =
3570                                          cpu_to_le32(map->m_lblk);
3571                         zero_ex.ee_len = cpu_to_le16(allocated);
3572                         ext4_ext_store_pblock(&zero_ex,
3573                                 ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3574                         err = ext4_ext_zeroout(inode, &zero_ex);
3575                         if (err)
3576                                 goto out;
3577                         split_map.m_lblk = map->m_lblk;
3578                         split_map.m_len = allocated;
3579                 } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
3580                         /* case 2 */
3581                         if (map->m_lblk != ee_block) {
3582                                 zero_ex.ee_block = ex->ee_block;
3583                                 zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3584                                                         ee_block);
3585                                 ext4_ext_store_pblock(&zero_ex,
3586                                                       ext4_ext_pblock(ex));
3587                                 err = ext4_ext_zeroout(inode, &zero_ex);
3588                                 if (err)
3589                                         goto out;
3590                         }
3591
3592                         split_map.m_lblk = ee_block;
3593                         split_map.m_len = map->m_lblk - ee_block + map->m_len;
3594                         allocated = map->m_len;
3595                 }
3596         }
3597
3598         allocated = ext4_split_extent(handle, inode, ppath,
3599                                       &split_map, split_flag, flags);
3600         if (allocated < 0)
3601                 err = allocated;
3602
3603 out:
3604         /* If we have gotten a failure, don't zero out status tree */
3605         if (!err)
3606                 err = ext4_zeroout_es(inode, &zero_ex);
3607         return err ? err : allocated;
3608 }
3609
3610 /*
3611  * This function is called by ext4_ext_map_blocks() from
3612  * ext4_get_blocks_dio_write() when DIO to write
3613  * to an unwritten extent.
3614  *
3615  * Writing to an unwritten extent may result in splitting the unwritten
3616  * extent into multiple initialized/unwritten extents (up to three)
3617  * There are three possibilities:
3618  *   a> There is no split required: Entire extent should be unwritten
3619  *   b> Splits in two extents: Write is happening at either end of the extent
3620  *   c> Splits in three extents: Somone is writing in middle of the extent
3621  *
3622  * This works the same way in the case of initialized -> unwritten conversion.
3623  *
3624  * One of more index blocks maybe needed if the extent tree grow after
3625  * the unwritten extent split. To prevent ENOSPC occur at the IO
3626  * complete, we need to split the unwritten extent before DIO submit
3627  * the IO. The unwritten extent called at this time will be split
3628  * into three unwritten extent(at most). After IO complete, the part
3629  * being filled will be convert to initialized by the end_io callback function
3630  * via ext4_convert_unwritten_extents().
3631  *
3632  * Returns the size of unwritten extent to be written on success.
3633  */
3634 static int ext4_split_convert_extents(handle_t *handle,
3635                                         struct inode *inode,
3636                                         struct ext4_map_blocks *map,
3637                                         struct ext4_ext_path **ppath,
3638                                         int flags)
3639 {
3640         struct ext4_ext_path *path = *ppath;
3641         ext4_lblk_t eof_block;
3642         ext4_lblk_t ee_block;
3643         struct ext4_extent *ex;
3644         unsigned int ee_len;
3645         int split_flag = 0, depth;
3646
3647         ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3648                   __func__, inode->i_ino,
3649                   (unsigned long long)map->m_lblk, map->m_len);
3650
3651         eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3652                 inode->i_sb->s_blocksize_bits;
3653         if (eof_block < map->m_lblk + map->m_len)
3654                 eof_block = map->m_lblk + map->m_len;
3655         /*
3656          * It is safe to convert extent to initialized via explicit
3657          * zeroout only if extent is fully insde i_size or new_size.
3658          */
3659         depth = ext_depth(inode);
3660         ex = path[depth].p_ext;
3661         ee_block = le32_to_cpu(ex->ee_block);
3662         ee_len = ext4_ext_get_actual_len(ex);
3663
3664         /* Convert to unwritten */
3665         if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3666                 split_flag |= EXT4_EXT_DATA_VALID1;
3667         /* Convert to initialized */
3668         } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3669                 split_flag |= ee_block + ee_len <= eof_block ?
3670                               EXT4_EXT_MAY_ZEROOUT : 0;
3671                 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3672         }
3673         flags |= EXT4_GET_BLOCKS_PRE_IO;
3674         return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3675 }
3676
3677 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3678                                                 struct inode *inode,
3679                                                 struct ext4_map_blocks *map,
3680                                                 struct ext4_ext_path **ppath)
3681 {
3682         struct ext4_ext_path *path = *ppath;
3683         struct ext4_extent *ex;
3684         ext4_lblk_t ee_block;
3685         unsigned int ee_len;
3686         int depth;
3687         int err = 0;
3688
3689         depth = ext_depth(inode);
3690         ex = path[depth].p_ext;
3691         ee_block = le32_to_cpu(ex->ee_block);
3692         ee_len = ext4_ext_get_actual_len(ex);
3693
3694         ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3695                 "block %llu, max_blocks %u\n", inode->i_ino,
3696                   (unsigned long long)ee_block, ee_len);
3697
3698         /* If extent is larger than requested it is a clear sign that we still
3699          * have some extent state machine issues left. So extent_split is still
3700          * required.
3701          * TODO: Once all related issues will be fixed this situation should be
3702          * illegal.
3703          */
3704         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3705 #ifdef EXT4_DEBUG
3706                 ext4_warning("Inode (%ld) finished: extent logical block %llu,"
3707                              " len %u; IO logical block %llu, len %u\n",
3708                              inode->i_ino, (unsigned long long)ee_block, ee_len,
3709                              (unsigned long long)map->m_lblk, map->m_len);
3710 #endif
3711                 err = ext4_split_convert_extents(handle, inode, map, ppath,
3712                                                  EXT4_GET_BLOCKS_CONVERT);
3713                 if (err < 0)
3714                         return err;
3715                 path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
3716                 if (IS_ERR(path))
3717                         return PTR_ERR(path);
3718                 depth = ext_depth(inode);
3719                 ex = path[depth].p_ext;
3720         }
3721
3722         err = ext4_ext_get_access(handle, inode, path + depth);
3723         if (err)
3724                 goto out;
3725         /* first mark the extent as initialized */
3726         ext4_ext_mark_initialized(ex);
3727
3728         /* note: ext4_ext_correct_indexes() isn't needed here because
3729          * borders are not changed
3730          */
3731         ext4_ext_try_to_merge(handle, inode, path, ex);
3732
3733         /* Mark modified extent as dirty */
3734         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3735 out:
3736         ext4_ext_show_leaf(inode, path);
3737         return err;
3738 }
3739
3740 static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3741                         sector_t block, int count)
3742 {
3743         int i;
3744         for (i = 0; i < count; i++)
3745                 unmap_underlying_metadata(bdev, block + i);
3746 }
3747
3748 /*
3749  * Handle EOFBLOCKS_FL flag, clearing it if necessary
3750  */
3751 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3752                               ext4_lblk_t lblk,
3753                               struct ext4_ext_path *path,
3754                               unsigned int len)
3755 {
3756         int i, depth;
3757         struct ext4_extent_header *eh;
3758         struct ext4_extent *last_ex;
3759
3760         if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3761                 return 0;
3762
3763         depth = ext_depth(inode);
3764         eh = path[depth].p_hdr;
3765
3766         /*
3767          * We're going to remove EOFBLOCKS_FL entirely in future so we
3768          * do not care for this case anymore. Simply remove the flag
3769          * if there are no extents.
3770          */
3771         if (unlikely(!eh->eh_entries))
3772                 goto out;
3773         last_ex = EXT_LAST_EXTENT(eh);
3774         /*
3775          * We should clear the EOFBLOCKS_FL flag if we are writing the
3776          * last block in the last extent in the file.  We test this by
3777          * first checking to see if the caller to
3778          * ext4_ext_get_blocks() was interested in the last block (or
3779          * a block beyond the last block) in the current extent.  If
3780          * this turns out to be false, we can bail out from this
3781          * function immediately.
3782          */
3783         if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3784             ext4_ext_get_actual_len(last_ex))
3785                 return 0;
3786         /*
3787          * If the caller does appear to be planning to write at or
3788          * beyond the end of the current extent, we then test to see
3789          * if the current extent is the last extent in the file, by
3790          * checking to make sure it was reached via the rightmost node
3791          * at each level of the tree.
3792          */
3793         for (i = depth-1; i >= 0; i--)
3794                 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3795                         return 0;
3796 out:
3797         ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3798         return ext4_mark_inode_dirty(handle, inode);
3799 }
3800
3801 /**
3802  * ext4_find_delalloc_range: find delayed allocated block in the given range.
3803  *
3804  * Return 1 if there is a delalloc block in the range, otherwise 0.
3805  */
3806 int ext4_find_delalloc_range(struct inode *inode,
3807                              ext4_lblk_t lblk_start,
3808                              ext4_lblk_t lblk_end)
3809 {
3810         struct extent_status es;
3811
3812         ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
3813         if (es.es_len == 0)
3814                 return 0; /* there is no delay extent in this tree */
3815         else if (es.es_lblk <= lblk_start &&
3816                  lblk_start < es.es_lblk + es.es_len)
3817                 return 1;
3818         else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
3819                 return 1;
3820         else
3821                 return 0;
3822 }
3823
3824 int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
3825 {
3826         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3827         ext4_lblk_t lblk_start, lblk_end;
3828         lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
3829         lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3830
3831         return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
3832 }
3833
3834 /**
3835  * Determines how many complete clusters (out of those specified by the 'map')
3836  * are under delalloc and were reserved quota for.
3837  * This function is called when we are writing out the blocks that were
3838  * originally written with their allocation delayed, but then the space was
3839  * allocated using fallocate() before the delayed allocation could be resolved.
3840  * The cases to look for are:
3841  * ('=' indicated delayed allocated blocks
3842  *  '-' indicates non-delayed allocated blocks)
3843  * (a) partial clusters towards beginning and/or end outside of allocated range
3844  *     are not delalloc'ed.
3845  *      Ex:
3846  *      |----c---=|====c====|====c====|===-c----|
3847  *               |++++++ allocated ++++++|
3848  *      ==> 4 complete clusters in above example
3849  *
3850  * (b) partial cluster (outside of allocated range) towards either end is
3851  *     marked for delayed allocation. In this case, we will exclude that
3852  *     cluster.
3853  *      Ex:
3854  *      |----====c========|========c========|
3855  *           |++++++ allocated ++++++|
3856  *      ==> 1 complete clusters in above example
3857  *
3858  *      Ex:
3859  *      |================c================|
3860  *            |++++++ allocated ++++++|
3861  *      ==> 0 complete clusters in above example
3862  *
3863  * The ext4_da_update_reserve_space will be called only if we
3864  * determine here that there were some "entire" clusters that span
3865  * this 'allocated' range.
3866  * In the non-bigalloc case, this function will just end up returning num_blks
3867  * without ever calling ext4_find_delalloc_range.
3868  */
3869 static unsigned int
3870 get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3871                            unsigned int num_blks)
3872 {
3873         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3874         ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3875         ext4_lblk_t lblk_from, lblk_to, c_offset;
3876         unsigned int allocated_clusters = 0;
3877
3878         alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3879         alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3880
3881         /* max possible clusters for this allocation */
3882         allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3883
3884         trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3885
3886         /* Check towards left side */
3887         c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
3888         if (c_offset) {
3889                 lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
3890                 lblk_to = lblk_from + c_offset - 1;
3891
3892                 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3893                         allocated_clusters--;
3894         }
3895
3896         /* Now check towards right. */
3897         c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
3898         if (allocated_clusters && c_offset) {
3899                 lblk_from = lblk_start + num_blks;
3900                 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3901
3902                 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3903                         allocated_clusters--;
3904         }
3905
3906         return allocated_clusters;
3907 }
3908
3909 static int
3910 convert_initialized_extent(handle_t *handle, struct inode *inode,
3911                            struct ext4_map_blocks *map,
3912                            struct ext4_ext_path **ppath, int flags,
3913                            unsigned int allocated, ext4_fsblk_t newblock)
3914 {
3915         struct ext4_ext_path *path = *ppath;
3916         struct ext4_extent *ex;
3917         ext4_lblk_t ee_block;
3918         unsigned int ee_len;
3919         int depth;
3920         int err = 0;
3921
3922         /*
3923          * Make sure that the extent is no bigger than we support with
3924          * unwritten extent
3925          */
3926         if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3927                 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3928
3929         depth = ext_depth(inode);
3930         ex = path[depth].p_ext;
3931         ee_block = le32_to_cpu(ex->ee_block);
3932         ee_len = ext4_ext_get_actual_len(ex);
3933
3934         ext_debug("%s: inode %lu, logical"
3935                 "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3936                   (unsigned long long)ee_block, ee_len);
3937
3938         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3939                 err = ext4_split_convert_extents(handle, inode, map, ppath,
3940                                 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3941                 if (err < 0)
3942                         return err;
3943                 path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0);
3944                 if (IS_ERR(path))
3945                         return PTR_ERR(path);
3946                 depth = ext_depth(inode);
3947                 ex = path[depth].p_ext;
3948                 if (!ex) {
3949                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3950                                          (unsigned long) map->m_lblk);
3951                         return -EIO;
3952                 }
3953         }
3954
3955         err = ext4_ext_get_access(handle, inode, path + depth);
3956         if (err)
3957                 return err;
3958         /* first mark the extent as unwritten */
3959         ext4_ext_mark_unwritten(ex);
3960
3961         /* note: ext4_ext_correct_indexes() isn't needed here because
3962          * borders are not changed
3963          */
3964         ext4_ext_try_to_merge(handle, inode, path, ex);
3965
3966         /* Mark modified extent as dirty */
3967         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3968         if (err)
3969                 return err;
3970         ext4_ext_show_leaf(inode, path);
3971
3972         ext4_update_inode_fsync_trans(handle, inode, 1);
3973         err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
3974         if (err)
3975                 return err;
3976         map->m_flags |= EXT4_MAP_UNWRITTEN;
3977         if (allocated > map->m_len)
3978                 allocated = map->m_len;
3979         map->m_len = allocated;
3980         return allocated;
3981 }
3982
3983 static int
3984 ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
3985                         struct ext4_map_blocks *map,
3986                         struct ext4_ext_path **ppath, int flags,
3987                         unsigned int allocated, ext4_fsblk_t newblock)
3988 {
3989         struct ext4_ext_path *path = *ppath;
3990         int ret = 0;
3991         int err = 0;
3992         ext4_io_end_t *io = ext4_inode_aio(inode);
3993
3994         ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
3995                   "block %llu, max_blocks %u, flags %x, allocated %u\n",
3996                   inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3997                   flags, allocated);
3998         ext4_ext_show_leaf(inode, path);
3999
4000         /*
4001          * When writing into unwritten space, we should not fail to
4002          * allocate metadata blocks for the new extent block if needed.
4003          */
4004         flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
4005
4006         trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
4007                                                     allocated, newblock);
4008
4009         /* get_block() before submit the IO, split the extent */
4010         if (flags & EXT4_GET_BLOCKS_PRE_IO) {
4011                 ret = ext4_split_convert_extents(handle, inode, map, ppath,
4012                                          flags | EXT4_GET_BLOCKS_CONVERT);
4013                 if (ret <= 0)
4014                         goto out;
4015                 /*
4016                  * Flag the inode(non aio case) or end_io struct (aio case)
4017                  * that this IO needs to conversion to written when IO is
4018                  * completed
4019                  */
4020                 if (io)
4021                         ext4_set_io_unwritten_flag(inode, io);
4022                 else
4023                         ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
4024                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4025                 goto out;
4026         }
4027         /* IO end_io complete, convert the filled extent to written */
4028         if (flags & EXT4_GET_BLOCKS_CONVERT) {
4029                 ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
4030                                                            ppath);
4031                 if (ret >= 0) {
4032                         ext4_update_inode_fsync_trans(handle, inode, 1);
4033                         err = check_eofblocks_fl(handle, inode, map->m_lblk,
4034                                                  path, map->m_len);
4035                 } else
4036                         err = ret;
4037                 map->m_flags |= EXT4_MAP_MAPPED;
4038                 map->m_pblk = newblock;
4039                 if (allocated > map->m_len)
4040                         allocated = map->m_len;
4041                 map->m_len = allocated;
4042                 goto out2;
4043         }
4044         /* buffered IO case */
4045         /*
4046          * repeat fallocate creation request
4047          * we already have an unwritten extent
4048          */
4049         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4050                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4051                 goto map_out;
4052         }
4053
4054         /* buffered READ or buffered write_begin() lookup */
4055         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4056                 /*
4057                  * We have blocks reserved already.  We
4058                  * return allocated blocks so that delalloc
4059                  * won't do block reservation for us.  But
4060                  * the buffer head will be unmapped so that
4061                  * a read from the block returns 0s.
4062                  */
4063                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4064                 goto out1;
4065         }
4066
4067         /* buffered write, writepage time, convert*/
4068         ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
4069         if (ret >= 0)
4070                 ext4_update_inode_fsync_trans(handle, inode, 1);
4071 out:
4072         if (ret <= 0) {
4073                 err = ret;
4074                 goto out2;
4075         } else
4076                 allocated = ret;
4077         map->m_flags |= EXT4_MAP_NEW;
4078         /*
4079          * if we allocated more blocks than requested
4080          * we need to make sure we unmap the extra block
4081          * allocated. The actual needed block will get
4082          * unmapped later when we find the buffer_head marked
4083          * new.
4084          */
4085         if (allocated > map->m_len) {
4086                 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
4087                                         newblock + map->m_len,
4088                                         allocated - map->m_len);
4089                 allocated = map->m_len;
4090         }
4091         map->m_len = allocated;
4092
4093         /*
4094          * If we have done fallocate with the offset that is already
4095          * delayed allocated, we would have block reservation
4096          * and quota reservation done in the delayed write path.
4097          * But fallocate would have already updated quota and block
4098          * count for this offset. So cancel these reservation
4099          */
4100         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4101                 unsigned int reserved_clusters;
4102                 reserved_clusters = get_reserved_cluster_alloc(inode,
4103                                 map->m_lblk, map->m_len);
4104                 if (reserved_clusters)
4105                         ext4_da_update_reserve_space(inode,
4106                                                      reserved_clusters,
4107                                                      0);
4108         }
4109
4110 map_out:
4111         map->m_flags |= EXT4_MAP_MAPPED;
4112         if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
4113                 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
4114                                          map->m_len);
4115                 if (err < 0)
4116                         goto out2;
4117         }
4118 out1:
4119         if (allocated > map->m_len)
4120                 allocated = map->m_len;
4121         ext4_ext_show_leaf(inode, path);
4122         map->m_pblk = newblock;
4123         map->m_len = allocated;
4124 out2:
4125         return err ? err : allocated;
4126 }
4127
4128 /*
4129  * get_implied_cluster_alloc - check to see if the requested
4130  * allocation (in the map structure) overlaps with a cluster already
4131  * allocated in an extent.
4132  *      @sb     The filesystem superblock structure
4133  *      @map    The requested lblk->pblk mapping
4134  *      @ex     The extent structure which might contain an implied
4135  *                      cluster allocation
4136  *
4137  * This function is called by ext4_ext_map_blocks() after we failed to
4138  * find blocks that were already in the inode's extent tree.  Hence,
4139  * we know that the beginning of the requested region cannot overlap
4140  * the extent from the inode's extent tree.  There are three cases we
4141  * want to catch.  The first is this case:
4142  *
4143  *               |--- cluster # N--|
4144  *    |--- extent ---|  |---- requested region ---|
4145  *                      |==========|
4146  *
4147  * The second case that we need to test for is this one:
4148  *
4149  *   |--------- cluster # N ----------------|
4150  *         |--- requested region --|   |------- extent ----|
4151  *         |=======================|
4152  *
4153  * The third case is when the requested region lies between two extents
4154  * within the same cluster:
4155  *          |------------- cluster # N-------------|
4156  * |----- ex -----|                  |---- ex_right ----|
4157  *                  |------ requested region ------|
4158  *                  |================|
4159  *
4160  * In each of the above cases, we need to set the map->m_pblk and
4161  * map->m_len so it corresponds to the return the extent labelled as
4162  * "|====|" from cluster #N, since it is already in use for data in
4163  * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
4164  * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
4165  * as a new "allocated" block region.  Otherwise, we will return 0 and
4166  * ext4_ext_map_blocks() will then allocate one or more new clusters
4167  * by calling ext4_mb_new_blocks().
4168  */
4169 static int get_implied_cluster_alloc(struct super_block *sb,
4170                                      struct ext4_map_blocks *map,
4171                                      struct ext4_extent *ex,
4172                                      struct ext4_ext_path *path)
4173 {
4174         struct ext4_sb_info *sbi = EXT4_SB(sb);
4175         ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4176         ext4_lblk_t ex_cluster_start, ex_cluster_end;
4177         ext4_lblk_t rr_cluster_start;
4178         ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4179         ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4180         unsigned short ee_len = ext4_ext_get_actual_len(ex);
4181
4182         /* The extent passed in that we are trying to match */
4183         ex_cluster_start = EXT4_B2C(sbi, ee_block);
4184         ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
4185
4186         /* The requested region passed into ext4_map_blocks() */
4187         rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
4188
4189         if ((rr_cluster_start == ex_cluster_end) ||
4190             (rr_cluster_start == ex_cluster_start)) {
4191                 if (rr_cluster_start == ex_cluster_end)
4192                         ee_start += ee_len - 1;
4193                 map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4194                 map->m_len = min(map->m_len,
4195                                  (unsigned) sbi->s_cluster_ratio - c_offset);
4196                 /*
4197                  * Check for and handle this case:
4198                  *
4199                  *   |--------- cluster # N-------------|
4200                  *                     |------- extent ----|
4201                  *         |--- requested region ---|
4202                  *         |===========|
4203                  */
4204
4205                 if (map->m_lblk < ee_block)
4206                         map->m_len = min(map->m_len, ee_block - map->m_lblk);
4207
4208                 /*
4209                  * Check for the case where there is already another allocated
4210                  * block to the right of 'ex' but before the end of the cluster.
4211                  *
4212                  *          |------------- cluster # N-------------|
4213                  * |----- ex -----|                  |---- ex_right ----|
4214                  *                  |------ requested region ------|
4215                  *                  |================|
4216                  */
4217                 if (map->m_lblk > ee_block) {
4218                         ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4219                         map->m_len = min(map->m_len, next - map->m_lblk);
4220                 }
4221
4222                 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4223                 return 1;
4224         }
4225
4226         trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4227         return 0;
4228 }
4229
4230
4231 /*
4232  * Block allocation/map/preallocation routine for extents based files
4233  *
4234  *
4235  * Need to be called with
4236  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4237  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4238  *
4239  * return > 0, number of of blocks already mapped/allocated
4240  *          if create == 0 and these are pre-allocated blocks
4241  *              buffer head is unmapped
4242  *          otherwise blocks are mapped
4243  *
4244  * return = 0, if plain look up failed (blocks have not been allocated)
4245  *          buffer head is unmapped
4246  *
4247  * return < 0, error case.
4248  */
4249 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4250                         struct ext4_map_blocks *map, int flags)
4251 {
4252         struct ext4_ext_path *path = NULL;
4253         struct ext4_extent newex, *ex, *ex2;
4254         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4255         ext4_fsblk_t newblock = 0;
4256         int free_on_err = 0, err = 0, depth, ret;
4257         unsigned int allocated = 0, offset = 0;
4258         unsigned int allocated_clusters = 0;
4259         struct ext4_allocation_request ar;
4260         ext4_io_end_t *io = ext4_inode_aio(inode);
4261         ext4_lblk_t cluster_offset;
4262         int set_unwritten = 0;
4263
4264         ext_debug("blocks %u/%u requested for inode %lu\n",
4265                   map->m_lblk, map->m_len, inode->i_ino);
4266         trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4267
4268         /* find extent for this block */
4269         path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
4270         if (IS_ERR(path)) {
4271                 err = PTR_ERR(path);
4272                 path = NULL;
4273                 goto out2;
4274         }
4275
4276         depth = ext_depth(inode);
4277
4278         /*
4279          * consistent leaf must not be empty;
4280          * this situation is possible, though, _during_ tree modification;
4281          * this is why assert can't be put in ext4_ext_find_extent()
4282          */
4283         if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4284                 EXT4_ERROR_INODE(inode, "bad extent address "
4285                                  "lblock: %lu, depth: %d pblock %lld",
4286                                  (unsigned long) map->m_lblk, depth,
4287                                  path[depth].p_block);
4288                 err = -EIO;
4289                 goto out2;
4290         }
4291
4292         ex = path[depth].p_ext;
4293         if (ex) {
4294                 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4295                 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4296                 unsigned short ee_len;
4297
4298
4299                 /*
4300                  * unwritten extents are treated as holes, except that
4301                  * we split out initialized portions during a write.
4302                  */
4303                 ee_len = ext4_ext_get_actual_len(ex);
4304
4305                 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4306
4307                 /* if found extent covers block, simply return it */
4308                 if (in_range(map->m_lblk, ee_block, ee_len)) {
4309                         newblock = map->m_lblk - ee_block + ee_start;
4310                         /* number of remaining blocks in the extent */
4311                         allocated = ee_len - (map->m_lblk - ee_block);
4312                         ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4313                                   ee_block, ee_len, newblock);
4314
4315                         /*
4316                          * If the extent is initialized check whether the
4317                          * caller wants to convert it to unwritten.
4318                          */
4319                         if ((!ext4_ext_is_unwritten(ex)) &&
4320                             (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4321                                 allocated = convert_initialized_extent(
4322                                                 handle, inode, map, &path,
4323                                                 flags, allocated, newblock);
4324                                 goto out2;
4325                         } else if (!ext4_ext_is_unwritten(ex))
4326                                 goto out;
4327
4328                         ret = ext4_ext_handle_unwritten_extents(
4329                                 handle, inode, map, &path, flags,
4330                                 allocated, newblock);
4331                         if (ret < 0)
4332                                 err = ret;
4333                         else
4334                                 allocated = ret;
4335                         goto out2;
4336                 }
4337         }
4338
4339         if ((sbi->s_cluster_ratio > 1) &&
4340             ext4_find_delalloc_cluster(inode, map->m_lblk))
4341                 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4342
4343         /*
4344          * requested block isn't allocated yet;
4345          * we couldn't try to create block if create flag is zero
4346          */
4347         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4348                 /*
4349                  * put just found gap into cache to speed up
4350                  * subsequent requests
4351                  */
4352                 if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
4353                         ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
4354                 goto out2;
4355         }
4356
4357         /*
4358          * Okay, we need to do block allocation.
4359          */
4360         map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
4361         newex.ee_block = cpu_to_le32(map->m_lblk);
4362         cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4363
4364         /*
4365          * If we are doing bigalloc, check to see if the extent returned
4366          * by ext4_ext_find_extent() implies a cluster we can use.
4367          */
4368         if (cluster_offset && ex &&
4369             get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4370                 ar.len = allocated = map->m_len;
4371                 newblock = map->m_pblk;
4372                 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4373                 goto got_allocated_blocks;
4374         }
4375
4376         /* find neighbour allocated blocks */
4377         ar.lleft = map->m_lblk;
4378         err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4379         if (err)
4380                 goto out2;
4381         ar.lright = map->m_lblk;
4382         ex2 = NULL;
4383         err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4384         if (err)
4385                 goto out2;
4386
4387         /* Check if the extent after searching to the right implies a
4388          * cluster we can use. */
4389         if ((sbi->s_cluster_ratio > 1) && ex2 &&
4390             get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4391                 ar.len = allocated = map->m_len;
4392                 newblock = map->m_pblk;
4393                 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4394                 goto got_allocated_blocks;
4395         }
4396
4397         /*
4398          * See if request is beyond maximum number of blocks we can have in
4399          * a single extent. For an initialized extent this limit is
4400          * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4401          * EXT_UNWRITTEN_MAX_LEN.
4402          */
4403         if (map->m_len > EXT_INIT_MAX_LEN &&
4404             !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4405                 map->m_len = EXT_INIT_MAX_LEN;
4406         else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4407                  (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4408                 map->m_len = EXT_UNWRITTEN_MAX_LEN;
4409
4410         /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4411         newex.ee_len = cpu_to_le16(map->m_len);
4412         err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4413         if (err)
4414                 allocated = ext4_ext_get_actual_len(&newex);
4415         else
4416                 allocated = map->m_len;
4417
4418         /* allocate new block */
4419         ar.inode = inode;
4420         ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4421         ar.logical = map->m_lblk;
4422         /*
4423          * We calculate the offset from the beginning of the cluster
4424          * for the logical block number, since when we allocate a
4425          * physical cluster, the physical block should start at the
4426          * same offset from the beginning of the cluster.  This is
4427          * needed so that future calls to get_implied_cluster_alloc()
4428          * work correctly.
4429          */
4430         offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4431         ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4432         ar.goal -= offset;
4433         ar.logical -= offset;
4434         if (S_ISREG(inode->i_mode))
4435                 ar.flags = EXT4_MB_HINT_DATA;
4436         else
4437                 /* disable in-core preallocation for non-regular files */
4438                 ar.flags = 0;
4439         if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4440                 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4441         newblock = ext4_mb_new_blocks(handle, &ar, &err);
4442         if (!newblock)
4443                 goto out2;
4444         ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4445                   ar.goal, newblock, allocated);
4446         free_on_err = 1;
4447         allocated_clusters = ar.len;
4448         ar.len = EXT4_C2B(sbi, ar.len) - offset;
4449         if (ar.len > allocated)
4450                 ar.len = allocated;
4451
4452 got_allocated_blocks:
4453         /* try to insert new extent into found leaf and return */
4454         ext4_ext_store_pblock(&newex, newblock + offset);
4455         newex.ee_len = cpu_to_le16(ar.len);
4456         /* Mark unwritten */
4457         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
4458                 ext4_ext_mark_unwritten(&newex);
4459                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4460                 /*
4461                  * io_end structure was created for every IO write to an
4462                  * unwritten extent. To avoid unnecessary conversion,
4463                  * here we flag the IO that really needs the conversion.
4464                  * For non asycn direct IO case, flag the inode state
4465                  * that we need to perform conversion when IO is done.
4466                  */
4467                 if (flags & EXT4_GET_BLOCKS_PRE_IO)
4468                         set_unwritten = 1;
4469         }
4470
4471         err = 0;
4472         if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4473                 err = check_eofblocks_fl(handle, inode, map->m_lblk,
4474                                          path, ar.len);
4475         if (!err)
4476                 err = ext4_ext_insert_extent(handle, inode, &path,
4477                                              &newex, flags);
4478
4479         if (!err && set_unwritten) {
4480                 if (io)
4481                         ext4_set_io_unwritten_flag(inode, io);
4482                 else
4483                         ext4_set_inode_state(inode,
4484                                              EXT4_STATE_DIO_UNWRITTEN);
4485         }
4486
4487         if (err && free_on_err) {
4488                 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4489                         EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4490                 /* free data blocks we just allocated */
4491                 /* not a good idea to call discard here directly,
4492                  * but otherwise we'd need to call it every free() */
4493                 ext4_discard_preallocations(inode);
4494                 ext4_free_blocks(handle, inode, NULL, newblock,
4495                                  EXT4_C2B(sbi, allocated_clusters), fb_flags);
4496                 goto out2;
4497         }
4498
4499         /* previous routine could use block we allocated */
4500         newblock = ext4_ext_pblock(&newex);
4501         allocated = ext4_ext_get_actual_len(&newex);
4502         if (allocated > map->m_len)
4503                 allocated = map->m_len;
4504         map->m_flags |= EXT4_MAP_NEW;
4505
4506         /*
4507          * Update reserved blocks/metadata blocks after successful
4508          * block allocation which had been deferred till now.
4509          */
4510         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4511                 unsigned int reserved_clusters;
4512                 /*
4513                  * Check how many clusters we had reserved this allocated range
4514                  */
4515                 reserved_clusters = get_reserved_cluster_alloc(inode,
4516                                                 map->m_lblk, allocated);
4517                 if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
4518                         if (reserved_clusters) {
4519                                 /*
4520                                  * We have clusters reserved for this range.
4521                                  * But since we are not doing actual allocation
4522                                  * and are simply using blocks from previously
4523                                  * allocated cluster, we should release the
4524                                  * reservation and not claim quota.
4525                                  */
4526                                 ext4_da_update_reserve_space(inode,
4527                                                 reserved_clusters, 0);
4528                         }
4529                 } else {
4530                         BUG_ON(allocated_clusters < reserved_clusters);
4531                         if (reserved_clusters < allocated_clusters) {
4532                                 struct ext4_inode_info *ei = EXT4_I(inode);
4533                                 int reservation = allocated_clusters -
4534                                                   reserved_clusters;
4535                                 /*
4536                                  * It seems we claimed few clusters outside of
4537                                  * the range of this allocation. We should give
4538                                  * it back to the reservation pool. This can
4539                                  * happen in the following case:
4540                                  *
4541                                  * * Suppose s_cluster_ratio is 4 (i.e., each
4542                                  *   cluster has 4 blocks. Thus, the clusters
4543                                  *   are [0-3],[4-7],[8-11]...
4544                                  * * First comes delayed allocation write for
4545                                  *   logical blocks 10 & 11. Since there were no
4546                                  *   previous delayed allocated blocks in the
4547                                  *   range [8-11], we would reserve 1 cluster
4548                                  *   for this write.
4549                                  * * Next comes write for logical blocks 3 to 8.
4550                                  *   In this case, we will reserve 2 clusters
4551                                  *   (for [0-3] and [4-7]; and not for [8-11] as
4552                                  *   that range has a delayed allocated blocks.
4553                                  *   Thus total reserved clusters now becomes 3.
4554                                  * * Now, during the delayed allocation writeout
4555                                  *   time, we will first write blocks [3-8] and
4556                                  *   allocate 3 clusters for writing these
4557                                  *   blocks. Also, we would claim all these
4558                                  *   three clusters above.
4559                                  * * Now when we come here to writeout the
4560                                  *   blocks [10-11], we would expect to claim
4561                                  *   the reservation of 1 cluster we had made
4562                                  *   (and we would claim it since there are no
4563                                  *   more delayed allocated blocks in the range
4564                                  *   [8-11]. But our reserved cluster count had
4565                                  *   already gone to 0.
4566                                  *
4567                                  *   Thus, at the step 4 above when we determine
4568                                  *   that there are still some unwritten delayed
4569                                  *   allocated blocks outside of our current
4570                                  *   block range, we should increment the
4571                                  *   reserved clusters count so that when the
4572                                  *   remaining blocks finally gets written, we
4573                                  *   could claim them.
4574                                  */
4575                                 dquot_reserve_block(inode,
4576                                                 EXT4_C2B(sbi, reservation));
4577                                 spin_lock(&ei->i_block_reservation_lock);
4578                                 ei->i_reserved_data_blocks += reservation;
4579                                 spin_unlock(&ei->i_block_reservation_lock);
4580                         }
4581                         /*
4582                          * We will claim quota for all newly allocated blocks.
4583                          * We're updating the reserved space *after* the
4584                          * correction above so we do not accidentally free
4585                          * all the metadata reservation because we might
4586                          * actually need it later on.
4587                          */
4588                         ext4_da_update_reserve_space(inode, allocated_clusters,
4589                                                         1);
4590                 }
4591         }
4592
4593         /*
4594          * Cache the extent and update transaction to commit on fdatasync only
4595          * when it is _not_ an unwritten extent.
4596          */
4597         if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4598                 ext4_update_inode_fsync_trans(handle, inode, 1);
4599         else
4600                 ext4_update_inode_fsync_trans(handle, inode, 0);
4601 out:
4602         if (allocated > map->m_len)
4603                 allocated = map->m_len;
4604         ext4_ext_show_leaf(inode, path);
4605         map->m_flags |= EXT4_MAP_MAPPED;
4606         map->m_pblk = newblock;
4607         map->m_len = allocated;
4608 out2:
4609         ext4_ext_drop_refs(path);
4610         kfree(path);
4611
4612         trace_ext4_ext_map_blocks_exit(inode, flags, map,
4613                                        err ? err : allocated);
4614         ext4_es_lru_add(inode);
4615         return err ? err : allocated;
4616 }
4617
4618 void ext4_ext_truncate(handle_t *handle, struct inode *inode)
4619 {
4620         struct super_block *sb = inode->i_sb;
4621         ext4_lblk_t last_block;
4622         int err = 0;
4623
4624         /*
4625          * TODO: optimization is possible here.
4626          * Probably we need not scan at all,
4627          * because page truncation is enough.
4628          */
4629
4630         /* we have to know where to truncate from in crash case */
4631         EXT4_I(inode)->i_disksize = inode->i_size;
4632         ext4_mark_inode_dirty(handle, inode);
4633
4634         last_block = (inode->i_size + sb->s_blocksize - 1)
4635                         >> EXT4_BLOCK_SIZE_BITS(sb);
4636 retry:
4637         err = ext4_es_remove_extent(inode, last_block,
4638                                     EXT_MAX_BLOCKS - last_block);
4639         if (err == -ENOMEM) {
4640                 cond_resched();
4641                 congestion_wait(BLK_RW_ASYNC, HZ/50);
4642                 goto retry;
4643         }
4644         if (err) {
4645                 ext4_std_error(inode->i_sb, err);
4646                 return;
4647         }
4648         err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4649         ext4_std_error(inode->i_sb, err);
4650 }
4651
4652 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4653                                   ext4_lblk_t len, loff_t new_size,
4654                                   int flags, int mode)
4655 {
4656         struct inode *inode = file_inode(file);
4657         handle_t *handle;
4658         int ret = 0;
4659         int ret2 = 0;
4660         int retries = 0;
4661         struct ext4_map_blocks map;
4662         unsigned int credits;
4663         loff_t epos;
4664
4665         map.m_lblk = offset;
4666         map.m_len = len;
4667         /*
4668          * Don't normalize the request if it can fit in one extent so
4669          * that it doesn't get unnecessarily split into multiple
4670          * extents.
4671          */
4672         if (len <= EXT_UNWRITTEN_MAX_LEN)
4673                 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4674
4675         /*
4676          * credits to insert 1 extent into extent tree
4677          */
4678         credits = ext4_chunk_trans_blocks(inode, len);
4679
4680 retry:
4681         while (ret >= 0 && len) {
4682                 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4683                                             credits);
4684                 if (IS_ERR(handle)) {
4685                         ret = PTR_ERR(handle);
4686                         break;
4687                 }
4688                 ret = ext4_map_blocks(handle, inode, &map, flags);
4689                 if (ret <= 0) {
4690                         ext4_debug("inode #%lu: block %u: len %u: "
4691                                    "ext4_ext_map_blocks returned %d",
4692                                    inode->i_ino, map.m_lblk,
4693                                    map.m_len, ret);
4694                         ext4_mark_inode_dirty(handle, inode);
4695                         ret2 = ext4_journal_stop(handle);
4696                         break;
4697                 }
4698                 map.m_lblk += ret;
4699                 map.m_len = len = len - ret;
4700                 epos = (loff_t)map.m_lblk << inode->i_blkbits;
4701                 inode->i_ctime = ext4_current_time(inode);
4702                 if (new_size) {
4703                         if (epos > new_size)
4704                                 epos = new_size;
4705                         if (ext4_update_inode_size(inode, epos) & 0x1)
4706                                 inode->i_mtime = inode->i_ctime;
4707                 } else {
4708                         if (epos > inode->i_size)
4709                                 ext4_set_inode_flag(inode,
4710                                                     EXT4_INODE_EOFBLOCKS);
4711                 }
4712                 ext4_mark_inode_dirty(handle, inode);
4713                 ret2 = ext4_journal_stop(handle);
4714                 if (ret2)
4715                         break;
4716         }
4717         if (ret == -ENOSPC &&
4718                         ext4_should_retry_alloc(inode->i_sb, &retries)) {
4719                 ret = 0;
4720                 goto retry;
4721         }
4722
4723         return ret > 0 ? ret2 : ret;
4724 }
4725
4726 static long ext4_zero_range(struct file *file, loff_t offset,
4727                             loff_t len, int mode)
4728 {
4729         struct inode *inode = file_inode(file);
4730         handle_t *handle = NULL;
4731         unsigned int max_blocks;
4732         loff_t new_size = 0;
4733         int ret = 0;
4734         int flags;
4735         int credits;
4736         int partial_begin, partial_end;
4737         loff_t start, end;
4738         ext4_lblk_t lblk;
4739         struct address_space *mapping = inode->i_mapping;
4740         unsigned int blkbits = inode->i_blkbits;
4741
4742         trace_ext4_zero_range(inode, offset, len, mode);
4743
4744         if (!S_ISREG(inode->i_mode))
4745                 return -EINVAL;
4746
4747         /* Call ext4_force_commit to flush all data in case of data=journal. */
4748         if (ext4_should_journal_data(inode)) {
4749                 ret = ext4_force_commit(inode->i_sb);
4750                 if (ret)
4751                         return ret;
4752         }
4753
4754         /*
4755          * Write out all dirty pages to avoid race conditions
4756          * Then release them.
4757          */
4758         if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4759                 ret = filemap_write_and_wait_range(mapping, offset,
4760                                                    offset + len - 1);
4761                 if (ret)
4762                         return ret;
4763         }
4764
4765         /*
4766          * Round up offset. This is not fallocate, we neet to zero out
4767          * blocks, so convert interior block aligned part of the range to
4768          * unwritten and possibly manually zero out unaligned parts of the
4769          * range.
4770          */
4771         start = round_up(offset, 1 << blkbits);
4772         end = round_down((offset + len), 1 << blkbits);
4773
4774         if (start < offset || end > offset + len)
4775                 return -EINVAL;
4776         partial_begin = offset & ((1 << blkbits) - 1);
4777         partial_end = (offset + len) & ((1 << blkbits) - 1);
4778
4779         lblk = start >> blkbits;
4780         max_blocks = (end >> blkbits);
4781         if (max_blocks < lblk)
4782                 max_blocks = 0;
4783         else
4784                 max_blocks -= lblk;
4785
4786         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
4787                 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4788                 EXT4_EX_NOCACHE;
4789         if (mode & FALLOC_FL_KEEP_SIZE)
4790                 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4791
4792         mutex_lock(&inode->i_mutex);
4793
4794         /*
4795          * Indirect files do not support unwritten extnets
4796          */
4797         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4798                 ret = -EOPNOTSUPP;
4799                 goto out_mutex;
4800         }
4801
4802         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4803              offset + len > i_size_read(inode)) {
4804                 new_size = offset + len;
4805                 ret = inode_newsize_ok(inode, new_size);
4806                 if (ret)
4807                         goto out_mutex;
4808                 /*
4809                  * If we have a partial block after EOF we have to allocate
4810                  * the entire block.
4811                  */
4812                 if (partial_end)
4813                         max_blocks += 1;
4814         }
4815
4816         if (max_blocks > 0) {
4817
4818                 /* Now release the pages and zero block aligned part of pages*/
4819                 truncate_pagecache_range(inode, start, end - 1);
4820                 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4821
4822                 /* Wait all existing dio workers, newcomers will block on i_mutex */
4823                 ext4_inode_block_unlocked_dio(inode);
4824                 inode_dio_wait(inode);
4825
4826                 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4827                                              flags, mode);
4828                 if (ret)
4829                         goto out_dio;
4830                 /*
4831                  * Remove entire range from the extent status tree.
4832                  *
4833                  * ext4_es_remove_extent(inode, lblk, max_blocks) is
4834                  * NOT sufficient.  I'm not sure why this is the case,
4835                  * but let's be conservative and remove the extent
4836                  * status tree for the entire inode.  There should be
4837                  * no outstanding delalloc extents thanks to the
4838                  * filemap_write_and_wait_range() call above.
4839                  */
4840                 ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
4841                 if (ret)
4842                         goto out_dio;
4843         }
4844         if (!partial_begin && !partial_end)
4845                 goto out_dio;
4846
4847         /*
4848          * In worst case we have to writeout two nonadjacent unwritten
4849          * blocks and update the inode
4850          */
4851         credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4852         if (ext4_should_journal_data(inode))
4853                 credits += 2;
4854         handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4855         if (IS_ERR(handle)) {
4856                 ret = PTR_ERR(handle);
4857                 ext4_std_error(inode->i_sb, ret);
4858                 goto out_dio;
4859         }
4860
4861         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4862         if (new_size) {
4863                 ext4_update_inode_size(inode, new_size);
4864         } else {
4865                 /*
4866                 * Mark that we allocate beyond EOF so the subsequent truncate
4867                 * can proceed even if the new size is the same as i_size.
4868                 */
4869                 if ((offset + len) > i_size_read(inode))
4870                         ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4871         }
4872         ext4_mark_inode_dirty(handle, inode);
4873
4874         /* Zero out partial block at the edges of the range */
4875         ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4876
4877         if (file->f_flags & O_SYNC)
4878                 ext4_handle_sync(handle);
4879
4880         ext4_journal_stop(handle);
4881 out_dio:
4882         ext4_inode_resume_unlocked_dio(inode);
4883 out_mutex:
4884         mutex_unlock(&inode->i_mutex);
4885         return ret;
4886 }
4887
4888 /*
4889  * preallocate space for a file. This implements ext4's fallocate file
4890  * operation, which gets called from sys_fallocate system call.
4891  * For block-mapped files, posix_fallocate should fall back to the method
4892  * of writing zeroes to the required new blocks (the same behavior which is
4893  * expected for file systems which do not support fallocate() system call).
4894  */
4895 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4896 {
4897         struct inode *inode = file_inode(file);
4898         loff_t new_size = 0;
4899         unsigned int max_blocks;
4900         int ret = 0;
4901         int flags;
4902         ext4_lblk_t lblk;
4903         unsigned int blkbits = inode->i_blkbits;
4904
4905         /* Return error if mode is not supported */
4906         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4907                      FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
4908                 return -EOPNOTSUPP;
4909
4910         if (mode & FALLOC_FL_PUNCH_HOLE)
4911                 return ext4_punch_hole(inode, offset, len);
4912
4913         ret = ext4_convert_inline_data(inode);
4914         if (ret)
4915                 return ret;
4916
4917         /*
4918          * currently supporting (pre)allocate mode for extent-based
4919          * files _only_
4920          */
4921         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4922                 return -EOPNOTSUPP;
4923
4924         if (mode & FALLOC_FL_COLLAPSE_RANGE)
4925                 return ext4_collapse_range(inode, offset, len);
4926
4927         if (mode & FALLOC_FL_ZERO_RANGE)
4928                 return ext4_zero_range(file, offset, len, mode);
4929
4930         trace_ext4_fallocate_enter(inode, offset, len, mode);
4931         lblk = offset >> blkbits;
4932         /*
4933          * We can't just convert len to max_blocks because
4934          * If blocksize = 4096 offset = 3072 and len = 2048
4935          */
4936         max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4937                 - lblk;
4938
4939         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4940         if (mode & FALLOC_FL_KEEP_SIZE)
4941                 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4942
4943         mutex_lock(&inode->i_mutex);
4944
4945         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4946              offset + len > i_size_read(inode)) {
4947                 new_size = offset + len;
4948                 ret = inode_newsize_ok(inode, new_size);
4949                 if (ret)
4950                         goto out;
4951         }
4952
4953         ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4954                                      flags, mode);
4955         if (ret)
4956                 goto out;
4957
4958         if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4959                 ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
4960                                                 EXT4_I(inode)->i_sync_tid);
4961         }
4962 out:
4963         mutex_unlock(&inode->i_mutex);
4964         trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4965         return ret;
4966 }
4967
4968 /*
4969  * This function convert a range of blocks to written extents
4970  * The caller of this function will pass the start offset and the size.
4971  * all unwritten extents within this range will be converted to
4972  * written extents.
4973  *
4974  * This function is called from the direct IO end io call back
4975  * function, to convert the fallocated extents after IO is completed.
4976  * Returns 0 on success.
4977  */
4978 int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4979                                    loff_t offset, ssize_t len)
4980 {
4981         unsigned int max_blocks;
4982         int ret = 0;
4983         int ret2 = 0;
4984         struct ext4_map_blocks map;
4985         unsigned int credits, blkbits = inode->i_blkbits;
4986
4987         map.m_lblk = offset >> blkbits;
4988         /*
4989          * We can't just convert len to max_blocks because
4990          * If blocksize = 4096 offset = 3072 and len = 2048
4991          */
4992         max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4993                       map.m_lblk);
4994         /*
4995          * This is somewhat ugly but the idea is clear: When transaction is
4996          * reserved, everything goes into it. Otherwise we rather start several
4997          * smaller transactions for conversion of each extent separately.
4998          */
4999         if (handle) {
5000                 handle = ext4_journal_start_reserved(handle,
5001                                                      EXT4_HT_EXT_CONVERT);
5002                 if (IS_ERR(handle))
5003                         return PTR_ERR(handle);
5004                 credits = 0;
5005         } else {
5006                 /*
5007                  * credits to insert 1 extent into extent tree
5008                  */
5009                 credits = ext4_chunk_trans_blocks(inode, max_blocks);
5010         }
5011         while (ret >= 0 && ret < max_blocks) {
5012                 map.m_lblk += ret;
5013                 map.m_len = (max_blocks -= ret);
5014                 if (credits) {
5015                         handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
5016                                                     credits);
5017                         if (IS_ERR(handle)) {
5018                                 ret = PTR_ERR(handle);
5019                                 break;
5020                         }
5021                 }
5022                 ret = ext4_map_blocks(handle, inode, &map,
5023                                       EXT4_GET_BLOCKS_IO_CONVERT_EXT);
5024                 if (ret <= 0)
5025                         ext4_warning(inode->i_sb,
5026                                      "inode #%lu: block %u: len %u: "
5027                                      "ext4_ext_map_blocks returned %d",
5028                                      inode->i_ino, map.m_lblk,
5029                                      map.m_len, ret);
5030                 ext4_mark_inode_dirty(handle, inode);
5031                 if (credits)
5032                         ret2 = ext4_journal_stop(handle);
5033                 if (ret <= 0 || ret2)
5034                         break;
5035         }
5036         if (!credits)
5037                 ret2 = ext4_journal_stop(handle);
5038         return ret > 0 ? ret2 : ret;
5039 }
5040
5041 /*
5042  * If newes is not existing extent (newes->ec_pblk equals zero) find
5043  * delayed extent at start of newes and update newes accordingly and
5044  * return start of the next delayed extent.
5045  *
5046  * If newes is existing extent (newes->ec_pblk is not equal zero)
5047  * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
5048  * extent found. Leave newes unmodified.
5049  */
5050 static int ext4_find_delayed_extent(struct inode *inode,
5051                                     struct extent_status *newes)
5052 {
5053         struct extent_status es;
5054         ext4_lblk_t block, next_del;
5055
5056         if (newes->es_pblk == 0) {
5057                 ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
5058                                 newes->es_lblk + newes->es_len - 1, &es);
5059
5060                 /*
5061                  * No extent in extent-tree contains block @newes->es_pblk,
5062                  * then the block may stay in 1)a hole or 2)delayed-extent.
5063                  */
5064                 if (es.es_len == 0)
5065                         /* A hole found. */
5066                         return 0;
5067
5068                 if (es.es_lblk > newes->es_lblk) {
5069                         /* A hole found. */
5070                         newes->es_len = min(es.es_lblk - newes->es_lblk,
5071                                             newes->es_len);
5072                         return 0;
5073                 }
5074
5075                 newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
5076         }
5077
5078         block = newes->es_lblk + newes->es_len;
5079         ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
5080         if (es.es_len == 0)
5081                 next_del = EXT_MAX_BLOCKS;
5082         else
5083                 next_del = es.es_lblk;
5084
5085         return next_del;
5086 }
5087 /* fiemap flags we can handle specified here */
5088 #define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
5089
5090 static int ext4_xattr_fiemap(struct inode *inode,
5091                                 struct fiemap_extent_info *fieinfo)
5092 {
5093         __u64 physical = 0;
5094         __u64 length;
5095         __u32 flags = FIEMAP_EXTENT_LAST;
5096         int blockbits = inode->i_sb->s_blocksize_bits;
5097         int error = 0;
5098
5099         /* in-inode? */
5100         if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
5101                 struct ext4_iloc iloc;
5102                 int offset;     /* offset of xattr in inode */
5103
5104                 error = ext4_get_inode_loc(inode, &iloc);
5105                 if (error)
5106                         return error;
5107                 physical = (__u64)iloc.bh->b_blocknr << blockbits;
5108                 offset = EXT4_GOOD_OLD_INODE_SIZE +
5109                                 EXT4_I(inode)->i_extra_isize;
5110                 physical += offset;
5111                 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
5112                 flags |= FIEMAP_EXTENT_DATA_INLINE;
5113                 brelse(iloc.bh);
5114         } else { /* external block */
5115                 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
5116                 length = inode->i_sb->s_blocksize;
5117         }
5118
5119         if (physical)
5120                 error = fiemap_fill_next_extent(fieinfo, 0, physical,
5121                                                 length, flags);
5122         return (error < 0 ? error : 0);
5123 }
5124
5125 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5126                 __u64 start, __u64 len)
5127 {
5128         ext4_lblk_t start_blk;
5129         int error = 0;
5130
5131         if (ext4_has_inline_data(inode)) {
5132                 int has_inline = 1;
5133
5134                 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
5135
5136                 if (has_inline)
5137                         return error;
5138         }
5139
5140         if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5141                 error = ext4_ext_precache(inode);
5142                 if (error)
5143                         return error;
5144         }
5145
5146         /* fallback to generic here if not in extents fmt */
5147         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5148                 return generic_block_fiemap(inode, fieinfo, start, len,
5149                         ext4_get_block);
5150
5151         if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
5152                 return -EBADR;
5153
5154         if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
5155                 error = ext4_xattr_fiemap(inode, fieinfo);
5156         } else {
5157                 ext4_lblk_t len_blks;
5158                 __u64 last_blk;
5159
5160                 start_blk = start >> inode->i_sb->s_blocksize_bits;
5161                 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
5162                 if (last_blk >= EXT_MAX_BLOCKS)
5163                         last_blk = EXT_MAX_BLOCKS-1;
5164                 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5165
5166                 /*
5167                  * Walk the extent tree gathering extent information
5168                  * and pushing extents back to the user.
5169                  */
5170                 error = ext4_fill_fiemap_extents(inode, start_blk,
5171                                                  len_blks, fieinfo);
5172         }
5173         ext4_es_lru_add(inode);
5174         return error;
5175 }
5176
5177 /*
5178  * ext4_access_path:
5179  * Function to access the path buffer for marking it dirty.
5180  * It also checks if there are sufficient credits left in the journal handle
5181  * to update path.
5182  */
5183 static int
5184 ext4_access_path(handle_t *handle, struct inode *inode,
5185                 struct ext4_ext_path *path)
5186 {
5187         int credits, err;
5188
5189         if (!ext4_handle_valid(handle))
5190                 return 0;
5191
5192         /*
5193          * Check if need to extend journal credits
5194          * 3 for leaf, sb, and inode plus 2 (bmap and group
5195          * descriptor) for each block group; assume two block
5196          * groups
5197          */
5198         if (handle->h_buffer_credits < 7) {
5199                 credits = ext4_writepage_trans_blocks(inode);
5200                 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
5201                 /* EAGAIN is success */
5202                 if (err && err != -EAGAIN)
5203                         return err;
5204         }
5205
5206         err = ext4_ext_get_access(handle, inode, path);
5207         return err;
5208 }
5209
5210 /*
5211  * ext4_ext_shift_path_extents:
5212  * Shift the extents of a path structure lying between path[depth].p_ext
5213  * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
5214  * from starting block for each extent.
5215  */
5216 static int
5217 ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5218                             struct inode *inode, handle_t *handle,
5219                             ext4_lblk_t *start)
5220 {
5221         int depth, err = 0;
5222         struct ext4_extent *ex_start, *ex_last;
5223         bool update = 0;
5224         depth = path->p_depth;
5225
5226         while (depth >= 0) {
5227                 if (depth == path->p_depth) {
5228                         ex_start = path[depth].p_ext;
5229                         if (!ex_start)
5230                                 return -EIO;
5231
5232                         ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5233                         if (!ex_last)
5234                                 return -EIO;
5235
5236                         err = ext4_access_path(handle, inode, path + depth);
5237                         if (err)
5238                                 goto out;
5239
5240                         if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5241                                 update = 1;
5242
5243                         *start = le32_to_cpu(ex_last->ee_block) +
5244                                 ext4_ext_get_actual_len(ex_last);
5245
5246                         while (ex_start <= ex_last) {
5247                                 le32_add_cpu(&ex_start->ee_block, -shift);
5248                                 /* Try to merge to the left. */
5249                                 if ((ex_start >
5250                                      EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
5251                                     ext4_ext_try_to_merge_right(inode,
5252                                                         path, ex_start - 1))
5253                                         ex_last--;
5254                                 else
5255                                         ex_start++;
5256                         }
5257                         err = ext4_ext_dirty(handle, inode, path + depth);
5258                         if (err)
5259                                 goto out;
5260
5261                         if (--depth < 0 || !update)
5262                                 break;
5263                 }
5264
5265                 /* Update index too */
5266                 err = ext4_access_path(handle, inode, path + depth);
5267                 if (err)
5268                         goto out;
5269
5270                 le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5271                 err = ext4_ext_dirty(handle, inode, path + depth);
5272                 if (err)
5273                         goto out;
5274
5275                 /* we are done if current index is not a starting index */
5276                 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5277                         break;
5278
5279                 depth--;
5280         }
5281
5282 out:
5283         return err;
5284 }
5285
5286 /*
5287  * ext4_ext_shift_extents:
5288  * All the extents which lies in the range from start to the last allocated
5289  * block for the file are shifted downwards by shift blocks.
5290  * On success, 0 is returned, error otherwise.
5291  */
5292 static int
5293 ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5294                        ext4_lblk_t start, ext4_lblk_t shift)
5295 {
5296         struct ext4_ext_path *path;
5297         int ret = 0, depth;
5298         struct ext4_extent *extent;
5299         ext4_lblk_t stop_block;
5300         ext4_lblk_t ex_start, ex_end;
5301
5302         /* Let path point to the last extent */
5303         path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
5304         if (IS_ERR(path))
5305                 return PTR_ERR(path);
5306
5307         depth = path->p_depth;
5308         extent = path[depth].p_ext;
5309         if (!extent) {
5310                 ext4_ext_drop_refs(path);
5311                 kfree(path);
5312                 return ret;
5313         }
5314
5315         stop_block = le32_to_cpu(extent->ee_block) +
5316                         ext4_ext_get_actual_len(extent);
5317         ext4_ext_drop_refs(path);
5318         kfree(path);
5319
5320         /* Nothing to shift, if hole is at the end of file */
5321         if (start >= stop_block)
5322                 return ret;
5323
5324         /*
5325          * Don't start shifting extents until we make sure the hole is big
5326          * enough to accomodate the shift.
5327          */
5328         path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
5329         if (IS_ERR(path))
5330                 return PTR_ERR(path);
5331         depth = path->p_depth;
5332         extent =  path[depth].p_ext;
5333         if (extent) {
5334                 ex_start = le32_to_cpu(extent->ee_block);
5335                 ex_end = le32_to_cpu(extent->ee_block) +
5336                         ext4_ext_get_actual_len(extent);
5337         } else {
5338                 ex_start = 0;
5339                 ex_end = 0;
5340         }
5341         ext4_ext_drop_refs(path);
5342         kfree(path);
5343
5344         if ((start == ex_start && shift > ex_start) ||
5345             (shift > start - ex_end))
5346                 return -EINVAL;
5347
5348         /* Its safe to start updating extents */
5349         while (start < stop_block) {
5350                 path = ext4_ext_find_extent(inode, start, NULL, 0);
5351                 if (IS_ERR(path))
5352                         return PTR_ERR(path);
5353                 depth = path->p_depth;
5354                 extent = path[depth].p_ext;
5355                 if (!extent) {
5356                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5357                                          (unsigned long) start);
5358                         return -EIO;
5359                 }
5360                 if (start > le32_to_cpu(extent->ee_block)) {
5361                         /* Hole, move to the next extent */
5362                         if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5363                                 path[depth].p_ext++;
5364                         } else {
5365                                 start = ext4_ext_next_allocated_block(path);
5366                                 ext4_ext_drop_refs(path);
5367                                 kfree(path);
5368                                 continue;
5369                         }
5370                 }
5371                 ret = ext4_ext_shift_path_extents(path, shift, inode,
5372                                 handle, &start);
5373                 ext4_ext_drop_refs(path);
5374                 kfree(path);
5375                 if (ret)
5376                         break;
5377         }
5378
5379         return ret;
5380 }
5381
5382 /*
5383  * ext4_collapse_range:
5384  * This implements the fallocate's collapse range functionality for ext4
5385  * Returns: 0 and non-zero on error.
5386  */
5387 int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5388 {
5389         struct super_block *sb = inode->i_sb;
5390         ext4_lblk_t punch_start, punch_stop;
5391         handle_t *handle;
5392         unsigned int credits;
5393         loff_t new_size, ioffset;
5394         int ret;
5395
5396         /* Collapse range works only on fs block size aligned offsets. */
5397         if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
5398             len & (EXT4_CLUSTER_SIZE(sb) - 1))
5399                 return -EINVAL;
5400
5401         if (!S_ISREG(inode->i_mode))
5402                 return -EINVAL;
5403
5404         trace_ext4_collapse_range(inode, offset, len);
5405
5406         punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5407         punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5408
5409         /* Call ext4_force_commit to flush all data in case of data=journal. */
5410         if (ext4_should_journal_data(inode)) {
5411                 ret = ext4_force_commit(inode->i_sb);
5412                 if (ret)
5413                         return ret;
5414         }
5415
5416         /*
5417          * Need to round down offset to be aligned with page size boundary
5418          * for page size > block size.
5419          */
5420         ioffset = round_down(offset, PAGE_SIZE);
5421
5422         /* Write out all dirty pages */
5423         ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5424                                            LLONG_MAX);
5425         if (ret)
5426                 return ret;
5427
5428         /* Take mutex lock */
5429         mutex_lock(&inode->i_mutex);
5430
5431         /*
5432          * There is no need to overlap collapse range with EOF, in which case
5433          * it is effectively a truncate operation
5434          */
5435         if (offset + len >= i_size_read(inode)) {
5436                 ret = -EINVAL;
5437                 goto out_mutex;
5438         }
5439
5440         /* Currently just for extent based files */
5441         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5442                 ret = -EOPNOTSUPP;
5443                 goto out_mutex;
5444         }
5445
5446         truncate_pagecache(inode, ioffset);
5447
5448         /* Wait for existing dio to complete */
5449         ext4_inode_block_unlocked_dio(inode);
5450         inode_dio_wait(inode);
5451
5452         credits = ext4_writepage_trans_blocks(inode);
5453         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5454         if (IS_ERR(handle)) {
5455                 ret = PTR_ERR(handle);
5456                 goto out_dio;
5457         }
5458
5459         down_write(&EXT4_I(inode)->i_data_sem);
5460         ext4_discard_preallocations(inode);
5461
5462         ret = ext4_es_remove_extent(inode, punch_start,
5463                                     EXT_MAX_BLOCKS - punch_start);
5464         if (ret) {
5465                 up_write(&EXT4_I(inode)->i_data_sem);
5466                 goto out_stop;
5467         }
5468
5469         ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5470         if (ret) {
5471                 up_write(&EXT4_I(inode)->i_data_sem);
5472                 goto out_stop;
5473         }
5474         ext4_discard_preallocations(inode);
5475
5476         ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5477                                      punch_stop - punch_start);
5478         if (ret) {
5479                 up_write(&EXT4_I(inode)->i_data_sem);
5480                 goto out_stop;
5481         }
5482
5483         new_size = i_size_read(inode) - len;
5484         i_size_write(inode, new_size);
5485         EXT4_I(inode)->i_disksize = new_size;
5486
5487         up_write(&EXT4_I(inode)->i_data_sem);
5488         if (IS_SYNC(inode))
5489                 ext4_handle_sync(handle);
5490         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5491         ext4_mark_inode_dirty(handle, inode);
5492
5493 out_stop:
5494         ext4_journal_stop(handle);
5495 out_dio:
5496         ext4_inode_resume_unlocked_dio(inode);
5497 out_mutex:
5498         mutex_unlock(&inode->i_mutex);
5499         return ret;
5500 }
5501
5502 /**
5503  * ext4_swap_extents - Swap extents between two inodes
5504  *
5505  * @inode1:     First inode
5506  * @inode2:     Second inode
5507  * @lblk1:      Start block for first inode
5508  * @lblk2:      Start block for second inode
5509  * @count:      Number of blocks to swap
5510  * @mark_unwritten: Mark second inode's extents as unwritten after swap
5511  * @erp:        Pointer to save error value
5512  *
5513  * This helper routine does exactly what is promise "swap extents". All other
5514  * stuff such as page-cache locking consistency, bh mapping consistency or
5515  * extent's data copying must be performed by caller.
5516  * Locking:
5517  *              i_mutex is held for both inodes
5518  *              i_data_sem is locked for write for both inodes
5519  * Assumptions:
5520  *              All pages from requested range are locked for both inodes
5521  */
5522 int
5523 ext4_swap_extents(handle_t *handle, struct inode *inode1,
5524                      struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5525                   ext4_lblk_t count, int unwritten, int *erp)
5526 {
5527         struct ext4_ext_path *path1 = NULL;
5528         struct ext4_ext_path *path2 = NULL;
5529         int replaced_count = 0;
5530
5531         BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5532         BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5533         BUG_ON(!mutex_is_locked(&inode1->i_mutex));
5534         BUG_ON(!mutex_is_locked(&inode1->i_mutex));
5535
5536         *erp = ext4_es_remove_extent(inode1, lblk1, count);
5537         if (unlikely(*erp))
5538                 return 0;
5539         *erp = ext4_es_remove_extent(inode2, lblk2, count);
5540         if (unlikely(*erp))
5541                 return 0;
5542
5543         while (count) {
5544                 struct ext4_extent *ex1, *ex2, tmp_ex;
5545                 ext4_lblk_t e1_blk, e2_blk;
5546                 int e1_len, e2_len, len;
5547                 int split = 0;
5548
5549                 path1 = ext4_ext_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5550                 if (unlikely(IS_ERR(path1))) {
5551                         *erp = PTR_ERR(path1);
5552                         path1 = NULL;
5553                 finish:
5554                         count = 0;
5555                         goto repeat;
5556                 }
5557                 path2 = ext4_ext_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5558                 if (unlikely(IS_ERR(path2))) {
5559                         *erp = PTR_ERR(path2);
5560                         path2 = NULL;
5561                         goto finish;
5562                 }
5563                 ex1 = path1[path1->p_depth].p_ext;
5564                 ex2 = path2[path2->p_depth].p_ext;
5565                 /* Do we have somthing to swap ? */
5566                 if (unlikely(!ex2 || !ex1))
5567                         goto finish;
5568
5569                 e1_blk = le32_to_cpu(ex1->ee_block);
5570                 e2_blk = le32_to_cpu(ex2->ee_block);
5571                 e1_len = ext4_ext_get_actual_len(ex1);
5572                 e2_len = ext4_ext_get_actual_len(ex2);
5573
5574                 /* Hole handling */
5575                 if (!in_range(lblk1, e1_blk, e1_len) ||
5576                     !in_range(lblk2, e2_blk, e2_len)) {
5577                         ext4_lblk_t next1, next2;
5578
5579                         /* if hole after extent, then go to next extent */
5580                         next1 = ext4_ext_next_allocated_block(path1);
5581                         next2 = ext4_ext_next_allocated_block(path2);
5582                         /* If hole before extent, then shift to that extent */
5583                         if (e1_blk > lblk1)
5584                                 next1 = e1_blk;
5585                         if (e2_blk > lblk2)
5586                                 next2 = e1_blk;
5587                         /* Do we have something to swap */
5588                         if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5589                                 goto finish;
5590                         /* Move to the rightest boundary */
5591                         len = next1 - lblk1;
5592                         if (len < next2 - lblk2)
5593                                 len = next2 - lblk2;
5594                         if (len > count)
5595                                 len = count;
5596                         lblk1 += len;
5597                         lblk2 += len;
5598                         count -= len;
5599                         goto repeat;
5600                 }
5601
5602                 /* Prepare left boundary */
5603                 if (e1_blk < lblk1) {
5604                         split = 1;
5605                         *erp = ext4_force_split_extent_at(handle, inode1,
5606                                                 &path1, lblk1, 0);
5607                         if (unlikely(*erp))
5608                                 goto finish;
5609                 }
5610                 if (e2_blk < lblk2) {
5611                         split = 1;
5612                         *erp = ext4_force_split_extent_at(handle, inode2,
5613                                                 &path2,  lblk2, 0);
5614                         if (unlikely(*erp))
5615                                 goto finish;
5616                 }
5617                 /* ext4_split_extent_at() may result in leaf extent split,
5618                  * path must to be revalidated. */
5619                 if (split)
5620                         goto repeat;
5621
5622                 /* Prepare right boundary */
5623                 len = count;
5624                 if (len > e1_blk + e1_len - lblk1)
5625                         len = e1_blk + e1_len - lblk1;
5626                 if (len > e2_blk + e2_len - lblk2)
5627                         len = e2_blk + e2_len - lblk2;
5628
5629                 if (len != e1_len) {
5630                         split = 1;
5631                         *erp = ext4_force_split_extent_at(handle, inode1,
5632                                                 &path1, lblk1 + len, 0);
5633                         if (unlikely(*erp))
5634                                 goto finish;
5635                 }
5636                 if (len != e2_len) {
5637                         split = 1;
5638                         *erp = ext4_force_split_extent_at(handle, inode2,
5639                                                 &path2, lblk2 + len, 0);
5640                         if (*erp)
5641                                 goto finish;
5642                 }
5643                 /* ext4_split_extent_at() may result in leaf extent split,
5644                  * path must to be revalidated. */
5645                 if (split)
5646                         goto repeat;
5647
5648                 BUG_ON(e2_len != e1_len);
5649                 *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5650                 if (unlikely(*erp))
5651                         goto finish;
5652                 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5653                 if (unlikely(*erp))
5654                         goto finish;
5655
5656                 /* Both extents are fully inside boundaries. Swap it now */
5657                 tmp_ex = *ex1;
5658                 ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5659                 ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5660                 ex1->ee_len = cpu_to_le16(e2_len);
5661                 ex2->ee_len = cpu_to_le16(e1_len);
5662                 if (unwritten)
5663                         ext4_ext_mark_unwritten(ex2);
5664                 if (ext4_ext_is_unwritten(&tmp_ex))
5665                         ext4_ext_mark_unwritten(ex1);
5666
5667                 ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5668                 ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5669                 *erp = ext4_ext_dirty(handle, inode2, path2 +
5670                                       path2->p_depth);
5671                 if (unlikely(*erp))
5672                         goto finish;
5673                 *erp = ext4_ext_dirty(handle, inode1, path1 +
5674                                       path1->p_depth);
5675                 /*
5676                  * Looks scarry ah..? second inode already points to new blocks,
5677                  * and it was successfully dirtied. But luckily error may happen
5678                  * only due to journal error, so full transaction will be
5679                  * aborted anyway.
5680                  */
5681                 if (unlikely(*erp))
5682                         goto finish;
5683                 lblk1 += len;
5684                 lblk2 += len;
5685                 replaced_count += len;
5686                 count -= len;
5687
5688         repeat:
5689                 ext4_ext_drop_refs(path1);
5690                 kfree(path1);
5691                 ext4_ext_drop_refs(path2);
5692                 kfree(path2);
5693                 path1 = path2 = NULL;
5694         }
5695         return replaced_count;
5696 }