ext4: collapse ext4_convert_initialized_extents()
[firefly-linux-kernel-4.4.55.git] / fs / ext4 / extents.c
1 /*
2  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3  * Written by Alex Tomas <alex@clusterfs.com>
4  *
5  * Architecture independence:
6  *   Copyright (c) 2005, Bull S.A.
7  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public Licens
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21  */
22
23 /*
24  * Extents support for EXT4
25  *
26  * TODO:
27  *   - ext4*_error() should be used in some situations
28  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29  *   - smart tree reduction
30  */
31
32 #include <linux/fs.h>
33 #include <linux/time.h>
34 #include <linux/jbd2.h>
35 #include <linux/highuid.h>
36 #include <linux/pagemap.h>
37 #include <linux/quotaops.h>
38 #include <linux/string.h>
39 #include <linux/slab.h>
40 #include <asm/uaccess.h>
41 #include <linux/fiemap.h>
42 #include "ext4_jbd2.h"
43 #include "ext4_extents.h"
44 #include "xattr.h"
45
46 #include <trace/events/ext4.h>
47
48 /*
49  * used by extent splitting.
50  */
51 #define EXT4_EXT_MAY_ZEROOUT    0x1  /* safe to zeroout if split fails \
52                                         due to ENOSPC */
53 #define EXT4_EXT_MARK_UNWRIT1   0x2  /* mark first half unwritten */
54 #define EXT4_EXT_MARK_UNWRIT2   0x4  /* mark second half unwritten */
55
56 #define EXT4_EXT_DATA_VALID1    0x8  /* first half contains valid data */
57 #define EXT4_EXT_DATA_VALID2    0x10 /* second half contains valid data */
58
59 static __le32 ext4_extent_block_csum(struct inode *inode,
60                                      struct ext4_extent_header *eh)
61 {
62         struct ext4_inode_info *ei = EXT4_I(inode);
63         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
64         __u32 csum;
65
66         csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
67                            EXT4_EXTENT_TAIL_OFFSET(eh));
68         return cpu_to_le32(csum);
69 }
70
71 static int ext4_extent_block_csum_verify(struct inode *inode,
72                                          struct ext4_extent_header *eh)
73 {
74         struct ext4_extent_tail *et;
75
76         if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
77                 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
78                 return 1;
79
80         et = find_ext4_extent_tail(eh);
81         if (et->et_checksum != ext4_extent_block_csum(inode, eh))
82                 return 0;
83         return 1;
84 }
85
86 static void ext4_extent_block_csum_set(struct inode *inode,
87                                        struct ext4_extent_header *eh)
88 {
89         struct ext4_extent_tail *et;
90
91         if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
92                 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
93                 return;
94
95         et = find_ext4_extent_tail(eh);
96         et->et_checksum = ext4_extent_block_csum(inode, eh);
97 }
98
99 static int ext4_split_extent(handle_t *handle,
100                                 struct inode *inode,
101                                 struct ext4_ext_path *path,
102                                 struct ext4_map_blocks *map,
103                                 int split_flag,
104                                 int flags);
105
106 static int ext4_split_extent_at(handle_t *handle,
107                              struct inode *inode,
108                              struct ext4_ext_path *path,
109                              ext4_lblk_t split,
110                              int split_flag,
111                              int flags);
112
113 static int ext4_find_delayed_extent(struct inode *inode,
114                                     struct extent_status *newes);
115
116 static int ext4_ext_truncate_extend_restart(handle_t *handle,
117                                             struct inode *inode,
118                                             int needed)
119 {
120         int err;
121
122         if (!ext4_handle_valid(handle))
123                 return 0;
124         if (handle->h_buffer_credits > needed)
125                 return 0;
126         err = ext4_journal_extend(handle, needed);
127         if (err <= 0)
128                 return err;
129         err = ext4_truncate_restart_trans(handle, inode, needed);
130         if (err == 0)
131                 err = -EAGAIN;
132
133         return err;
134 }
135
136 /*
137  * could return:
138  *  - EROFS
139  *  - ENOMEM
140  */
141 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
142                                 struct ext4_ext_path *path)
143 {
144         if (path->p_bh) {
145                 /* path points to block */
146                 BUFFER_TRACE(path->p_bh, "get_write_access");
147                 return ext4_journal_get_write_access(handle, path->p_bh);
148         }
149         /* path points to leaf/index in inode body */
150         /* we use in-core data, no need to protect them */
151         return 0;
152 }
153
154 /*
155  * could return:
156  *  - EROFS
157  *  - ENOMEM
158  *  - EIO
159  */
160 int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
161                      struct inode *inode, struct ext4_ext_path *path)
162 {
163         int err;
164
165         WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
166         if (path->p_bh) {
167                 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
168                 /* path points to block */
169                 err = __ext4_handle_dirty_metadata(where, line, handle,
170                                                    inode, path->p_bh);
171         } else {
172                 /* path points to leaf/index in inode body */
173                 err = ext4_mark_inode_dirty(handle, inode);
174         }
175         return err;
176 }
177
178 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
179                               struct ext4_ext_path *path,
180                               ext4_lblk_t block)
181 {
182         if (path) {
183                 int depth = path->p_depth;
184                 struct ext4_extent *ex;
185
186                 /*
187                  * Try to predict block placement assuming that we are
188                  * filling in a file which will eventually be
189                  * non-sparse --- i.e., in the case of libbfd writing
190                  * an ELF object sections out-of-order but in a way
191                  * the eventually results in a contiguous object or
192                  * executable file, or some database extending a table
193                  * space file.  However, this is actually somewhat
194                  * non-ideal if we are writing a sparse file such as
195                  * qemu or KVM writing a raw image file that is going
196                  * to stay fairly sparse, since it will end up
197                  * fragmenting the file system's free space.  Maybe we
198                  * should have some hueristics or some way to allow
199                  * userspace to pass a hint to file system,
200                  * especially if the latter case turns out to be
201                  * common.
202                  */
203                 ex = path[depth].p_ext;
204                 if (ex) {
205                         ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
206                         ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
207
208                         if (block > ext_block)
209                                 return ext_pblk + (block - ext_block);
210                         else
211                                 return ext_pblk - (ext_block - block);
212                 }
213
214                 /* it looks like index is empty;
215                  * try to find starting block from index itself */
216                 if (path[depth].p_bh)
217                         return path[depth].p_bh->b_blocknr;
218         }
219
220         /* OK. use inode's group */
221         return ext4_inode_to_goal_block(inode);
222 }
223
224 /*
225  * Allocation for a meta data block
226  */
227 static ext4_fsblk_t
228 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
229                         struct ext4_ext_path *path,
230                         struct ext4_extent *ex, int *err, unsigned int flags)
231 {
232         ext4_fsblk_t goal, newblock;
233
234         goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
235         newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
236                                         NULL, err);
237         return newblock;
238 }
239
240 static inline int ext4_ext_space_block(struct inode *inode, int check)
241 {
242         int size;
243
244         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
245                         / sizeof(struct ext4_extent);
246 #ifdef AGGRESSIVE_TEST
247         if (!check && size > 6)
248                 size = 6;
249 #endif
250         return size;
251 }
252
253 static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
254 {
255         int size;
256
257         size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
258                         / sizeof(struct ext4_extent_idx);
259 #ifdef AGGRESSIVE_TEST
260         if (!check && size > 5)
261                 size = 5;
262 #endif
263         return size;
264 }
265
266 static inline int ext4_ext_space_root(struct inode *inode, int check)
267 {
268         int size;
269
270         size = sizeof(EXT4_I(inode)->i_data);
271         size -= sizeof(struct ext4_extent_header);
272         size /= sizeof(struct ext4_extent);
273 #ifdef AGGRESSIVE_TEST
274         if (!check && size > 3)
275                 size = 3;
276 #endif
277         return size;
278 }
279
280 static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
281 {
282         int size;
283
284         size = sizeof(EXT4_I(inode)->i_data);
285         size -= sizeof(struct ext4_extent_header);
286         size /= sizeof(struct ext4_extent_idx);
287 #ifdef AGGRESSIVE_TEST
288         if (!check && size > 4)
289                 size = 4;
290 #endif
291         return size;
292 }
293
294 static inline int
295 ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
296                            struct ext4_ext_path *path, ext4_lblk_t lblk,
297                            int nofail)
298 {
299         int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
300
301         return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
302                         EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
303                         EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
304                         (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
305 }
306
307 /*
308  * Calculate the number of metadata blocks needed
309  * to allocate @blocks
310  * Worse case is one block per extent
311  */
312 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
313 {
314         struct ext4_inode_info *ei = EXT4_I(inode);
315         int idxs;
316
317         idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
318                 / sizeof(struct ext4_extent_idx));
319
320         /*
321          * If the new delayed allocation block is contiguous with the
322          * previous da block, it can share index blocks with the
323          * previous block, so we only need to allocate a new index
324          * block every idxs leaf blocks.  At ldxs**2 blocks, we need
325          * an additional index block, and at ldxs**3 blocks, yet
326          * another index blocks.
327          */
328         if (ei->i_da_metadata_calc_len &&
329             ei->i_da_metadata_calc_last_lblock+1 == lblock) {
330                 int num = 0;
331
332                 if ((ei->i_da_metadata_calc_len % idxs) == 0)
333                         num++;
334                 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
335                         num++;
336                 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
337                         num++;
338                         ei->i_da_metadata_calc_len = 0;
339                 } else
340                         ei->i_da_metadata_calc_len++;
341                 ei->i_da_metadata_calc_last_lblock++;
342                 return num;
343         }
344
345         /*
346          * In the worst case we need a new set of index blocks at
347          * every level of the inode's extent tree.
348          */
349         ei->i_da_metadata_calc_len = 1;
350         ei->i_da_metadata_calc_last_lblock = lblock;
351         return ext_depth(inode) + 1;
352 }
353
354 static int
355 ext4_ext_max_entries(struct inode *inode, int depth)
356 {
357         int max;
358
359         if (depth == ext_depth(inode)) {
360                 if (depth == 0)
361                         max = ext4_ext_space_root(inode, 1);
362                 else
363                         max = ext4_ext_space_root_idx(inode, 1);
364         } else {
365                 if (depth == 0)
366                         max = ext4_ext_space_block(inode, 1);
367                 else
368                         max = ext4_ext_space_block_idx(inode, 1);
369         }
370
371         return max;
372 }
373
374 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
375 {
376         ext4_fsblk_t block = ext4_ext_pblock(ext);
377         int len = ext4_ext_get_actual_len(ext);
378         ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
379         ext4_lblk_t last = lblock + len - 1;
380
381         if (lblock > last)
382                 return 0;
383         return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
384 }
385
386 static int ext4_valid_extent_idx(struct inode *inode,
387                                 struct ext4_extent_idx *ext_idx)
388 {
389         ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
390
391         return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
392 }
393
394 static int ext4_valid_extent_entries(struct inode *inode,
395                                 struct ext4_extent_header *eh,
396                                 int depth)
397 {
398         unsigned short entries;
399         if (eh->eh_entries == 0)
400                 return 1;
401
402         entries = le16_to_cpu(eh->eh_entries);
403
404         if (depth == 0) {
405                 /* leaf entries */
406                 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
407                 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
408                 ext4_fsblk_t pblock = 0;
409                 ext4_lblk_t lblock = 0;
410                 ext4_lblk_t prev = 0;
411                 int len = 0;
412                 while (entries) {
413                         if (!ext4_valid_extent(inode, ext))
414                                 return 0;
415
416                         /* Check for overlapping extents */
417                         lblock = le32_to_cpu(ext->ee_block);
418                         len = ext4_ext_get_actual_len(ext);
419                         if ((lblock <= prev) && prev) {
420                                 pblock = ext4_ext_pblock(ext);
421                                 es->s_last_error_block = cpu_to_le64(pblock);
422                                 return 0;
423                         }
424                         ext++;
425                         entries--;
426                         prev = lblock + len - 1;
427                 }
428         } else {
429                 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
430                 while (entries) {
431                         if (!ext4_valid_extent_idx(inode, ext_idx))
432                                 return 0;
433                         ext_idx++;
434                         entries--;
435                 }
436         }
437         return 1;
438 }
439
440 static int __ext4_ext_check(const char *function, unsigned int line,
441                             struct inode *inode, struct ext4_extent_header *eh,
442                             int depth, ext4_fsblk_t pblk)
443 {
444         const char *error_msg;
445         int max = 0;
446
447         if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
448                 error_msg = "invalid magic";
449                 goto corrupted;
450         }
451         if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
452                 error_msg = "unexpected eh_depth";
453                 goto corrupted;
454         }
455         if (unlikely(eh->eh_max == 0)) {
456                 error_msg = "invalid eh_max";
457                 goto corrupted;
458         }
459         max = ext4_ext_max_entries(inode, depth);
460         if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
461                 error_msg = "too large eh_max";
462                 goto corrupted;
463         }
464         if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
465                 error_msg = "invalid eh_entries";
466                 goto corrupted;
467         }
468         if (!ext4_valid_extent_entries(inode, eh, depth)) {
469                 error_msg = "invalid extent entries";
470                 goto corrupted;
471         }
472         /* Verify checksum on non-root extent tree nodes */
473         if (ext_depth(inode) != depth &&
474             !ext4_extent_block_csum_verify(inode, eh)) {
475                 error_msg = "extent tree corrupted";
476                 goto corrupted;
477         }
478         return 0;
479
480 corrupted:
481         ext4_error_inode(inode, function, line, 0,
482                          "pblk %llu bad header/extent: %s - magic %x, "
483                          "entries %u, max %u(%u), depth %u(%u)",
484                          (unsigned long long) pblk, error_msg,
485                          le16_to_cpu(eh->eh_magic),
486                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
487                          max, le16_to_cpu(eh->eh_depth), depth);
488         return -EIO;
489 }
490
491 #define ext4_ext_check(inode, eh, depth, pblk)                  \
492         __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
493
494 int ext4_ext_check_inode(struct inode *inode)
495 {
496         return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
497 }
498
499 static struct buffer_head *
500 __read_extent_tree_block(const char *function, unsigned int line,
501                          struct inode *inode, ext4_fsblk_t pblk, int depth,
502                          int flags)
503 {
504         struct buffer_head              *bh;
505         int                             err;
506
507         bh = sb_getblk(inode->i_sb, pblk);
508         if (unlikely(!bh))
509                 return ERR_PTR(-ENOMEM);
510
511         if (!bh_uptodate_or_lock(bh)) {
512                 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
513                 err = bh_submit_read(bh);
514                 if (err < 0)
515                         goto errout;
516         }
517         if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
518                 return bh;
519         err = __ext4_ext_check(function, line, inode,
520                                ext_block_hdr(bh), depth, pblk);
521         if (err)
522                 goto errout;
523         set_buffer_verified(bh);
524         /*
525          * If this is a leaf block, cache all of its entries
526          */
527         if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
528                 struct ext4_extent_header *eh = ext_block_hdr(bh);
529                 struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
530                 ext4_lblk_t prev = 0;
531                 int i;
532
533                 for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
534                         unsigned int status = EXTENT_STATUS_WRITTEN;
535                         ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
536                         int len = ext4_ext_get_actual_len(ex);
537
538                         if (prev && (prev != lblk))
539                                 ext4_es_cache_extent(inode, prev,
540                                                      lblk - prev, ~0,
541                                                      EXTENT_STATUS_HOLE);
542
543                         if (ext4_ext_is_unwritten(ex))
544                                 status = EXTENT_STATUS_UNWRITTEN;
545                         ext4_es_cache_extent(inode, lblk, len,
546                                              ext4_ext_pblock(ex), status);
547                         prev = lblk + len;
548                 }
549         }
550         return bh;
551 errout:
552         put_bh(bh);
553         return ERR_PTR(err);
554
555 }
556
557 #define read_extent_tree_block(inode, pblk, depth, flags)               \
558         __read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
559                                  (depth), (flags))
560
561 /*
562  * This function is called to cache a file's extent information in the
563  * extent status tree
564  */
565 int ext4_ext_precache(struct inode *inode)
566 {
567         struct ext4_inode_info *ei = EXT4_I(inode);
568         struct ext4_ext_path *path = NULL;
569         struct buffer_head *bh;
570         int i = 0, depth, ret = 0;
571
572         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
573                 return 0;       /* not an extent-mapped inode */
574
575         down_read(&ei->i_data_sem);
576         depth = ext_depth(inode);
577
578         path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
579                        GFP_NOFS);
580         if (path == NULL) {
581                 up_read(&ei->i_data_sem);
582                 return -ENOMEM;
583         }
584
585         /* Don't cache anything if there are no external extent blocks */
586         if (depth == 0)
587                 goto out;
588         path[0].p_hdr = ext_inode_hdr(inode);
589         ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
590         if (ret)
591                 goto out;
592         path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
593         while (i >= 0) {
594                 /*
595                  * If this is a leaf block or we've reached the end of
596                  * the index block, go up
597                  */
598                 if ((i == depth) ||
599                     path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
600                         brelse(path[i].p_bh);
601                         path[i].p_bh = NULL;
602                         i--;
603                         continue;
604                 }
605                 bh = read_extent_tree_block(inode,
606                                             ext4_idx_pblock(path[i].p_idx++),
607                                             depth - i - 1,
608                                             EXT4_EX_FORCE_CACHE);
609                 if (IS_ERR(bh)) {
610                         ret = PTR_ERR(bh);
611                         break;
612                 }
613                 i++;
614                 path[i].p_bh = bh;
615                 path[i].p_hdr = ext_block_hdr(bh);
616                 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
617         }
618         ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
619 out:
620         up_read(&ei->i_data_sem);
621         ext4_ext_drop_refs(path);
622         kfree(path);
623         return ret;
624 }
625
626 #ifdef EXT_DEBUG
627 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
628 {
629         int k, l = path->p_depth;
630
631         ext_debug("path:");
632         for (k = 0; k <= l; k++, path++) {
633                 if (path->p_idx) {
634                   ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
635                             ext4_idx_pblock(path->p_idx));
636                 } else if (path->p_ext) {
637                         ext_debug("  %d:[%d]%d:%llu ",
638                                   le32_to_cpu(path->p_ext->ee_block),
639                                   ext4_ext_is_unwritten(path->p_ext),
640                                   ext4_ext_get_actual_len(path->p_ext),
641                                   ext4_ext_pblock(path->p_ext));
642                 } else
643                         ext_debug("  []");
644         }
645         ext_debug("\n");
646 }
647
648 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
649 {
650         int depth = ext_depth(inode);
651         struct ext4_extent_header *eh;
652         struct ext4_extent *ex;
653         int i;
654
655         if (!path)
656                 return;
657
658         eh = path[depth].p_hdr;
659         ex = EXT_FIRST_EXTENT(eh);
660
661         ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
662
663         for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
664                 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
665                           ext4_ext_is_unwritten(ex),
666                           ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
667         }
668         ext_debug("\n");
669 }
670
671 static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
672                         ext4_fsblk_t newblock, int level)
673 {
674         int depth = ext_depth(inode);
675         struct ext4_extent *ex;
676
677         if (depth != level) {
678                 struct ext4_extent_idx *idx;
679                 idx = path[level].p_idx;
680                 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
681                         ext_debug("%d: move %d:%llu in new index %llu\n", level,
682                                         le32_to_cpu(idx->ei_block),
683                                         ext4_idx_pblock(idx),
684                                         newblock);
685                         idx++;
686                 }
687
688                 return;
689         }
690
691         ex = path[depth].p_ext;
692         while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
693                 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
694                                 le32_to_cpu(ex->ee_block),
695                                 ext4_ext_pblock(ex),
696                                 ext4_ext_is_unwritten(ex),
697                                 ext4_ext_get_actual_len(ex),
698                                 newblock);
699                 ex++;
700         }
701 }
702
703 #else
704 #define ext4_ext_show_path(inode, path)
705 #define ext4_ext_show_leaf(inode, path)
706 #define ext4_ext_show_move(inode, path, newblock, level)
707 #endif
708
709 void ext4_ext_drop_refs(struct ext4_ext_path *path)
710 {
711         int depth = path->p_depth;
712         int i;
713
714         for (i = 0; i <= depth; i++, path++)
715                 if (path->p_bh) {
716                         brelse(path->p_bh);
717                         path->p_bh = NULL;
718                 }
719 }
720
721 /*
722  * ext4_ext_binsearch_idx:
723  * binary search for the closest index of the given block
724  * the header must be checked before calling this
725  */
726 static void
727 ext4_ext_binsearch_idx(struct inode *inode,
728                         struct ext4_ext_path *path, ext4_lblk_t block)
729 {
730         struct ext4_extent_header *eh = path->p_hdr;
731         struct ext4_extent_idx *r, *l, *m;
732
733
734         ext_debug("binsearch for %u(idx):  ", block);
735
736         l = EXT_FIRST_INDEX(eh) + 1;
737         r = EXT_LAST_INDEX(eh);
738         while (l <= r) {
739                 m = l + (r - l) / 2;
740                 if (block < le32_to_cpu(m->ei_block))
741                         r = m - 1;
742                 else
743                         l = m + 1;
744                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
745                                 m, le32_to_cpu(m->ei_block),
746                                 r, le32_to_cpu(r->ei_block));
747         }
748
749         path->p_idx = l - 1;
750         ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
751                   ext4_idx_pblock(path->p_idx));
752
753 #ifdef CHECK_BINSEARCH
754         {
755                 struct ext4_extent_idx *chix, *ix;
756                 int k;
757
758                 chix = ix = EXT_FIRST_INDEX(eh);
759                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
760                   if (k != 0 &&
761                       le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
762                                 printk(KERN_DEBUG "k=%d, ix=0x%p, "
763                                        "first=0x%p\n", k,
764                                        ix, EXT_FIRST_INDEX(eh));
765                                 printk(KERN_DEBUG "%u <= %u\n",
766                                        le32_to_cpu(ix->ei_block),
767                                        le32_to_cpu(ix[-1].ei_block));
768                         }
769                         BUG_ON(k && le32_to_cpu(ix->ei_block)
770                                            <= le32_to_cpu(ix[-1].ei_block));
771                         if (block < le32_to_cpu(ix->ei_block))
772                                 break;
773                         chix = ix;
774                 }
775                 BUG_ON(chix != path->p_idx);
776         }
777 #endif
778
779 }
780
781 /*
782  * ext4_ext_binsearch:
783  * binary search for closest extent of the given block
784  * the header must be checked before calling this
785  */
786 static void
787 ext4_ext_binsearch(struct inode *inode,
788                 struct ext4_ext_path *path, ext4_lblk_t block)
789 {
790         struct ext4_extent_header *eh = path->p_hdr;
791         struct ext4_extent *r, *l, *m;
792
793         if (eh->eh_entries == 0) {
794                 /*
795                  * this leaf is empty:
796                  * we get such a leaf in split/add case
797                  */
798                 return;
799         }
800
801         ext_debug("binsearch for %u:  ", block);
802
803         l = EXT_FIRST_EXTENT(eh) + 1;
804         r = EXT_LAST_EXTENT(eh);
805
806         while (l <= r) {
807                 m = l + (r - l) / 2;
808                 if (block < le32_to_cpu(m->ee_block))
809                         r = m - 1;
810                 else
811                         l = m + 1;
812                 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
813                                 m, le32_to_cpu(m->ee_block),
814                                 r, le32_to_cpu(r->ee_block));
815         }
816
817         path->p_ext = l - 1;
818         ext_debug("  -> %d:%llu:[%d]%d ",
819                         le32_to_cpu(path->p_ext->ee_block),
820                         ext4_ext_pblock(path->p_ext),
821                         ext4_ext_is_unwritten(path->p_ext),
822                         ext4_ext_get_actual_len(path->p_ext));
823
824 #ifdef CHECK_BINSEARCH
825         {
826                 struct ext4_extent *chex, *ex;
827                 int k;
828
829                 chex = ex = EXT_FIRST_EXTENT(eh);
830                 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
831                         BUG_ON(k && le32_to_cpu(ex->ee_block)
832                                           <= le32_to_cpu(ex[-1].ee_block));
833                         if (block < le32_to_cpu(ex->ee_block))
834                                 break;
835                         chex = ex;
836                 }
837                 BUG_ON(chex != path->p_ext);
838         }
839 #endif
840
841 }
842
843 int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
844 {
845         struct ext4_extent_header *eh;
846
847         eh = ext_inode_hdr(inode);
848         eh->eh_depth = 0;
849         eh->eh_entries = 0;
850         eh->eh_magic = EXT4_EXT_MAGIC;
851         eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
852         ext4_mark_inode_dirty(handle, inode);
853         return 0;
854 }
855
856 struct ext4_ext_path *
857 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
858                      struct ext4_ext_path **orig_path, int flags)
859 {
860         struct ext4_extent_header *eh;
861         struct buffer_head *bh;
862         struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
863         short int depth, i, ppos = 0;
864         short free_on_err = (flags & EXT4_EX_NOFREE_ON_ERR) == 0;
865         int ret;
866
867         eh = ext_inode_hdr(inode);
868         depth = ext_depth(inode);
869
870         /* account possible depth increase */
871         if (!path) {
872                 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
873                                 GFP_NOFS);
874                 if (unlikely(!path))
875                         return ERR_PTR(-ENOMEM);
876                 free_on_err = 1;
877         }
878         path[0].p_hdr = eh;
879         path[0].p_bh = NULL;
880
881         i = depth;
882         /* walk through the tree */
883         while (i) {
884                 ext_debug("depth %d: num %d, max %d\n",
885                           ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
886
887                 ext4_ext_binsearch_idx(inode, path + ppos, block);
888                 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
889                 path[ppos].p_depth = i;
890                 path[ppos].p_ext = NULL;
891
892                 bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
893                                             flags);
894                 if (unlikely(IS_ERR(bh))) {
895                         ret = PTR_ERR(bh);
896                         goto err;
897                 }
898
899                 eh = ext_block_hdr(bh);
900                 ppos++;
901                 if (unlikely(ppos > depth)) {
902                         put_bh(bh);
903                         EXT4_ERROR_INODE(inode,
904                                          "ppos %d > depth %d", ppos, depth);
905                         ret = -EIO;
906                         goto err;
907                 }
908                 path[ppos].p_bh = bh;
909                 path[ppos].p_hdr = eh;
910         }
911
912         path[ppos].p_depth = i;
913         path[ppos].p_ext = NULL;
914         path[ppos].p_idx = NULL;
915
916         /* find extent */
917         ext4_ext_binsearch(inode, path + ppos, block);
918         /* if not an empty leaf */
919         if (path[ppos].p_ext)
920                 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
921
922         ext4_ext_show_path(inode, path);
923
924         return path;
925
926 err:
927         ext4_ext_drop_refs(path);
928         if (free_on_err) {
929                 kfree(path);
930                 if (orig_path)
931                         *orig_path = NULL;
932         }
933         return ERR_PTR(ret);
934 }
935
936 /*
937  * ext4_ext_insert_index:
938  * insert new index [@logical;@ptr] into the block at @curp;
939  * check where to insert: before @curp or after @curp
940  */
941 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
942                                  struct ext4_ext_path *curp,
943                                  int logical, ext4_fsblk_t ptr)
944 {
945         struct ext4_extent_idx *ix;
946         int len, err;
947
948         err = ext4_ext_get_access(handle, inode, curp);
949         if (err)
950                 return err;
951
952         if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
953                 EXT4_ERROR_INODE(inode,
954                                  "logical %d == ei_block %d!",
955                                  logical, le32_to_cpu(curp->p_idx->ei_block));
956                 return -EIO;
957         }
958
959         if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
960                              >= le16_to_cpu(curp->p_hdr->eh_max))) {
961                 EXT4_ERROR_INODE(inode,
962                                  "eh_entries %d >= eh_max %d!",
963                                  le16_to_cpu(curp->p_hdr->eh_entries),
964                                  le16_to_cpu(curp->p_hdr->eh_max));
965                 return -EIO;
966         }
967
968         if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
969                 /* insert after */
970                 ext_debug("insert new index %d after: %llu\n", logical, ptr);
971                 ix = curp->p_idx + 1;
972         } else {
973                 /* insert before */
974                 ext_debug("insert new index %d before: %llu\n", logical, ptr);
975                 ix = curp->p_idx;
976         }
977
978         len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
979         BUG_ON(len < 0);
980         if (len > 0) {
981                 ext_debug("insert new index %d: "
982                                 "move %d indices from 0x%p to 0x%p\n",
983                                 logical, len, ix, ix + 1);
984                 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
985         }
986
987         if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
988                 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
989                 return -EIO;
990         }
991
992         ix->ei_block = cpu_to_le32(logical);
993         ext4_idx_store_pblock(ix, ptr);
994         le16_add_cpu(&curp->p_hdr->eh_entries, 1);
995
996         if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
997                 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
998                 return -EIO;
999         }
1000
1001         err = ext4_ext_dirty(handle, inode, curp);
1002         ext4_std_error(inode->i_sb, err);
1003
1004         return err;
1005 }
1006
1007 /*
1008  * ext4_ext_split:
1009  * inserts new subtree into the path, using free index entry
1010  * at depth @at:
1011  * - allocates all needed blocks (new leaf and all intermediate index blocks)
1012  * - makes decision where to split
1013  * - moves remaining extents and index entries (right to the split point)
1014  *   into the newly allocated blocks
1015  * - initializes subtree
1016  */
1017 static int ext4_ext_split(handle_t *handle, struct inode *inode,
1018                           unsigned int flags,
1019                           struct ext4_ext_path *path,
1020                           struct ext4_extent *newext, int at)
1021 {
1022         struct buffer_head *bh = NULL;
1023         int depth = ext_depth(inode);
1024         struct ext4_extent_header *neh;
1025         struct ext4_extent_idx *fidx;
1026         int i = at, k, m, a;
1027         ext4_fsblk_t newblock, oldblock;
1028         __le32 border;
1029         ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1030         int err = 0;
1031
1032         /* make decision: where to split? */
1033         /* FIXME: now decision is simplest: at current extent */
1034
1035         /* if current leaf will be split, then we should use
1036          * border from split point */
1037         if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1038                 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1039                 return -EIO;
1040         }
1041         if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1042                 border = path[depth].p_ext[1].ee_block;
1043                 ext_debug("leaf will be split."
1044                                 " next leaf starts at %d\n",
1045                                   le32_to_cpu(border));
1046         } else {
1047                 border = newext->ee_block;
1048                 ext_debug("leaf will be added."
1049                                 " next leaf starts at %d\n",
1050                                 le32_to_cpu(border));
1051         }
1052
1053         /*
1054          * If error occurs, then we break processing
1055          * and mark filesystem read-only. index won't
1056          * be inserted and tree will be in consistent
1057          * state. Next mount will repair buffers too.
1058          */
1059
1060         /*
1061          * Get array to track all allocated blocks.
1062          * We need this to handle errors and free blocks
1063          * upon them.
1064          */
1065         ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
1066         if (!ablocks)
1067                 return -ENOMEM;
1068
1069         /* allocate all needed blocks */
1070         ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1071         for (a = 0; a < depth - at; a++) {
1072                 newblock = ext4_ext_new_meta_block(handle, inode, path,
1073                                                    newext, &err, flags);
1074                 if (newblock == 0)
1075                         goto cleanup;
1076                 ablocks[a] = newblock;
1077         }
1078
1079         /* initialize new leaf */
1080         newblock = ablocks[--a];
1081         if (unlikely(newblock == 0)) {
1082                 EXT4_ERROR_INODE(inode, "newblock == 0!");
1083                 err = -EIO;
1084                 goto cleanup;
1085         }
1086         bh = sb_getblk(inode->i_sb, newblock);
1087         if (unlikely(!bh)) {
1088                 err = -ENOMEM;
1089                 goto cleanup;
1090         }
1091         lock_buffer(bh);
1092
1093         err = ext4_journal_get_create_access(handle, bh);
1094         if (err)
1095                 goto cleanup;
1096
1097         neh = ext_block_hdr(bh);
1098         neh->eh_entries = 0;
1099         neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1100         neh->eh_magic = EXT4_EXT_MAGIC;
1101         neh->eh_depth = 0;
1102
1103         /* move remainder of path[depth] to the new leaf */
1104         if (unlikely(path[depth].p_hdr->eh_entries !=
1105                      path[depth].p_hdr->eh_max)) {
1106                 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1107                                  path[depth].p_hdr->eh_entries,
1108                                  path[depth].p_hdr->eh_max);
1109                 err = -EIO;
1110                 goto cleanup;
1111         }
1112         /* start copy from next extent */
1113         m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1114         ext4_ext_show_move(inode, path, newblock, depth);
1115         if (m) {
1116                 struct ext4_extent *ex;
1117                 ex = EXT_FIRST_EXTENT(neh);
1118                 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1119                 le16_add_cpu(&neh->eh_entries, m);
1120         }
1121
1122         ext4_extent_block_csum_set(inode, neh);
1123         set_buffer_uptodate(bh);
1124         unlock_buffer(bh);
1125
1126         err = ext4_handle_dirty_metadata(handle, inode, bh);
1127         if (err)
1128                 goto cleanup;
1129         brelse(bh);
1130         bh = NULL;
1131
1132         /* correct old leaf */
1133         if (m) {
1134                 err = ext4_ext_get_access(handle, inode, path + depth);
1135                 if (err)
1136                         goto cleanup;
1137                 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1138                 err = ext4_ext_dirty(handle, inode, path + depth);
1139                 if (err)
1140                         goto cleanup;
1141
1142         }
1143
1144         /* create intermediate indexes */
1145         k = depth - at - 1;
1146         if (unlikely(k < 0)) {
1147                 EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1148                 err = -EIO;
1149                 goto cleanup;
1150         }
1151         if (k)
1152                 ext_debug("create %d intermediate indices\n", k);
1153         /* insert new index into current index block */
1154         /* current depth stored in i var */
1155         i = depth - 1;
1156         while (k--) {
1157                 oldblock = newblock;
1158                 newblock = ablocks[--a];
1159                 bh = sb_getblk(inode->i_sb, newblock);
1160                 if (unlikely(!bh)) {
1161                         err = -ENOMEM;
1162                         goto cleanup;
1163                 }
1164                 lock_buffer(bh);
1165
1166                 err = ext4_journal_get_create_access(handle, bh);
1167                 if (err)
1168                         goto cleanup;
1169
1170                 neh = ext_block_hdr(bh);
1171                 neh->eh_entries = cpu_to_le16(1);
1172                 neh->eh_magic = EXT4_EXT_MAGIC;
1173                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1174                 neh->eh_depth = cpu_to_le16(depth - i);
1175                 fidx = EXT_FIRST_INDEX(neh);
1176                 fidx->ei_block = border;
1177                 ext4_idx_store_pblock(fidx, oldblock);
1178
1179                 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1180                                 i, newblock, le32_to_cpu(border), oldblock);
1181
1182                 /* move remainder of path[i] to the new index block */
1183                 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1184                                         EXT_LAST_INDEX(path[i].p_hdr))) {
1185                         EXT4_ERROR_INODE(inode,
1186                                          "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1187                                          le32_to_cpu(path[i].p_ext->ee_block));
1188                         err = -EIO;
1189                         goto cleanup;
1190                 }
1191                 /* start copy indexes */
1192                 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1193                 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1194                                 EXT_MAX_INDEX(path[i].p_hdr));
1195                 ext4_ext_show_move(inode, path, newblock, i);
1196                 if (m) {
1197                         memmove(++fidx, path[i].p_idx,
1198                                 sizeof(struct ext4_extent_idx) * m);
1199                         le16_add_cpu(&neh->eh_entries, m);
1200                 }
1201                 ext4_extent_block_csum_set(inode, neh);
1202                 set_buffer_uptodate(bh);
1203                 unlock_buffer(bh);
1204
1205                 err = ext4_handle_dirty_metadata(handle, inode, bh);
1206                 if (err)
1207                         goto cleanup;
1208                 brelse(bh);
1209                 bh = NULL;
1210
1211                 /* correct old index */
1212                 if (m) {
1213                         err = ext4_ext_get_access(handle, inode, path + i);
1214                         if (err)
1215                                 goto cleanup;
1216                         le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1217                         err = ext4_ext_dirty(handle, inode, path + i);
1218                         if (err)
1219                                 goto cleanup;
1220                 }
1221
1222                 i--;
1223         }
1224
1225         /* insert new index */
1226         err = ext4_ext_insert_index(handle, inode, path + at,
1227                                     le32_to_cpu(border), newblock);
1228
1229 cleanup:
1230         if (bh) {
1231                 if (buffer_locked(bh))
1232                         unlock_buffer(bh);
1233                 brelse(bh);
1234         }
1235
1236         if (err) {
1237                 /* free all allocated blocks in error case */
1238                 for (i = 0; i < depth; i++) {
1239                         if (!ablocks[i])
1240                                 continue;
1241                         ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1242                                          EXT4_FREE_BLOCKS_METADATA);
1243                 }
1244         }
1245         kfree(ablocks);
1246
1247         return err;
1248 }
1249
1250 /*
1251  * ext4_ext_grow_indepth:
1252  * implements tree growing procedure:
1253  * - allocates new block
1254  * - moves top-level data (index block or leaf) into the new block
1255  * - initializes new top-level, creating index that points to the
1256  *   just created block
1257  */
1258 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1259                                  unsigned int flags,
1260                                  struct ext4_extent *newext)
1261 {
1262         struct ext4_extent_header *neh;
1263         struct buffer_head *bh;
1264         ext4_fsblk_t newblock;
1265         int err = 0;
1266
1267         newblock = ext4_ext_new_meta_block(handle, inode, NULL,
1268                 newext, &err, flags);
1269         if (newblock == 0)
1270                 return err;
1271
1272         bh = sb_getblk(inode->i_sb, newblock);
1273         if (unlikely(!bh))
1274                 return -ENOMEM;
1275         lock_buffer(bh);
1276
1277         err = ext4_journal_get_create_access(handle, bh);
1278         if (err) {
1279                 unlock_buffer(bh);
1280                 goto out;
1281         }
1282
1283         /* move top-level index/leaf into new block */
1284         memmove(bh->b_data, EXT4_I(inode)->i_data,
1285                 sizeof(EXT4_I(inode)->i_data));
1286
1287         /* set size of new block */
1288         neh = ext_block_hdr(bh);
1289         /* old root could have indexes or leaves
1290          * so calculate e_max right way */
1291         if (ext_depth(inode))
1292                 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1293         else
1294                 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1295         neh->eh_magic = EXT4_EXT_MAGIC;
1296         ext4_extent_block_csum_set(inode, neh);
1297         set_buffer_uptodate(bh);
1298         unlock_buffer(bh);
1299
1300         err = ext4_handle_dirty_metadata(handle, inode, bh);
1301         if (err)
1302                 goto out;
1303
1304         /* Update top-level index: num,max,pointer */
1305         neh = ext_inode_hdr(inode);
1306         neh->eh_entries = cpu_to_le16(1);
1307         ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1308         if (neh->eh_depth == 0) {
1309                 /* Root extent block becomes index block */
1310                 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1311                 EXT_FIRST_INDEX(neh)->ei_block =
1312                         EXT_FIRST_EXTENT(neh)->ee_block;
1313         }
1314         ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1315                   le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1316                   le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1317                   ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1318
1319         le16_add_cpu(&neh->eh_depth, 1);
1320         ext4_mark_inode_dirty(handle, inode);
1321 out:
1322         brelse(bh);
1323
1324         return err;
1325 }
1326
1327 /*
1328  * ext4_ext_create_new_leaf:
1329  * finds empty index and adds new leaf.
1330  * if no free index is found, then it requests in-depth growing.
1331  */
1332 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1333                                     unsigned int mb_flags,
1334                                     unsigned int gb_flags,
1335                                     struct ext4_ext_path *path,
1336                                     struct ext4_extent *newext)
1337 {
1338         struct ext4_ext_path *curp;
1339         int depth, i, err = 0;
1340
1341 repeat:
1342         i = depth = ext_depth(inode);
1343
1344         /* walk up to the tree and look for free index entry */
1345         curp = path + depth;
1346         while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1347                 i--;
1348                 curp--;
1349         }
1350
1351         /* we use already allocated block for index block,
1352          * so subsequent data blocks should be contiguous */
1353         if (EXT_HAS_FREE_INDEX(curp)) {
1354                 /* if we found index with free entry, then use that
1355                  * entry: create all needed subtree and add new leaf */
1356                 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1357                 if (err)
1358                         goto out;
1359
1360                 /* refill path */
1361                 ext4_ext_drop_refs(path);
1362                 path = ext4_ext_find_extent(inode,
1363                                     (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1364                                     &path, gb_flags | EXT4_EX_NOFREE_ON_ERR);
1365                 if (IS_ERR(path))
1366                         err = PTR_ERR(path);
1367         } else {
1368                 /* tree is full, time to grow in depth */
1369                 err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
1370                 if (err)
1371                         goto out;
1372
1373                 /* refill path */
1374                 ext4_ext_drop_refs(path);
1375                 path = ext4_ext_find_extent(inode,
1376                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1377                                     &path, gb_flags | EXT4_EX_NOFREE_ON_ERR);
1378                 if (IS_ERR(path)) {
1379                         err = PTR_ERR(path);
1380                         goto out;
1381                 }
1382
1383                 /*
1384                  * only first (depth 0 -> 1) produces free space;
1385                  * in all other cases we have to split the grown tree
1386                  */
1387                 depth = ext_depth(inode);
1388                 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1389                         /* now we need to split */
1390                         goto repeat;
1391                 }
1392         }
1393
1394 out:
1395         return err;
1396 }
1397
1398 /*
1399  * search the closest allocated block to the left for *logical
1400  * and returns it at @logical + it's physical address at @phys
1401  * if *logical is the smallest allocated block, the function
1402  * returns 0 at @phys
1403  * return value contains 0 (success) or error code
1404  */
1405 static int ext4_ext_search_left(struct inode *inode,
1406                                 struct ext4_ext_path *path,
1407                                 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1408 {
1409         struct ext4_extent_idx *ix;
1410         struct ext4_extent *ex;
1411         int depth, ee_len;
1412
1413         if (unlikely(path == NULL)) {
1414                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1415                 return -EIO;
1416         }
1417         depth = path->p_depth;
1418         *phys = 0;
1419
1420         if (depth == 0 && path->p_ext == NULL)
1421                 return 0;
1422
1423         /* usually extent in the path covers blocks smaller
1424          * then *logical, but it can be that extent is the
1425          * first one in the file */
1426
1427         ex = path[depth].p_ext;
1428         ee_len = ext4_ext_get_actual_len(ex);
1429         if (*logical < le32_to_cpu(ex->ee_block)) {
1430                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1431                         EXT4_ERROR_INODE(inode,
1432                                          "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1433                                          *logical, le32_to_cpu(ex->ee_block));
1434                         return -EIO;
1435                 }
1436                 while (--depth >= 0) {
1437                         ix = path[depth].p_idx;
1438                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1439                                 EXT4_ERROR_INODE(inode,
1440                                   "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1441                                   ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1442                                   EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1443                 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1444                                   depth);
1445                                 return -EIO;
1446                         }
1447                 }
1448                 return 0;
1449         }
1450
1451         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1452                 EXT4_ERROR_INODE(inode,
1453                                  "logical %d < ee_block %d + ee_len %d!",
1454                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1455                 return -EIO;
1456         }
1457
1458         *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1459         *phys = ext4_ext_pblock(ex) + ee_len - 1;
1460         return 0;
1461 }
1462
1463 /*
1464  * search the closest allocated block to the right for *logical
1465  * and returns it at @logical + it's physical address at @phys
1466  * if *logical is the largest allocated block, the function
1467  * returns 0 at @phys
1468  * return value contains 0 (success) or error code
1469  */
1470 static int ext4_ext_search_right(struct inode *inode,
1471                                  struct ext4_ext_path *path,
1472                                  ext4_lblk_t *logical, ext4_fsblk_t *phys,
1473                                  struct ext4_extent **ret_ex)
1474 {
1475         struct buffer_head *bh = NULL;
1476         struct ext4_extent_header *eh;
1477         struct ext4_extent_idx *ix;
1478         struct ext4_extent *ex;
1479         ext4_fsblk_t block;
1480         int depth;      /* Note, NOT eh_depth; depth from top of tree */
1481         int ee_len;
1482
1483         if (unlikely(path == NULL)) {
1484                 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1485                 return -EIO;
1486         }
1487         depth = path->p_depth;
1488         *phys = 0;
1489
1490         if (depth == 0 && path->p_ext == NULL)
1491                 return 0;
1492
1493         /* usually extent in the path covers blocks smaller
1494          * then *logical, but it can be that extent is the
1495          * first one in the file */
1496
1497         ex = path[depth].p_ext;
1498         ee_len = ext4_ext_get_actual_len(ex);
1499         if (*logical < le32_to_cpu(ex->ee_block)) {
1500                 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1501                         EXT4_ERROR_INODE(inode,
1502                                          "first_extent(path[%d].p_hdr) != ex",
1503                                          depth);
1504                         return -EIO;
1505                 }
1506                 while (--depth >= 0) {
1507                         ix = path[depth].p_idx;
1508                         if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1509                                 EXT4_ERROR_INODE(inode,
1510                                                  "ix != EXT_FIRST_INDEX *logical %d!",
1511                                                  *logical);
1512                                 return -EIO;
1513                         }
1514                 }
1515                 goto found_extent;
1516         }
1517
1518         if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1519                 EXT4_ERROR_INODE(inode,
1520                                  "logical %d < ee_block %d + ee_len %d!",
1521                                  *logical, le32_to_cpu(ex->ee_block), ee_len);
1522                 return -EIO;
1523         }
1524
1525         if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1526                 /* next allocated block in this leaf */
1527                 ex++;
1528                 goto found_extent;
1529         }
1530
1531         /* go up and search for index to the right */
1532         while (--depth >= 0) {
1533                 ix = path[depth].p_idx;
1534                 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1535                         goto got_index;
1536         }
1537
1538         /* we've gone up to the root and found no index to the right */
1539         return 0;
1540
1541 got_index:
1542         /* we've found index to the right, let's
1543          * follow it and find the closest allocated
1544          * block to the right */
1545         ix++;
1546         block = ext4_idx_pblock(ix);
1547         while (++depth < path->p_depth) {
1548                 /* subtract from p_depth to get proper eh_depth */
1549                 bh = read_extent_tree_block(inode, block,
1550                                             path->p_depth - depth, 0);
1551                 if (IS_ERR(bh))
1552                         return PTR_ERR(bh);
1553                 eh = ext_block_hdr(bh);
1554                 ix = EXT_FIRST_INDEX(eh);
1555                 block = ext4_idx_pblock(ix);
1556                 put_bh(bh);
1557         }
1558
1559         bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1560         if (IS_ERR(bh))
1561                 return PTR_ERR(bh);
1562         eh = ext_block_hdr(bh);
1563         ex = EXT_FIRST_EXTENT(eh);
1564 found_extent:
1565         *logical = le32_to_cpu(ex->ee_block);
1566         *phys = ext4_ext_pblock(ex);
1567         *ret_ex = ex;
1568         if (bh)
1569                 put_bh(bh);
1570         return 0;
1571 }
1572
1573 /*
1574  * ext4_ext_next_allocated_block:
1575  * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1576  * NOTE: it considers block number from index entry as
1577  * allocated block. Thus, index entries have to be consistent
1578  * with leaves.
1579  */
1580 ext4_lblk_t
1581 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1582 {
1583         int depth;
1584
1585         BUG_ON(path == NULL);
1586         depth = path->p_depth;
1587
1588         if (depth == 0 && path->p_ext == NULL)
1589                 return EXT_MAX_BLOCKS;
1590
1591         while (depth >= 0) {
1592                 if (depth == path->p_depth) {
1593                         /* leaf */
1594                         if (path[depth].p_ext &&
1595                                 path[depth].p_ext !=
1596                                         EXT_LAST_EXTENT(path[depth].p_hdr))
1597                           return le32_to_cpu(path[depth].p_ext[1].ee_block);
1598                 } else {
1599                         /* index */
1600                         if (path[depth].p_idx !=
1601                                         EXT_LAST_INDEX(path[depth].p_hdr))
1602                           return le32_to_cpu(path[depth].p_idx[1].ei_block);
1603                 }
1604                 depth--;
1605         }
1606
1607         return EXT_MAX_BLOCKS;
1608 }
1609
1610 /*
1611  * ext4_ext_next_leaf_block:
1612  * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1613  */
1614 static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1615 {
1616         int depth;
1617
1618         BUG_ON(path == NULL);
1619         depth = path->p_depth;
1620
1621         /* zero-tree has no leaf blocks at all */
1622         if (depth == 0)
1623                 return EXT_MAX_BLOCKS;
1624
1625         /* go to index block */
1626         depth--;
1627
1628         while (depth >= 0) {
1629                 if (path[depth].p_idx !=
1630                                 EXT_LAST_INDEX(path[depth].p_hdr))
1631                         return (ext4_lblk_t)
1632                                 le32_to_cpu(path[depth].p_idx[1].ei_block);
1633                 depth--;
1634         }
1635
1636         return EXT_MAX_BLOCKS;
1637 }
1638
1639 /*
1640  * ext4_ext_correct_indexes:
1641  * if leaf gets modified and modified extent is first in the leaf,
1642  * then we have to correct all indexes above.
1643  * TODO: do we need to correct tree in all cases?
1644  */
1645 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1646                                 struct ext4_ext_path *path)
1647 {
1648         struct ext4_extent_header *eh;
1649         int depth = ext_depth(inode);
1650         struct ext4_extent *ex;
1651         __le32 border;
1652         int k, err = 0;
1653
1654         eh = path[depth].p_hdr;
1655         ex = path[depth].p_ext;
1656
1657         if (unlikely(ex == NULL || eh == NULL)) {
1658                 EXT4_ERROR_INODE(inode,
1659                                  "ex %p == NULL or eh %p == NULL", ex, eh);
1660                 return -EIO;
1661         }
1662
1663         if (depth == 0) {
1664                 /* there is no tree at all */
1665                 return 0;
1666         }
1667
1668         if (ex != EXT_FIRST_EXTENT(eh)) {
1669                 /* we correct tree if first leaf got modified only */
1670                 return 0;
1671         }
1672
1673         /*
1674          * TODO: we need correction if border is smaller than current one
1675          */
1676         k = depth - 1;
1677         border = path[depth].p_ext->ee_block;
1678         err = ext4_ext_get_access(handle, inode, path + k);
1679         if (err)
1680                 return err;
1681         path[k].p_idx->ei_block = border;
1682         err = ext4_ext_dirty(handle, inode, path + k);
1683         if (err)
1684                 return err;
1685
1686         while (k--) {
1687                 /* change all left-side indexes */
1688                 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1689                         break;
1690                 err = ext4_ext_get_access(handle, inode, path + k);
1691                 if (err)
1692                         break;
1693                 path[k].p_idx->ei_block = border;
1694                 err = ext4_ext_dirty(handle, inode, path + k);
1695                 if (err)
1696                         break;
1697         }
1698
1699         return err;
1700 }
1701
1702 int
1703 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1704                                 struct ext4_extent *ex2)
1705 {
1706         unsigned short ext1_ee_len, ext2_ee_len;
1707
1708         /*
1709          * Make sure that both extents are initialized. We don't merge
1710          * unwritten extents so that we can be sure that end_io code has
1711          * the extent that was written properly split out and conversion to
1712          * initialized is trivial.
1713          */
1714         if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1715                 return 0;
1716
1717         ext1_ee_len = ext4_ext_get_actual_len(ex1);
1718         ext2_ee_len = ext4_ext_get_actual_len(ex2);
1719
1720         if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1721                         le32_to_cpu(ex2->ee_block))
1722                 return 0;
1723
1724         /*
1725          * To allow future support for preallocated extents to be added
1726          * as an RO_COMPAT feature, refuse to merge to extents if
1727          * this can result in the top bit of ee_len being set.
1728          */
1729         if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1730                 return 0;
1731         if (ext4_ext_is_unwritten(ex1) &&
1732             (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1733              atomic_read(&EXT4_I(inode)->i_unwritten) ||
1734              (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
1735                 return 0;
1736 #ifdef AGGRESSIVE_TEST
1737         if (ext1_ee_len >= 4)
1738                 return 0;
1739 #endif
1740
1741         if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1742                 return 1;
1743         return 0;
1744 }
1745
1746 /*
1747  * This function tries to merge the "ex" extent to the next extent in the tree.
1748  * It always tries to merge towards right. If you want to merge towards
1749  * left, pass "ex - 1" as argument instead of "ex".
1750  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1751  * 1 if they got merged.
1752  */
1753 static int ext4_ext_try_to_merge_right(struct inode *inode,
1754                                  struct ext4_ext_path *path,
1755                                  struct ext4_extent *ex)
1756 {
1757         struct ext4_extent_header *eh;
1758         unsigned int depth, len;
1759         int merge_done = 0, unwritten;
1760
1761         depth = ext_depth(inode);
1762         BUG_ON(path[depth].p_hdr == NULL);
1763         eh = path[depth].p_hdr;
1764
1765         while (ex < EXT_LAST_EXTENT(eh)) {
1766                 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1767                         break;
1768                 /* merge with next extent! */
1769                 unwritten = ext4_ext_is_unwritten(ex);
1770                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1771                                 + ext4_ext_get_actual_len(ex + 1));
1772                 if (unwritten)
1773                         ext4_ext_mark_unwritten(ex);
1774
1775                 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1776                         len = (EXT_LAST_EXTENT(eh) - ex - 1)
1777                                 * sizeof(struct ext4_extent);
1778                         memmove(ex + 1, ex + 2, len);
1779                 }
1780                 le16_add_cpu(&eh->eh_entries, -1);
1781                 merge_done = 1;
1782                 WARN_ON(eh->eh_entries == 0);
1783                 if (!eh->eh_entries)
1784                         EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1785         }
1786
1787         return merge_done;
1788 }
1789
1790 /*
1791  * This function does a very simple check to see if we can collapse
1792  * an extent tree with a single extent tree leaf block into the inode.
1793  */
1794 static void ext4_ext_try_to_merge_up(handle_t *handle,
1795                                      struct inode *inode,
1796                                      struct ext4_ext_path *path)
1797 {
1798         size_t s;
1799         unsigned max_root = ext4_ext_space_root(inode, 0);
1800         ext4_fsblk_t blk;
1801
1802         if ((path[0].p_depth != 1) ||
1803             (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1804             (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1805                 return;
1806
1807         /*
1808          * We need to modify the block allocation bitmap and the block
1809          * group descriptor to release the extent tree block.  If we
1810          * can't get the journal credits, give up.
1811          */
1812         if (ext4_journal_extend(handle, 2))
1813                 return;
1814
1815         /*
1816          * Copy the extent data up to the inode
1817          */
1818         blk = ext4_idx_pblock(path[0].p_idx);
1819         s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1820                 sizeof(struct ext4_extent_idx);
1821         s += sizeof(struct ext4_extent_header);
1822
1823         memcpy(path[0].p_hdr, path[1].p_hdr, s);
1824         path[0].p_depth = 0;
1825         path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1826                 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1827         path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1828
1829         brelse(path[1].p_bh);
1830         ext4_free_blocks(handle, inode, NULL, blk, 1,
1831                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1832 }
1833
1834 /*
1835  * This function tries to merge the @ex extent to neighbours in the tree.
1836  * return 1 if merge left else 0.
1837  */
1838 static void ext4_ext_try_to_merge(handle_t *handle,
1839                                   struct inode *inode,
1840                                   struct ext4_ext_path *path,
1841                                   struct ext4_extent *ex) {
1842         struct ext4_extent_header *eh;
1843         unsigned int depth;
1844         int merge_done = 0;
1845
1846         depth = ext_depth(inode);
1847         BUG_ON(path[depth].p_hdr == NULL);
1848         eh = path[depth].p_hdr;
1849
1850         if (ex > EXT_FIRST_EXTENT(eh))
1851                 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1852
1853         if (!merge_done)
1854                 (void) ext4_ext_try_to_merge_right(inode, path, ex);
1855
1856         ext4_ext_try_to_merge_up(handle, inode, path);
1857 }
1858
1859 /*
1860  * check if a portion of the "newext" extent overlaps with an
1861  * existing extent.
1862  *
1863  * If there is an overlap discovered, it updates the length of the newext
1864  * such that there will be no overlap, and then returns 1.
1865  * If there is no overlap found, it returns 0.
1866  */
1867 static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1868                                            struct inode *inode,
1869                                            struct ext4_extent *newext,
1870                                            struct ext4_ext_path *path)
1871 {
1872         ext4_lblk_t b1, b2;
1873         unsigned int depth, len1;
1874         unsigned int ret = 0;
1875
1876         b1 = le32_to_cpu(newext->ee_block);
1877         len1 = ext4_ext_get_actual_len(newext);
1878         depth = ext_depth(inode);
1879         if (!path[depth].p_ext)
1880                 goto out;
1881         b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1882
1883         /*
1884          * get the next allocated block if the extent in the path
1885          * is before the requested block(s)
1886          */
1887         if (b2 < b1) {
1888                 b2 = ext4_ext_next_allocated_block(path);
1889                 if (b2 == EXT_MAX_BLOCKS)
1890                         goto out;
1891                 b2 = EXT4_LBLK_CMASK(sbi, b2);
1892         }
1893
1894         /* check for wrap through zero on extent logical start block*/
1895         if (b1 + len1 < b1) {
1896                 len1 = EXT_MAX_BLOCKS - b1;
1897                 newext->ee_len = cpu_to_le16(len1);
1898                 ret = 1;
1899         }
1900
1901         /* check for overlap */
1902         if (b1 + len1 > b2) {
1903                 newext->ee_len = cpu_to_le16(b2 - b1);
1904                 ret = 1;
1905         }
1906 out:
1907         return ret;
1908 }
1909
1910 /*
1911  * ext4_ext_insert_extent:
1912  * tries to merge requsted extent into the existing extent or
1913  * inserts requested extent as new one into the tree,
1914  * creating new leaf in the no-space case.
1915  */
1916 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1917                                 struct ext4_ext_path *path,
1918                                 struct ext4_extent *newext, int gb_flags)
1919 {
1920         struct ext4_extent_header *eh;
1921         struct ext4_extent *ex, *fex;
1922         struct ext4_extent *nearex; /* nearest extent */
1923         struct ext4_ext_path *npath = NULL;
1924         int depth, len, err;
1925         ext4_lblk_t next;
1926         int mb_flags = 0, unwritten;
1927
1928         if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1929                 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1930                 return -EIO;
1931         }
1932         depth = ext_depth(inode);
1933         ex = path[depth].p_ext;
1934         eh = path[depth].p_hdr;
1935         if (unlikely(path[depth].p_hdr == NULL)) {
1936                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1937                 return -EIO;
1938         }
1939
1940         /* try to insert block into found extent and return */
1941         if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1942
1943                 /*
1944                  * Try to see whether we should rather test the extent on
1945                  * right from ex, or from the left of ex. This is because
1946                  * ext4_ext_find_extent() can return either extent on the
1947                  * left, or on the right from the searched position. This
1948                  * will make merging more effective.
1949                  */
1950                 if (ex < EXT_LAST_EXTENT(eh) &&
1951                     (le32_to_cpu(ex->ee_block) +
1952                     ext4_ext_get_actual_len(ex) <
1953                     le32_to_cpu(newext->ee_block))) {
1954                         ex += 1;
1955                         goto prepend;
1956                 } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1957                            (le32_to_cpu(newext->ee_block) +
1958                            ext4_ext_get_actual_len(newext) <
1959                            le32_to_cpu(ex->ee_block)))
1960                         ex -= 1;
1961
1962                 /* Try to append newex to the ex */
1963                 if (ext4_can_extents_be_merged(inode, ex, newext)) {
1964                         ext_debug("append [%d]%d block to %u:[%d]%d"
1965                                   "(from %llu)\n",
1966                                   ext4_ext_is_unwritten(newext),
1967                                   ext4_ext_get_actual_len(newext),
1968                                   le32_to_cpu(ex->ee_block),
1969                                   ext4_ext_is_unwritten(ex),
1970                                   ext4_ext_get_actual_len(ex),
1971                                   ext4_ext_pblock(ex));
1972                         err = ext4_ext_get_access(handle, inode,
1973                                                   path + depth);
1974                         if (err)
1975                                 return err;
1976                         unwritten = ext4_ext_is_unwritten(ex);
1977                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1978                                         + ext4_ext_get_actual_len(newext));
1979                         if (unwritten)
1980                                 ext4_ext_mark_unwritten(ex);
1981                         eh = path[depth].p_hdr;
1982                         nearex = ex;
1983                         goto merge;
1984                 }
1985
1986 prepend:
1987                 /* Try to prepend newex to the ex */
1988                 if (ext4_can_extents_be_merged(inode, newext, ex)) {
1989                         ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
1990                                   "(from %llu)\n",
1991                                   le32_to_cpu(newext->ee_block),
1992                                   ext4_ext_is_unwritten(newext),
1993                                   ext4_ext_get_actual_len(newext),
1994                                   le32_to_cpu(ex->ee_block),
1995                                   ext4_ext_is_unwritten(ex),
1996                                   ext4_ext_get_actual_len(ex),
1997                                   ext4_ext_pblock(ex));
1998                         err = ext4_ext_get_access(handle, inode,
1999                                                   path + depth);
2000                         if (err)
2001                                 return err;
2002
2003                         unwritten = ext4_ext_is_unwritten(ex);
2004                         ex->ee_block = newext->ee_block;
2005                         ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
2006                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2007                                         + ext4_ext_get_actual_len(newext));
2008                         if (unwritten)
2009                                 ext4_ext_mark_unwritten(ex);
2010                         eh = path[depth].p_hdr;
2011                         nearex = ex;
2012                         goto merge;
2013                 }
2014         }
2015
2016         depth = ext_depth(inode);
2017         eh = path[depth].p_hdr;
2018         if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2019                 goto has_space;
2020
2021         /* probably next leaf has space for us? */
2022         fex = EXT_LAST_EXTENT(eh);
2023         next = EXT_MAX_BLOCKS;
2024         if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2025                 next = ext4_ext_next_leaf_block(path);
2026         if (next != EXT_MAX_BLOCKS) {
2027                 ext_debug("next leaf block - %u\n", next);
2028                 BUG_ON(npath != NULL);
2029                 npath = ext4_ext_find_extent(inode, next, NULL, 0);
2030                 if (IS_ERR(npath))
2031                         return PTR_ERR(npath);
2032                 BUG_ON(npath->p_depth != path->p_depth);
2033                 eh = npath[depth].p_hdr;
2034                 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2035                         ext_debug("next leaf isn't full(%d)\n",
2036                                   le16_to_cpu(eh->eh_entries));
2037                         path = npath;
2038                         goto has_space;
2039                 }
2040                 ext_debug("next leaf has no free space(%d,%d)\n",
2041                           le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2042         }
2043
2044         /*
2045          * There is no free space in the found leaf.
2046          * We're gonna add a new leaf in the tree.
2047          */
2048         if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2049                 mb_flags = EXT4_MB_USE_RESERVED;
2050         err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2051                                        path, newext);
2052         if (err)
2053                 goto cleanup;
2054         depth = ext_depth(inode);
2055         eh = path[depth].p_hdr;
2056
2057 has_space:
2058         nearex = path[depth].p_ext;
2059
2060         err = ext4_ext_get_access(handle, inode, path + depth);
2061         if (err)
2062                 goto cleanup;
2063
2064         if (!nearex) {
2065                 /* there is no extent in this leaf, create first one */
2066                 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
2067                                 le32_to_cpu(newext->ee_block),
2068                                 ext4_ext_pblock(newext),
2069                                 ext4_ext_is_unwritten(newext),
2070                                 ext4_ext_get_actual_len(newext));
2071                 nearex = EXT_FIRST_EXTENT(eh);
2072         } else {
2073                 if (le32_to_cpu(newext->ee_block)
2074                            > le32_to_cpu(nearex->ee_block)) {
2075                         /* Insert after */
2076                         ext_debug("insert %u:%llu:[%d]%d before: "
2077                                         "nearest %p\n",
2078                                         le32_to_cpu(newext->ee_block),
2079                                         ext4_ext_pblock(newext),
2080                                         ext4_ext_is_unwritten(newext),
2081                                         ext4_ext_get_actual_len(newext),
2082                                         nearex);
2083                         nearex++;
2084                 } else {
2085                         /* Insert before */
2086                         BUG_ON(newext->ee_block == nearex->ee_block);
2087                         ext_debug("insert %u:%llu:[%d]%d after: "
2088                                         "nearest %p\n",
2089                                         le32_to_cpu(newext->ee_block),
2090                                         ext4_ext_pblock(newext),
2091                                         ext4_ext_is_unwritten(newext),
2092                                         ext4_ext_get_actual_len(newext),
2093                                         nearex);
2094                 }
2095                 len = EXT_LAST_EXTENT(eh) - nearex + 1;
2096                 if (len > 0) {
2097                         ext_debug("insert %u:%llu:[%d]%d: "
2098                                         "move %d extents from 0x%p to 0x%p\n",
2099                                         le32_to_cpu(newext->ee_block),
2100                                         ext4_ext_pblock(newext),
2101                                         ext4_ext_is_unwritten(newext),
2102                                         ext4_ext_get_actual_len(newext),
2103                                         len, nearex, nearex + 1);
2104                         memmove(nearex + 1, nearex,
2105                                 len * sizeof(struct ext4_extent));
2106                 }
2107         }
2108
2109         le16_add_cpu(&eh->eh_entries, 1);
2110         path[depth].p_ext = nearex;
2111         nearex->ee_block = newext->ee_block;
2112         ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2113         nearex->ee_len = newext->ee_len;
2114
2115 merge:
2116         /* try to merge extents */
2117         if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2118                 ext4_ext_try_to_merge(handle, inode, path, nearex);
2119
2120
2121         /* time to correct all indexes above */
2122         err = ext4_ext_correct_indexes(handle, inode, path);
2123         if (err)
2124                 goto cleanup;
2125
2126         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2127
2128 cleanup:
2129         if (npath) {
2130                 ext4_ext_drop_refs(npath);
2131                 kfree(npath);
2132         }
2133         return err;
2134 }
2135
2136 static int ext4_fill_fiemap_extents(struct inode *inode,
2137                                     ext4_lblk_t block, ext4_lblk_t num,
2138                                     struct fiemap_extent_info *fieinfo)
2139 {
2140         struct ext4_ext_path *path = NULL;
2141         struct ext4_extent *ex;
2142         struct extent_status es;
2143         ext4_lblk_t next, next_del, start = 0, end = 0;
2144         ext4_lblk_t last = block + num;
2145         int exists, depth = 0, err = 0;
2146         unsigned int flags = 0;
2147         unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2148
2149         while (block < last && block != EXT_MAX_BLOCKS) {
2150                 num = last - block;
2151                 /* find extent for this block */
2152                 down_read(&EXT4_I(inode)->i_data_sem);
2153
2154                 if (path && ext_depth(inode) != depth) {
2155                         /* depth was changed. we have to realloc path */
2156                         kfree(path);
2157                         path = NULL;
2158                 }
2159
2160                 path = ext4_ext_find_extent(inode, block, &path, 0);
2161                 if (IS_ERR(path)) {
2162                         up_read(&EXT4_I(inode)->i_data_sem);
2163                         err = PTR_ERR(path);
2164                         path = NULL;
2165                         break;
2166                 }
2167
2168                 depth = ext_depth(inode);
2169                 if (unlikely(path[depth].p_hdr == NULL)) {
2170                         up_read(&EXT4_I(inode)->i_data_sem);
2171                         EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2172                         err = -EIO;
2173                         break;
2174                 }
2175                 ex = path[depth].p_ext;
2176                 next = ext4_ext_next_allocated_block(path);
2177                 ext4_ext_drop_refs(path);
2178
2179                 flags = 0;
2180                 exists = 0;
2181                 if (!ex) {
2182                         /* there is no extent yet, so try to allocate
2183                          * all requested space */
2184                         start = block;
2185                         end = block + num;
2186                 } else if (le32_to_cpu(ex->ee_block) > block) {
2187                         /* need to allocate space before found extent */
2188                         start = block;
2189                         end = le32_to_cpu(ex->ee_block);
2190                         if (block + num < end)
2191                                 end = block + num;
2192                 } else if (block >= le32_to_cpu(ex->ee_block)
2193                                         + ext4_ext_get_actual_len(ex)) {
2194                         /* need to allocate space after found extent */
2195                         start = block;
2196                         end = block + num;
2197                         if (end >= next)
2198                                 end = next;
2199                 } else if (block >= le32_to_cpu(ex->ee_block)) {
2200                         /*
2201                          * some part of requested space is covered
2202                          * by found extent
2203                          */
2204                         start = block;
2205                         end = le32_to_cpu(ex->ee_block)
2206                                 + ext4_ext_get_actual_len(ex);
2207                         if (block + num < end)
2208                                 end = block + num;
2209                         exists = 1;
2210                 } else {
2211                         BUG();
2212                 }
2213                 BUG_ON(end <= start);
2214
2215                 if (!exists) {
2216                         es.es_lblk = start;
2217                         es.es_len = end - start;
2218                         es.es_pblk = 0;
2219                 } else {
2220                         es.es_lblk = le32_to_cpu(ex->ee_block);
2221                         es.es_len = ext4_ext_get_actual_len(ex);
2222                         es.es_pblk = ext4_ext_pblock(ex);
2223                         if (ext4_ext_is_unwritten(ex))
2224                                 flags |= FIEMAP_EXTENT_UNWRITTEN;
2225                 }
2226
2227                 /*
2228                  * Find delayed extent and update es accordingly. We call
2229                  * it even in !exists case to find out whether es is the
2230                  * last existing extent or not.
2231                  */
2232                 next_del = ext4_find_delayed_extent(inode, &es);
2233                 if (!exists && next_del) {
2234                         exists = 1;
2235                         flags |= (FIEMAP_EXTENT_DELALLOC |
2236                                   FIEMAP_EXTENT_UNKNOWN);
2237                 }
2238                 up_read(&EXT4_I(inode)->i_data_sem);
2239
2240                 if (unlikely(es.es_len == 0)) {
2241                         EXT4_ERROR_INODE(inode, "es.es_len == 0");
2242                         err = -EIO;
2243                         break;
2244                 }
2245
2246                 /*
2247                  * This is possible iff next == next_del == EXT_MAX_BLOCKS.
2248                  * we need to check next == EXT_MAX_BLOCKS because it is
2249                  * possible that an extent is with unwritten and delayed
2250                  * status due to when an extent is delayed allocated and
2251                  * is allocated by fallocate status tree will track both of
2252                  * them in a extent.
2253                  *
2254                  * So we could return a unwritten and delayed extent, and
2255                  * its block is equal to 'next'.
2256                  */
2257                 if (next == next_del && next == EXT_MAX_BLOCKS) {
2258                         flags |= FIEMAP_EXTENT_LAST;
2259                         if (unlikely(next_del != EXT_MAX_BLOCKS ||
2260                                      next != EXT_MAX_BLOCKS)) {
2261                                 EXT4_ERROR_INODE(inode,
2262                                                  "next extent == %u, next "
2263                                                  "delalloc extent = %u",
2264                                                  next, next_del);
2265                                 err = -EIO;
2266                                 break;
2267                         }
2268                 }
2269
2270                 if (exists) {
2271                         err = fiemap_fill_next_extent(fieinfo,
2272                                 (__u64)es.es_lblk << blksize_bits,
2273                                 (__u64)es.es_pblk << blksize_bits,
2274                                 (__u64)es.es_len << blksize_bits,
2275                                 flags);
2276                         if (err < 0)
2277                                 break;
2278                         if (err == 1) {
2279                                 err = 0;
2280                                 break;
2281                         }
2282                 }
2283
2284                 block = es.es_lblk + es.es_len;
2285         }
2286
2287         if (path) {
2288                 ext4_ext_drop_refs(path);
2289                 kfree(path);
2290         }
2291
2292         return err;
2293 }
2294
2295 /*
2296  * ext4_ext_put_gap_in_cache:
2297  * calculate boundaries of the gap that the requested block fits into
2298  * and cache this gap
2299  */
2300 static void
2301 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2302                                 ext4_lblk_t block)
2303 {
2304         int depth = ext_depth(inode);
2305         unsigned long len = 0;
2306         ext4_lblk_t lblock = 0;
2307         struct ext4_extent *ex;
2308
2309         ex = path[depth].p_ext;
2310         if (ex == NULL) {
2311                 /*
2312                  * there is no extent yet, so gap is [0;-] and we
2313                  * don't cache it
2314                  */
2315                 ext_debug("cache gap(whole file):");
2316         } else if (block < le32_to_cpu(ex->ee_block)) {
2317                 lblock = block;
2318                 len = le32_to_cpu(ex->ee_block) - block;
2319                 ext_debug("cache gap(before): %u [%u:%u]",
2320                                 block,
2321                                 le32_to_cpu(ex->ee_block),
2322                                  ext4_ext_get_actual_len(ex));
2323                 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2324                         ext4_es_insert_extent(inode, lblock, len, ~0,
2325                                               EXTENT_STATUS_HOLE);
2326         } else if (block >= le32_to_cpu(ex->ee_block)
2327                         + ext4_ext_get_actual_len(ex)) {
2328                 ext4_lblk_t next;
2329                 lblock = le32_to_cpu(ex->ee_block)
2330                         + ext4_ext_get_actual_len(ex);
2331
2332                 next = ext4_ext_next_allocated_block(path);
2333                 ext_debug("cache gap(after): [%u:%u] %u",
2334                                 le32_to_cpu(ex->ee_block),
2335                                 ext4_ext_get_actual_len(ex),
2336                                 block);
2337                 BUG_ON(next == lblock);
2338                 len = next - lblock;
2339                 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
2340                         ext4_es_insert_extent(inode, lblock, len, ~0,
2341                                               EXTENT_STATUS_HOLE);
2342         } else {
2343                 BUG();
2344         }
2345
2346         ext_debug(" -> %u:%lu\n", lblock, len);
2347 }
2348
2349 /*
2350  * ext4_ext_rm_idx:
2351  * removes index from the index block.
2352  */
2353 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2354                         struct ext4_ext_path *path, int depth)
2355 {
2356         int err;
2357         ext4_fsblk_t leaf;
2358
2359         /* free index block */
2360         depth--;
2361         path = path + depth;
2362         leaf = ext4_idx_pblock(path->p_idx);
2363         if (unlikely(path->p_hdr->eh_entries == 0)) {
2364                 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2365                 return -EIO;
2366         }
2367         err = ext4_ext_get_access(handle, inode, path);
2368         if (err)
2369                 return err;
2370
2371         if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2372                 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2373                 len *= sizeof(struct ext4_extent_idx);
2374                 memmove(path->p_idx, path->p_idx + 1, len);
2375         }
2376
2377         le16_add_cpu(&path->p_hdr->eh_entries, -1);
2378         err = ext4_ext_dirty(handle, inode, path);
2379         if (err)
2380                 return err;
2381         ext_debug("index is empty, remove it, free block %llu\n", leaf);
2382         trace_ext4_ext_rm_idx(inode, leaf);
2383
2384         ext4_free_blocks(handle, inode, NULL, leaf, 1,
2385                          EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2386
2387         while (--depth >= 0) {
2388                 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2389                         break;
2390                 path--;
2391                 err = ext4_ext_get_access(handle, inode, path);
2392                 if (err)
2393                         break;
2394                 path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2395                 err = ext4_ext_dirty(handle, inode, path);
2396                 if (err)
2397                         break;
2398         }
2399         return err;
2400 }
2401
2402 /*
2403  * ext4_ext_calc_credits_for_single_extent:
2404  * This routine returns max. credits that needed to insert an extent
2405  * to the extent tree.
2406  * When pass the actual path, the caller should calculate credits
2407  * under i_data_sem.
2408  */
2409 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2410                                                 struct ext4_ext_path *path)
2411 {
2412         if (path) {
2413                 int depth = ext_depth(inode);
2414                 int ret = 0;
2415
2416                 /* probably there is space in leaf? */
2417                 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2418                                 < le16_to_cpu(path[depth].p_hdr->eh_max)) {
2419
2420                         /*
2421                          *  There are some space in the leaf tree, no
2422                          *  need to account for leaf block credit
2423                          *
2424                          *  bitmaps and block group descriptor blocks
2425                          *  and other metadata blocks still need to be
2426                          *  accounted.
2427                          */
2428                         /* 1 bitmap, 1 block group descriptor */
2429                         ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2430                         return ret;
2431                 }
2432         }
2433
2434         return ext4_chunk_trans_blocks(inode, nrblocks);
2435 }
2436
2437 /*
2438  * How many index/leaf blocks need to change/allocate to add @extents extents?
2439  *
2440  * If we add a single extent, then in the worse case, each tree level
2441  * index/leaf need to be changed in case of the tree split.
2442  *
2443  * If more extents are inserted, they could cause the whole tree split more
2444  * than once, but this is really rare.
2445  */
2446 int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2447 {
2448         int index;
2449         int depth;
2450
2451         /* If we are converting the inline data, only one is needed here. */
2452         if (ext4_has_inline_data(inode))
2453                 return 1;
2454
2455         depth = ext_depth(inode);
2456
2457         if (extents <= 1)
2458                 index = depth * 2;
2459         else
2460                 index = depth * 3;
2461
2462         return index;
2463 }
2464
2465 static inline int get_default_free_blocks_flags(struct inode *inode)
2466 {
2467         if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2468                 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2469         else if (ext4_should_journal_data(inode))
2470                 return EXT4_FREE_BLOCKS_FORGET;
2471         return 0;
2472 }
2473
2474 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2475                               struct ext4_extent *ex,
2476                               long long *partial_cluster,
2477                               ext4_lblk_t from, ext4_lblk_t to)
2478 {
2479         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2480         unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2481         ext4_fsblk_t pblk;
2482         int flags = get_default_free_blocks_flags(inode);
2483
2484         /*
2485          * For bigalloc file systems, we never free a partial cluster
2486          * at the beginning of the extent.  Instead, we make a note
2487          * that we tried freeing the cluster, and check to see if we
2488          * need to free it on a subsequent call to ext4_remove_blocks,
2489          * or at the end of the ext4_truncate() operation.
2490          */
2491         flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2492
2493         trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2494         /*
2495          * If we have a partial cluster, and it's different from the
2496          * cluster of the last block, we need to explicitly free the
2497          * partial cluster here.
2498          */
2499         pblk = ext4_ext_pblock(ex) + ee_len - 1;
2500         if ((*partial_cluster > 0) &&
2501             (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2502                 ext4_free_blocks(handle, inode, NULL,
2503                                  EXT4_C2B(sbi, *partial_cluster),
2504                                  sbi->s_cluster_ratio, flags);
2505                 *partial_cluster = 0;
2506         }
2507
2508 #ifdef EXTENTS_STATS
2509         {
2510                 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2511                 spin_lock(&sbi->s_ext_stats_lock);
2512                 sbi->s_ext_blocks += ee_len;
2513                 sbi->s_ext_extents++;
2514                 if (ee_len < sbi->s_ext_min)
2515                         sbi->s_ext_min = ee_len;
2516                 if (ee_len > sbi->s_ext_max)
2517                         sbi->s_ext_max = ee_len;
2518                 if (ext_depth(inode) > sbi->s_depth_max)
2519                         sbi->s_depth_max = ext_depth(inode);
2520                 spin_unlock(&sbi->s_ext_stats_lock);
2521         }
2522 #endif
2523         if (from >= le32_to_cpu(ex->ee_block)
2524             && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2525                 /* tail removal */
2526                 ext4_lblk_t num;
2527                 unsigned int unaligned;
2528
2529                 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2530                 pblk = ext4_ext_pblock(ex) + ee_len - num;
2531                 /*
2532                  * Usually we want to free partial cluster at the end of the
2533                  * extent, except for the situation when the cluster is still
2534                  * used by any other extent (partial_cluster is negative).
2535                  */
2536                 if (*partial_cluster < 0 &&
2537                     -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
2538                         flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2539
2540                 ext_debug("free last %u blocks starting %llu partial %lld\n",
2541                           num, pblk, *partial_cluster);
2542                 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2543                 /*
2544                  * If the block range to be freed didn't start at the
2545                  * beginning of a cluster, and we removed the entire
2546                  * extent and the cluster is not used by any other extent,
2547                  * save the partial cluster here, since we might need to
2548                  * delete if we determine that the truncate operation has
2549                  * removed all of the blocks in the cluster.
2550                  *
2551                  * On the other hand, if we did not manage to free the whole
2552                  * extent, we have to mark the cluster as used (store negative
2553                  * cluster number in partial_cluster).
2554                  */
2555                 unaligned = EXT4_PBLK_COFF(sbi, pblk);
2556                 if (unaligned && (ee_len == num) &&
2557                     (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
2558                         *partial_cluster = EXT4_B2C(sbi, pblk);
2559                 else if (unaligned)
2560                         *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
2561                 else if (*partial_cluster > 0)
2562                         *partial_cluster = 0;
2563         } else
2564                 ext4_error(sbi->s_sb, "strange request: removal(2) "
2565                            "%u-%u from %u:%u\n",
2566                            from, to, le32_to_cpu(ex->ee_block), ee_len);
2567         return 0;
2568 }
2569
2570
2571 /*
2572  * ext4_ext_rm_leaf() Removes the extents associated with the
2573  * blocks appearing between "start" and "end", and splits the extents
2574  * if "start" and "end" appear in the same extent
2575  *
2576  * @handle: The journal handle
2577  * @inode:  The files inode
2578  * @path:   The path to the leaf
2579  * @partial_cluster: The cluster which we'll have to free if all extents
2580  *                   has been released from it. It gets negative in case
2581  *                   that the cluster is still used.
2582  * @start:  The first block to remove
2583  * @end:   The last block to remove
2584  */
2585 static int
2586 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2587                  struct ext4_ext_path *path,
2588                  long long *partial_cluster,
2589                  ext4_lblk_t start, ext4_lblk_t end)
2590 {
2591         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2592         int err = 0, correct_index = 0;
2593         int depth = ext_depth(inode), credits;
2594         struct ext4_extent_header *eh;
2595         ext4_lblk_t a, b;
2596         unsigned num;
2597         ext4_lblk_t ex_ee_block;
2598         unsigned short ex_ee_len;
2599         unsigned unwritten = 0;
2600         struct ext4_extent *ex;
2601         ext4_fsblk_t pblk;
2602
2603         /* the header must be checked already in ext4_ext_remove_space() */
2604         ext_debug("truncate since %u in leaf to %u\n", start, end);
2605         if (!path[depth].p_hdr)
2606                 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2607         eh = path[depth].p_hdr;
2608         if (unlikely(path[depth].p_hdr == NULL)) {
2609                 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2610                 return -EIO;
2611         }
2612         /* find where to start removing */
2613         ex = path[depth].p_ext;
2614         if (!ex)
2615                 ex = EXT_LAST_EXTENT(eh);
2616
2617         ex_ee_block = le32_to_cpu(ex->ee_block);
2618         ex_ee_len = ext4_ext_get_actual_len(ex);
2619
2620         /*
2621          * If we're starting with an extent other than the last one in the
2622          * node, we need to see if it shares a cluster with the extent to
2623          * the right (towards the end of the file). If its leftmost cluster
2624          * is this extent's rightmost cluster and it is not cluster aligned,
2625          * we'll mark it as a partial that is not to be deallocated.
2626          */
2627
2628         if (ex != EXT_LAST_EXTENT(eh)) {
2629                 ext4_fsblk_t current_pblk, right_pblk;
2630                 long long current_cluster, right_cluster;
2631
2632                 current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2633                 current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
2634                 right_pblk = ext4_ext_pblock(ex + 1);
2635                 right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
2636                 if (current_cluster == right_cluster &&
2637                         EXT4_PBLK_COFF(sbi, right_pblk))
2638                         *partial_cluster = -right_cluster;
2639         }
2640
2641         trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
2642
2643         while (ex >= EXT_FIRST_EXTENT(eh) &&
2644                         ex_ee_block + ex_ee_len > start) {
2645
2646                 if (ext4_ext_is_unwritten(ex))
2647                         unwritten = 1;
2648                 else
2649                         unwritten = 0;
2650
2651                 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2652                           unwritten, ex_ee_len);
2653                 path[depth].p_ext = ex;
2654
2655                 a = ex_ee_block > start ? ex_ee_block : start;
2656                 b = ex_ee_block+ex_ee_len - 1 < end ?
2657                         ex_ee_block+ex_ee_len - 1 : end;
2658
2659                 ext_debug("  border %u:%u\n", a, b);
2660
2661                 /* If this extent is beyond the end of the hole, skip it */
2662                 if (end < ex_ee_block) {
2663                         /*
2664                          * We're going to skip this extent and move to another,
2665                          * so if this extent is not cluster aligned we have
2666                          * to mark the current cluster as used to avoid
2667                          * accidentally freeing it later on
2668                          */
2669                         pblk = ext4_ext_pblock(ex);
2670                         if (EXT4_PBLK_COFF(sbi, pblk))
2671                                 *partial_cluster =
2672                                         -((long long)EXT4_B2C(sbi, pblk));
2673                         ex--;
2674                         ex_ee_block = le32_to_cpu(ex->ee_block);
2675                         ex_ee_len = ext4_ext_get_actual_len(ex);
2676                         continue;
2677                 } else if (b != ex_ee_block + ex_ee_len - 1) {
2678                         EXT4_ERROR_INODE(inode,
2679                                          "can not handle truncate %u:%u "
2680                                          "on extent %u:%u",
2681                                          start, end, ex_ee_block,
2682                                          ex_ee_block + ex_ee_len - 1);
2683                         err = -EIO;
2684                         goto out;
2685                 } else if (a != ex_ee_block) {
2686                         /* remove tail of the extent */
2687                         num = a - ex_ee_block;
2688                 } else {
2689                         /* remove whole extent: excellent! */
2690                         num = 0;
2691                 }
2692                 /*
2693                  * 3 for leaf, sb, and inode plus 2 (bmap and group
2694                  * descriptor) for each block group; assume two block
2695                  * groups plus ex_ee_len/blocks_per_block_group for
2696                  * the worst case
2697                  */
2698                 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2699                 if (ex == EXT_FIRST_EXTENT(eh)) {
2700                         correct_index = 1;
2701                         credits += (ext_depth(inode)) + 1;
2702                 }
2703                 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2704
2705                 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2706                 if (err)
2707                         goto out;
2708
2709                 err = ext4_ext_get_access(handle, inode, path + depth);
2710                 if (err)
2711                         goto out;
2712
2713                 err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
2714                                          a, b);
2715                 if (err)
2716                         goto out;
2717
2718                 if (num == 0)
2719                         /* this extent is removed; mark slot entirely unused */
2720                         ext4_ext_store_pblock(ex, 0);
2721
2722                 ex->ee_len = cpu_to_le16(num);
2723                 /*
2724                  * Do not mark unwritten if all the blocks in the
2725                  * extent have been removed.
2726                  */
2727                 if (unwritten && num)
2728                         ext4_ext_mark_unwritten(ex);
2729                 /*
2730                  * If the extent was completely released,
2731                  * we need to remove it from the leaf
2732                  */
2733                 if (num == 0) {
2734                         if (end != EXT_MAX_BLOCKS - 1) {
2735                                 /*
2736                                  * For hole punching, we need to scoot all the
2737                                  * extents up when an extent is removed so that
2738                                  * we dont have blank extents in the middle
2739                                  */
2740                                 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2741                                         sizeof(struct ext4_extent));
2742
2743                                 /* Now get rid of the one at the end */
2744                                 memset(EXT_LAST_EXTENT(eh), 0,
2745                                         sizeof(struct ext4_extent));
2746                         }
2747                         le16_add_cpu(&eh->eh_entries, -1);
2748                 } else if (*partial_cluster > 0)
2749                         *partial_cluster = 0;
2750
2751                 err = ext4_ext_dirty(handle, inode, path + depth);
2752                 if (err)
2753                         goto out;
2754
2755                 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2756                                 ext4_ext_pblock(ex));
2757                 ex--;
2758                 ex_ee_block = le32_to_cpu(ex->ee_block);
2759                 ex_ee_len = ext4_ext_get_actual_len(ex);
2760         }
2761
2762         if (correct_index && eh->eh_entries)
2763                 err = ext4_ext_correct_indexes(handle, inode, path);
2764
2765         /*
2766          * If there's a partial cluster and at least one extent remains in
2767          * the leaf, free the partial cluster if it isn't shared with the
2768          * current extent.  If there's a partial cluster and no extents
2769          * remain in the leaf, it can't be freed here.  It can only be
2770          * freed when it's possible to determine if it's not shared with
2771          * any other extent - when the next leaf is processed or when space
2772          * removal is complete.
2773          */
2774         if (*partial_cluster > 0 && eh->eh_entries &&
2775             (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2776              *partial_cluster)) {
2777                 int flags = get_default_free_blocks_flags(inode);
2778
2779                 ext4_free_blocks(handle, inode, NULL,
2780                                  EXT4_C2B(sbi, *partial_cluster),
2781                                  sbi->s_cluster_ratio, flags);
2782                 *partial_cluster = 0;
2783         }
2784
2785         /* if this leaf is free, then we should
2786          * remove it from index block above */
2787         if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2788                 err = ext4_ext_rm_idx(handle, inode, path, depth);
2789
2790 out:
2791         return err;
2792 }
2793
2794 /*
2795  * ext4_ext_more_to_rm:
2796  * returns 1 if current index has to be freed (even partial)
2797  */
2798 static int
2799 ext4_ext_more_to_rm(struct ext4_ext_path *path)
2800 {
2801         BUG_ON(path->p_idx == NULL);
2802
2803         if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2804                 return 0;
2805
2806         /*
2807          * if truncate on deeper level happened, it wasn't partial,
2808          * so we have to consider current index for truncation
2809          */
2810         if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2811                 return 0;
2812         return 1;
2813 }
2814
2815 int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2816                           ext4_lblk_t end)
2817 {
2818         struct super_block *sb = inode->i_sb;
2819         int depth = ext_depth(inode);
2820         struct ext4_ext_path *path = NULL;
2821         long long partial_cluster = 0;
2822         handle_t *handle;
2823         int i = 0, err = 0;
2824
2825         ext_debug("truncate since %u to %u\n", start, end);
2826
2827         /* probably first extent we're gonna free will be last in block */
2828         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
2829         if (IS_ERR(handle))
2830                 return PTR_ERR(handle);
2831
2832 again:
2833         trace_ext4_ext_remove_space(inode, start, end, depth);
2834
2835         /*
2836          * Check if we are removing extents inside the extent tree. If that
2837          * is the case, we are going to punch a hole inside the extent tree
2838          * so we have to check whether we need to split the extent covering
2839          * the last block to remove so we can easily remove the part of it
2840          * in ext4_ext_rm_leaf().
2841          */
2842         if (end < EXT_MAX_BLOCKS - 1) {
2843                 struct ext4_extent *ex;
2844                 ext4_lblk_t ee_block;
2845
2846                 /* find extent for this block */
2847                 path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2848                 if (IS_ERR(path)) {
2849                         ext4_journal_stop(handle);
2850                         return PTR_ERR(path);
2851                 }
2852                 depth = ext_depth(inode);
2853                 /* Leaf not may not exist only if inode has no blocks at all */
2854                 ex = path[depth].p_ext;
2855                 if (!ex) {
2856                         if (depth) {
2857                                 EXT4_ERROR_INODE(inode,
2858                                                  "path[%d].p_hdr == NULL",
2859                                                  depth);
2860                                 err = -EIO;
2861                         }
2862                         goto out;
2863                 }
2864
2865                 ee_block = le32_to_cpu(ex->ee_block);
2866
2867                 /*
2868                  * See if the last block is inside the extent, if so split
2869                  * the extent at 'end' block so we can easily remove the
2870                  * tail of the first part of the split extent in
2871                  * ext4_ext_rm_leaf().
2872                  */
2873                 if (end >= ee_block &&
2874                     end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
2875                         /*
2876                          * Split the extent in two so that 'end' is the last
2877                          * block in the first new extent. Also we should not
2878                          * fail removing space due to ENOSPC so try to use
2879                          * reserved block if that happens.
2880                          */
2881                         err = ext4_force_split_extent_at(handle, inode, path,
2882                                                          end + 1, 1);
2883                         if (err < 0)
2884                                 goto out;
2885                 }
2886         }
2887         /*
2888          * We start scanning from right side, freeing all the blocks
2889          * after i_size and walking into the tree depth-wise.
2890          */
2891         depth = ext_depth(inode);
2892         if (path) {
2893                 int k = i = depth;
2894                 while (--k > 0)
2895                         path[k].p_block =
2896                                 le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2897         } else {
2898                 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
2899                                GFP_NOFS);
2900                 if (path == NULL) {
2901                         ext4_journal_stop(handle);
2902                         return -ENOMEM;
2903                 }
2904                 path[0].p_depth = depth;
2905                 path[0].p_hdr = ext_inode_hdr(inode);
2906                 i = 0;
2907
2908                 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
2909                         err = -EIO;
2910                         goto out;
2911                 }
2912         }
2913         err = 0;
2914
2915         while (i >= 0 && err == 0) {
2916                 if (i == depth) {
2917                         /* this is leaf block */
2918                         err = ext4_ext_rm_leaf(handle, inode, path,
2919                                                &partial_cluster, start,
2920                                                end);
2921                         /* root level has p_bh == NULL, brelse() eats this */
2922                         brelse(path[i].p_bh);
2923                         path[i].p_bh = NULL;
2924                         i--;
2925                         continue;
2926                 }
2927
2928                 /* this is index block */
2929                 if (!path[i].p_hdr) {
2930                         ext_debug("initialize header\n");
2931                         path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2932                 }
2933
2934                 if (!path[i].p_idx) {
2935                         /* this level hasn't been touched yet */
2936                         path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2937                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2938                         ext_debug("init index ptr: hdr 0x%p, num %d\n",
2939                                   path[i].p_hdr,
2940                                   le16_to_cpu(path[i].p_hdr->eh_entries));
2941                 } else {
2942                         /* we were already here, see at next index */
2943                         path[i].p_idx--;
2944                 }
2945
2946                 ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2947                                 i, EXT_FIRST_INDEX(path[i].p_hdr),
2948                                 path[i].p_idx);
2949                 if (ext4_ext_more_to_rm(path + i)) {
2950                         struct buffer_head *bh;
2951                         /* go to the next level */
2952                         ext_debug("move to level %d (block %llu)\n",
2953                                   i + 1, ext4_idx_pblock(path[i].p_idx));
2954                         memset(path + i + 1, 0, sizeof(*path));
2955                         bh = read_extent_tree_block(inode,
2956                                 ext4_idx_pblock(path[i].p_idx), depth - i - 1,
2957                                 EXT4_EX_NOCACHE);
2958                         if (IS_ERR(bh)) {
2959                                 /* should we reset i_size? */
2960                                 err = PTR_ERR(bh);
2961                                 break;
2962                         }
2963                         /* Yield here to deal with large extent trees.
2964                          * Should be a no-op if we did IO above. */
2965                         cond_resched();
2966                         if (WARN_ON(i + 1 > depth)) {
2967                                 err = -EIO;
2968                                 break;
2969                         }
2970                         path[i + 1].p_bh = bh;
2971
2972                         /* save actual number of indexes since this
2973                          * number is changed at the next iteration */
2974                         path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2975                         i++;
2976                 } else {
2977                         /* we finished processing this index, go up */
2978                         if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2979                                 /* index is empty, remove it;
2980                                  * handle must be already prepared by the
2981                                  * truncatei_leaf() */
2982                                 err = ext4_ext_rm_idx(handle, inode, path, i);
2983                         }
2984                         /* root level has p_bh == NULL, brelse() eats this */
2985                         brelse(path[i].p_bh);
2986                         path[i].p_bh = NULL;
2987                         i--;
2988                         ext_debug("return to level %d\n", i);
2989                 }
2990         }
2991
2992         trace_ext4_ext_remove_space_done(inode, start, end, depth,
2993                         partial_cluster, path->p_hdr->eh_entries);
2994
2995         /* If we still have something in the partial cluster and we have removed
2996          * even the first extent, then we should free the blocks in the partial
2997          * cluster as well. */
2998         if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
2999                 int flags = get_default_free_blocks_flags(inode);
3000
3001                 ext4_free_blocks(handle, inode, NULL,
3002                                  EXT4_C2B(EXT4_SB(sb), partial_cluster),
3003                                  EXT4_SB(sb)->s_cluster_ratio, flags);
3004                 partial_cluster = 0;
3005         }
3006
3007         /* TODO: flexible tree reduction should be here */
3008         if (path->p_hdr->eh_entries == 0) {
3009                 /*
3010                  * truncate to zero freed all the tree,
3011                  * so we need to correct eh_depth
3012                  */
3013                 err = ext4_ext_get_access(handle, inode, path);
3014                 if (err == 0) {
3015                         ext_inode_hdr(inode)->eh_depth = 0;
3016                         ext_inode_hdr(inode)->eh_max =
3017                                 cpu_to_le16(ext4_ext_space_root(inode, 0));
3018                         err = ext4_ext_dirty(handle, inode, path);
3019                 }
3020         }
3021 out:
3022         ext4_ext_drop_refs(path);
3023         kfree(path);
3024         if (err == -EAGAIN) {
3025                 path = NULL;
3026                 goto again;
3027         }
3028         ext4_journal_stop(handle);
3029
3030         return err;
3031 }
3032
3033 /*
3034  * called at mount time
3035  */
3036 void ext4_ext_init(struct super_block *sb)
3037 {
3038         /*
3039          * possible initialization would be here
3040          */
3041
3042         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
3043 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3044                 printk(KERN_INFO "EXT4-fs: file extents enabled"
3045 #ifdef AGGRESSIVE_TEST
3046                        ", aggressive tests"
3047 #endif
3048 #ifdef CHECK_BINSEARCH
3049                        ", check binsearch"
3050 #endif
3051 #ifdef EXTENTS_STATS
3052                        ", stats"
3053 #endif
3054                        "\n");
3055 #endif
3056 #ifdef EXTENTS_STATS
3057                 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3058                 EXT4_SB(sb)->s_ext_min = 1 << 30;
3059                 EXT4_SB(sb)->s_ext_max = 0;
3060 #endif
3061         }
3062 }
3063
3064 /*
3065  * called at umount time
3066  */
3067 void ext4_ext_release(struct super_block *sb)
3068 {
3069         if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
3070                 return;
3071
3072 #ifdef EXTENTS_STATS
3073         if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3074                 struct ext4_sb_info *sbi = EXT4_SB(sb);
3075                 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3076                         sbi->s_ext_blocks, sbi->s_ext_extents,
3077                         sbi->s_ext_blocks / sbi->s_ext_extents);
3078                 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3079                         sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3080         }
3081 #endif
3082 }
3083
3084 static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3085 {
3086         ext4_lblk_t  ee_block;
3087         ext4_fsblk_t ee_pblock;
3088         unsigned int ee_len;
3089
3090         ee_block  = le32_to_cpu(ex->ee_block);
3091         ee_len    = ext4_ext_get_actual_len(ex);
3092         ee_pblock = ext4_ext_pblock(ex);
3093
3094         if (ee_len == 0)
3095                 return 0;
3096
3097         return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3098                                      EXTENT_STATUS_WRITTEN);
3099 }
3100
3101 /* FIXME!! we need to try to merge to left or right after zero-out  */
3102 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3103 {
3104         ext4_fsblk_t ee_pblock;
3105         unsigned int ee_len;
3106         int ret;
3107
3108         ee_len    = ext4_ext_get_actual_len(ex);
3109         ee_pblock = ext4_ext_pblock(ex);
3110
3111         ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
3112         if (ret > 0)
3113                 ret = 0;
3114
3115         return ret;
3116 }
3117
3118 /*
3119  * ext4_split_extent_at() splits an extent at given block.
3120  *
3121  * @handle: the journal handle
3122  * @inode: the file inode
3123  * @path: the path to the extent
3124  * @split: the logical block where the extent is splitted.
3125  * @split_flags: indicates if the extent could be zeroout if split fails, and
3126  *               the states(init or unwritten) of new extents.
3127  * @flags: flags used to insert new extent to extent tree.
3128  *
3129  *
3130  * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3131  * of which are deterimined by split_flag.
3132  *
3133  * There are two cases:
3134  *  a> the extent are splitted into two extent.
3135  *  b> split is not needed, and just mark the extent.
3136  *
3137  * return 0 on success.
3138  */
3139 static int ext4_split_extent_at(handle_t *handle,
3140                              struct inode *inode,
3141                              struct ext4_ext_path *path,
3142                              ext4_lblk_t split,
3143                              int split_flag,
3144                              int flags)
3145 {
3146         ext4_fsblk_t newblock;
3147         ext4_lblk_t ee_block;
3148         struct ext4_extent *ex, newex, orig_ex, zero_ex;
3149         struct ext4_extent *ex2 = NULL;
3150         unsigned int ee_len, depth;
3151         int err = 0;
3152
3153         BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3154                (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3155
3156         ext_debug("ext4_split_extents_at: inode %lu, logical"
3157                 "block %llu\n", inode->i_ino, (unsigned long long)split);
3158
3159         ext4_ext_show_leaf(inode, path);
3160
3161         depth = ext_depth(inode);
3162         ex = path[depth].p_ext;
3163         ee_block = le32_to_cpu(ex->ee_block);
3164         ee_len = ext4_ext_get_actual_len(ex);
3165         newblock = split - ee_block + ext4_ext_pblock(ex);
3166
3167         BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3168         BUG_ON(!ext4_ext_is_unwritten(ex) &&
3169                split_flag & (EXT4_EXT_MAY_ZEROOUT |
3170                              EXT4_EXT_MARK_UNWRIT1 |
3171                              EXT4_EXT_MARK_UNWRIT2));
3172
3173         err = ext4_ext_get_access(handle, inode, path + depth);
3174         if (err)
3175                 goto out;
3176
3177         if (split == ee_block) {
3178                 /*
3179                  * case b: block @split is the block that the extent begins with
3180                  * then we just change the state of the extent, and splitting
3181                  * is not needed.
3182                  */
3183                 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3184                         ext4_ext_mark_unwritten(ex);
3185                 else
3186                         ext4_ext_mark_initialized(ex);
3187
3188                 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3189                         ext4_ext_try_to_merge(handle, inode, path, ex);
3190
3191                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3192                 goto out;
3193         }
3194
3195         /* case a */
3196         memcpy(&orig_ex, ex, sizeof(orig_ex));
3197         ex->ee_len = cpu_to_le16(split - ee_block);
3198         if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3199                 ext4_ext_mark_unwritten(ex);
3200
3201         /*
3202          * path may lead to new leaf, not to original leaf any more
3203          * after ext4_ext_insert_extent() returns,
3204          */
3205         err = ext4_ext_dirty(handle, inode, path + depth);
3206         if (err)
3207                 goto fix_extent_len;
3208
3209         ex2 = &newex;
3210         ex2->ee_block = cpu_to_le32(split);
3211         ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3212         ext4_ext_store_pblock(ex2, newblock);
3213         if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3214                 ext4_ext_mark_unwritten(ex2);
3215
3216         err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3217         if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3218                 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3219                         if (split_flag & EXT4_EXT_DATA_VALID1) {
3220                                 err = ext4_ext_zeroout(inode, ex2);
3221                                 zero_ex.ee_block = ex2->ee_block;
3222                                 zero_ex.ee_len = cpu_to_le16(
3223                                                 ext4_ext_get_actual_len(ex2));
3224                                 ext4_ext_store_pblock(&zero_ex,
3225                                                       ext4_ext_pblock(ex2));
3226                         } else {
3227                                 err = ext4_ext_zeroout(inode, ex);
3228                                 zero_ex.ee_block = ex->ee_block;
3229                                 zero_ex.ee_len = cpu_to_le16(
3230                                                 ext4_ext_get_actual_len(ex));
3231                                 ext4_ext_store_pblock(&zero_ex,
3232                                                       ext4_ext_pblock(ex));
3233                         }
3234                 } else {
3235                         err = ext4_ext_zeroout(inode, &orig_ex);
3236                         zero_ex.ee_block = orig_ex.ee_block;
3237                         zero_ex.ee_len = cpu_to_le16(
3238                                                 ext4_ext_get_actual_len(&orig_ex));
3239                         ext4_ext_store_pblock(&zero_ex,
3240                                               ext4_ext_pblock(&orig_ex));
3241                 }
3242
3243                 if (err)
3244                         goto fix_extent_len;
3245                 /* update the extent length and mark as initialized */
3246                 ex->ee_len = cpu_to_le16(ee_len);
3247                 ext4_ext_try_to_merge(handle, inode, path, ex);
3248                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3249                 if (err)
3250                         goto fix_extent_len;
3251
3252                 /* update extent status tree */
3253                 err = ext4_zeroout_es(inode, &zero_ex);
3254
3255                 goto out;
3256         } else if (err)
3257                 goto fix_extent_len;
3258
3259 out:
3260         ext4_ext_show_leaf(inode, path);
3261         return err;
3262
3263 fix_extent_len:
3264         ex->ee_len = orig_ex.ee_len;
3265         ext4_ext_dirty(handle, inode, path + path->p_depth);
3266         return err;
3267 }
3268
3269 /*
3270  * ext4_split_extents() splits an extent and mark extent which is covered
3271  * by @map as split_flags indicates
3272  *
3273  * It may result in splitting the extent into multiple extents (up to three)
3274  * There are three possibilities:
3275  *   a> There is no split required
3276  *   b> Splits in two extents: Split is happening at either end of the extent
3277  *   c> Splits in three extents: Somone is splitting in middle of the extent
3278  *
3279  */
3280 static int ext4_split_extent(handle_t *handle,
3281                               struct inode *inode,
3282                               struct ext4_ext_path *path,
3283                               struct ext4_map_blocks *map,
3284                               int split_flag,
3285                               int flags)
3286 {
3287         ext4_lblk_t ee_block;
3288         struct ext4_extent *ex;
3289         unsigned int ee_len, depth;
3290         int err = 0;
3291         int unwritten;
3292         int split_flag1, flags1;
3293         int allocated = map->m_len;
3294
3295         depth = ext_depth(inode);
3296         ex = path[depth].p_ext;
3297         ee_block = le32_to_cpu(ex->ee_block);
3298         ee_len = ext4_ext_get_actual_len(ex);
3299         unwritten = ext4_ext_is_unwritten(ex);
3300
3301         if (map->m_lblk + map->m_len < ee_block + ee_len) {
3302                 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3303                 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3304                 if (unwritten)
3305                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3306                                        EXT4_EXT_MARK_UNWRIT2;
3307                 if (split_flag & EXT4_EXT_DATA_VALID2)
3308                         split_flag1 |= EXT4_EXT_DATA_VALID1;
3309                 err = ext4_split_extent_at(handle, inode, path,
3310                                 map->m_lblk + map->m_len, split_flag1, flags1);
3311                 if (err)
3312                         goto out;
3313         } else {
3314                 allocated = ee_len - (map->m_lblk - ee_block);
3315         }
3316         /*
3317          * Update path is required because previous ext4_split_extent_at() may
3318          * result in split of original leaf or extent zeroout.
3319          */
3320         ext4_ext_drop_refs(path);
3321         path = ext4_ext_find_extent(inode, map->m_lblk, &path,
3322                                     EXT4_EX_NOFREE_ON_ERR);
3323         if (IS_ERR(path))
3324                 return PTR_ERR(path);
3325         depth = ext_depth(inode);
3326         ex = path[depth].p_ext;
3327         if (!ex) {
3328                 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3329                                  (unsigned long) map->m_lblk);
3330                 return -EIO;
3331         }
3332         unwritten = ext4_ext_is_unwritten(ex);
3333         split_flag1 = 0;
3334
3335         if (map->m_lblk >= ee_block) {
3336                 split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3337                 if (unwritten) {
3338                         split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3339                         split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3340                                                      EXT4_EXT_MARK_UNWRIT2);
3341                 }
3342                 err = ext4_split_extent_at(handle, inode, path,
3343                                 map->m_lblk, split_flag1, flags);
3344                 if (err)
3345                         goto out;
3346         }
3347
3348         ext4_ext_show_leaf(inode, path);
3349 out:
3350         return err ? err : allocated;
3351 }
3352
3353 /*
3354  * This function is called by ext4_ext_map_blocks() if someone tries to write
3355  * to an unwritten extent. It may result in splitting the unwritten
3356  * extent into multiple extents (up to three - one initialized and two
3357  * unwritten).
3358  * There are three possibilities:
3359  *   a> There is no split required: Entire extent should be initialized
3360  *   b> Splits in two extents: Write is happening at either end of the extent
3361  *   c> Splits in three extents: Somone is writing in middle of the extent
3362  *
3363  * Pre-conditions:
3364  *  - The extent pointed to by 'path' is unwritten.
3365  *  - The extent pointed to by 'path' contains a superset
3366  *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3367  *
3368  * Post-conditions on success:
3369  *  - the returned value is the number of blocks beyond map->l_lblk
3370  *    that are allocated and initialized.
3371  *    It is guaranteed to be >= map->m_len.
3372  */
3373 static int ext4_ext_convert_to_initialized(handle_t *handle,
3374                                            struct inode *inode,
3375                                            struct ext4_map_blocks *map,
3376                                            struct ext4_ext_path *path,
3377                                            int flags)
3378 {
3379         struct ext4_sb_info *sbi;
3380         struct ext4_extent_header *eh;
3381         struct ext4_map_blocks split_map;
3382         struct ext4_extent zero_ex;
3383         struct ext4_extent *ex, *abut_ex;
3384         ext4_lblk_t ee_block, eof_block;
3385         unsigned int ee_len, depth, map_len = map->m_len;
3386         int allocated = 0, max_zeroout = 0;
3387         int err = 0;
3388         int split_flag = 0;
3389
3390         ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3391                 "block %llu, max_blocks %u\n", inode->i_ino,
3392                 (unsigned long long)map->m_lblk, map_len);
3393
3394         sbi = EXT4_SB(inode->i_sb);
3395         eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3396                 inode->i_sb->s_blocksize_bits;
3397         if (eof_block < map->m_lblk + map_len)
3398                 eof_block = map->m_lblk + map_len;
3399
3400         depth = ext_depth(inode);
3401         eh = path[depth].p_hdr;
3402         ex = path[depth].p_ext;
3403         ee_block = le32_to_cpu(ex->ee_block);
3404         ee_len = ext4_ext_get_actual_len(ex);
3405         zero_ex.ee_len = 0;
3406
3407         trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3408
3409         /* Pre-conditions */
3410         BUG_ON(!ext4_ext_is_unwritten(ex));
3411         BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3412
3413         /*
3414          * Attempt to transfer newly initialized blocks from the currently
3415          * unwritten extent to its neighbor. This is much cheaper
3416          * than an insertion followed by a merge as those involve costly
3417          * memmove() calls. Transferring to the left is the common case in
3418          * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3419          * followed by append writes.
3420          *
3421          * Limitations of the current logic:
3422          *  - L1: we do not deal with writes covering the whole extent.
3423          *    This would require removing the extent if the transfer
3424          *    is possible.
3425          *  - L2: we only attempt to merge with an extent stored in the
3426          *    same extent tree node.
3427          */
3428         if ((map->m_lblk == ee_block) &&
3429                 /* See if we can merge left */
3430                 (map_len < ee_len) &&           /*L1*/
3431                 (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
3432                 ext4_lblk_t prev_lblk;
3433                 ext4_fsblk_t prev_pblk, ee_pblk;
3434                 unsigned int prev_len;
3435
3436                 abut_ex = ex - 1;
3437                 prev_lblk = le32_to_cpu(abut_ex->ee_block);
3438                 prev_len = ext4_ext_get_actual_len(abut_ex);
3439                 prev_pblk = ext4_ext_pblock(abut_ex);
3440                 ee_pblk = ext4_ext_pblock(ex);
3441
3442                 /*
3443                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3444                  * upon those conditions:
3445                  * - C1: abut_ex is initialized,
3446                  * - C2: abut_ex is logically abutting ex,
3447                  * - C3: abut_ex is physically abutting ex,
3448                  * - C4: abut_ex can receive the additional blocks without
3449                  *   overflowing the (initialized) length limit.
3450                  */
3451                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3452                         ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
3453                         ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
3454                         (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
3455                         err = ext4_ext_get_access(handle, inode, path + depth);
3456                         if (err)
3457                                 goto out;
3458
3459                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3460                                 map, ex, abut_ex);
3461
3462                         /* Shift the start of ex by 'map_len' blocks */
3463                         ex->ee_block = cpu_to_le32(ee_block + map_len);
3464                         ext4_ext_store_pblock(ex, ee_pblk + map_len);
3465                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3466                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3467
3468                         /* Extend abut_ex by 'map_len' blocks */
3469                         abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3470
3471                         /* Result: number of initialized blocks past m_lblk */
3472                         allocated = map_len;
3473                 }
3474         } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3475                    (map_len < ee_len) &&        /*L1*/
3476                    ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
3477                 /* See if we can merge right */
3478                 ext4_lblk_t next_lblk;
3479                 ext4_fsblk_t next_pblk, ee_pblk;
3480                 unsigned int next_len;
3481
3482                 abut_ex = ex + 1;
3483                 next_lblk = le32_to_cpu(abut_ex->ee_block);
3484                 next_len = ext4_ext_get_actual_len(abut_ex);
3485                 next_pblk = ext4_ext_pblock(abut_ex);
3486                 ee_pblk = ext4_ext_pblock(ex);
3487
3488                 /*
3489                  * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3490                  * upon those conditions:
3491                  * - C1: abut_ex is initialized,
3492                  * - C2: abut_ex is logically abutting ex,
3493                  * - C3: abut_ex is physically abutting ex,
3494                  * - C4: abut_ex can receive the additional blocks without
3495                  *   overflowing the (initialized) length limit.
3496                  */
3497                 if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
3498                     ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
3499                     ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
3500                     (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
3501                         err = ext4_ext_get_access(handle, inode, path + depth);
3502                         if (err)
3503                                 goto out;
3504
3505                         trace_ext4_ext_convert_to_initialized_fastpath(inode,
3506                                 map, ex, abut_ex);
3507
3508                         /* Shift the start of abut_ex by 'map_len' blocks */
3509                         abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3510                         ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3511                         ex->ee_len = cpu_to_le16(ee_len - map_len);
3512                         ext4_ext_mark_unwritten(ex); /* Restore the flag */
3513
3514                         /* Extend abut_ex by 'map_len' blocks */
3515                         abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3516
3517                         /* Result: number of initialized blocks past m_lblk */
3518                         allocated = map_len;
3519                 }
3520         }
3521         if (allocated) {
3522                 /* Mark the block containing both extents as dirty */
3523                 ext4_ext_dirty(handle, inode, path + depth);
3524
3525                 /* Update path to point to the right extent */
3526                 path[depth].p_ext = abut_ex;
3527                 goto out;
3528         } else
3529                 allocated = ee_len - (map->m_lblk - ee_block);
3530
3531         WARN_ON(map->m_lblk < ee_block);
3532         /*
3533          * It is safe to convert extent to initialized via explicit
3534          * zeroout only if extent is fully inside i_size or new_size.
3535          */
3536         split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3537
3538         if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3539                 max_zeroout = sbi->s_extent_max_zeroout_kb >>
3540                         (inode->i_sb->s_blocksize_bits - 10);
3541
3542         /* If extent is less than s_max_zeroout_kb, zeroout directly */
3543         if (max_zeroout && (ee_len <= max_zeroout)) {
3544                 err = ext4_ext_zeroout(inode, ex);
3545                 if (err)
3546                         goto out;
3547                 zero_ex.ee_block = ex->ee_block;
3548                 zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
3549                 ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
3550
3551                 err = ext4_ext_get_access(handle, inode, path + depth);
3552                 if (err)
3553                         goto out;
3554                 ext4_ext_mark_initialized(ex);
3555                 ext4_ext_try_to_merge(handle, inode, path, ex);
3556                 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3557                 goto out;
3558         }
3559
3560         /*
3561          * four cases:
3562          * 1. split the extent into three extents.
3563          * 2. split the extent into two extents, zeroout the first half.
3564          * 3. split the extent into two extents, zeroout the second half.
3565          * 4. split the extent into two extents with out zeroout.
3566          */
3567         split_map.m_lblk = map->m_lblk;
3568         split_map.m_len = map->m_len;
3569
3570         if (max_zeroout && (allocated > map->m_len)) {
3571                 if (allocated <= max_zeroout) {
3572                         /* case 3 */
3573                         zero_ex.ee_block =
3574                                          cpu_to_le32(map->m_lblk);
3575                         zero_ex.ee_len = cpu_to_le16(allocated);
3576                         ext4_ext_store_pblock(&zero_ex,
3577                                 ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3578                         err = ext4_ext_zeroout(inode, &zero_ex);
3579                         if (err)
3580                                 goto out;
3581                         split_map.m_lblk = map->m_lblk;
3582                         split_map.m_len = allocated;
3583                 } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
3584                         /* case 2 */
3585                         if (map->m_lblk != ee_block) {
3586                                 zero_ex.ee_block = ex->ee_block;
3587                                 zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3588                                                         ee_block);
3589                                 ext4_ext_store_pblock(&zero_ex,
3590                                                       ext4_ext_pblock(ex));
3591                                 err = ext4_ext_zeroout(inode, &zero_ex);
3592                                 if (err)
3593                                         goto out;
3594                         }
3595
3596                         split_map.m_lblk = ee_block;
3597                         split_map.m_len = map->m_lblk - ee_block + map->m_len;
3598                         allocated = map->m_len;
3599                 }
3600         }
3601
3602         allocated = ext4_split_extent(handle, inode, path,
3603                                       &split_map, split_flag, flags);
3604         if (allocated < 0)
3605                 err = allocated;
3606
3607 out:
3608         /* If we have gotten a failure, don't zero out status tree */
3609         if (!err)
3610                 err = ext4_zeroout_es(inode, &zero_ex);
3611         return err ? err : allocated;
3612 }
3613
3614 /*
3615  * This function is called by ext4_ext_map_blocks() from
3616  * ext4_get_blocks_dio_write() when DIO to write
3617  * to an unwritten extent.
3618  *
3619  * Writing to an unwritten extent may result in splitting the unwritten
3620  * extent into multiple initialized/unwritten extents (up to three)
3621  * There are three possibilities:
3622  *   a> There is no split required: Entire extent should be unwritten
3623  *   b> Splits in two extents: Write is happening at either end of the extent
3624  *   c> Splits in three extents: Somone is writing in middle of the extent
3625  *
3626  * This works the same way in the case of initialized -> unwritten conversion.
3627  *
3628  * One of more index blocks maybe needed if the extent tree grow after
3629  * the unwritten extent split. To prevent ENOSPC occur at the IO
3630  * complete, we need to split the unwritten extent before DIO submit
3631  * the IO. The unwritten extent called at this time will be split
3632  * into three unwritten extent(at most). After IO complete, the part
3633  * being filled will be convert to initialized by the end_io callback function
3634  * via ext4_convert_unwritten_extents().
3635  *
3636  * Returns the size of unwritten extent to be written on success.
3637  */
3638 static int ext4_split_convert_extents(handle_t *handle,
3639                                         struct inode *inode,
3640                                         struct ext4_map_blocks *map,
3641                                         struct ext4_ext_path *path,
3642                                         int flags)
3643 {
3644         ext4_lblk_t eof_block;
3645         ext4_lblk_t ee_block;
3646         struct ext4_extent *ex;
3647         unsigned int ee_len;
3648         int split_flag = 0, depth;
3649
3650         ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3651                   __func__, inode->i_ino,
3652                   (unsigned long long)map->m_lblk, map->m_len);
3653
3654         eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3655                 inode->i_sb->s_blocksize_bits;
3656         if (eof_block < map->m_lblk + map->m_len)
3657                 eof_block = map->m_lblk + map->m_len;
3658         /*
3659          * It is safe to convert extent to initialized via explicit
3660          * zeroout only if extent is fully insde i_size or new_size.
3661          */
3662         depth = ext_depth(inode);
3663         ex = path[depth].p_ext;
3664         ee_block = le32_to_cpu(ex->ee_block);
3665         ee_len = ext4_ext_get_actual_len(ex);
3666
3667         /* Convert to unwritten */
3668         if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3669                 split_flag |= EXT4_EXT_DATA_VALID1;
3670         /* Convert to initialized */
3671         } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3672                 split_flag |= ee_block + ee_len <= eof_block ?
3673                               EXT4_EXT_MAY_ZEROOUT : 0;
3674                 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3675         }
3676         flags |= EXT4_GET_BLOCKS_PRE_IO;
3677         return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3678 }
3679
3680 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3681                                                 struct inode *inode,
3682                                                 struct ext4_map_blocks *map,
3683                                                 struct ext4_ext_path *path)
3684 {
3685         struct ext4_extent *ex;
3686         ext4_lblk_t ee_block;
3687         unsigned int ee_len;
3688         int depth;
3689         int err = 0;
3690
3691         depth = ext_depth(inode);
3692         ex = path[depth].p_ext;
3693         ee_block = le32_to_cpu(ex->ee_block);
3694         ee_len = ext4_ext_get_actual_len(ex);
3695
3696         ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3697                 "block %llu, max_blocks %u\n", inode->i_ino,
3698                   (unsigned long long)ee_block, ee_len);
3699
3700         /* If extent is larger than requested it is a clear sign that we still
3701          * have some extent state machine issues left. So extent_split is still
3702          * required.
3703          * TODO: Once all related issues will be fixed this situation should be
3704          * illegal.
3705          */
3706         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3707 #ifdef EXT4_DEBUG
3708                 ext4_warning("Inode (%ld) finished: extent logical block %llu,"
3709                              " len %u; IO logical block %llu, len %u\n",
3710                              inode->i_ino, (unsigned long long)ee_block, ee_len,
3711                              (unsigned long long)map->m_lblk, map->m_len);
3712 #endif
3713                 err = ext4_split_convert_extents(handle, inode, map, path,
3714                                                  EXT4_GET_BLOCKS_CONVERT);
3715                 if (err < 0)
3716                         goto out;
3717                 ext4_ext_drop_refs(path);
3718                 path = ext4_ext_find_extent(inode, map->m_lblk, &path,
3719                                             EXT4_EX_NOFREE_ON_ERR);
3720                 if (IS_ERR(path)) {
3721                         err = PTR_ERR(path);
3722                         goto out;
3723                 }
3724                 depth = ext_depth(inode);
3725                 ex = path[depth].p_ext;
3726         }
3727
3728         err = ext4_ext_get_access(handle, inode, path + depth);
3729         if (err)
3730                 goto out;
3731         /* first mark the extent as initialized */
3732         ext4_ext_mark_initialized(ex);
3733
3734         /* note: ext4_ext_correct_indexes() isn't needed here because
3735          * borders are not changed
3736          */
3737         ext4_ext_try_to_merge(handle, inode, path, ex);
3738
3739         /* Mark modified extent as dirty */
3740         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3741 out:
3742         ext4_ext_show_leaf(inode, path);
3743         return err;
3744 }
3745
3746 static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3747                         sector_t block, int count)
3748 {
3749         int i;
3750         for (i = 0; i < count; i++)
3751                 unmap_underlying_metadata(bdev, block + i);
3752 }
3753
3754 /*
3755  * Handle EOFBLOCKS_FL flag, clearing it if necessary
3756  */
3757 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3758                               ext4_lblk_t lblk,
3759                               struct ext4_ext_path *path,
3760                               unsigned int len)
3761 {
3762         int i, depth;
3763         struct ext4_extent_header *eh;
3764         struct ext4_extent *last_ex;
3765
3766         if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3767                 return 0;
3768
3769         depth = ext_depth(inode);
3770         eh = path[depth].p_hdr;
3771
3772         /*
3773          * We're going to remove EOFBLOCKS_FL entirely in future so we
3774          * do not care for this case anymore. Simply remove the flag
3775          * if there are no extents.
3776          */
3777         if (unlikely(!eh->eh_entries))
3778                 goto out;
3779         last_ex = EXT_LAST_EXTENT(eh);
3780         /*
3781          * We should clear the EOFBLOCKS_FL flag if we are writing the
3782          * last block in the last extent in the file.  We test this by
3783          * first checking to see if the caller to
3784          * ext4_ext_get_blocks() was interested in the last block (or
3785          * a block beyond the last block) in the current extent.  If
3786          * this turns out to be false, we can bail out from this
3787          * function immediately.
3788          */
3789         if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3790             ext4_ext_get_actual_len(last_ex))
3791                 return 0;
3792         /*
3793          * If the caller does appear to be planning to write at or
3794          * beyond the end of the current extent, we then test to see
3795          * if the current extent is the last extent in the file, by
3796          * checking to make sure it was reached via the rightmost node
3797          * at each level of the tree.
3798          */
3799         for (i = depth-1; i >= 0; i--)
3800                 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3801                         return 0;
3802 out:
3803         ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3804         return ext4_mark_inode_dirty(handle, inode);
3805 }
3806
3807 /**
3808  * ext4_find_delalloc_range: find delayed allocated block in the given range.
3809  *
3810  * Return 1 if there is a delalloc block in the range, otherwise 0.
3811  */
3812 int ext4_find_delalloc_range(struct inode *inode,
3813                              ext4_lblk_t lblk_start,
3814                              ext4_lblk_t lblk_end)
3815 {
3816         struct extent_status es;
3817
3818         ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
3819         if (es.es_len == 0)
3820                 return 0; /* there is no delay extent in this tree */
3821         else if (es.es_lblk <= lblk_start &&
3822                  lblk_start < es.es_lblk + es.es_len)
3823                 return 1;
3824         else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
3825                 return 1;
3826         else
3827                 return 0;
3828 }
3829
3830 int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
3831 {
3832         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3833         ext4_lblk_t lblk_start, lblk_end;
3834         lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
3835         lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3836
3837         return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
3838 }
3839
3840 /**
3841  * Determines how many complete clusters (out of those specified by the 'map')
3842  * are under delalloc and were reserved quota for.
3843  * This function is called when we are writing out the blocks that were
3844  * originally written with their allocation delayed, but then the space was
3845  * allocated using fallocate() before the delayed allocation could be resolved.
3846  * The cases to look for are:
3847  * ('=' indicated delayed allocated blocks
3848  *  '-' indicates non-delayed allocated blocks)
3849  * (a) partial clusters towards beginning and/or end outside of allocated range
3850  *     are not delalloc'ed.
3851  *      Ex:
3852  *      |----c---=|====c====|====c====|===-c----|
3853  *               |++++++ allocated ++++++|
3854  *      ==> 4 complete clusters in above example
3855  *
3856  * (b) partial cluster (outside of allocated range) towards either end is
3857  *     marked for delayed allocation. In this case, we will exclude that
3858  *     cluster.
3859  *      Ex:
3860  *      |----====c========|========c========|
3861  *           |++++++ allocated ++++++|
3862  *      ==> 1 complete clusters in above example
3863  *
3864  *      Ex:
3865  *      |================c================|
3866  *            |++++++ allocated ++++++|
3867  *      ==> 0 complete clusters in above example
3868  *
3869  * The ext4_da_update_reserve_space will be called only if we
3870  * determine here that there were some "entire" clusters that span
3871  * this 'allocated' range.
3872  * In the non-bigalloc case, this function will just end up returning num_blks
3873  * without ever calling ext4_find_delalloc_range.
3874  */
3875 static unsigned int
3876 get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3877                            unsigned int num_blks)
3878 {
3879         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3880         ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3881         ext4_lblk_t lblk_from, lblk_to, c_offset;
3882         unsigned int allocated_clusters = 0;
3883
3884         alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3885         alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3886
3887         /* max possible clusters for this allocation */
3888         allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3889
3890         trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3891
3892         /* Check towards left side */
3893         c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
3894         if (c_offset) {
3895                 lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
3896                 lblk_to = lblk_from + c_offset - 1;
3897
3898                 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3899                         allocated_clusters--;
3900         }
3901
3902         /* Now check towards right. */
3903         c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
3904         if (allocated_clusters && c_offset) {
3905                 lblk_from = lblk_start + num_blks;
3906                 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3907
3908                 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3909                         allocated_clusters--;
3910         }
3911
3912         return allocated_clusters;
3913 }
3914
3915 static int
3916 convert_initialized_extent(handle_t *handle, struct inode *inode,
3917                            struct ext4_map_blocks *map,
3918                            struct ext4_ext_path *path, int flags,
3919                            unsigned int allocated, ext4_fsblk_t newblock)
3920 {
3921         struct ext4_extent *ex;
3922         ext4_lblk_t ee_block;
3923         unsigned int ee_len;
3924         int depth;
3925         int err = 0;
3926
3927         /*
3928          * Make sure that the extent is no bigger than we support with
3929          * unwritten extent
3930          */
3931         if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3932                 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3933
3934         depth = ext_depth(inode);
3935         ex = path[depth].p_ext;
3936         ee_block = le32_to_cpu(ex->ee_block);
3937         ee_len = ext4_ext_get_actual_len(ex);
3938
3939         ext_debug("%s: inode %lu, logical"
3940                 "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3941                   (unsigned long long)ee_block, ee_len);
3942
3943         if (ee_block != map->m_lblk || ee_len > map->m_len) {
3944                 err = ext4_split_convert_extents(handle, inode, map, path,
3945                                 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3946                 if (err < 0)
3947                         return err;
3948                 ext4_ext_drop_refs(path);
3949                 path = ext4_ext_find_extent(inode, map->m_lblk, &path,
3950                                             EXT4_EX_NOFREE_ON_ERR);
3951                 if (IS_ERR(path))
3952                         return PTR_ERR(path);
3953                 depth = ext_depth(inode);
3954                 ex = path[depth].p_ext;
3955                 if (!ex) {
3956                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3957                                          (unsigned long) map->m_lblk);
3958                         return -EIO;
3959                 }
3960         }
3961
3962         err = ext4_ext_get_access(handle, inode, path + depth);
3963         if (err)
3964                 return err;
3965         /* first mark the extent as unwritten */
3966         ext4_ext_mark_unwritten(ex);
3967
3968         /* note: ext4_ext_correct_indexes() isn't needed here because
3969          * borders are not changed
3970          */
3971         ext4_ext_try_to_merge(handle, inode, path, ex);
3972
3973         /* Mark modified extent as dirty */
3974         err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3975         if (err)
3976                 return err;
3977         ext4_ext_show_leaf(inode, path);
3978
3979         ext4_update_inode_fsync_trans(handle, inode, 1);
3980         err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
3981         if (err)
3982                 return err;
3983         map->m_flags |= EXT4_MAP_UNWRITTEN;
3984         if (allocated > map->m_len)
3985                 allocated = map->m_len;
3986         map->m_len = allocated;
3987         return allocated;
3988 }
3989
3990 static int
3991 ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
3992                         struct ext4_map_blocks *map,
3993                         struct ext4_ext_path *path, int flags,
3994                         unsigned int allocated, ext4_fsblk_t newblock)
3995 {
3996         int ret = 0;
3997         int err = 0;
3998         ext4_io_end_t *io = ext4_inode_aio(inode);
3999
4000         ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
4001                   "block %llu, max_blocks %u, flags %x, allocated %u\n",
4002                   inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
4003                   flags, allocated);
4004         ext4_ext_show_leaf(inode, path);
4005
4006         /*
4007          * When writing into unwritten space, we should not fail to
4008          * allocate metadata blocks for the new extent block if needed.
4009          */
4010         flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
4011
4012         trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
4013                                                     allocated, newblock);
4014
4015         /* get_block() before submit the IO, split the extent */
4016         if (flags & EXT4_GET_BLOCKS_PRE_IO) {
4017                 ret = ext4_split_convert_extents(handle, inode, map,
4018                                          path, flags | EXT4_GET_BLOCKS_CONVERT);
4019                 if (ret <= 0)
4020                         goto out;
4021                 /*
4022                  * Flag the inode(non aio case) or end_io struct (aio case)
4023                  * that this IO needs to conversion to written when IO is
4024                  * completed
4025                  */
4026                 if (io)
4027                         ext4_set_io_unwritten_flag(inode, io);
4028                 else
4029                         ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
4030                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4031                 goto out;
4032         }
4033         /* IO end_io complete, convert the filled extent to written */
4034         if (flags & EXT4_GET_BLOCKS_CONVERT) {
4035                 ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
4036                                                         path);
4037                 if (ret >= 0) {
4038                         ext4_update_inode_fsync_trans(handle, inode, 1);
4039                         err = check_eofblocks_fl(handle, inode, map->m_lblk,
4040                                                  path, map->m_len);
4041                 } else
4042                         err = ret;
4043                 map->m_flags |= EXT4_MAP_MAPPED;
4044                 map->m_pblk = newblock;
4045                 if (allocated > map->m_len)
4046                         allocated = map->m_len;
4047                 map->m_len = allocated;
4048                 goto out2;
4049         }
4050         /* buffered IO case */
4051         /*
4052          * repeat fallocate creation request
4053          * we already have an unwritten extent
4054          */
4055         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4056                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4057                 goto map_out;
4058         }
4059
4060         /* buffered READ or buffered write_begin() lookup */
4061         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4062                 /*
4063                  * We have blocks reserved already.  We
4064                  * return allocated blocks so that delalloc
4065                  * won't do block reservation for us.  But
4066                  * the buffer head will be unmapped so that
4067                  * a read from the block returns 0s.
4068                  */
4069                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4070                 goto out1;
4071         }
4072
4073         /* buffered write, writepage time, convert*/
4074         ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
4075         if (ret >= 0)
4076                 ext4_update_inode_fsync_trans(handle, inode, 1);
4077 out:
4078         if (ret <= 0) {
4079                 err = ret;
4080                 goto out2;
4081         } else
4082                 allocated = ret;
4083         map->m_flags |= EXT4_MAP_NEW;
4084         /*
4085          * if we allocated more blocks than requested
4086          * we need to make sure we unmap the extra block
4087          * allocated. The actual needed block will get
4088          * unmapped later when we find the buffer_head marked
4089          * new.
4090          */
4091         if (allocated > map->m_len) {
4092                 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
4093                                         newblock + map->m_len,
4094                                         allocated - map->m_len);
4095                 allocated = map->m_len;
4096         }
4097         map->m_len = allocated;
4098
4099         /*
4100          * If we have done fallocate with the offset that is already
4101          * delayed allocated, we would have block reservation
4102          * and quota reservation done in the delayed write path.
4103          * But fallocate would have already updated quota and block
4104          * count for this offset. So cancel these reservation
4105          */
4106         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4107                 unsigned int reserved_clusters;
4108                 reserved_clusters = get_reserved_cluster_alloc(inode,
4109                                 map->m_lblk, map->m_len);
4110                 if (reserved_clusters)
4111                         ext4_da_update_reserve_space(inode,
4112                                                      reserved_clusters,
4113                                                      0);
4114         }
4115
4116 map_out:
4117         map->m_flags |= EXT4_MAP_MAPPED;
4118         if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
4119                 err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
4120                                          map->m_len);
4121                 if (err < 0)
4122                         goto out2;
4123         }
4124 out1:
4125         if (allocated > map->m_len)
4126                 allocated = map->m_len;
4127         ext4_ext_show_leaf(inode, path);
4128         map->m_pblk = newblock;
4129         map->m_len = allocated;
4130 out2:
4131         return err ? err : allocated;
4132 }
4133
4134 /*
4135  * get_implied_cluster_alloc - check to see if the requested
4136  * allocation (in the map structure) overlaps with a cluster already
4137  * allocated in an extent.
4138  *      @sb     The filesystem superblock structure
4139  *      @map    The requested lblk->pblk mapping
4140  *      @ex     The extent structure which might contain an implied
4141  *                      cluster allocation
4142  *
4143  * This function is called by ext4_ext_map_blocks() after we failed to
4144  * find blocks that were already in the inode's extent tree.  Hence,
4145  * we know that the beginning of the requested region cannot overlap
4146  * the extent from the inode's extent tree.  There are three cases we
4147  * want to catch.  The first is this case:
4148  *
4149  *               |--- cluster # N--|
4150  *    |--- extent ---|  |---- requested region ---|
4151  *                      |==========|
4152  *
4153  * The second case that we need to test for is this one:
4154  *
4155  *   |--------- cluster # N ----------------|
4156  *         |--- requested region --|   |------- extent ----|
4157  *         |=======================|
4158  *
4159  * The third case is when the requested region lies between two extents
4160  * within the same cluster:
4161  *          |------------- cluster # N-------------|
4162  * |----- ex -----|                  |---- ex_right ----|
4163  *                  |------ requested region ------|
4164  *                  |================|
4165  *
4166  * In each of the above cases, we need to set the map->m_pblk and
4167  * map->m_len so it corresponds to the return the extent labelled as
4168  * "|====|" from cluster #N, since it is already in use for data in
4169  * cluster EXT4_B2C(sbi, map->m_lblk).  We will then return 1 to
4170  * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
4171  * as a new "allocated" block region.  Otherwise, we will return 0 and
4172  * ext4_ext_map_blocks() will then allocate one or more new clusters
4173  * by calling ext4_mb_new_blocks().
4174  */
4175 static int get_implied_cluster_alloc(struct super_block *sb,
4176                                      struct ext4_map_blocks *map,
4177                                      struct ext4_extent *ex,
4178                                      struct ext4_ext_path *path)
4179 {
4180         struct ext4_sb_info *sbi = EXT4_SB(sb);
4181         ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4182         ext4_lblk_t ex_cluster_start, ex_cluster_end;
4183         ext4_lblk_t rr_cluster_start;
4184         ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4185         ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4186         unsigned short ee_len = ext4_ext_get_actual_len(ex);
4187
4188         /* The extent passed in that we are trying to match */
4189         ex_cluster_start = EXT4_B2C(sbi, ee_block);
4190         ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
4191
4192         /* The requested region passed into ext4_map_blocks() */
4193         rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
4194
4195         if ((rr_cluster_start == ex_cluster_end) ||
4196             (rr_cluster_start == ex_cluster_start)) {
4197                 if (rr_cluster_start == ex_cluster_end)
4198                         ee_start += ee_len - 1;
4199                 map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4200                 map->m_len = min(map->m_len,
4201                                  (unsigned) sbi->s_cluster_ratio - c_offset);
4202                 /*
4203                  * Check for and handle this case:
4204                  *
4205                  *   |--------- cluster # N-------------|
4206                  *                     |------- extent ----|
4207                  *         |--- requested region ---|
4208                  *         |===========|
4209                  */
4210
4211                 if (map->m_lblk < ee_block)
4212                         map->m_len = min(map->m_len, ee_block - map->m_lblk);
4213
4214                 /*
4215                  * Check for the case where there is already another allocated
4216                  * block to the right of 'ex' but before the end of the cluster.
4217                  *
4218                  *          |------------- cluster # N-------------|
4219                  * |----- ex -----|                  |---- ex_right ----|
4220                  *                  |------ requested region ------|
4221                  *                  |================|
4222                  */
4223                 if (map->m_lblk > ee_block) {
4224                         ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4225                         map->m_len = min(map->m_len, next - map->m_lblk);
4226                 }
4227
4228                 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4229                 return 1;
4230         }
4231
4232         trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4233         return 0;
4234 }
4235
4236
4237 /*
4238  * Block allocation/map/preallocation routine for extents based files
4239  *
4240  *
4241  * Need to be called with
4242  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4243  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4244  *
4245  * return > 0, number of of blocks already mapped/allocated
4246  *          if create == 0 and these are pre-allocated blocks
4247  *              buffer head is unmapped
4248  *          otherwise blocks are mapped
4249  *
4250  * return = 0, if plain look up failed (blocks have not been allocated)
4251  *          buffer head is unmapped
4252  *
4253  * return < 0, error case.
4254  */
4255 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4256                         struct ext4_map_blocks *map, int flags)
4257 {
4258         struct ext4_ext_path *path = NULL;
4259         struct ext4_extent newex, *ex, *ex2;
4260         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4261         ext4_fsblk_t newblock = 0;
4262         int free_on_err = 0, err = 0, depth, ret;
4263         unsigned int allocated = 0, offset = 0;
4264         unsigned int allocated_clusters = 0;
4265         struct ext4_allocation_request ar;
4266         ext4_io_end_t *io = ext4_inode_aio(inode);
4267         ext4_lblk_t cluster_offset;
4268         int set_unwritten = 0;
4269
4270         ext_debug("blocks %u/%u requested for inode %lu\n",
4271                   map->m_lblk, map->m_len, inode->i_ino);
4272         trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4273
4274         /* find extent for this block */
4275         path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
4276         if (IS_ERR(path)) {
4277                 err = PTR_ERR(path);
4278                 path = NULL;
4279                 goto out2;
4280         }
4281
4282         depth = ext_depth(inode);
4283
4284         /*
4285          * consistent leaf must not be empty;
4286          * this situation is possible, though, _during_ tree modification;
4287          * this is why assert can't be put in ext4_ext_find_extent()
4288          */
4289         if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4290                 EXT4_ERROR_INODE(inode, "bad extent address "
4291                                  "lblock: %lu, depth: %d pblock %lld",
4292                                  (unsigned long) map->m_lblk, depth,
4293                                  path[depth].p_block);
4294                 err = -EIO;
4295                 goto out2;
4296         }
4297
4298         ex = path[depth].p_ext;
4299         if (ex) {
4300                 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4301                 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4302                 unsigned short ee_len;
4303
4304
4305                 /*
4306                  * unwritten extents are treated as holes, except that
4307                  * we split out initialized portions during a write.
4308                  */
4309                 ee_len = ext4_ext_get_actual_len(ex);
4310
4311                 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4312
4313                 /* if found extent covers block, simply return it */
4314                 if (in_range(map->m_lblk, ee_block, ee_len)) {
4315                         newblock = map->m_lblk - ee_block + ee_start;
4316                         /* number of remaining blocks in the extent */
4317                         allocated = ee_len - (map->m_lblk - ee_block);
4318                         ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4319                                   ee_block, ee_len, newblock);
4320
4321                         /*
4322                          * If the extent is initialized check whether the
4323                          * caller wants to convert it to unwritten.
4324                          */
4325                         if ((!ext4_ext_is_unwritten(ex)) &&
4326                             (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4327                                 allocated = convert_initialized_extent(
4328                                                 handle, inode, map, path, flags,
4329                                                 allocated, newblock);
4330                                 goto out2;
4331                         } else if (!ext4_ext_is_unwritten(ex))
4332                                 goto out;
4333
4334                         ret = ext4_ext_handle_unwritten_extents(
4335                                 handle, inode, map, path, flags,
4336                                 allocated, newblock);
4337                         if (ret < 0)
4338                                 err = ret;
4339                         else
4340                                 allocated = ret;
4341                         goto out2;
4342                 }
4343         }
4344
4345         if ((sbi->s_cluster_ratio > 1) &&
4346             ext4_find_delalloc_cluster(inode, map->m_lblk))
4347                 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4348
4349         /*
4350          * requested block isn't allocated yet;
4351          * we couldn't try to create block if create flag is zero
4352          */
4353         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4354                 /*
4355                  * put just found gap into cache to speed up
4356                  * subsequent requests
4357                  */
4358                 if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
4359                         ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
4360                 goto out2;
4361         }
4362
4363         /*
4364          * Okay, we need to do block allocation.
4365          */
4366         map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
4367         newex.ee_block = cpu_to_le32(map->m_lblk);
4368         cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4369
4370         /*
4371          * If we are doing bigalloc, check to see if the extent returned
4372          * by ext4_ext_find_extent() implies a cluster we can use.
4373          */
4374         if (cluster_offset && ex &&
4375             get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4376                 ar.len = allocated = map->m_len;
4377                 newblock = map->m_pblk;
4378                 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4379                 goto got_allocated_blocks;
4380         }
4381
4382         /* find neighbour allocated blocks */
4383         ar.lleft = map->m_lblk;
4384         err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4385         if (err)
4386                 goto out2;
4387         ar.lright = map->m_lblk;
4388         ex2 = NULL;
4389         err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4390         if (err)
4391                 goto out2;
4392
4393         /* Check if the extent after searching to the right implies a
4394          * cluster we can use. */
4395         if ((sbi->s_cluster_ratio > 1) && ex2 &&
4396             get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4397                 ar.len = allocated = map->m_len;
4398                 newblock = map->m_pblk;
4399                 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4400                 goto got_allocated_blocks;
4401         }
4402
4403         /*
4404          * See if request is beyond maximum number of blocks we can have in
4405          * a single extent. For an initialized extent this limit is
4406          * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4407          * EXT_UNWRITTEN_MAX_LEN.
4408          */
4409         if (map->m_len > EXT_INIT_MAX_LEN &&
4410             !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4411                 map->m_len = EXT_INIT_MAX_LEN;
4412         else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4413                  (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4414                 map->m_len = EXT_UNWRITTEN_MAX_LEN;
4415
4416         /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4417         newex.ee_len = cpu_to_le16(map->m_len);
4418         err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4419         if (err)
4420                 allocated = ext4_ext_get_actual_len(&newex);
4421         else
4422                 allocated = map->m_len;
4423
4424         /* allocate new block */
4425         ar.inode = inode;
4426         ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4427         ar.logical = map->m_lblk;
4428         /*
4429          * We calculate the offset from the beginning of the cluster
4430          * for the logical block number, since when we allocate a
4431          * physical cluster, the physical block should start at the
4432          * same offset from the beginning of the cluster.  This is
4433          * needed so that future calls to get_implied_cluster_alloc()
4434          * work correctly.
4435          */
4436         offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4437         ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4438         ar.goal -= offset;
4439         ar.logical -= offset;
4440         if (S_ISREG(inode->i_mode))
4441                 ar.flags = EXT4_MB_HINT_DATA;
4442         else
4443                 /* disable in-core preallocation for non-regular files */
4444                 ar.flags = 0;
4445         if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4446                 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4447         newblock = ext4_mb_new_blocks(handle, &ar, &err);
4448         if (!newblock)
4449                 goto out2;
4450         ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4451                   ar.goal, newblock, allocated);
4452         free_on_err = 1;
4453         allocated_clusters = ar.len;
4454         ar.len = EXT4_C2B(sbi, ar.len) - offset;
4455         if (ar.len > allocated)
4456                 ar.len = allocated;
4457
4458 got_allocated_blocks:
4459         /* try to insert new extent into found leaf and return */
4460         ext4_ext_store_pblock(&newex, newblock + offset);
4461         newex.ee_len = cpu_to_le16(ar.len);
4462         /* Mark unwritten */
4463         if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
4464                 ext4_ext_mark_unwritten(&newex);
4465                 map->m_flags |= EXT4_MAP_UNWRITTEN;
4466                 /*
4467                  * io_end structure was created for every IO write to an
4468                  * unwritten extent. To avoid unnecessary conversion,
4469                  * here we flag the IO that really needs the conversion.
4470                  * For non asycn direct IO case, flag the inode state
4471                  * that we need to perform conversion when IO is done.
4472                  */
4473                 if (flags & EXT4_GET_BLOCKS_PRE_IO)
4474                         set_unwritten = 1;
4475         }
4476
4477         err = 0;
4478         if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
4479                 err = check_eofblocks_fl(handle, inode, map->m_lblk,
4480                                          path, ar.len);
4481         if (!err)
4482                 err = ext4_ext_insert_extent(handle, inode, path,
4483                                              &newex, flags);
4484
4485         if (!err && set_unwritten) {
4486                 if (io)
4487                         ext4_set_io_unwritten_flag(inode, io);
4488                 else
4489                         ext4_set_inode_state(inode,
4490                                              EXT4_STATE_DIO_UNWRITTEN);
4491         }
4492
4493         if (err && free_on_err) {
4494                 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4495                         EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
4496                 /* free data blocks we just allocated */
4497                 /* not a good idea to call discard here directly,
4498                  * but otherwise we'd need to call it every free() */
4499                 ext4_discard_preallocations(inode);
4500                 ext4_free_blocks(handle, inode, NULL, newblock,
4501                                  EXT4_C2B(sbi, allocated_clusters), fb_flags);
4502                 goto out2;
4503         }
4504
4505         /* previous routine could use block we allocated */
4506         newblock = ext4_ext_pblock(&newex);
4507         allocated = ext4_ext_get_actual_len(&newex);
4508         if (allocated > map->m_len)
4509                 allocated = map->m_len;
4510         map->m_flags |= EXT4_MAP_NEW;
4511
4512         /*
4513          * Update reserved blocks/metadata blocks after successful
4514          * block allocation which had been deferred till now.
4515          */
4516         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4517                 unsigned int reserved_clusters;
4518                 /*
4519                  * Check how many clusters we had reserved this allocated range
4520                  */
4521                 reserved_clusters = get_reserved_cluster_alloc(inode,
4522                                                 map->m_lblk, allocated);
4523                 if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
4524                         if (reserved_clusters) {
4525                                 /*
4526                                  * We have clusters reserved for this range.
4527                                  * But since we are not doing actual allocation
4528                                  * and are simply using blocks from previously
4529                                  * allocated cluster, we should release the
4530                                  * reservation and not claim quota.
4531                                  */
4532                                 ext4_da_update_reserve_space(inode,
4533                                                 reserved_clusters, 0);
4534                         }
4535                 } else {
4536                         BUG_ON(allocated_clusters < reserved_clusters);
4537                         if (reserved_clusters < allocated_clusters) {
4538                                 struct ext4_inode_info *ei = EXT4_I(inode);
4539                                 int reservation = allocated_clusters -
4540                                                   reserved_clusters;
4541                                 /*
4542                                  * It seems we claimed few clusters outside of
4543                                  * the range of this allocation. We should give
4544                                  * it back to the reservation pool. This can
4545                                  * happen in the following case:
4546                                  *
4547                                  * * Suppose s_cluster_ratio is 4 (i.e., each
4548                                  *   cluster has 4 blocks. Thus, the clusters
4549                                  *   are [0-3],[4-7],[8-11]...
4550                                  * * First comes delayed allocation write for
4551                                  *   logical blocks 10 & 11. Since there were no
4552                                  *   previous delayed allocated blocks in the
4553                                  *   range [8-11], we would reserve 1 cluster
4554                                  *   for this write.
4555                                  * * Next comes write for logical blocks 3 to 8.
4556                                  *   In this case, we will reserve 2 clusters
4557                                  *   (for [0-3] and [4-7]; and not for [8-11] as
4558                                  *   that range has a delayed allocated blocks.
4559                                  *   Thus total reserved clusters now becomes 3.
4560                                  * * Now, during the delayed allocation writeout
4561                                  *   time, we will first write blocks [3-8] and
4562                                  *   allocate 3 clusters for writing these
4563                                  *   blocks. Also, we would claim all these
4564                                  *   three clusters above.
4565                                  * * Now when we come here to writeout the
4566                                  *   blocks [10-11], we would expect to claim
4567                                  *   the reservation of 1 cluster we had made
4568                                  *   (and we would claim it since there are no
4569                                  *   more delayed allocated blocks in the range
4570                                  *   [8-11]. But our reserved cluster count had
4571                                  *   already gone to 0.
4572                                  *
4573                                  *   Thus, at the step 4 above when we determine
4574                                  *   that there are still some unwritten delayed
4575                                  *   allocated blocks outside of our current
4576                                  *   block range, we should increment the
4577                                  *   reserved clusters count so that when the
4578                                  *   remaining blocks finally gets written, we
4579                                  *   could claim them.
4580                                  */
4581                                 dquot_reserve_block(inode,
4582                                                 EXT4_C2B(sbi, reservation));
4583                                 spin_lock(&ei->i_block_reservation_lock);
4584                                 ei->i_reserved_data_blocks += reservation;
4585                                 spin_unlock(&ei->i_block_reservation_lock);
4586                         }
4587                         /*
4588                          * We will claim quota for all newly allocated blocks.
4589                          * We're updating the reserved space *after* the
4590                          * correction above so we do not accidentally free
4591                          * all the metadata reservation because we might
4592                          * actually need it later on.
4593                          */
4594                         ext4_da_update_reserve_space(inode, allocated_clusters,
4595                                                         1);
4596                 }
4597         }
4598
4599         /*
4600          * Cache the extent and update transaction to commit on fdatasync only
4601          * when it is _not_ an unwritten extent.
4602          */
4603         if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4604                 ext4_update_inode_fsync_trans(handle, inode, 1);
4605         else
4606                 ext4_update_inode_fsync_trans(handle, inode, 0);
4607 out:
4608         if (allocated > map->m_len)
4609                 allocated = map->m_len;
4610         ext4_ext_show_leaf(inode, path);
4611         map->m_flags |= EXT4_MAP_MAPPED;
4612         map->m_pblk = newblock;
4613         map->m_len = allocated;
4614 out2:
4615         if (path) {
4616                 ext4_ext_drop_refs(path);
4617                 kfree(path);
4618         }
4619
4620         trace_ext4_ext_map_blocks_exit(inode, flags, map,
4621                                        err ? err : allocated);
4622         ext4_es_lru_add(inode);
4623         return err ? err : allocated;
4624 }
4625
4626 void ext4_ext_truncate(handle_t *handle, struct inode *inode)
4627 {
4628         struct super_block *sb = inode->i_sb;
4629         ext4_lblk_t last_block;
4630         int err = 0;
4631
4632         /*
4633          * TODO: optimization is possible here.
4634          * Probably we need not scan at all,
4635          * because page truncation is enough.
4636          */
4637
4638         /* we have to know where to truncate from in crash case */
4639         EXT4_I(inode)->i_disksize = inode->i_size;
4640         ext4_mark_inode_dirty(handle, inode);
4641
4642         last_block = (inode->i_size + sb->s_blocksize - 1)
4643                         >> EXT4_BLOCK_SIZE_BITS(sb);
4644 retry:
4645         err = ext4_es_remove_extent(inode, last_block,
4646                                     EXT_MAX_BLOCKS - last_block);
4647         if (err == -ENOMEM) {
4648                 cond_resched();
4649                 congestion_wait(BLK_RW_ASYNC, HZ/50);
4650                 goto retry;
4651         }
4652         if (err) {
4653                 ext4_std_error(inode->i_sb, err);
4654                 return;
4655         }
4656         err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4657         ext4_std_error(inode->i_sb, err);
4658 }
4659
4660 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4661                                   ext4_lblk_t len, loff_t new_size,
4662                                   int flags, int mode)
4663 {
4664         struct inode *inode = file_inode(file);
4665         handle_t *handle;
4666         int ret = 0;
4667         int ret2 = 0;
4668         int retries = 0;
4669         struct ext4_map_blocks map;
4670         unsigned int credits;
4671         loff_t epos;
4672
4673         map.m_lblk = offset;
4674         map.m_len = len;
4675         /*
4676          * Don't normalize the request if it can fit in one extent so
4677          * that it doesn't get unnecessarily split into multiple
4678          * extents.
4679          */
4680         if (len <= EXT_UNWRITTEN_MAX_LEN)
4681                 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4682
4683         /*
4684          * credits to insert 1 extent into extent tree
4685          */
4686         credits = ext4_chunk_trans_blocks(inode, len);
4687
4688 retry:
4689         while (ret >= 0 && len) {
4690                 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4691                                             credits);
4692                 if (IS_ERR(handle)) {
4693                         ret = PTR_ERR(handle);
4694                         break;
4695                 }
4696                 ret = ext4_map_blocks(handle, inode, &map, flags);
4697                 if (ret <= 0) {
4698                         ext4_debug("inode #%lu: block %u: len %u: "
4699                                    "ext4_ext_map_blocks returned %d",
4700                                    inode->i_ino, map.m_lblk,
4701                                    map.m_len, ret);
4702                         ext4_mark_inode_dirty(handle, inode);
4703                         ret2 = ext4_journal_stop(handle);
4704                         break;
4705                 }
4706                 map.m_lblk += ret;
4707                 map.m_len = len = len - ret;
4708                 epos = (loff_t)map.m_lblk << inode->i_blkbits;
4709                 inode->i_ctime = ext4_current_time(inode);
4710                 if (new_size) {
4711                         if (epos > new_size)
4712                                 epos = new_size;
4713                         if (ext4_update_inode_size(inode, epos) & 0x1)
4714                                 inode->i_mtime = inode->i_ctime;
4715                 } else {
4716                         if (epos > inode->i_size)
4717                                 ext4_set_inode_flag(inode,
4718                                                     EXT4_INODE_EOFBLOCKS);
4719                 }
4720                 ext4_mark_inode_dirty(handle, inode);
4721                 ret2 = ext4_journal_stop(handle);
4722                 if (ret2)
4723                         break;
4724         }
4725         if (ret == -ENOSPC &&
4726                         ext4_should_retry_alloc(inode->i_sb, &retries)) {
4727                 ret = 0;
4728                 goto retry;
4729         }
4730
4731         return ret > 0 ? ret2 : ret;
4732 }
4733
4734 static long ext4_zero_range(struct file *file, loff_t offset,
4735                             loff_t len, int mode)
4736 {
4737         struct inode *inode = file_inode(file);
4738         handle_t *handle = NULL;
4739         unsigned int max_blocks;
4740         loff_t new_size = 0;
4741         int ret = 0;
4742         int flags;
4743         int credits;
4744         int partial_begin, partial_end;
4745         loff_t start, end;
4746         ext4_lblk_t lblk;
4747         struct address_space *mapping = inode->i_mapping;
4748         unsigned int blkbits = inode->i_blkbits;
4749
4750         trace_ext4_zero_range(inode, offset, len, mode);
4751
4752         if (!S_ISREG(inode->i_mode))
4753                 return -EINVAL;
4754
4755         /* Call ext4_force_commit to flush all data in case of data=journal. */
4756         if (ext4_should_journal_data(inode)) {
4757                 ret = ext4_force_commit(inode->i_sb);
4758                 if (ret)
4759                         return ret;
4760         }
4761
4762         /*
4763          * Write out all dirty pages to avoid race conditions
4764          * Then release them.
4765          */
4766         if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4767                 ret = filemap_write_and_wait_range(mapping, offset,
4768                                                    offset + len - 1);
4769                 if (ret)
4770                         return ret;
4771         }
4772
4773         /*
4774          * Round up offset. This is not fallocate, we neet to zero out
4775          * blocks, so convert interior block aligned part of the range to
4776          * unwritten and possibly manually zero out unaligned parts of the
4777          * range.
4778          */
4779         start = round_up(offset, 1 << blkbits);
4780         end = round_down((offset + len), 1 << blkbits);
4781
4782         if (start < offset || end > offset + len)
4783                 return -EINVAL;
4784         partial_begin = offset & ((1 << blkbits) - 1);
4785         partial_end = (offset + len) & ((1 << blkbits) - 1);
4786
4787         lblk = start >> blkbits;
4788         max_blocks = (end >> blkbits);
4789         if (max_blocks < lblk)
4790                 max_blocks = 0;
4791         else
4792                 max_blocks -= lblk;
4793
4794         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
4795                 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4796                 EXT4_EX_NOCACHE;
4797         if (mode & FALLOC_FL_KEEP_SIZE)
4798                 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4799
4800         mutex_lock(&inode->i_mutex);
4801
4802         /*
4803          * Indirect files do not support unwritten extnets
4804          */
4805         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4806                 ret = -EOPNOTSUPP;
4807                 goto out_mutex;
4808         }
4809
4810         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4811              offset + len > i_size_read(inode)) {
4812                 new_size = offset + len;
4813                 ret = inode_newsize_ok(inode, new_size);
4814                 if (ret)
4815                         goto out_mutex;
4816                 /*
4817                  * If we have a partial block after EOF we have to allocate
4818                  * the entire block.
4819                  */
4820                 if (partial_end)
4821                         max_blocks += 1;
4822         }
4823
4824         if (max_blocks > 0) {
4825
4826                 /* Now release the pages and zero block aligned part of pages*/
4827                 truncate_pagecache_range(inode, start, end - 1);
4828                 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4829
4830                 /* Wait all existing dio workers, newcomers will block on i_mutex */
4831                 ext4_inode_block_unlocked_dio(inode);
4832                 inode_dio_wait(inode);
4833
4834                 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4835                                              flags, mode);
4836                 if (ret)
4837                         goto out_dio;
4838                 /*
4839                  * Remove entire range from the extent status tree.
4840                  *
4841                  * ext4_es_remove_extent(inode, lblk, max_blocks) is
4842                  * NOT sufficient.  I'm not sure why this is the case,
4843                  * but let's be conservative and remove the extent
4844                  * status tree for the entire inode.  There should be
4845                  * no outstanding delalloc extents thanks to the
4846                  * filemap_write_and_wait_range() call above.
4847                  */
4848                 ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
4849                 if (ret)
4850                         goto out_dio;
4851         }
4852         if (!partial_begin && !partial_end)
4853                 goto out_dio;
4854
4855         /*
4856          * In worst case we have to writeout two nonadjacent unwritten
4857          * blocks and update the inode
4858          */
4859         credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4860         if (ext4_should_journal_data(inode))
4861                 credits += 2;
4862         handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4863         if (IS_ERR(handle)) {
4864                 ret = PTR_ERR(handle);
4865                 ext4_std_error(inode->i_sb, ret);
4866                 goto out_dio;
4867         }
4868
4869         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4870         if (new_size) {
4871                 ext4_update_inode_size(inode, new_size);
4872         } else {
4873                 /*
4874                 * Mark that we allocate beyond EOF so the subsequent truncate
4875                 * can proceed even if the new size is the same as i_size.
4876                 */
4877                 if ((offset + len) > i_size_read(inode))
4878                         ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4879         }
4880         ext4_mark_inode_dirty(handle, inode);
4881
4882         /* Zero out partial block at the edges of the range */
4883         ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4884
4885         if (file->f_flags & O_SYNC)
4886                 ext4_handle_sync(handle);
4887
4888         ext4_journal_stop(handle);
4889 out_dio:
4890         ext4_inode_resume_unlocked_dio(inode);
4891 out_mutex:
4892         mutex_unlock(&inode->i_mutex);
4893         return ret;
4894 }
4895
4896 /*
4897  * preallocate space for a file. This implements ext4's fallocate file
4898  * operation, which gets called from sys_fallocate system call.
4899  * For block-mapped files, posix_fallocate should fall back to the method
4900  * of writing zeroes to the required new blocks (the same behavior which is
4901  * expected for file systems which do not support fallocate() system call).
4902  */
4903 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4904 {
4905         struct inode *inode = file_inode(file);
4906         loff_t new_size = 0;
4907         unsigned int max_blocks;
4908         int ret = 0;
4909         int flags;
4910         ext4_lblk_t lblk;
4911         unsigned int blkbits = inode->i_blkbits;
4912
4913         /* Return error if mode is not supported */
4914         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4915                      FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
4916                 return -EOPNOTSUPP;
4917
4918         if (mode & FALLOC_FL_PUNCH_HOLE)
4919                 return ext4_punch_hole(inode, offset, len);
4920
4921         ret = ext4_convert_inline_data(inode);
4922         if (ret)
4923                 return ret;
4924
4925         /*
4926          * currently supporting (pre)allocate mode for extent-based
4927          * files _only_
4928          */
4929         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4930                 return -EOPNOTSUPP;
4931
4932         if (mode & FALLOC_FL_COLLAPSE_RANGE)
4933                 return ext4_collapse_range(inode, offset, len);
4934
4935         if (mode & FALLOC_FL_ZERO_RANGE)
4936                 return ext4_zero_range(file, offset, len, mode);
4937
4938         trace_ext4_fallocate_enter(inode, offset, len, mode);
4939         lblk = offset >> blkbits;
4940         /*
4941          * We can't just convert len to max_blocks because
4942          * If blocksize = 4096 offset = 3072 and len = 2048
4943          */
4944         max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
4945                 - lblk;
4946
4947         flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4948         if (mode & FALLOC_FL_KEEP_SIZE)
4949                 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4950
4951         mutex_lock(&inode->i_mutex);
4952
4953         if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4954              offset + len > i_size_read(inode)) {
4955                 new_size = offset + len;
4956                 ret = inode_newsize_ok(inode, new_size);
4957                 if (ret)
4958                         goto out;
4959         }
4960
4961         ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4962                                      flags, mode);
4963         if (ret)
4964                 goto out;
4965
4966         if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4967                 ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
4968                                                 EXT4_I(inode)->i_sync_tid);
4969         }
4970 out:
4971         mutex_unlock(&inode->i_mutex);
4972         trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4973         return ret;
4974 }
4975
4976 /*
4977  * This function convert a range of blocks to written extents
4978  * The caller of this function will pass the start offset and the size.
4979  * all unwritten extents within this range will be converted to
4980  * written extents.
4981  *
4982  * This function is called from the direct IO end io call back
4983  * function, to convert the fallocated extents after IO is completed.
4984  * Returns 0 on success.
4985  */
4986 int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4987                                    loff_t offset, ssize_t len)
4988 {
4989         unsigned int max_blocks;
4990         int ret = 0;
4991         int ret2 = 0;
4992         struct ext4_map_blocks map;
4993         unsigned int credits, blkbits = inode->i_blkbits;
4994
4995         map.m_lblk = offset >> blkbits;
4996         /*
4997          * We can't just convert len to max_blocks because
4998          * If blocksize = 4096 offset = 3072 and len = 2048
4999          */
5000         max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
5001                       map.m_lblk);
5002         /*
5003          * This is somewhat ugly but the idea is clear: When transaction is
5004          * reserved, everything goes into it. Otherwise we rather start several
5005          * smaller transactions for conversion of each extent separately.
5006          */
5007         if (handle) {
5008                 handle = ext4_journal_start_reserved(handle,
5009                                                      EXT4_HT_EXT_CONVERT);
5010                 if (IS_ERR(handle))
5011                         return PTR_ERR(handle);
5012                 credits = 0;
5013         } else {
5014                 /*
5015                  * credits to insert 1 extent into extent tree
5016                  */
5017                 credits = ext4_chunk_trans_blocks(inode, max_blocks);
5018         }
5019         while (ret >= 0 && ret < max_blocks) {
5020                 map.m_lblk += ret;
5021                 map.m_len = (max_blocks -= ret);
5022                 if (credits) {
5023                         handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
5024                                                     credits);
5025                         if (IS_ERR(handle)) {
5026                                 ret = PTR_ERR(handle);
5027                                 break;
5028                         }
5029                 }
5030                 ret = ext4_map_blocks(handle, inode, &map,
5031                                       EXT4_GET_BLOCKS_IO_CONVERT_EXT);
5032                 if (ret <= 0)
5033                         ext4_warning(inode->i_sb,
5034                                      "inode #%lu: block %u: len %u: "
5035                                      "ext4_ext_map_blocks returned %d",
5036                                      inode->i_ino, map.m_lblk,
5037                                      map.m_len, ret);
5038                 ext4_mark_inode_dirty(handle, inode);
5039                 if (credits)
5040                         ret2 = ext4_journal_stop(handle);
5041                 if (ret <= 0 || ret2)
5042                         break;
5043         }
5044         if (!credits)
5045                 ret2 = ext4_journal_stop(handle);
5046         return ret > 0 ? ret2 : ret;
5047 }
5048
5049 /*
5050  * If newes is not existing extent (newes->ec_pblk equals zero) find
5051  * delayed extent at start of newes and update newes accordingly and
5052  * return start of the next delayed extent.
5053  *
5054  * If newes is existing extent (newes->ec_pblk is not equal zero)
5055  * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
5056  * extent found. Leave newes unmodified.
5057  */
5058 static int ext4_find_delayed_extent(struct inode *inode,
5059                                     struct extent_status *newes)
5060 {
5061         struct extent_status es;
5062         ext4_lblk_t block, next_del;
5063
5064         if (newes->es_pblk == 0) {
5065                 ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
5066                                 newes->es_lblk + newes->es_len - 1, &es);
5067
5068                 /*
5069                  * No extent in extent-tree contains block @newes->es_pblk,
5070                  * then the block may stay in 1)a hole or 2)delayed-extent.
5071                  */
5072                 if (es.es_len == 0)
5073                         /* A hole found. */
5074                         return 0;
5075
5076                 if (es.es_lblk > newes->es_lblk) {
5077                         /* A hole found. */
5078                         newes->es_len = min(es.es_lblk - newes->es_lblk,
5079                                             newes->es_len);
5080                         return 0;
5081                 }
5082
5083                 newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
5084         }
5085
5086         block = newes->es_lblk + newes->es_len;
5087         ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
5088         if (es.es_len == 0)
5089                 next_del = EXT_MAX_BLOCKS;
5090         else
5091                 next_del = es.es_lblk;
5092
5093         return next_del;
5094 }
5095 /* fiemap flags we can handle specified here */
5096 #define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
5097
5098 static int ext4_xattr_fiemap(struct inode *inode,
5099                                 struct fiemap_extent_info *fieinfo)
5100 {
5101         __u64 physical = 0;
5102         __u64 length;
5103         __u32 flags = FIEMAP_EXTENT_LAST;
5104         int blockbits = inode->i_sb->s_blocksize_bits;
5105         int error = 0;
5106
5107         /* in-inode? */
5108         if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
5109                 struct ext4_iloc iloc;
5110                 int offset;     /* offset of xattr in inode */
5111
5112                 error = ext4_get_inode_loc(inode, &iloc);
5113                 if (error)
5114                         return error;
5115                 physical = (__u64)iloc.bh->b_blocknr << blockbits;
5116                 offset = EXT4_GOOD_OLD_INODE_SIZE +
5117                                 EXT4_I(inode)->i_extra_isize;
5118                 physical += offset;
5119                 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
5120                 flags |= FIEMAP_EXTENT_DATA_INLINE;
5121                 brelse(iloc.bh);
5122         } else { /* external block */
5123                 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
5124                 length = inode->i_sb->s_blocksize;
5125         }
5126
5127         if (physical)
5128                 error = fiemap_fill_next_extent(fieinfo, 0, physical,
5129                                                 length, flags);
5130         return (error < 0 ? error : 0);
5131 }
5132
5133 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5134                 __u64 start, __u64 len)
5135 {
5136         ext4_lblk_t start_blk;
5137         int error = 0;
5138
5139         if (ext4_has_inline_data(inode)) {
5140                 int has_inline = 1;
5141
5142                 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
5143
5144                 if (has_inline)
5145                         return error;
5146         }
5147
5148         if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5149                 error = ext4_ext_precache(inode);
5150                 if (error)
5151                         return error;
5152         }
5153
5154         /* fallback to generic here if not in extents fmt */
5155         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5156                 return generic_block_fiemap(inode, fieinfo, start, len,
5157                         ext4_get_block);
5158
5159         if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
5160                 return -EBADR;
5161
5162         if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
5163                 error = ext4_xattr_fiemap(inode, fieinfo);
5164         } else {
5165                 ext4_lblk_t len_blks;
5166                 __u64 last_blk;
5167
5168                 start_blk = start >> inode->i_sb->s_blocksize_bits;
5169                 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
5170                 if (last_blk >= EXT_MAX_BLOCKS)
5171                         last_blk = EXT_MAX_BLOCKS-1;
5172                 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5173
5174                 /*
5175                  * Walk the extent tree gathering extent information
5176                  * and pushing extents back to the user.
5177                  */
5178                 error = ext4_fill_fiemap_extents(inode, start_blk,
5179                                                  len_blks, fieinfo);
5180         }
5181         ext4_es_lru_add(inode);
5182         return error;
5183 }
5184
5185 /*
5186  * ext4_access_path:
5187  * Function to access the path buffer for marking it dirty.
5188  * It also checks if there are sufficient credits left in the journal handle
5189  * to update path.
5190  */
5191 static int
5192 ext4_access_path(handle_t *handle, struct inode *inode,
5193                 struct ext4_ext_path *path)
5194 {
5195         int credits, err;
5196
5197         if (!ext4_handle_valid(handle))
5198                 return 0;
5199
5200         /*
5201          * Check if need to extend journal credits
5202          * 3 for leaf, sb, and inode plus 2 (bmap and group
5203          * descriptor) for each block group; assume two block
5204          * groups
5205          */
5206         if (handle->h_buffer_credits < 7) {
5207                 credits = ext4_writepage_trans_blocks(inode);
5208                 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
5209                 /* EAGAIN is success */
5210                 if (err && err != -EAGAIN)
5211                         return err;
5212         }
5213
5214         err = ext4_ext_get_access(handle, inode, path);
5215         return err;
5216 }
5217
5218 /*
5219  * ext4_ext_shift_path_extents:
5220  * Shift the extents of a path structure lying between path[depth].p_ext
5221  * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
5222  * from starting block for each extent.
5223  */
5224 static int
5225 ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5226                             struct inode *inode, handle_t *handle,
5227                             ext4_lblk_t *start)
5228 {
5229         int depth, err = 0;
5230         struct ext4_extent *ex_start, *ex_last;
5231         bool update = 0;
5232         depth = path->p_depth;
5233
5234         while (depth >= 0) {
5235                 if (depth == path->p_depth) {
5236                         ex_start = path[depth].p_ext;
5237                         if (!ex_start)
5238                                 return -EIO;
5239
5240                         ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5241                         if (!ex_last)
5242                                 return -EIO;
5243
5244                         err = ext4_access_path(handle, inode, path + depth);
5245                         if (err)
5246                                 goto out;
5247
5248                         if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
5249                                 update = 1;
5250
5251                         *start = le32_to_cpu(ex_last->ee_block) +
5252                                 ext4_ext_get_actual_len(ex_last);
5253
5254                         while (ex_start <= ex_last) {
5255                                 le32_add_cpu(&ex_start->ee_block, -shift);
5256                                 /* Try to merge to the left. */
5257                                 if ((ex_start >
5258                                      EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
5259                                     ext4_ext_try_to_merge_right(inode,
5260                                                         path, ex_start - 1))
5261                                         ex_last--;
5262                                 else
5263                                         ex_start++;
5264                         }
5265                         err = ext4_ext_dirty(handle, inode, path + depth);
5266                         if (err)
5267                                 goto out;
5268
5269                         if (--depth < 0 || !update)
5270                                 break;
5271                 }
5272
5273                 /* Update index too */
5274                 err = ext4_access_path(handle, inode, path + depth);
5275                 if (err)
5276                         goto out;
5277
5278                 le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5279                 err = ext4_ext_dirty(handle, inode, path + depth);
5280                 if (err)
5281                         goto out;
5282
5283                 /* we are done if current index is not a starting index */
5284                 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5285                         break;
5286
5287                 depth--;
5288         }
5289
5290 out:
5291         return err;
5292 }
5293
5294 /*
5295  * ext4_ext_shift_extents:
5296  * All the extents which lies in the range from start to the last allocated
5297  * block for the file are shifted downwards by shift blocks.
5298  * On success, 0 is returned, error otherwise.
5299  */
5300 static int
5301 ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5302                        ext4_lblk_t start, ext4_lblk_t shift)
5303 {
5304         struct ext4_ext_path *path;
5305         int ret = 0, depth;
5306         struct ext4_extent *extent;
5307         ext4_lblk_t stop_block;
5308         ext4_lblk_t ex_start, ex_end;
5309
5310         /* Let path point to the last extent */
5311         path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
5312         if (IS_ERR(path))
5313                 return PTR_ERR(path);
5314
5315         depth = path->p_depth;
5316         extent = path[depth].p_ext;
5317         if (!extent) {
5318                 ext4_ext_drop_refs(path);
5319                 kfree(path);
5320                 return ret;
5321         }
5322
5323         stop_block = le32_to_cpu(extent->ee_block) +
5324                         ext4_ext_get_actual_len(extent);
5325         ext4_ext_drop_refs(path);
5326         kfree(path);
5327
5328         /* Nothing to shift, if hole is at the end of file */
5329         if (start >= stop_block)
5330                 return ret;
5331
5332         /*
5333          * Don't start shifting extents until we make sure the hole is big
5334          * enough to accomodate the shift.
5335          */
5336         path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
5337         if (IS_ERR(path))
5338                 return PTR_ERR(path);
5339         depth = path->p_depth;
5340         extent =  path[depth].p_ext;
5341         if (extent) {
5342                 ex_start = le32_to_cpu(extent->ee_block);
5343                 ex_end = le32_to_cpu(extent->ee_block) +
5344                         ext4_ext_get_actual_len(extent);
5345         } else {
5346                 ex_start = 0;
5347                 ex_end = 0;
5348         }
5349         ext4_ext_drop_refs(path);
5350         kfree(path);
5351
5352         if ((start == ex_start && shift > ex_start) ||
5353             (shift > start - ex_end))
5354                 return -EINVAL;
5355
5356         /* Its safe to start updating extents */
5357         while (start < stop_block) {
5358                 path = ext4_ext_find_extent(inode, start, NULL, 0);
5359                 if (IS_ERR(path))
5360                         return PTR_ERR(path);
5361                 depth = path->p_depth;
5362                 extent = path[depth].p_ext;
5363                 if (!extent) {
5364                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5365                                          (unsigned long) start);
5366                         return -EIO;
5367                 }
5368                 if (start > le32_to_cpu(extent->ee_block)) {
5369                         /* Hole, move to the next extent */
5370                         if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5371                                 path[depth].p_ext++;
5372                         } else {
5373                                 start = ext4_ext_next_allocated_block(path);
5374                                 ext4_ext_drop_refs(path);
5375                                 kfree(path);
5376                                 continue;
5377                         }
5378                 }
5379                 ret = ext4_ext_shift_path_extents(path, shift, inode,
5380                                 handle, &start);
5381                 ext4_ext_drop_refs(path);
5382                 kfree(path);
5383                 if (ret)
5384                         break;
5385         }
5386
5387         return ret;
5388 }
5389
5390 /*
5391  * ext4_collapse_range:
5392  * This implements the fallocate's collapse range functionality for ext4
5393  * Returns: 0 and non-zero on error.
5394  */
5395 int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5396 {
5397         struct super_block *sb = inode->i_sb;
5398         ext4_lblk_t punch_start, punch_stop;
5399         handle_t *handle;
5400         unsigned int credits;
5401         loff_t new_size, ioffset;
5402         int ret;
5403
5404         /* Collapse range works only on fs block size aligned offsets. */
5405         if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
5406             len & (EXT4_CLUSTER_SIZE(sb) - 1))
5407                 return -EINVAL;
5408
5409         if (!S_ISREG(inode->i_mode))
5410                 return -EINVAL;
5411
5412         trace_ext4_collapse_range(inode, offset, len);
5413
5414         punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5415         punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5416
5417         /* Call ext4_force_commit to flush all data in case of data=journal. */
5418         if (ext4_should_journal_data(inode)) {
5419                 ret = ext4_force_commit(inode->i_sb);
5420                 if (ret)
5421                         return ret;
5422         }
5423
5424         /*
5425          * Need to round down offset to be aligned with page size boundary
5426          * for page size > block size.
5427          */
5428         ioffset = round_down(offset, PAGE_SIZE);
5429
5430         /* Write out all dirty pages */
5431         ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5432                                            LLONG_MAX);
5433         if (ret)
5434                 return ret;
5435
5436         /* Take mutex lock */
5437         mutex_lock(&inode->i_mutex);
5438
5439         /*
5440          * There is no need to overlap collapse range with EOF, in which case
5441          * it is effectively a truncate operation
5442          */
5443         if (offset + len >= i_size_read(inode)) {
5444                 ret = -EINVAL;
5445                 goto out_mutex;
5446         }
5447
5448         /* Currently just for extent based files */
5449         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5450                 ret = -EOPNOTSUPP;
5451                 goto out_mutex;
5452         }
5453
5454         truncate_pagecache(inode, ioffset);
5455
5456         /* Wait for existing dio to complete */
5457         ext4_inode_block_unlocked_dio(inode);
5458         inode_dio_wait(inode);
5459
5460         credits = ext4_writepage_trans_blocks(inode);
5461         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5462         if (IS_ERR(handle)) {
5463                 ret = PTR_ERR(handle);
5464                 goto out_dio;
5465         }
5466
5467         down_write(&EXT4_I(inode)->i_data_sem);
5468         ext4_discard_preallocations(inode);
5469
5470         ret = ext4_es_remove_extent(inode, punch_start,
5471                                     EXT_MAX_BLOCKS - punch_start);
5472         if (ret) {
5473                 up_write(&EXT4_I(inode)->i_data_sem);
5474                 goto out_stop;
5475         }
5476
5477         ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5478         if (ret) {
5479                 up_write(&EXT4_I(inode)->i_data_sem);
5480                 goto out_stop;
5481         }
5482         ext4_discard_preallocations(inode);
5483
5484         ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5485                                      punch_stop - punch_start);
5486         if (ret) {
5487                 up_write(&EXT4_I(inode)->i_data_sem);
5488                 goto out_stop;
5489         }
5490
5491         new_size = i_size_read(inode) - len;
5492         i_size_write(inode, new_size);
5493         EXT4_I(inode)->i_disksize = new_size;
5494
5495         up_write(&EXT4_I(inode)->i_data_sem);
5496         if (IS_SYNC(inode))
5497                 ext4_handle_sync(handle);
5498         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
5499         ext4_mark_inode_dirty(handle, inode);
5500
5501 out_stop:
5502         ext4_journal_stop(handle);
5503 out_dio:
5504         ext4_inode_resume_unlocked_dio(inode);
5505 out_mutex:
5506         mutex_unlock(&inode->i_mutex);
5507         return ret;
5508 }
5509
5510 /**
5511  * ext4_swap_extents - Swap extents between two inodes
5512  *
5513  * @inode1:     First inode
5514  * @inode2:     Second inode
5515  * @lblk1:      Start block for first inode
5516  * @lblk2:      Start block for second inode
5517  * @count:      Number of blocks to swap
5518  * @mark_unwritten: Mark second inode's extents as unwritten after swap
5519  * @erp:        Pointer to save error value
5520  *
5521  * This helper routine does exactly what is promise "swap extents". All other
5522  * stuff such as page-cache locking consistency, bh mapping consistency or
5523  * extent's data copying must be performed by caller.
5524  * Locking:
5525  *              i_mutex is held for both inodes
5526  *              i_data_sem is locked for write for both inodes
5527  * Assumptions:
5528  *              All pages from requested range are locked for both inodes
5529  */
5530 int
5531 ext4_swap_extents(handle_t *handle, struct inode *inode1,
5532                      struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5533                   ext4_lblk_t count, int unwritten, int *erp)
5534 {
5535         struct ext4_ext_path *path1 = NULL;
5536         struct ext4_ext_path *path2 = NULL;
5537         int replaced_count = 0;
5538
5539         BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5540         BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5541         BUG_ON(!mutex_is_locked(&inode1->i_mutex));
5542         BUG_ON(!mutex_is_locked(&inode1->i_mutex));
5543
5544         *erp = ext4_es_remove_extent(inode1, lblk1, count);
5545         if (unlikely(*erp))
5546                 return 0;
5547         *erp = ext4_es_remove_extent(inode2, lblk2, count);
5548         if (unlikely(*erp))
5549                 return 0;
5550
5551         while (count) {
5552                 struct ext4_extent *ex1, *ex2, tmp_ex;
5553                 ext4_lblk_t e1_blk, e2_blk;
5554                 int e1_len, e2_len, len;
5555                 int split = 0;
5556
5557                 path1 = ext4_ext_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5558                 if (unlikely(IS_ERR(path1))) {
5559                         *erp = PTR_ERR(path1);
5560                         path1 = NULL;
5561                 finish:
5562                         count = 0;
5563                         goto repeat;
5564                 }
5565                 path2 = ext4_ext_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5566                 if (unlikely(IS_ERR(path2))) {
5567                         *erp = PTR_ERR(path2);
5568                         path2 = NULL;
5569                         goto finish;
5570                 }
5571                 ex1 = path1[path1->p_depth].p_ext;
5572                 ex2 = path2[path2->p_depth].p_ext;
5573                 /* Do we have somthing to swap ? */
5574                 if (unlikely(!ex2 || !ex1))
5575                         goto finish;
5576
5577                 e1_blk = le32_to_cpu(ex1->ee_block);
5578                 e2_blk = le32_to_cpu(ex2->ee_block);
5579                 e1_len = ext4_ext_get_actual_len(ex1);
5580                 e2_len = ext4_ext_get_actual_len(ex2);
5581
5582                 /* Hole handling */
5583                 if (!in_range(lblk1, e1_blk, e1_len) ||
5584                     !in_range(lblk2, e2_blk, e2_len)) {
5585                         ext4_lblk_t next1, next2;
5586
5587                         /* if hole after extent, then go to next extent */
5588                         next1 = ext4_ext_next_allocated_block(path1);
5589                         next2 = ext4_ext_next_allocated_block(path2);
5590                         /* If hole before extent, then shift to that extent */
5591                         if (e1_blk > lblk1)
5592                                 next1 = e1_blk;
5593                         if (e2_blk > lblk2)
5594                                 next2 = e1_blk;
5595                         /* Do we have something to swap */
5596                         if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5597                                 goto finish;
5598                         /* Move to the rightest boundary */
5599                         len = next1 - lblk1;
5600                         if (len < next2 - lblk2)
5601                                 len = next2 - lblk2;
5602                         if (len > count)
5603                                 len = count;
5604                         lblk1 += len;
5605                         lblk2 += len;
5606                         count -= len;
5607                         goto repeat;
5608                 }
5609
5610                 /* Prepare left boundary */
5611                 if (e1_blk < lblk1) {
5612                         split = 1;
5613                         *erp = ext4_force_split_extent_at(handle, inode1,
5614                                                 path1, lblk1, 0);
5615                         if (unlikely(*erp))
5616                                 goto finish;
5617                 }
5618                 if (e2_blk < lblk2) {
5619                         split = 1;
5620                         *erp = ext4_force_split_extent_at(handle, inode2,
5621                                                 path2,  lblk2, 0);
5622                         if (unlikely(*erp))
5623                                 goto finish;
5624                 }
5625                 /* ext4_split_extent_at() may retult in leaf extent split,
5626                  * path must to be revalidated. */
5627                 if (split)
5628                         goto repeat;
5629
5630                 /* Prepare right boundary */
5631                 len = count;
5632                 if (len > e1_blk + e1_len - lblk1)
5633                         len = e1_blk + e1_len - lblk1;
5634                 if (len > e2_blk + e2_len - lblk2)
5635                         len = e2_blk + e2_len - lblk2;
5636
5637                 if (len != e1_len) {
5638                         split = 1;
5639                         *erp = ext4_force_split_extent_at(handle, inode1,
5640                                                 path1, lblk1 + len, 0);
5641                         if (unlikely(*erp))
5642                                 goto finish;
5643                 }
5644                 if (len != e2_len) {
5645                         split = 1;
5646                         *erp = ext4_force_split_extent_at(handle, inode2,
5647                                                 path2, lblk2 + len, 0);
5648                         if (*erp)
5649                                 goto finish;
5650                 }
5651                 /* ext4_split_extent_at() may retult in leaf extent split,
5652                  * path must to be revalidated. */
5653                 if (split)
5654                         goto repeat;
5655
5656                 BUG_ON(e2_len != e1_len);
5657                 *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5658                 if (unlikely(*erp))
5659                         goto finish;
5660                 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5661                 if (unlikely(*erp))
5662                         goto finish;
5663
5664                 /* Both extents are fully inside boundaries. Swap it now */
5665                 tmp_ex = *ex1;
5666                 ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5667                 ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5668                 ex1->ee_len = cpu_to_le16(e2_len);
5669                 ex2->ee_len = cpu_to_le16(e1_len);
5670                 if (unwritten)
5671                         ext4_ext_mark_unwritten(ex2);
5672                 if (ext4_ext_is_unwritten(&tmp_ex))
5673                         ext4_ext_mark_unwritten(ex1);
5674
5675                 ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5676                 ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5677                 *erp = ext4_ext_dirty(handle, inode2, path2 +
5678                                       path2->p_depth);
5679                 if (unlikely(*erp))
5680                         goto finish;
5681                 *erp = ext4_ext_dirty(handle, inode1, path1 +
5682                                       path1->p_depth);
5683                 /*
5684                  * Looks scarry ah..? second inode already points to new blocks,
5685                  * and it was successfully dirtied. But luckily error may happen
5686                  * only due to journal error, so full transaction will be
5687                  * aborted anyway.
5688                  */
5689                 if (unlikely(*erp))
5690                         goto finish;
5691                 lblk1 += len;
5692                 lblk2 += len;
5693                 replaced_count += len;
5694                 count -= len;
5695
5696         repeat:
5697                 if (path1) {
5698                         ext4_ext_drop_refs(path1);
5699                         kfree(path1);
5700                         path1 = NULL;
5701                 }
5702                 if (path2) {
5703                         ext4_ext_drop_refs(path2);
5704                         kfree(path2);
5705                         path2 = NULL;
5706                 }
5707         }
5708         return replaced_count;
5709 }