Btrfs: Fix uninitialized root flags for subvolumes
[firefly-linux-kernel-4.4.55.git] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "blockcheck.h"
39 #include "dlmglue.h"
40 #include "inode.h"
41 #include "journal.h"
42 #include "localalloc.h"
43 #include "suballoc.h"
44 #include "super.h"
45 #include "sysfile.h"
46 #include "uptodate.h"
47
48 #include "buffer_head_io.h"
49
50 #define NOT_ALLOC_NEW_GROUP             0
51 #define ALLOC_NEW_GROUP                 0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL        0x2
53
54 #define OCFS2_MAX_INODES_TO_STEAL       1024
55
56 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
58 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
59 static int ocfs2_block_group_fill(handle_t *handle,
60                                   struct inode *alloc_inode,
61                                   struct buffer_head *bg_bh,
62                                   u64 group_blkno,
63                                   u16 my_chain,
64                                   struct ocfs2_chain_list *cl);
65 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
66                                    struct inode *alloc_inode,
67                                    struct buffer_head *bh,
68                                    u64 max_block,
69                                    u64 *last_alloc_group,
70                                    int flags);
71
72 static int ocfs2_cluster_group_search(struct inode *inode,
73                                       struct buffer_head *group_bh,
74                                       u32 bits_wanted, u32 min_bits,
75                                       u64 max_block,
76                                       u16 *bit_off, u16 *bits_found);
77 static int ocfs2_block_group_search(struct inode *inode,
78                                     struct buffer_head *group_bh,
79                                     u32 bits_wanted, u32 min_bits,
80                                     u64 max_block,
81                                     u16 *bit_off, u16 *bits_found);
82 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
83                                      struct ocfs2_alloc_context *ac,
84                                      handle_t *handle,
85                                      u32 bits_wanted,
86                                      u32 min_bits,
87                                      u16 *bit_off,
88                                      unsigned int *num_bits,
89                                      u64 *bg_blkno);
90 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
91                                          int nr);
92 static inline int ocfs2_block_group_set_bits(handle_t *handle,
93                                              struct inode *alloc_inode,
94                                              struct ocfs2_group_desc *bg,
95                                              struct buffer_head *group_bh,
96                                              unsigned int bit_off,
97                                              unsigned int num_bits);
98 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
99                                                struct inode *alloc_inode,
100                                                struct ocfs2_group_desc *bg,
101                                                struct buffer_head *group_bh,
102                                                unsigned int bit_off,
103                                                unsigned int num_bits);
104
105 static int ocfs2_relink_block_group(handle_t *handle,
106                                     struct inode *alloc_inode,
107                                     struct buffer_head *fe_bh,
108                                     struct buffer_head *bg_bh,
109                                     struct buffer_head *prev_bg_bh,
110                                     u16 chain);
111 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
112                                                      u32 wanted);
113 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
114                                                    u64 bg_blkno,
115                                                    u16 bg_bit_off);
116 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
117                                                 u64 data_blkno,
118                                                 u64 *bg_blkno,
119                                                 u16 *bg_bit_off);
120 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
121                                              u32 bits_wanted, u64 max_block,
122                                              int flags,
123                                              struct ocfs2_alloc_context **ac);
124
125 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
126 {
127         struct inode *inode = ac->ac_inode;
128
129         if (inode) {
130                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
131                         ocfs2_inode_unlock(inode, 1);
132
133                 mutex_unlock(&inode->i_mutex);
134
135                 iput(inode);
136                 ac->ac_inode = NULL;
137         }
138         brelse(ac->ac_bh);
139         ac->ac_bh = NULL;
140 }
141
142 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
143 {
144         ocfs2_free_ac_resource(ac);
145         kfree(ac);
146 }
147
148 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
149 {
150         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
151 }
152
153 #define do_error(fmt, ...)                                              \
154         do{                                                             \
155                 if (resize)                                     \
156                         mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
157                 else                                                    \
158                         ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
159         } while (0)
160
161 static int ocfs2_validate_gd_self(struct super_block *sb,
162                                   struct buffer_head *bh,
163                                   int resize)
164 {
165         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
166
167         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
168                 do_error("Group descriptor #%llu has bad signature %.*s",
169                          (unsigned long long)bh->b_blocknr, 7,
170                          gd->bg_signature);
171                 return -EINVAL;
172         }
173
174         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
175                 do_error("Group descriptor #%llu has an invalid bg_blkno "
176                          "of %llu",
177                          (unsigned long long)bh->b_blocknr,
178                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
179                 return -EINVAL;
180         }
181
182         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
183                 do_error("Group descriptor #%llu has an invalid "
184                          "fs_generation of #%u",
185                          (unsigned long long)bh->b_blocknr,
186                          le32_to_cpu(gd->bg_generation));
187                 return -EINVAL;
188         }
189
190         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
191                 do_error("Group descriptor #%llu has bit count %u but "
192                          "claims that %u are free",
193                          (unsigned long long)bh->b_blocknr,
194                          le16_to_cpu(gd->bg_bits),
195                          le16_to_cpu(gd->bg_free_bits_count));
196                 return -EINVAL;
197         }
198
199         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
200                 do_error("Group descriptor #%llu has bit count %u but "
201                          "max bitmap bits of %u",
202                          (unsigned long long)bh->b_blocknr,
203                          le16_to_cpu(gd->bg_bits),
204                          8 * le16_to_cpu(gd->bg_size));
205                 return -EINVAL;
206         }
207
208         return 0;
209 }
210
211 static int ocfs2_validate_gd_parent(struct super_block *sb,
212                                     struct ocfs2_dinode *di,
213                                     struct buffer_head *bh,
214                                     int resize)
215 {
216         unsigned int max_bits;
217         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
218
219         if (di->i_blkno != gd->bg_parent_dinode) {
220                 do_error("Group descriptor #%llu has bad parent "
221                          "pointer (%llu, expected %llu)",
222                          (unsigned long long)bh->b_blocknr,
223                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
224                          (unsigned long long)le64_to_cpu(di->i_blkno));
225                 return -EINVAL;
226         }
227
228         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
229         if (le16_to_cpu(gd->bg_bits) > max_bits) {
230                 do_error("Group descriptor #%llu has bit count of %u",
231                          (unsigned long long)bh->b_blocknr,
232                          le16_to_cpu(gd->bg_bits));
233                 return -EINVAL;
234         }
235
236         /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
237         if ((le16_to_cpu(gd->bg_chain) >
238              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
239             ((le16_to_cpu(gd->bg_chain) ==
240              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
241                 do_error("Group descriptor #%llu has bad chain %u",
242                          (unsigned long long)bh->b_blocknr,
243                          le16_to_cpu(gd->bg_chain));
244                 return -EINVAL;
245         }
246
247         return 0;
248 }
249
250 #undef do_error
251
252 /*
253  * This version only prints errors.  It does not fail the filesystem, and
254  * exists only for resize.
255  */
256 int ocfs2_check_group_descriptor(struct super_block *sb,
257                                  struct ocfs2_dinode *di,
258                                  struct buffer_head *bh)
259 {
260         int rc;
261         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
262
263         BUG_ON(!buffer_uptodate(bh));
264
265         /*
266          * If the ecc fails, we return the error but otherwise
267          * leave the filesystem running.  We know any error is
268          * local to this block.
269          */
270         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
271         if (rc) {
272                 mlog(ML_ERROR,
273                      "Checksum failed for group descriptor %llu\n",
274                      (unsigned long long)bh->b_blocknr);
275         } else
276                 rc = ocfs2_validate_gd_self(sb, bh, 1);
277         if (!rc)
278                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
279
280         return rc;
281 }
282
283 static int ocfs2_validate_group_descriptor(struct super_block *sb,
284                                            struct buffer_head *bh)
285 {
286         int rc;
287         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
288
289         mlog(0, "Validating group descriptor %llu\n",
290              (unsigned long long)bh->b_blocknr);
291
292         BUG_ON(!buffer_uptodate(bh));
293
294         /*
295          * If the ecc fails, we return the error but otherwise
296          * leave the filesystem running.  We know any error is
297          * local to this block.
298          */
299         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
300         if (rc)
301                 return rc;
302
303         /*
304          * Errors after here are fatal.
305          */
306
307         return ocfs2_validate_gd_self(sb, bh, 0);
308 }
309
310 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
311                                 u64 gd_blkno, struct buffer_head **bh)
312 {
313         int rc;
314         struct buffer_head *tmp = *bh;
315
316         rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
317                               ocfs2_validate_group_descriptor);
318         if (rc)
319                 goto out;
320
321         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
322         if (rc) {
323                 brelse(tmp);
324                 goto out;
325         }
326
327         /* If ocfs2_read_block() got us a new bh, pass it up. */
328         if (!*bh)
329                 *bh = tmp;
330
331 out:
332         return rc;
333 }
334
335 static int ocfs2_block_group_fill(handle_t *handle,
336                                   struct inode *alloc_inode,
337                                   struct buffer_head *bg_bh,
338                                   u64 group_blkno,
339                                   u16 my_chain,
340                                   struct ocfs2_chain_list *cl)
341 {
342         int status = 0;
343         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
344         struct super_block * sb = alloc_inode->i_sb;
345
346         mlog_entry_void();
347
348         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
349                 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
350                             "b_blocknr (%llu)",
351                             (unsigned long long)group_blkno,
352                             (unsigned long long) bg_bh->b_blocknr);
353                 status = -EIO;
354                 goto bail;
355         }
356
357         status = ocfs2_journal_access_gd(handle,
358                                          INODE_CACHE(alloc_inode),
359                                          bg_bh,
360                                          OCFS2_JOURNAL_ACCESS_CREATE);
361         if (status < 0) {
362                 mlog_errno(status);
363                 goto bail;
364         }
365
366         memset(bg, 0, sb->s_blocksize);
367         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
368         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
369         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
370         bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
371         bg->bg_chain = cpu_to_le16(my_chain);
372         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
373         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
374         bg->bg_blkno = cpu_to_le64(group_blkno);
375         /* set the 1st bit in the bitmap to account for the descriptor block */
376         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
377         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
378
379         status = ocfs2_journal_dirty(handle, bg_bh);
380         if (status < 0)
381                 mlog_errno(status);
382
383         /* There is no need to zero out or otherwise initialize the
384          * other blocks in a group - All valid FS metadata in a block
385          * group stores the superblock fs_generation value at
386          * allocation time. */
387
388 bail:
389         mlog_exit(status);
390         return status;
391 }
392
393 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
394 {
395         u16 curr, best;
396
397         best = curr = 0;
398         while (curr < le16_to_cpu(cl->cl_count)) {
399                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
400                     le32_to_cpu(cl->cl_recs[curr].c_total))
401                         best = curr;
402                 curr++;
403         }
404         return best;
405 }
406
407 /*
408  * We expect the block group allocator to already be locked.
409  */
410 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
411                                    struct inode *alloc_inode,
412                                    struct buffer_head *bh,
413                                    u64 max_block,
414                                    u64 *last_alloc_group,
415                                    int flags)
416 {
417         int status, credits;
418         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
419         struct ocfs2_chain_list *cl;
420         struct ocfs2_alloc_context *ac = NULL;
421         handle_t *handle = NULL;
422         u32 bit_off, num_bits;
423         u16 alloc_rec;
424         u64 bg_blkno;
425         struct buffer_head *bg_bh = NULL;
426         struct ocfs2_group_desc *bg;
427
428         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
429
430         mlog_entry_void();
431
432         cl = &fe->id2.i_chain;
433         status = ocfs2_reserve_clusters_with_limit(osb,
434                                                    le16_to_cpu(cl->cl_cpg),
435                                                    max_block, flags, &ac);
436         if (status < 0) {
437                 if (status != -ENOSPC)
438                         mlog_errno(status);
439                 goto bail;
440         }
441
442         credits = ocfs2_calc_group_alloc_credits(osb->sb,
443                                                  le16_to_cpu(cl->cl_cpg));
444         handle = ocfs2_start_trans(osb, credits);
445         if (IS_ERR(handle)) {
446                 status = PTR_ERR(handle);
447                 handle = NULL;
448                 mlog_errno(status);
449                 goto bail;
450         }
451
452         if (last_alloc_group && *last_alloc_group != 0) {
453                 mlog(0, "use old allocation group %llu for block group alloc\n",
454                      (unsigned long long)*last_alloc_group);
455                 ac->ac_last_group = *last_alloc_group;
456         }
457         status = ocfs2_claim_clusters(osb,
458                                       handle,
459                                       ac,
460                                       le16_to_cpu(cl->cl_cpg),
461                                       &bit_off,
462                                       &num_bits);
463         if (status < 0) {
464                 if (status != -ENOSPC)
465                         mlog_errno(status);
466                 goto bail;
467         }
468
469         alloc_rec = ocfs2_find_smallest_chain(cl);
470
471         /* setup the group */
472         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
473         mlog(0, "new descriptor, record %u, at block %llu\n",
474              alloc_rec, (unsigned long long)bg_blkno);
475
476         bg_bh = sb_getblk(osb->sb, bg_blkno);
477         if (!bg_bh) {
478                 status = -EIO;
479                 mlog_errno(status);
480                 goto bail;
481         }
482         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
483
484         status = ocfs2_block_group_fill(handle,
485                                         alloc_inode,
486                                         bg_bh,
487                                         bg_blkno,
488                                         alloc_rec,
489                                         cl);
490         if (status < 0) {
491                 mlog_errno(status);
492                 goto bail;
493         }
494
495         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
496
497         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
498                                          bh, OCFS2_JOURNAL_ACCESS_WRITE);
499         if (status < 0) {
500                 mlog_errno(status);
501                 goto bail;
502         }
503
504         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
505                      le16_to_cpu(bg->bg_free_bits_count));
506         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
507         cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
508         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
509                 le16_add_cpu(&cl->cl_next_free_rec, 1);
510
511         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
512                                         le16_to_cpu(bg->bg_free_bits_count));
513         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
514         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
515
516         status = ocfs2_journal_dirty(handle, bh);
517         if (status < 0) {
518                 mlog_errno(status);
519                 goto bail;
520         }
521
522         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
523         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
524         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
525                                              le32_to_cpu(fe->i_clusters)));
526         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
527         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
528         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
529
530         status = 0;
531
532         /* save the new last alloc group so that the caller can cache it. */
533         if (last_alloc_group)
534                 *last_alloc_group = ac->ac_last_group;
535
536 bail:
537         if (handle)
538                 ocfs2_commit_trans(osb, handle);
539
540         if (ac)
541                 ocfs2_free_alloc_context(ac);
542
543         brelse(bg_bh);
544
545         mlog_exit(status);
546         return status;
547 }
548
549 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
550                                        struct ocfs2_alloc_context *ac,
551                                        int type,
552                                        u32 slot,
553                                        u64 *last_alloc_group,
554                                        int flags)
555 {
556         int status;
557         u32 bits_wanted = ac->ac_bits_wanted;
558         struct inode *alloc_inode;
559         struct buffer_head *bh = NULL;
560         struct ocfs2_dinode *fe;
561         u32 free_bits;
562
563         mlog_entry_void();
564
565         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
566         if (!alloc_inode) {
567                 mlog_errno(-EINVAL);
568                 return -EINVAL;
569         }
570
571         mutex_lock(&alloc_inode->i_mutex);
572
573         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
574         if (status < 0) {
575                 mutex_unlock(&alloc_inode->i_mutex);
576                 iput(alloc_inode);
577
578                 mlog_errno(status);
579                 return status;
580         }
581
582         ac->ac_inode = alloc_inode;
583         ac->ac_alloc_slot = slot;
584
585         fe = (struct ocfs2_dinode *) bh->b_data;
586
587         /* The bh was validated by the inode read inside
588          * ocfs2_inode_lock().  Any corruption is a code bug. */
589         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
590
591         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
592                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
593                             (unsigned long long)le64_to_cpu(fe->i_blkno));
594                 status = -EIO;
595                 goto bail;
596         }
597
598         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
599                 le32_to_cpu(fe->id1.bitmap1.i_used);
600
601         if (bits_wanted > free_bits) {
602                 /* cluster bitmap never grows */
603                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
604                         mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
605                              bits_wanted, free_bits);
606                         status = -ENOSPC;
607                         goto bail;
608                 }
609
610                 if (!(flags & ALLOC_NEW_GROUP)) {
611                         mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
612                              "and we don't alloc a new group for it.\n",
613                              slot, bits_wanted, free_bits);
614                         status = -ENOSPC;
615                         goto bail;
616                 }
617
618                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
619                                                  ac->ac_max_block,
620                                                  last_alloc_group, flags);
621                 if (status < 0) {
622                         if (status != -ENOSPC)
623                                 mlog_errno(status);
624                         goto bail;
625                 }
626                 atomic_inc(&osb->alloc_stats.bg_extends);
627
628                 /* You should never ask for this much metadata */
629                 BUG_ON(bits_wanted >
630                        (le32_to_cpu(fe->id1.bitmap1.i_total)
631                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
632         }
633
634         get_bh(bh);
635         ac->ac_bh = bh;
636 bail:
637         brelse(bh);
638
639         mlog_exit(status);
640         return status;
641 }
642
643 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
644                                       int blocks,
645                                       struct ocfs2_alloc_context **ac)
646 {
647         int status;
648         u32 slot;
649
650         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
651         if (!(*ac)) {
652                 status = -ENOMEM;
653                 mlog_errno(status);
654                 goto bail;
655         }
656
657         (*ac)->ac_bits_wanted = blocks;
658         (*ac)->ac_which = OCFS2_AC_USE_META;
659         slot = osb->slot_num;
660         (*ac)->ac_group_search = ocfs2_block_group_search;
661
662         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
663                                              EXTENT_ALLOC_SYSTEM_INODE,
664                                              slot, NULL, ALLOC_NEW_GROUP);
665         if (status < 0) {
666                 if (status != -ENOSPC)
667                         mlog_errno(status);
668                 goto bail;
669         }
670
671         status = 0;
672 bail:
673         if ((status < 0) && *ac) {
674                 ocfs2_free_alloc_context(*ac);
675                 *ac = NULL;
676         }
677
678         mlog_exit(status);
679         return status;
680 }
681
682 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
683                                struct ocfs2_extent_list *root_el,
684                                struct ocfs2_alloc_context **ac)
685 {
686         return ocfs2_reserve_new_metadata_blocks(osb,
687                                         ocfs2_extend_meta_needed(root_el),
688                                         ac);
689 }
690
691 static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
692                                               struct ocfs2_alloc_context *ac)
693 {
694         int i, status = -ENOSPC;
695         s16 slot = ocfs2_get_inode_steal_slot(osb);
696
697         /* Start to steal inodes from the first slot after ours. */
698         if (slot == OCFS2_INVALID_SLOT)
699                 slot = osb->slot_num + 1;
700
701         for (i = 0; i < osb->max_slots; i++, slot++) {
702                 if (slot == osb->max_slots)
703                         slot = 0;
704
705                 if (slot == osb->slot_num)
706                         continue;
707
708                 status = ocfs2_reserve_suballoc_bits(osb, ac,
709                                                      INODE_ALLOC_SYSTEM_INODE,
710                                                      slot, NULL,
711                                                      NOT_ALLOC_NEW_GROUP);
712                 if (status >= 0) {
713                         ocfs2_set_inode_steal_slot(osb, slot);
714                         break;
715                 }
716
717                 ocfs2_free_ac_resource(ac);
718         }
719
720         return status;
721 }
722
723 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
724                             struct ocfs2_alloc_context **ac)
725 {
726         int status;
727         s16 slot = ocfs2_get_inode_steal_slot(osb);
728         u64 alloc_group;
729
730         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
731         if (!(*ac)) {
732                 status = -ENOMEM;
733                 mlog_errno(status);
734                 goto bail;
735         }
736
737         (*ac)->ac_bits_wanted = 1;
738         (*ac)->ac_which = OCFS2_AC_USE_INODE;
739
740         (*ac)->ac_group_search = ocfs2_block_group_search;
741
742         /*
743          * stat(2) can't handle i_ino > 32bits, so we tell the
744          * lower levels not to allocate us a block group past that
745          * limit.  The 'inode64' mount option avoids this behavior.
746          */
747         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
748                 (*ac)->ac_max_block = (u32)~0U;
749
750         /*
751          * slot is set when we successfully steal inode from other nodes.
752          * It is reset in 3 places:
753          * 1. when we flush the truncate log
754          * 2. when we complete local alloc recovery.
755          * 3. when we successfully allocate from our own slot.
756          * After it is set, we will go on stealing inodes until we find the
757          * need to check our slots to see whether there is some space for us.
758          */
759         if (slot != OCFS2_INVALID_SLOT &&
760             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
761                 goto inode_steal;
762
763         atomic_set(&osb->s_num_inodes_stolen, 0);
764         alloc_group = osb->osb_inode_alloc_group;
765         status = ocfs2_reserve_suballoc_bits(osb, *ac,
766                                              INODE_ALLOC_SYSTEM_INODE,
767                                              osb->slot_num,
768                                              &alloc_group,
769                                              ALLOC_NEW_GROUP |
770                                              ALLOC_GROUPS_FROM_GLOBAL);
771         if (status >= 0) {
772                 status = 0;
773
774                 spin_lock(&osb->osb_lock);
775                 osb->osb_inode_alloc_group = alloc_group;
776                 spin_unlock(&osb->osb_lock);
777                 mlog(0, "after reservation, new allocation group is "
778                      "%llu\n", (unsigned long long)alloc_group);
779
780                 /*
781                  * Some inodes must be freed by us, so try to allocate
782                  * from our own next time.
783                  */
784                 if (slot != OCFS2_INVALID_SLOT)
785                         ocfs2_init_inode_steal_slot(osb);
786                 goto bail;
787         } else if (status < 0 && status != -ENOSPC) {
788                 mlog_errno(status);
789                 goto bail;
790         }
791
792         ocfs2_free_ac_resource(*ac);
793
794 inode_steal:
795         status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
796         atomic_inc(&osb->s_num_inodes_stolen);
797         if (status < 0) {
798                 if (status != -ENOSPC)
799                         mlog_errno(status);
800                 goto bail;
801         }
802
803         status = 0;
804 bail:
805         if ((status < 0) && *ac) {
806                 ocfs2_free_alloc_context(*ac);
807                 *ac = NULL;
808         }
809
810         mlog_exit(status);
811         return status;
812 }
813
814 /* local alloc code has to do the same thing, so rather than do this
815  * twice.. */
816 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
817                                       struct ocfs2_alloc_context *ac)
818 {
819         int status;
820
821         ac->ac_which = OCFS2_AC_USE_MAIN;
822         ac->ac_group_search = ocfs2_cluster_group_search;
823
824         status = ocfs2_reserve_suballoc_bits(osb, ac,
825                                              GLOBAL_BITMAP_SYSTEM_INODE,
826                                              OCFS2_INVALID_SLOT, NULL,
827                                              ALLOC_NEW_GROUP);
828         if (status < 0 && status != -ENOSPC) {
829                 mlog_errno(status);
830                 goto bail;
831         }
832
833 bail:
834         return status;
835 }
836
837 /* Callers don't need to care which bitmap (local alloc or main) to
838  * use so we figure it out for them, but unfortunately this clutters
839  * things a bit. */
840 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
841                                              u32 bits_wanted, u64 max_block,
842                                              int flags,
843                                              struct ocfs2_alloc_context **ac)
844 {
845         int status;
846
847         mlog_entry_void();
848
849         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
850         if (!(*ac)) {
851                 status = -ENOMEM;
852                 mlog_errno(status);
853                 goto bail;
854         }
855
856         (*ac)->ac_bits_wanted = bits_wanted;
857         (*ac)->ac_max_block = max_block;
858
859         status = -ENOSPC;
860         if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
861             ocfs2_alloc_should_use_local(osb, bits_wanted)) {
862                 status = ocfs2_reserve_local_alloc_bits(osb,
863                                                         bits_wanted,
864                                                         *ac);
865                 if (status == -EFBIG) {
866                         /* The local alloc window is outside ac_max_block.
867                          * use the main bitmap. */
868                         status = -ENOSPC;
869                 } else if ((status < 0) && (status != -ENOSPC)) {
870                         mlog_errno(status);
871                         goto bail;
872                 }
873         }
874
875         if (status == -ENOSPC) {
876                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
877                 if (status < 0) {
878                         if (status != -ENOSPC)
879                                 mlog_errno(status);
880                         goto bail;
881                 }
882         }
883
884         status = 0;
885 bail:
886         if ((status < 0) && *ac) {
887                 ocfs2_free_alloc_context(*ac);
888                 *ac = NULL;
889         }
890
891         mlog_exit(status);
892         return status;
893 }
894
895 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
896                            u32 bits_wanted,
897                            struct ocfs2_alloc_context **ac)
898 {
899         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
900                                                  ALLOC_NEW_GROUP, ac);
901 }
902
903 /*
904  * More or less lifted from ext3. I'll leave their description below:
905  *
906  * "For ext3 allocations, we must not reuse any blocks which are
907  * allocated in the bitmap buffer's "last committed data" copy.  This
908  * prevents deletes from freeing up the page for reuse until we have
909  * committed the delete transaction.
910  *
911  * If we didn't do this, then deleting something and reallocating it as
912  * data would allow the old block to be overwritten before the
913  * transaction committed (because we force data to disk before commit).
914  * This would lead to corruption if we crashed between overwriting the
915  * data and committing the delete.
916  *
917  * @@@ We may want to make this allocation behaviour conditional on
918  * data-writes at some point, and disable it for metadata allocations or
919  * sync-data inodes."
920  *
921  * Note: OCFS2 already does this differently for metadata vs data
922  * allocations, as those bitmaps are separate and undo access is never
923  * called on a metadata group descriptor.
924  */
925 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
926                                          int nr)
927 {
928         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
929         int ret;
930
931         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
932                 return 0;
933
934         if (!buffer_jbd(bg_bh))
935                 return 1;
936
937         jbd_lock_bh_state(bg_bh);
938         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
939         if (bg)
940                 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
941         else
942                 ret = 1;
943         jbd_unlock_bh_state(bg_bh);
944
945         return ret;
946 }
947
948 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
949                                              struct buffer_head *bg_bh,
950                                              unsigned int bits_wanted,
951                                              unsigned int total_bits,
952                                              u16 *bit_off,
953                                              u16 *bits_found)
954 {
955         void *bitmap;
956         u16 best_offset, best_size;
957         int offset, start, found, status = 0;
958         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
959
960         /* Callers got this descriptor from
961          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
962         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
963
964         found = start = best_offset = best_size = 0;
965         bitmap = bg->bg_bitmap;
966
967         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
968                 if (offset == total_bits)
969                         break;
970
971                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
972                         /* We found a zero, but we can't use it as it
973                          * hasn't been put to disk yet! */
974                         found = 0;
975                         start = offset + 1;
976                 } else if (offset == start) {
977                         /* we found a zero */
978                         found++;
979                         /* move start to the next bit to test */
980                         start++;
981                 } else {
982                         /* got a zero after some ones */
983                         found = 1;
984                         start = offset + 1;
985                 }
986                 if (found > best_size) {
987                         best_size = found;
988                         best_offset = start - found;
989                 }
990                 /* we got everything we needed */
991                 if (found == bits_wanted) {
992                         /* mlog(0, "Found it all!\n"); */
993                         break;
994                 }
995         }
996
997         /* XXX: I think the first clause is equivalent to the second
998          *      - jlbec */
999         if (found == bits_wanted) {
1000                 *bit_off = start - found;
1001                 *bits_found = found;
1002         } else if (best_size) {
1003                 *bit_off = best_offset;
1004                 *bits_found = best_size;
1005         } else {
1006                 status = -ENOSPC;
1007                 /* No error log here -- see the comment above
1008                  * ocfs2_test_bg_bit_allocatable */
1009         }
1010
1011         return status;
1012 }
1013
1014 static inline int ocfs2_block_group_set_bits(handle_t *handle,
1015                                              struct inode *alloc_inode,
1016                                              struct ocfs2_group_desc *bg,
1017                                              struct buffer_head *group_bh,
1018                                              unsigned int bit_off,
1019                                              unsigned int num_bits)
1020 {
1021         int status;
1022         void *bitmap = bg->bg_bitmap;
1023         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1024
1025         mlog_entry_void();
1026
1027         /* All callers get the descriptor via
1028          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1029         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1030         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1031
1032         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
1033              num_bits);
1034
1035         if (ocfs2_is_cluster_bitmap(alloc_inode))
1036                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1037
1038         status = ocfs2_journal_access_gd(handle,
1039                                          INODE_CACHE(alloc_inode),
1040                                          group_bh,
1041                                          journal_type);
1042         if (status < 0) {
1043                 mlog_errno(status);
1044                 goto bail;
1045         }
1046
1047         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1048
1049         while(num_bits--)
1050                 ocfs2_set_bit(bit_off++, bitmap);
1051
1052         status = ocfs2_journal_dirty(handle,
1053                                      group_bh);
1054         if (status < 0) {
1055                 mlog_errno(status);
1056                 goto bail;
1057         }
1058
1059 bail:
1060         mlog_exit(status);
1061         return status;
1062 }
1063
1064 /* find the one with the most empty bits */
1065 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1066 {
1067         u16 curr, best;
1068
1069         BUG_ON(!cl->cl_next_free_rec);
1070
1071         best = curr = 0;
1072         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1073                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1074                     le32_to_cpu(cl->cl_recs[best].c_free))
1075                         best = curr;
1076                 curr++;
1077         }
1078
1079         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1080         return best;
1081 }
1082
1083 static int ocfs2_relink_block_group(handle_t *handle,
1084                                     struct inode *alloc_inode,
1085                                     struct buffer_head *fe_bh,
1086                                     struct buffer_head *bg_bh,
1087                                     struct buffer_head *prev_bg_bh,
1088                                     u16 chain)
1089 {
1090         int status;
1091         /* there is a really tiny chance the journal calls could fail,
1092          * but we wouldn't want inconsistent blocks in *any* case. */
1093         u64 fe_ptr, bg_ptr, prev_bg_ptr;
1094         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1095         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1096         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1097
1098         /* The caller got these descriptors from
1099          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1100         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1101         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1102
1103         mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1104              (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1105              (unsigned long long)le64_to_cpu(bg->bg_blkno),
1106              (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1107
1108         fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1109         bg_ptr = le64_to_cpu(bg->bg_next_group);
1110         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1111
1112         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1113                                          prev_bg_bh,
1114                                          OCFS2_JOURNAL_ACCESS_WRITE);
1115         if (status < 0) {
1116                 mlog_errno(status);
1117                 goto out_rollback;
1118         }
1119
1120         prev_bg->bg_next_group = bg->bg_next_group;
1121
1122         status = ocfs2_journal_dirty(handle, prev_bg_bh);
1123         if (status < 0) {
1124                 mlog_errno(status);
1125                 goto out_rollback;
1126         }
1127
1128         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1129                                          bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1130         if (status < 0) {
1131                 mlog_errno(status);
1132                 goto out_rollback;
1133         }
1134
1135         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1136
1137         status = ocfs2_journal_dirty(handle, bg_bh);
1138         if (status < 0) {
1139                 mlog_errno(status);
1140                 goto out_rollback;
1141         }
1142
1143         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1144                                          fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1145         if (status < 0) {
1146                 mlog_errno(status);
1147                 goto out_rollback;
1148         }
1149
1150         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1151
1152         status = ocfs2_journal_dirty(handle, fe_bh);
1153         if (status < 0) {
1154                 mlog_errno(status);
1155                 goto out_rollback;
1156         }
1157
1158         status = 0;
1159 out_rollback:
1160         if (status < 0) {
1161                 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1162                 bg->bg_next_group = cpu_to_le64(bg_ptr);
1163                 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1164         }
1165
1166         mlog_exit(status);
1167         return status;
1168 }
1169
1170 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1171                                                      u32 wanted)
1172 {
1173         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1174 }
1175
1176 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1177  * value on error. */
1178 static int ocfs2_cluster_group_search(struct inode *inode,
1179                                       struct buffer_head *group_bh,
1180                                       u32 bits_wanted, u32 min_bits,
1181                                       u64 max_block,
1182                                       u16 *bit_off, u16 *bits_found)
1183 {
1184         int search = -ENOSPC;
1185         int ret;
1186         u64 blkoff;
1187         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1188         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1189         u16 tmp_off, tmp_found;
1190         unsigned int max_bits, gd_cluster_off;
1191
1192         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1193
1194         if (gd->bg_free_bits_count) {
1195                 max_bits = le16_to_cpu(gd->bg_bits);
1196
1197                 /* Tail groups in cluster bitmaps which aren't cpg
1198                  * aligned are prone to partial extention by a failed
1199                  * fs resize. If the file system resize never got to
1200                  * update the dinode cluster count, then we don't want
1201                  * to trust any clusters past it, regardless of what
1202                  * the group descriptor says. */
1203                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1204                                                           le64_to_cpu(gd->bg_blkno));
1205                 if ((gd_cluster_off + max_bits) >
1206                     OCFS2_I(inode)->ip_clusters) {
1207                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1208                         mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1209                              (unsigned long long)le64_to_cpu(gd->bg_blkno),
1210                              le16_to_cpu(gd->bg_bits),
1211                              OCFS2_I(inode)->ip_clusters, max_bits);
1212                 }
1213
1214                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1215                                                         group_bh, bits_wanted,
1216                                                         max_bits,
1217                                                         &tmp_off, &tmp_found);
1218                 if (ret)
1219                         return ret;
1220
1221                 if (max_block) {
1222                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1223                                                           gd_cluster_off +
1224                                                           tmp_off + tmp_found);
1225                         mlog(0, "Checking %llu against %llu\n",
1226                              (unsigned long long)blkoff,
1227                              (unsigned long long)max_block);
1228                         if (blkoff > max_block)
1229                                 return -ENOSPC;
1230                 }
1231
1232                 /* ocfs2_block_group_find_clear_bits() might
1233                  * return success, but we still want to return
1234                  * -ENOSPC unless it found the minimum number
1235                  * of bits. */
1236                 if (min_bits <= tmp_found) {
1237                         *bit_off = tmp_off;
1238                         *bits_found = tmp_found;
1239                         search = 0; /* success */
1240                 } else if (tmp_found) {
1241                         /*
1242                          * Don't show bits which we'll be returning
1243                          * for allocation to the local alloc bitmap.
1244                          */
1245                         ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
1246                 }
1247         }
1248
1249         return search;
1250 }
1251
1252 static int ocfs2_block_group_search(struct inode *inode,
1253                                     struct buffer_head *group_bh,
1254                                     u32 bits_wanted, u32 min_bits,
1255                                     u64 max_block,
1256                                     u16 *bit_off, u16 *bits_found)
1257 {
1258         int ret = -ENOSPC;
1259         u64 blkoff;
1260         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1261
1262         BUG_ON(min_bits != 1);
1263         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1264
1265         if (bg->bg_free_bits_count) {
1266                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1267                                                         group_bh, bits_wanted,
1268                                                         le16_to_cpu(bg->bg_bits),
1269                                                         bit_off, bits_found);
1270                 if (!ret && max_block) {
1271                         blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
1272                                 *bits_found;
1273                         mlog(0, "Checking %llu against %llu\n",
1274                              (unsigned long long)blkoff,
1275                              (unsigned long long)max_block);
1276                         if (blkoff > max_block)
1277                                 ret = -ENOSPC;
1278                 }
1279         }
1280
1281         return ret;
1282 }
1283
1284 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1285                                        handle_t *handle,
1286                                        struct buffer_head *di_bh,
1287                                        u32 num_bits,
1288                                        u16 chain)
1289 {
1290         int ret;
1291         u32 tmp_used;
1292         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1293         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1294
1295         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1296                                       OCFS2_JOURNAL_ACCESS_WRITE);
1297         if (ret < 0) {
1298                 mlog_errno(ret);
1299                 goto out;
1300         }
1301
1302         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1303         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1304         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1305
1306         ret = ocfs2_journal_dirty(handle, di_bh);
1307         if (ret < 0)
1308                 mlog_errno(ret);
1309
1310 out:
1311         return ret;
1312 }
1313
1314 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1315                                   handle_t *handle,
1316                                   u32 bits_wanted,
1317                                   u32 min_bits,
1318                                   u16 *bit_off,
1319                                   unsigned int *num_bits,
1320                                   u64 gd_blkno,
1321                                   u16 *bits_left)
1322 {
1323         int ret;
1324         u16 found;
1325         struct buffer_head *group_bh = NULL;
1326         struct ocfs2_group_desc *gd;
1327         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1328         struct inode *alloc_inode = ac->ac_inode;
1329
1330         ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1331                                           &group_bh);
1332         if (ret < 0) {
1333                 mlog_errno(ret);
1334                 return ret;
1335         }
1336
1337         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1338         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1339                                   ac->ac_max_block, bit_off, &found);
1340         if (ret < 0) {
1341                 if (ret != -ENOSPC)
1342                         mlog_errno(ret);
1343                 goto out;
1344         }
1345
1346         *num_bits = found;
1347
1348         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1349                                                *num_bits,
1350                                                le16_to_cpu(gd->bg_chain));
1351         if (ret < 0) {
1352                 mlog_errno(ret);
1353                 goto out;
1354         }
1355
1356         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1357                                          *bit_off, *num_bits);
1358         if (ret < 0)
1359                 mlog_errno(ret);
1360
1361         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1362
1363 out:
1364         brelse(group_bh);
1365
1366         return ret;
1367 }
1368
1369 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1370                               handle_t *handle,
1371                               u32 bits_wanted,
1372                               u32 min_bits,
1373                               u16 *bit_off,
1374                               unsigned int *num_bits,
1375                               u64 *bg_blkno,
1376                               u16 *bits_left)
1377 {
1378         int status;
1379         u16 chain, tmp_bits;
1380         u32 tmp_used;
1381         u64 next_group;
1382         struct inode *alloc_inode = ac->ac_inode;
1383         struct buffer_head *group_bh = NULL;
1384         struct buffer_head *prev_group_bh = NULL;
1385         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1386         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1387         struct ocfs2_group_desc *bg;
1388
1389         chain = ac->ac_chain;
1390         mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1391              bits_wanted, chain,
1392              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1393
1394         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1395                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1396                                              &group_bh);
1397         if (status < 0) {
1398                 mlog_errno(status);
1399                 goto bail;
1400         }
1401         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1402
1403         status = -ENOSPC;
1404         /* for now, the chain search is a bit simplistic. We just use
1405          * the 1st group with any empty bits. */
1406         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1407                                              bits_wanted, min_bits,
1408                                              ac->ac_max_block, bit_off,
1409                                              &tmp_bits)) == -ENOSPC) {
1410                 if (!bg->bg_next_group)
1411                         break;
1412
1413                 brelse(prev_group_bh);
1414                 prev_group_bh = NULL;
1415
1416                 next_group = le64_to_cpu(bg->bg_next_group);
1417                 prev_group_bh = group_bh;
1418                 group_bh = NULL;
1419                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1420                                                      next_group, &group_bh);
1421                 if (status < 0) {
1422                         mlog_errno(status);
1423                         goto bail;
1424                 }
1425                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1426         }
1427         if (status < 0) {
1428                 if (status != -ENOSPC)
1429                         mlog_errno(status);
1430                 goto bail;
1431         }
1432
1433         mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1434              tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1435
1436         *num_bits = tmp_bits;
1437
1438         BUG_ON(*num_bits == 0);
1439
1440         /*
1441          * Keep track of previous block descriptor read. When
1442          * we find a target, if we have read more than X
1443          * number of descriptors, and the target is reasonably
1444          * empty, relink him to top of his chain.
1445          *
1446          * We've read 0 extra blocks and only send one more to
1447          * the transaction, yet the next guy to search has a
1448          * much easier time.
1449          *
1450          * Do this *after* figuring out how many bits we're taking out
1451          * of our target group.
1452          */
1453         if (ac->ac_allow_chain_relink &&
1454             (prev_group_bh) &&
1455             (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1456                 status = ocfs2_relink_block_group(handle, alloc_inode,
1457                                                   ac->ac_bh, group_bh,
1458                                                   prev_group_bh, chain);
1459                 if (status < 0) {
1460                         mlog_errno(status);
1461                         goto bail;
1462                 }
1463         }
1464
1465         /* Ok, claim our bits now: set the info on dinode, chainlist
1466          * and then the group */
1467         status = ocfs2_journal_access_di(handle,
1468                                          INODE_CACHE(alloc_inode),
1469                                          ac->ac_bh,
1470                                          OCFS2_JOURNAL_ACCESS_WRITE);
1471         if (status < 0) {
1472                 mlog_errno(status);
1473                 goto bail;
1474         }
1475
1476         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1477         fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1478         le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1479
1480         status = ocfs2_journal_dirty(handle,
1481                                      ac->ac_bh);
1482         if (status < 0) {
1483                 mlog_errno(status);
1484                 goto bail;
1485         }
1486
1487         status = ocfs2_block_group_set_bits(handle,
1488                                             alloc_inode,
1489                                             bg,
1490                                             group_bh,
1491                                             *bit_off,
1492                                             *num_bits);
1493         if (status < 0) {
1494                 mlog_errno(status);
1495                 goto bail;
1496         }
1497
1498         mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1499              (unsigned long long)le64_to_cpu(fe->i_blkno));
1500
1501         *bg_blkno = le64_to_cpu(bg->bg_blkno);
1502         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1503 bail:
1504         brelse(group_bh);
1505         brelse(prev_group_bh);
1506
1507         mlog_exit(status);
1508         return status;
1509 }
1510
1511 /* will give out up to bits_wanted contiguous bits. */
1512 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1513                                      struct ocfs2_alloc_context *ac,
1514                                      handle_t *handle,
1515                                      u32 bits_wanted,
1516                                      u32 min_bits,
1517                                      u16 *bit_off,
1518                                      unsigned int *num_bits,
1519                                      u64 *bg_blkno)
1520 {
1521         int status;
1522         u16 victim, i;
1523         u16 bits_left = 0;
1524         u64 hint_blkno = ac->ac_last_group;
1525         struct ocfs2_chain_list *cl;
1526         struct ocfs2_dinode *fe;
1527
1528         mlog_entry_void();
1529
1530         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1531         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1532         BUG_ON(!ac->ac_bh);
1533
1534         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1535
1536         /* The bh was validated by the inode read during
1537          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1538         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1539
1540         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1541             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1542                 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1543                             "bits but only %u total.",
1544                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1545                             le32_to_cpu(fe->id1.bitmap1.i_used),
1546                             le32_to_cpu(fe->id1.bitmap1.i_total));
1547                 status = -EIO;
1548                 goto bail;
1549         }
1550
1551         if (hint_blkno) {
1552                 /* Attempt to short-circuit the usual search mechanism
1553                  * by jumping straight to the most recently used
1554                  * allocation group. This helps us mantain some
1555                  * contiguousness across allocations. */
1556                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1557                                                 min_bits, bit_off, num_bits,
1558                                                 hint_blkno, &bits_left);
1559                 if (!status) {
1560                         /* Be careful to update *bg_blkno here as the
1561                          * caller is expecting it to be filled in, and
1562                          * ocfs2_search_one_group() won't do that for
1563                          * us. */
1564                         *bg_blkno = hint_blkno;
1565                         goto set_hint;
1566                 }
1567                 if (status < 0 && status != -ENOSPC) {
1568                         mlog_errno(status);
1569                         goto bail;
1570                 }
1571         }
1572
1573         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1574
1575         victim = ocfs2_find_victim_chain(cl);
1576         ac->ac_chain = victim;
1577         ac->ac_allow_chain_relink = 1;
1578
1579         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1580                                     num_bits, bg_blkno, &bits_left);
1581         if (!status)
1582                 goto set_hint;
1583         if (status < 0 && status != -ENOSPC) {
1584                 mlog_errno(status);
1585                 goto bail;
1586         }
1587
1588         mlog(0, "Search of victim chain %u came up with nothing, "
1589              "trying all chains now.\n", victim);
1590
1591         /* If we didn't pick a good victim, then just default to
1592          * searching each chain in order. Don't allow chain relinking
1593          * because we only calculate enough journal credits for one
1594          * relink per alloc. */
1595         ac->ac_allow_chain_relink = 0;
1596         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1597                 if (i == victim)
1598                         continue;
1599                 if (!cl->cl_recs[i].c_free)
1600                         continue;
1601
1602                 ac->ac_chain = i;
1603                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1604                                             bit_off, num_bits, bg_blkno,
1605                                             &bits_left);
1606                 if (!status)
1607                         break;
1608                 if (status < 0 && status != -ENOSPC) {
1609                         mlog_errno(status);
1610                         goto bail;
1611                 }
1612         }
1613
1614 set_hint:
1615         if (status != -ENOSPC) {
1616                 /* If the next search of this group is not likely to
1617                  * yield a suitable extent, then we reset the last
1618                  * group hint so as to not waste a disk read */
1619                 if (bits_left < min_bits)
1620                         ac->ac_last_group = 0;
1621                 else
1622                         ac->ac_last_group = *bg_blkno;
1623         }
1624
1625 bail:
1626         mlog_exit(status);
1627         return status;
1628 }
1629
1630 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1631                          handle_t *handle,
1632                          struct ocfs2_alloc_context *ac,
1633                          u32 bits_wanted,
1634                          u16 *suballoc_bit_start,
1635                          unsigned int *num_bits,
1636                          u64 *blkno_start)
1637 {
1638         int status;
1639         u64 bg_blkno;
1640
1641         BUG_ON(!ac);
1642         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1643         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1644
1645         status = ocfs2_claim_suballoc_bits(osb,
1646                                            ac,
1647                                            handle,
1648                                            bits_wanted,
1649                                            1,
1650                                            suballoc_bit_start,
1651                                            num_bits,
1652                                            &bg_blkno);
1653         if (status < 0) {
1654                 mlog_errno(status);
1655                 goto bail;
1656         }
1657         atomic_inc(&osb->alloc_stats.bg_allocs);
1658
1659         *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1660         ac->ac_bits_given += (*num_bits);
1661         status = 0;
1662 bail:
1663         mlog_exit(status);
1664         return status;
1665 }
1666
1667 static void ocfs2_init_inode_ac_group(struct inode *dir,
1668                                       struct buffer_head *parent_fe_bh,
1669                                       struct ocfs2_alloc_context *ac)
1670 {
1671         struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1672         /*
1673          * Try to allocate inodes from some specific group.
1674          *
1675          * If the parent dir has recorded the last group used in allocation,
1676          * cool, use it. Otherwise if we try to allocate new inode from the
1677          * same slot the parent dir belongs to, use the same chunk.
1678          *
1679          * We are very careful here to avoid the mistake of setting
1680          * ac_last_group to a group descriptor from a different (unlocked) slot.
1681          */
1682         if (OCFS2_I(dir)->ip_last_used_group &&
1683             OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1684                 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1685         else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1686                 ac->ac_last_group = ocfs2_which_suballoc_group(
1687                                         le64_to_cpu(fe->i_blkno),
1688                                         le16_to_cpu(fe->i_suballoc_bit));
1689 }
1690
1691 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1692                                              struct ocfs2_alloc_context *ac)
1693 {
1694         OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1695         OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1696 }
1697
1698 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1699                           handle_t *handle,
1700                           struct inode *dir,
1701                           struct buffer_head *parent_fe_bh,
1702                           struct ocfs2_alloc_context *ac,
1703                           u16 *suballoc_bit,
1704                           u64 *fe_blkno)
1705 {
1706         int status;
1707         unsigned int num_bits;
1708         u64 bg_blkno;
1709
1710         mlog_entry_void();
1711
1712         BUG_ON(!ac);
1713         BUG_ON(ac->ac_bits_given != 0);
1714         BUG_ON(ac->ac_bits_wanted != 1);
1715         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1716
1717         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1718
1719         status = ocfs2_claim_suballoc_bits(osb,
1720                                            ac,
1721                                            handle,
1722                                            1,
1723                                            1,
1724                                            suballoc_bit,
1725                                            &num_bits,
1726                                            &bg_blkno);
1727         if (status < 0) {
1728                 mlog_errno(status);
1729                 goto bail;
1730         }
1731         atomic_inc(&osb->alloc_stats.bg_allocs);
1732
1733         BUG_ON(num_bits != 1);
1734
1735         *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1736         ac->ac_bits_given++;
1737         ocfs2_save_inode_ac_group(dir, ac);
1738         status = 0;
1739 bail:
1740         mlog_exit(status);
1741         return status;
1742 }
1743
1744 /* translate a group desc. blkno and it's bitmap offset into
1745  * disk cluster offset. */
1746 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1747                                                    u64 bg_blkno,
1748                                                    u16 bg_bit_off)
1749 {
1750         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1751         u32 cluster = 0;
1752
1753         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1754
1755         if (bg_blkno != osb->first_cluster_group_blkno)
1756                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1757         cluster += (u32) bg_bit_off;
1758         return cluster;
1759 }
1760
1761 /* given a cluster offset, calculate which block group it belongs to
1762  * and return that block offset. */
1763 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1764 {
1765         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1766         u32 group_no;
1767
1768         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1769
1770         group_no = cluster / osb->bitmap_cpg;
1771         if (!group_no)
1772                 return osb->first_cluster_group_blkno;
1773         return ocfs2_clusters_to_blocks(inode->i_sb,
1774                                         group_no * osb->bitmap_cpg);
1775 }
1776
1777 /* given the block number of a cluster start, calculate which cluster
1778  * group and descriptor bitmap offset that corresponds to. */
1779 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1780                                                 u64 data_blkno,
1781                                                 u64 *bg_blkno,
1782                                                 u16 *bg_bit_off)
1783 {
1784         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1785         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1786
1787         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1788
1789         *bg_blkno = ocfs2_which_cluster_group(inode,
1790                                               data_cluster);
1791
1792         if (*bg_blkno == osb->first_cluster_group_blkno)
1793                 *bg_bit_off = (u16) data_cluster;
1794         else
1795                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1796                                                              data_blkno - *bg_blkno);
1797 }
1798
1799 /*
1800  * min_bits - minimum contiguous chunk from this total allocation we
1801  * can handle. set to what we asked for originally for a full
1802  * contig. allocation, set to '1' to indicate we can deal with extents
1803  * of any size.
1804  */
1805 int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1806                            handle_t *handle,
1807                            struct ocfs2_alloc_context *ac,
1808                            u32 min_clusters,
1809                            u32 max_clusters,
1810                            u32 *cluster_start,
1811                            u32 *num_clusters)
1812 {
1813         int status;
1814         unsigned int bits_wanted = max_clusters;
1815         u64 bg_blkno = 0;
1816         u16 bg_bit_off;
1817
1818         mlog_entry_void();
1819
1820         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1821
1822         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1823                && ac->ac_which != OCFS2_AC_USE_MAIN);
1824
1825         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1826                 status = ocfs2_claim_local_alloc_bits(osb,
1827                                                       handle,
1828                                                       ac,
1829                                                       bits_wanted,
1830                                                       cluster_start,
1831                                                       num_clusters);
1832                 if (!status)
1833                         atomic_inc(&osb->alloc_stats.local_data);
1834         } else {
1835                 if (min_clusters > (osb->bitmap_cpg - 1)) {
1836                         /* The only paths asking for contiguousness
1837                          * should know about this already. */
1838                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1839                              "group bitmap size %u!\n", min_clusters,
1840                              osb->bitmap_cpg);
1841                         status = -ENOSPC;
1842                         goto bail;
1843                 }
1844                 /* clamp the current request down to a realistic size. */
1845                 if (bits_wanted > (osb->bitmap_cpg - 1))
1846                         bits_wanted = osb->bitmap_cpg - 1;
1847
1848                 status = ocfs2_claim_suballoc_bits(osb,
1849                                                    ac,
1850                                                    handle,
1851                                                    bits_wanted,
1852                                                    min_clusters,
1853                                                    &bg_bit_off,
1854                                                    num_clusters,
1855                                                    &bg_blkno);
1856                 if (!status) {
1857                         *cluster_start =
1858                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1859                                                                  bg_blkno,
1860                                                                  bg_bit_off);
1861                         atomic_inc(&osb->alloc_stats.bitmap_data);
1862                 }
1863         }
1864         if (status < 0) {
1865                 if (status != -ENOSPC)
1866                         mlog_errno(status);
1867                 goto bail;
1868         }
1869
1870         ac->ac_bits_given += *num_clusters;
1871
1872 bail:
1873         mlog_exit(status);
1874         return status;
1875 }
1876
1877 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1878                          handle_t *handle,
1879                          struct ocfs2_alloc_context *ac,
1880                          u32 min_clusters,
1881                          u32 *cluster_start,
1882                          u32 *num_clusters)
1883 {
1884         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1885
1886         return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
1887                                       bits_wanted, cluster_start, num_clusters);
1888 }
1889
1890 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1891                                                struct inode *alloc_inode,
1892                                                struct ocfs2_group_desc *bg,
1893                                                struct buffer_head *group_bh,
1894                                                unsigned int bit_off,
1895                                                unsigned int num_bits)
1896 {
1897         int status;
1898         unsigned int tmp;
1899         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1900         struct ocfs2_group_desc *undo_bg = NULL;
1901         int cluster_bitmap = 0;
1902
1903         mlog_entry_void();
1904
1905         /* The caller got this descriptor from
1906          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1907         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1908
1909         mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1910
1911         if (ocfs2_is_cluster_bitmap(alloc_inode))
1912                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1913
1914         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1915                                          group_bh, journal_type);
1916         if (status < 0) {
1917                 mlog_errno(status);
1918                 goto bail;
1919         }
1920
1921         if (ocfs2_is_cluster_bitmap(alloc_inode))
1922                 cluster_bitmap = 1;
1923
1924         if (cluster_bitmap) {
1925                 jbd_lock_bh_state(group_bh);
1926                 undo_bg = (struct ocfs2_group_desc *)
1927                                         bh2jh(group_bh)->b_committed_data;
1928                 BUG_ON(!undo_bg);
1929         }
1930
1931         tmp = num_bits;
1932         while(tmp--) {
1933                 ocfs2_clear_bit((bit_off + tmp),
1934                                 (unsigned long *) bg->bg_bitmap);
1935                 if (cluster_bitmap)
1936                         ocfs2_set_bit(bit_off + tmp,
1937                                       (unsigned long *) undo_bg->bg_bitmap);
1938         }
1939         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1940
1941         if (cluster_bitmap)
1942                 jbd_unlock_bh_state(group_bh);
1943
1944         status = ocfs2_journal_dirty(handle, group_bh);
1945         if (status < 0)
1946                 mlog_errno(status);
1947 bail:
1948         return status;
1949 }
1950
1951 /*
1952  * expects the suballoc inode to already be locked.
1953  */
1954 int ocfs2_free_suballoc_bits(handle_t *handle,
1955                              struct inode *alloc_inode,
1956                              struct buffer_head *alloc_bh,
1957                              unsigned int start_bit,
1958                              u64 bg_blkno,
1959                              unsigned int count)
1960 {
1961         int status = 0;
1962         u32 tmp_used;
1963         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1964         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1965         struct buffer_head *group_bh = NULL;
1966         struct ocfs2_group_desc *group;
1967
1968         mlog_entry_void();
1969
1970         /* The alloc_bh comes from ocfs2_free_dinode() or
1971          * ocfs2_free_clusters().  The callers have all locked the
1972          * allocator and gotten alloc_bh from the lock call.  This
1973          * validates the dinode buffer.  Any corruption that has happended
1974          * is a code bug. */
1975         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1976         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1977
1978         mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1979              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1980              (unsigned long long)bg_blkno, start_bit);
1981
1982         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
1983                                              &group_bh);
1984         if (status < 0) {
1985                 mlog_errno(status);
1986                 goto bail;
1987         }
1988         group = (struct ocfs2_group_desc *) group_bh->b_data;
1989
1990         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1991
1992         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1993                                               group, group_bh,
1994                                               start_bit, count);
1995         if (status < 0) {
1996                 mlog_errno(status);
1997                 goto bail;
1998         }
1999
2000         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2001                                          alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2002         if (status < 0) {
2003                 mlog_errno(status);
2004                 goto bail;
2005         }
2006
2007         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2008                      count);
2009         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2010         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2011
2012         status = ocfs2_journal_dirty(handle, alloc_bh);
2013         if (status < 0) {
2014                 mlog_errno(status);
2015                 goto bail;
2016         }
2017
2018 bail:
2019         brelse(group_bh);
2020
2021         mlog_exit(status);
2022         return status;
2023 }
2024
2025 int ocfs2_free_dinode(handle_t *handle,
2026                       struct inode *inode_alloc_inode,
2027                       struct buffer_head *inode_alloc_bh,
2028                       struct ocfs2_dinode *di)
2029 {
2030         u64 blk = le64_to_cpu(di->i_blkno);
2031         u16 bit = le16_to_cpu(di->i_suballoc_bit);
2032         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2033
2034         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2035                                         inode_alloc_bh, bit, bg_blkno, 1);
2036 }
2037
2038 int ocfs2_free_clusters(handle_t *handle,
2039                        struct inode *bitmap_inode,
2040                        struct buffer_head *bitmap_bh,
2041                        u64 start_blk,
2042                        unsigned int num_clusters)
2043 {
2044         int status;
2045         u16 bg_start_bit;
2046         u64 bg_blkno;
2047         struct ocfs2_dinode *fe;
2048
2049         /* You can't ever have a contiguous set of clusters
2050          * bigger than a block group bitmap so we never have to worry
2051          * about looping on them. */
2052
2053         mlog_entry_void();
2054
2055         /* This is expensive. We can safely remove once this stuff has
2056          * gotten tested really well. */
2057         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2058
2059         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2060
2061         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2062                                      &bg_start_bit);
2063
2064         mlog(0, "want to free %u clusters starting at block %llu\n",
2065              num_clusters, (unsigned long long)start_blk);
2066         mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2067              (unsigned long long)bg_blkno, bg_start_bit);
2068
2069         status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2070                                           bg_start_bit, bg_blkno,
2071                                           num_clusters);
2072         if (status < 0) {
2073                 mlog_errno(status);
2074                 goto out;
2075         }
2076
2077         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2078                                          num_clusters);
2079
2080 out:
2081         mlog_exit(status);
2082         return status;
2083 }
2084
2085 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2086 {
2087         printk("Block Group:\n");
2088         printk("bg_signature:       %s\n", bg->bg_signature);
2089         printk("bg_size:            %u\n", bg->bg_size);
2090         printk("bg_bits:            %u\n", bg->bg_bits);
2091         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2092         printk("bg_chain:           %u\n", bg->bg_chain);
2093         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2094         printk("bg_next_group:      %llu\n",
2095                (unsigned long long)bg->bg_next_group);
2096         printk("bg_parent_dinode:   %llu\n",
2097                (unsigned long long)bg->bg_parent_dinode);
2098         printk("bg_blkno:           %llu\n",
2099                (unsigned long long)bg->bg_blkno);
2100 }
2101
2102 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2103 {
2104         int i;
2105
2106         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2107         printk("i_signature:                  %s\n", fe->i_signature);
2108         printk("i_size:                       %llu\n",
2109                (unsigned long long)fe->i_size);
2110         printk("i_clusters:                   %u\n", fe->i_clusters);
2111         printk("i_generation:                 %u\n",
2112                le32_to_cpu(fe->i_generation));
2113         printk("id1.bitmap1.i_used:           %u\n",
2114                le32_to_cpu(fe->id1.bitmap1.i_used));
2115         printk("id1.bitmap1.i_total:          %u\n",
2116                le32_to_cpu(fe->id1.bitmap1.i_total));
2117         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2118         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2119         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2120         printk("id2.i_chain.cl_next_free_rec: %u\n",
2121                fe->id2.i_chain.cl_next_free_rec);
2122         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2123                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2124                        fe->id2.i_chain.cl_recs[i].c_free);
2125                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2126                        fe->id2.i_chain.cl_recs[i].c_total);
2127                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2128                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2129         }
2130 }
2131
2132 /*
2133  * For a given allocation, determine which allocators will need to be
2134  * accessed, and lock them, reserving the appropriate number of bits.
2135  *
2136  * Sparse file systems call this from ocfs2_write_begin_nolock()
2137  * and ocfs2_allocate_unwritten_extents().
2138  *
2139  * File systems which don't support holes call this from
2140  * ocfs2_extend_allocation().
2141  */
2142 int ocfs2_lock_allocators(struct inode *inode,
2143                           struct ocfs2_extent_tree *et,
2144                           u32 clusters_to_add, u32 extents_to_split,
2145                           struct ocfs2_alloc_context **data_ac,
2146                           struct ocfs2_alloc_context **meta_ac)
2147 {
2148         int ret = 0, num_free_extents;
2149         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2150         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2151
2152         *meta_ac = NULL;
2153         if (data_ac)
2154                 *data_ac = NULL;
2155
2156         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2157
2158         num_free_extents = ocfs2_num_free_extents(osb, et);
2159         if (num_free_extents < 0) {
2160                 ret = num_free_extents;
2161                 mlog_errno(ret);
2162                 goto out;
2163         }
2164
2165         /*
2166          * Sparse allocation file systems need to be more conservative
2167          * with reserving room for expansion - the actual allocation
2168          * happens while we've got a journal handle open so re-taking
2169          * a cluster lock (because we ran out of room for another
2170          * extent) will violate ordering rules.
2171          *
2172          * Most of the time we'll only be seeing this 1 cluster at a time
2173          * anyway.
2174          *
2175          * Always lock for any unwritten extents - we might want to
2176          * add blocks during a split.
2177          */
2178         if (!num_free_extents ||
2179             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2180                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2181                 if (ret < 0) {
2182                         if (ret != -ENOSPC)
2183                                 mlog_errno(ret);
2184                         goto out;
2185                 }
2186         }
2187
2188         if (clusters_to_add == 0)
2189                 goto out;
2190
2191         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2192         if (ret < 0) {
2193                 if (ret != -ENOSPC)
2194                         mlog_errno(ret);
2195                 goto out;
2196         }
2197
2198 out:
2199         if (ret) {
2200                 if (*meta_ac) {
2201                         ocfs2_free_alloc_context(*meta_ac);
2202                         *meta_ac = NULL;
2203                 }
2204
2205                 /*
2206                  * We cannot have an error and a non null *data_ac.
2207                  */
2208         }
2209
2210         return ret;
2211 }
2212
2213 /*
2214  * Read the inode specified by blkno to get suballoc_slot and
2215  * suballoc_bit.
2216  */
2217 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2218                                        u16 *suballoc_slot, u16 *suballoc_bit)
2219 {
2220         int status;
2221         struct buffer_head *inode_bh = NULL;
2222         struct ocfs2_dinode *inode_fe;
2223
2224         mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2225
2226         /* dirty read disk */
2227         status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2228         if (status < 0) {
2229                 mlog(ML_ERROR, "read block %llu failed %d\n",
2230                      (unsigned long long)blkno, status);
2231                 goto bail;
2232         }
2233
2234         inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2235         if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2236                 mlog(ML_ERROR, "invalid inode %llu requested\n",
2237                      (unsigned long long)blkno);
2238                 status = -EINVAL;
2239                 goto bail;
2240         }
2241
2242         if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2243             (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2244                 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2245                      (unsigned long long)blkno,
2246                      (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2247                 status = -EINVAL;
2248                 goto bail;
2249         }
2250
2251         if (suballoc_slot)
2252                 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2253         if (suballoc_bit)
2254                 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2255
2256 bail:
2257         brelse(inode_bh);
2258
2259         mlog_exit(status);
2260         return status;
2261 }
2262
2263 /*
2264  * test whether bit is SET in allocator bitmap or not.  on success, 0
2265  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2266  * is returned and *res is meaningless.  Call this after you have
2267  * cluster locked against suballoc, or you may get a result based on
2268  * non-up2date contents
2269  */
2270 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2271                                    struct inode *suballoc,
2272                                    struct buffer_head *alloc_bh, u64 blkno,
2273                                    u16 bit, int *res)
2274 {
2275         struct ocfs2_dinode *alloc_fe;
2276         struct ocfs2_group_desc *group;
2277         struct buffer_head *group_bh = NULL;
2278         u64 bg_blkno;
2279         int status;
2280
2281         mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2282                    (unsigned int)bit);
2283
2284         alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2285         if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2286                 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2287                      (unsigned int)bit,
2288                      ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2289                 status = -EINVAL;
2290                 goto bail;
2291         }
2292
2293         bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2294         status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2295                                              &group_bh);
2296         if (status < 0) {
2297                 mlog(ML_ERROR, "read group %llu failed %d\n",
2298                      (unsigned long long)bg_blkno, status);
2299                 goto bail;
2300         }
2301
2302         group = (struct ocfs2_group_desc *) group_bh->b_data;
2303         *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2304
2305 bail:
2306         brelse(group_bh);
2307
2308         mlog_exit(status);
2309         return status;
2310 }
2311
2312 /*
2313  * Test if the bit representing this inode (blkno) is set in the
2314  * suballocator.
2315  *
2316  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2317  *
2318  * In the event of failure, a negative value is returned and *res is
2319  * meaningless.
2320  *
2321  * Callers must make sure to hold nfs_sync_lock to prevent
2322  * ocfs2_delete_inode() on another node from accessing the same
2323  * suballocator concurrently.
2324  */
2325 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2326 {
2327         int status;
2328         u16 suballoc_bit = 0, suballoc_slot = 0;
2329         struct inode *inode_alloc_inode;
2330         struct buffer_head *alloc_bh = NULL;
2331
2332         mlog_entry("blkno: %llu", (unsigned long long)blkno);
2333
2334         status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2335                                              &suballoc_bit);
2336         if (status < 0) {
2337                 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2338                 goto bail;
2339         }
2340
2341         inode_alloc_inode =
2342                 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2343                                             suballoc_slot);
2344         if (!inode_alloc_inode) {
2345                 /* the error code could be inaccurate, but we are not able to
2346                  * get the correct one. */
2347                 status = -EINVAL;
2348                 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2349                      (u32)suballoc_slot);
2350                 goto bail;
2351         }
2352
2353         mutex_lock(&inode_alloc_inode->i_mutex);
2354         status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2355         if (status < 0) {
2356                 mutex_unlock(&inode_alloc_inode->i_mutex);
2357                 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2358                      (u32)suballoc_slot, status);
2359                 goto bail;
2360         }
2361
2362         status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2363                                          blkno, suballoc_bit, res);
2364         if (status < 0)
2365                 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2366
2367         ocfs2_inode_unlock(inode_alloc_inode, 0);
2368         mutex_unlock(&inode_alloc_inode->i_mutex);
2369
2370         iput(inode_alloc_inode);
2371         brelse(alloc_bh);
2372 bail:
2373         mlog_exit(status);
2374         return status;
2375 }