Btrfs: Add run time btree defrag, and an ioctl to force btree defrag
[firefly-linux-kernel-4.4.55.git] / fs / btrfs / transaction.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/fs.h>
20 #include <linux/sched.h>
21 #include "ctree.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24
25 static int total_trans = 0;
26 extern struct kmem_cache *btrfs_trans_handle_cachep;
27 extern struct kmem_cache *btrfs_transaction_cachep;
28
29 static struct workqueue_struct *trans_wq;
30
31 #define BTRFS_ROOT_TRANS_TAG 0
32 #define BTRFS_ROOT_DEFRAG_TAG 1
33
34 static void put_transaction(struct btrfs_transaction *transaction)
35 {
36         WARN_ON(transaction->use_count == 0);
37         transaction->use_count--;
38         if (transaction->use_count == 0) {
39                 WARN_ON(total_trans == 0);
40                 total_trans--;
41                 list_del_init(&transaction->list);
42                 memset(transaction, 0, sizeof(*transaction));
43                 kmem_cache_free(btrfs_transaction_cachep, transaction);
44         }
45 }
46
47 static int join_transaction(struct btrfs_root *root)
48 {
49         struct btrfs_transaction *cur_trans;
50         cur_trans = root->fs_info->running_transaction;
51         if (!cur_trans) {
52                 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
53                                              GFP_NOFS);
54                 total_trans++;
55                 BUG_ON(!cur_trans);
56                 root->fs_info->generation++;
57                 root->fs_info->running_transaction = cur_trans;
58                 cur_trans->num_writers = 0;
59                 cur_trans->transid = root->fs_info->generation;
60                 init_waitqueue_head(&cur_trans->writer_wait);
61                 init_waitqueue_head(&cur_trans->commit_wait);
62                 cur_trans->in_commit = 0;
63                 cur_trans->use_count = 1;
64                 cur_trans->commit_done = 0;
65                 cur_trans->start_time = get_seconds();
66                 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
67                 init_bit_radix(&cur_trans->dirty_pages);
68         }
69         cur_trans->num_writers++;
70         return 0;
71 }
72
73 static int record_root_in_trans(struct btrfs_root *root)
74 {
75         u64 running_trans_id = root->fs_info->running_transaction->transid;
76         if (root->ref_cows && root->last_trans < running_trans_id) {
77                 WARN_ON(root == root->fs_info->extent_root);
78                 if (root->root_item.refs != 0) {
79                         radix_tree_tag_set(&root->fs_info->fs_roots_radix,
80                                    (unsigned long)root->root_key.objectid,
81                                    BTRFS_ROOT_TRANS_TAG);
82                         radix_tree_tag_set(&root->fs_info->fs_roots_radix,
83                                    (unsigned long)root->root_key.objectid,
84                                    BTRFS_ROOT_DEFRAG_TAG);
85                         root->commit_root = root->node;
86                         get_bh(root->node);
87                 } else {
88                         WARN_ON(1);
89                 }
90                 root->last_trans = running_trans_id;
91         }
92         return 0;
93 }
94
95 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
96                                                    int num_blocks)
97 {
98         struct btrfs_trans_handle *h =
99                 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
100         int ret;
101
102         mutex_lock(&root->fs_info->trans_mutex);
103         ret = join_transaction(root);
104         BUG_ON(ret);
105
106         record_root_in_trans(root);
107         h->transid = root->fs_info->running_transaction->transid;
108         h->transaction = root->fs_info->running_transaction;
109         h->blocks_reserved = num_blocks;
110         h->blocks_used = 0;
111         h->block_group = NULL;
112         root->fs_info->running_transaction->use_count++;
113         mutex_unlock(&root->fs_info->trans_mutex);
114         return h;
115 }
116
117 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
118                           struct btrfs_root *root)
119 {
120         struct btrfs_transaction *cur_trans;
121
122         mutex_lock(&root->fs_info->trans_mutex);
123         cur_trans = root->fs_info->running_transaction;
124         WARN_ON(cur_trans != trans->transaction);
125         WARN_ON(cur_trans->num_writers < 1);
126         cur_trans->num_writers--;
127         if (waitqueue_active(&cur_trans->writer_wait))
128                 wake_up(&cur_trans->writer_wait);
129         put_transaction(cur_trans);
130         mutex_unlock(&root->fs_info->trans_mutex);
131         memset(trans, 0, sizeof(*trans));
132         kmem_cache_free(btrfs_trans_handle_cachep, trans);
133         return 0;
134 }
135
136
137 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
138                                      struct btrfs_root *root)
139 {
140         unsigned long gang[16];
141         int ret;
142         int i;
143         int err;
144         int werr = 0;
145         struct page *page;
146         struct radix_tree_root *dirty_pages;
147         struct inode *btree_inode = root->fs_info->btree_inode;
148
149         if (!trans || !trans->transaction) {
150                 return filemap_write_and_wait(btree_inode->i_mapping);
151         }
152         dirty_pages = &trans->transaction->dirty_pages;
153         while(1) {
154                 ret = find_first_radix_bit(dirty_pages, gang,
155                                            0, ARRAY_SIZE(gang));
156                 if (!ret)
157                         break;
158                 for (i = 0; i < ret; i++) {
159                         /* FIXME EIO */
160                         clear_radix_bit(dirty_pages, gang[i]);
161                         page = find_lock_page(btree_inode->i_mapping,
162                                               gang[i]);
163                         if (!page)
164                                 continue;
165                         if (PageWriteback(page)) {
166                                 if (PageDirty(page))
167                                         wait_on_page_writeback(page);
168                                 else {
169                                         unlock_page(page);
170                                         page_cache_release(page);
171                                         continue;
172                                 }
173                         }
174                         err = write_one_page(page, 0);
175                         if (err)
176                                 werr = err;
177                         page_cache_release(page);
178                 }
179         }
180         err = filemap_fdatawait(btree_inode->i_mapping);
181         if (err)
182                 werr = err;
183         return werr;
184 }
185
186 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
187                             struct btrfs_root *root)
188 {
189         int ret;
190         u64 old_extent_block;
191         struct btrfs_fs_info *fs_info = root->fs_info;
192         struct btrfs_root *tree_root = fs_info->tree_root;
193         struct btrfs_root *extent_root = fs_info->extent_root;
194
195         btrfs_write_dirty_block_groups(trans, extent_root);
196         while(1) {
197                 old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
198                 if (old_extent_block == bh_blocknr(extent_root->node))
199                         break;
200                 btrfs_set_root_blocknr(&extent_root->root_item,
201                                        bh_blocknr(extent_root->node));
202                 ret = btrfs_update_root(trans, tree_root,
203                                         &extent_root->root_key,
204                                         &extent_root->root_item);
205                 BUG_ON(ret);
206                 btrfs_write_dirty_block_groups(trans, extent_root);
207         }
208         return 0;
209 }
210
211 static int wait_for_commit(struct btrfs_root *root,
212                            struct btrfs_transaction *commit)
213 {
214         DEFINE_WAIT(wait);
215         mutex_lock(&root->fs_info->trans_mutex);
216         while(!commit->commit_done) {
217                 prepare_to_wait(&commit->commit_wait, &wait,
218                                 TASK_UNINTERRUPTIBLE);
219                 if (commit->commit_done)
220                         break;
221                 mutex_unlock(&root->fs_info->trans_mutex);
222                 schedule();
223                 mutex_lock(&root->fs_info->trans_mutex);
224         }
225         mutex_unlock(&root->fs_info->trans_mutex);
226         finish_wait(&commit->commit_wait, &wait);
227         return 0;
228 }
229
230 struct dirty_root {
231         struct list_head list;
232         struct btrfs_root *root;
233 };
234
235 int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list)
236 {
237         struct dirty_root *dirty;
238
239         dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
240         if (!dirty)
241                 return -ENOMEM;
242         dirty->root = root;
243         list_add(&dirty->list, dead_list);
244         return 0;
245 }
246
247 static int add_dirty_roots(struct btrfs_trans_handle *trans,
248                            struct radix_tree_root *radix,
249                            struct list_head *list)
250 {
251         struct dirty_root *dirty;
252         struct btrfs_root *gang[8];
253         struct btrfs_root *root;
254         int i;
255         int ret;
256         int err = 0;
257         u32 refs;
258
259         while(1) {
260                 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
261                                                  ARRAY_SIZE(gang),
262                                                  BTRFS_ROOT_TRANS_TAG);
263                 if (ret == 0)
264                         break;
265                 for (i = 0; i < ret; i++) {
266                         root = gang[i];
267                         radix_tree_tag_clear(radix,
268                                      (unsigned long)root->root_key.objectid,
269                                      BTRFS_ROOT_TRANS_TAG);
270                         if (root->commit_root == root->node) {
271                                 WARN_ON(bh_blocknr(root->node) !=
272                                         btrfs_root_blocknr(&root->root_item));
273                                 brelse(root->commit_root);
274                                 root->commit_root = NULL;
275                                 continue;
276                         }
277                         dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
278                         BUG_ON(!dirty);
279                         dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
280                         BUG_ON(!dirty->root);
281
282                         memset(&root->root_item.drop_progress, 0,
283                                sizeof(struct btrfs_disk_key));
284                         root->root_item.drop_level = 0;
285
286                         memcpy(dirty->root, root, sizeof(*root));
287                         dirty->root->node = root->commit_root;
288                         root->commit_root = NULL;
289
290                         root->root_key.offset = root->fs_info->generation;
291                         btrfs_set_root_blocknr(&root->root_item,
292                                                bh_blocknr(root->node));
293                         err = btrfs_insert_root(trans, root->fs_info->tree_root,
294                                                 &root->root_key,
295                                                 &root->root_item);
296                         if (err)
297                                 break;
298
299                         refs = btrfs_root_refs(&dirty->root->root_item);
300                         btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
301                         err = btrfs_update_root(trans, root->fs_info->tree_root,
302                                                 &dirty->root->root_key,
303                                                 &dirty->root->root_item);
304
305                         BUG_ON(err);
306                         if (refs == 1) {
307                                 list_add(&dirty->list, list);
308                         } else {
309                                 WARN_ON(1);
310                                 kfree(dirty->root);
311                                 kfree(dirty);
312                         }
313                 }
314         }
315         return err;
316 }
317
318 int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
319 {
320         struct btrfs_root *gang[1];
321         struct btrfs_root *root;
322         struct btrfs_root *tree_root = info->tree_root;
323         struct btrfs_trans_handle *trans;
324         int i;
325         int ret;
326         int err = 0;
327         u64 last = 0;
328
329         trans = btrfs_start_transaction(tree_root, 1);
330         while(1) {
331                 ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix,
332                                                  (void **)gang, last,
333                                                  ARRAY_SIZE(gang),
334                                                  BTRFS_ROOT_DEFRAG_TAG);
335                 if (ret == 0)
336                         break;
337                 for (i = 0; i < ret; i++) {
338                         root = gang[i];
339                         last = root->root_key.objectid + 1;
340                         radix_tree_tag_clear(&info->fs_roots_radix,
341                                      (unsigned long)root->root_key.objectid,
342                                      BTRFS_ROOT_DEFRAG_TAG);
343                         if (root->defrag_running)
344                                 continue;
345
346                         while (1) {
347                                 mutex_lock(&root->fs_info->trans_mutex);
348                                 record_root_in_trans(root);
349                                 mutex_unlock(&root->fs_info->trans_mutex);
350
351                                 root->defrag_running = 1;
352                                 err = btrfs_defrag_leaves(trans, root, 1);
353                                 btrfs_end_transaction(trans, tree_root);
354                                 mutex_unlock(&info->fs_mutex);
355
356                                 btrfs_btree_balance_dirty(root);
357
358                                 mutex_lock(&info->fs_mutex);
359                                 trans = btrfs_start_transaction(tree_root, 1);
360                                 if (err != -EAGAIN)
361                                         break;
362                         }
363                         root->defrag_running = 0;
364                 }
365         }
366         btrfs_end_transaction(trans, tree_root);
367         return err;
368 }
369
370 static int drop_dirty_roots(struct btrfs_root *tree_root,
371                             struct list_head *list)
372 {
373         struct dirty_root *dirty;
374         struct btrfs_trans_handle *trans;
375         int ret = 0;
376         int err;
377
378         while(!list_empty(list)) {
379                 mutex_lock(&tree_root->fs_info->fs_mutex);
380                 dirty = list_entry(list->next, struct dirty_root, list);
381                 list_del_init(&dirty->list);
382
383                 while(1) {
384                         trans = btrfs_start_transaction(tree_root, 1);
385                         ret = btrfs_drop_snapshot(trans, dirty->root);
386                         if (ret != -EAGAIN) {
387                                 break;
388                         }
389                         err = btrfs_update_root(trans,
390                                         tree_root,
391                                         &dirty->root->root_key,
392                                         &dirty->root->root_item);
393                         if (err)
394                                 ret = err;
395                         ret = btrfs_end_transaction(trans, tree_root);
396                         BUG_ON(ret);
397                 }
398                 BUG_ON(ret);
399                 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
400                 if (ret)
401                         break;
402                 ret = btrfs_end_transaction(trans, tree_root);
403                 BUG_ON(ret);
404
405                 kfree(dirty->root);
406                 kfree(dirty);
407                 mutex_unlock(&tree_root->fs_info->fs_mutex);
408                 btrfs_btree_balance_dirty(tree_root);
409         }
410         return ret;
411 }
412
413 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
414                              struct btrfs_root *root)
415 {
416         int ret = 0;
417         struct btrfs_transaction *cur_trans;
418         struct btrfs_transaction *prev_trans = NULL;
419         struct list_head dirty_fs_roots;
420         struct radix_tree_root pinned_copy;
421         DEFINE_WAIT(wait);
422
423         init_bit_radix(&pinned_copy);
424         INIT_LIST_HEAD(&dirty_fs_roots);
425
426         mutex_lock(&root->fs_info->trans_mutex);
427         if (trans->transaction->in_commit) {
428                 cur_trans = trans->transaction;
429                 trans->transaction->use_count++;
430                 mutex_unlock(&root->fs_info->trans_mutex);
431                 btrfs_end_transaction(trans, root);
432
433                 mutex_unlock(&root->fs_info->fs_mutex);
434                 ret = wait_for_commit(root, cur_trans);
435                 BUG_ON(ret);
436                 put_transaction(cur_trans);
437                 mutex_lock(&root->fs_info->fs_mutex);
438                 return 0;
439         }
440         trans->transaction->in_commit = 1;
441         cur_trans = trans->transaction;
442         if (cur_trans->list.prev != &root->fs_info->trans_list) {
443                 prev_trans = list_entry(cur_trans->list.prev,
444                                         struct btrfs_transaction, list);
445                 if (!prev_trans->commit_done) {
446                         prev_trans->use_count++;
447                         mutex_unlock(&root->fs_info->fs_mutex);
448                         mutex_unlock(&root->fs_info->trans_mutex);
449
450                         wait_for_commit(root, prev_trans);
451                         put_transaction(prev_trans);
452
453                         mutex_lock(&root->fs_info->fs_mutex);
454                         mutex_lock(&root->fs_info->trans_mutex);
455                 }
456         }
457         while (trans->transaction->num_writers > 1) {
458                 WARN_ON(cur_trans != trans->transaction);
459                 prepare_to_wait(&trans->transaction->writer_wait, &wait,
460                                 TASK_UNINTERRUPTIBLE);
461                 if (trans->transaction->num_writers <= 1)
462                         break;
463                 mutex_unlock(&root->fs_info->fs_mutex);
464                 mutex_unlock(&root->fs_info->trans_mutex);
465                 schedule();
466                 mutex_lock(&root->fs_info->fs_mutex);
467                 mutex_lock(&root->fs_info->trans_mutex);
468                 finish_wait(&trans->transaction->writer_wait, &wait);
469         }
470         finish_wait(&trans->transaction->writer_wait, &wait);
471         WARN_ON(cur_trans != trans->transaction);
472         ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
473                               &dirty_fs_roots);
474         BUG_ON(ret);
475
476         ret = btrfs_commit_tree_roots(trans, root);
477         BUG_ON(ret);
478
479         cur_trans = root->fs_info->running_transaction;
480         root->fs_info->running_transaction = NULL;
481         btrfs_set_super_generation(&root->fs_info->super_copy,
482                                    cur_trans->transid);
483         btrfs_set_super_root(&root->fs_info->super_copy,
484                              bh_blocknr(root->fs_info->tree_root->node));
485         memcpy(root->fs_info->disk_super, &root->fs_info->super_copy,
486                sizeof(root->fs_info->super_copy));
487
488         btrfs_copy_pinned(root, &pinned_copy);
489
490         mutex_unlock(&root->fs_info->trans_mutex);
491         mutex_unlock(&root->fs_info->fs_mutex);
492         ret = btrfs_write_and_wait_transaction(trans, root);
493         BUG_ON(ret);
494         write_ctree_super(trans, root);
495         mutex_lock(&root->fs_info->fs_mutex);
496         btrfs_finish_extent_commit(trans, root, &pinned_copy);
497         mutex_lock(&root->fs_info->trans_mutex);
498         cur_trans->commit_done = 1;
499         wake_up(&cur_trans->commit_wait);
500         put_transaction(cur_trans);
501         put_transaction(cur_trans);
502         if (root->fs_info->closing)
503                 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
504         else
505                 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
506         mutex_unlock(&root->fs_info->trans_mutex);
507         kmem_cache_free(btrfs_trans_handle_cachep, trans);
508
509         if (root->fs_info->closing) {
510                 mutex_unlock(&root->fs_info->fs_mutex);
511                 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
512                 mutex_lock(&root->fs_info->fs_mutex);
513         }
514         return ret;
515 }
516
517 void btrfs_transaction_cleaner(struct work_struct *work)
518 {
519         struct btrfs_fs_info *fs_info = container_of(work,
520                                                      struct btrfs_fs_info,
521                                                      trans_work.work);
522
523         struct btrfs_root *root = fs_info->tree_root;
524         struct btrfs_transaction *cur;
525         struct btrfs_trans_handle *trans;
526         struct list_head dirty_roots;
527         unsigned long now;
528         unsigned long delay = HZ * 30;
529         int ret;
530
531         INIT_LIST_HEAD(&dirty_roots);
532         mutex_lock(&root->fs_info->fs_mutex);
533         mutex_lock(&root->fs_info->trans_mutex);
534         cur = root->fs_info->running_transaction;
535         if (!cur) {
536                 mutex_unlock(&root->fs_info->trans_mutex);
537                 goto out;
538         }
539         now = get_seconds();
540         if (now < cur->start_time || now - cur->start_time < 30) {
541                 mutex_unlock(&root->fs_info->trans_mutex);
542                 delay = HZ * 5;
543                 goto out;
544         }
545         mutex_unlock(&root->fs_info->trans_mutex);
546         btrfs_defrag_dirty_roots(root->fs_info);
547         trans = btrfs_start_transaction(root, 1);
548         ret = btrfs_commit_transaction(trans, root);
549 out:
550         mutex_unlock(&root->fs_info->fs_mutex);
551
552         mutex_lock(&root->fs_info->trans_mutex);
553         list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
554         mutex_unlock(&root->fs_info->trans_mutex);
555
556         if (!list_empty(&dirty_roots)) {
557                 drop_dirty_roots(root, &dirty_roots);
558         }
559         btrfs_transaction_queue_work(root, delay);
560 }
561
562 void btrfs_transaction_queue_work(struct btrfs_root *root, int delay)
563 {
564         queue_delayed_work(trans_wq, &root->fs_info->trans_work, delay);
565 }
566
567 void btrfs_transaction_flush_work(struct btrfs_root *root)
568 {
569         cancel_rearming_delayed_workqueue(trans_wq, &root->fs_info->trans_work);
570         flush_workqueue(trans_wq);
571 }
572
573 void __init btrfs_init_transaction_sys(void)
574 {
575         trans_wq = create_workqueue("btrfs");
576 }
577
578 void __exit btrfs_exit_transaction_sys(void)
579 {
580         destroy_workqueue(trans_wq);
581 }
582