Btrfs: allow for pausing restriper
[firefly-linux-kernel-4.4.55.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <linux/kthread.h>
27 #include <asm/div64.h>
28 #include "compat.h"
29 #include "ctree.h"
30 #include "extent_map.h"
31 #include "disk-io.h"
32 #include "transaction.h"
33 #include "print-tree.h"
34 #include "volumes.h"
35 #include "async-thread.h"
36
37 static int init_first_rw_device(struct btrfs_trans_handle *trans,
38                                 struct btrfs_root *root,
39                                 struct btrfs_device *device);
40 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
41
42 static DEFINE_MUTEX(uuid_mutex);
43 static LIST_HEAD(fs_uuids);
44
45 static void lock_chunks(struct btrfs_root *root)
46 {
47         mutex_lock(&root->fs_info->chunk_mutex);
48 }
49
50 static void unlock_chunks(struct btrfs_root *root)
51 {
52         mutex_unlock(&root->fs_info->chunk_mutex);
53 }
54
55 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
56 {
57         struct btrfs_device *device;
58         WARN_ON(fs_devices->opened);
59         while (!list_empty(&fs_devices->devices)) {
60                 device = list_entry(fs_devices->devices.next,
61                                     struct btrfs_device, dev_list);
62                 list_del(&device->dev_list);
63                 kfree(device->name);
64                 kfree(device);
65         }
66         kfree(fs_devices);
67 }
68
69 int btrfs_cleanup_fs_uuids(void)
70 {
71         struct btrfs_fs_devices *fs_devices;
72
73         while (!list_empty(&fs_uuids)) {
74                 fs_devices = list_entry(fs_uuids.next,
75                                         struct btrfs_fs_devices, list);
76                 list_del(&fs_devices->list);
77                 free_fs_devices(fs_devices);
78         }
79         return 0;
80 }
81
82 static noinline struct btrfs_device *__find_device(struct list_head *head,
83                                                    u64 devid, u8 *uuid)
84 {
85         struct btrfs_device *dev;
86
87         list_for_each_entry(dev, head, dev_list) {
88                 if (dev->devid == devid &&
89                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
90                         return dev;
91                 }
92         }
93         return NULL;
94 }
95
96 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
97 {
98         struct btrfs_fs_devices *fs_devices;
99
100         list_for_each_entry(fs_devices, &fs_uuids, list) {
101                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
102                         return fs_devices;
103         }
104         return NULL;
105 }
106
107 static void requeue_list(struct btrfs_pending_bios *pending_bios,
108                         struct bio *head, struct bio *tail)
109 {
110
111         struct bio *old_head;
112
113         old_head = pending_bios->head;
114         pending_bios->head = head;
115         if (pending_bios->tail)
116                 tail->bi_next = old_head;
117         else
118                 pending_bios->tail = tail;
119 }
120
121 /*
122  * we try to collect pending bios for a device so we don't get a large
123  * number of procs sending bios down to the same device.  This greatly
124  * improves the schedulers ability to collect and merge the bios.
125  *
126  * But, it also turns into a long list of bios to process and that is sure
127  * to eventually make the worker thread block.  The solution here is to
128  * make some progress and then put this work struct back at the end of
129  * the list if the block device is congested.  This way, multiple devices
130  * can make progress from a single worker thread.
131  */
132 static noinline int run_scheduled_bios(struct btrfs_device *device)
133 {
134         struct bio *pending;
135         struct backing_dev_info *bdi;
136         struct btrfs_fs_info *fs_info;
137         struct btrfs_pending_bios *pending_bios;
138         struct bio *tail;
139         struct bio *cur;
140         int again = 0;
141         unsigned long num_run;
142         unsigned long batch_run = 0;
143         unsigned long limit;
144         unsigned long last_waited = 0;
145         int force_reg = 0;
146         int sync_pending = 0;
147         struct blk_plug plug;
148
149         /*
150          * this function runs all the bios we've collected for
151          * a particular device.  We don't want to wander off to
152          * another device without first sending all of these down.
153          * So, setup a plug here and finish it off before we return
154          */
155         blk_start_plug(&plug);
156
157         bdi = blk_get_backing_dev_info(device->bdev);
158         fs_info = device->dev_root->fs_info;
159         limit = btrfs_async_submit_limit(fs_info);
160         limit = limit * 2 / 3;
161
162 loop:
163         spin_lock(&device->io_lock);
164
165 loop_lock:
166         num_run = 0;
167
168         /* take all the bios off the list at once and process them
169          * later on (without the lock held).  But, remember the
170          * tail and other pointers so the bios can be properly reinserted
171          * into the list if we hit congestion
172          */
173         if (!force_reg && device->pending_sync_bios.head) {
174                 pending_bios = &device->pending_sync_bios;
175                 force_reg = 1;
176         } else {
177                 pending_bios = &device->pending_bios;
178                 force_reg = 0;
179         }
180
181         pending = pending_bios->head;
182         tail = pending_bios->tail;
183         WARN_ON(pending && !tail);
184
185         /*
186          * if pending was null this time around, no bios need processing
187          * at all and we can stop.  Otherwise it'll loop back up again
188          * and do an additional check so no bios are missed.
189          *
190          * device->running_pending is used to synchronize with the
191          * schedule_bio code.
192          */
193         if (device->pending_sync_bios.head == NULL &&
194             device->pending_bios.head == NULL) {
195                 again = 0;
196                 device->running_pending = 0;
197         } else {
198                 again = 1;
199                 device->running_pending = 1;
200         }
201
202         pending_bios->head = NULL;
203         pending_bios->tail = NULL;
204
205         spin_unlock(&device->io_lock);
206
207         while (pending) {
208
209                 rmb();
210                 /* we want to work on both lists, but do more bios on the
211                  * sync list than the regular list
212                  */
213                 if ((num_run > 32 &&
214                     pending_bios != &device->pending_sync_bios &&
215                     device->pending_sync_bios.head) ||
216                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
217                     device->pending_bios.head)) {
218                         spin_lock(&device->io_lock);
219                         requeue_list(pending_bios, pending, tail);
220                         goto loop_lock;
221                 }
222
223                 cur = pending;
224                 pending = pending->bi_next;
225                 cur->bi_next = NULL;
226                 atomic_dec(&fs_info->nr_async_bios);
227
228                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
229                     waitqueue_active(&fs_info->async_submit_wait))
230                         wake_up(&fs_info->async_submit_wait);
231
232                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
233
234                 /*
235                  * if we're doing the sync list, record that our
236                  * plug has some sync requests on it
237                  *
238                  * If we're doing the regular list and there are
239                  * sync requests sitting around, unplug before
240                  * we add more
241                  */
242                 if (pending_bios == &device->pending_sync_bios) {
243                         sync_pending = 1;
244                 } else if (sync_pending) {
245                         blk_finish_plug(&plug);
246                         blk_start_plug(&plug);
247                         sync_pending = 0;
248                 }
249
250                 submit_bio(cur->bi_rw, cur);
251                 num_run++;
252                 batch_run++;
253                 if (need_resched())
254                         cond_resched();
255
256                 /*
257                  * we made progress, there is more work to do and the bdi
258                  * is now congested.  Back off and let other work structs
259                  * run instead
260                  */
261                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
262                     fs_info->fs_devices->open_devices > 1) {
263                         struct io_context *ioc;
264
265                         ioc = current->io_context;
266
267                         /*
268                          * the main goal here is that we don't want to
269                          * block if we're going to be able to submit
270                          * more requests without blocking.
271                          *
272                          * This code does two great things, it pokes into
273                          * the elevator code from a filesystem _and_
274                          * it makes assumptions about how batching works.
275                          */
276                         if (ioc && ioc->nr_batch_requests > 0 &&
277                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
278                             (last_waited == 0 ||
279                              ioc->last_waited == last_waited)) {
280                                 /*
281                                  * we want to go through our batch of
282                                  * requests and stop.  So, we copy out
283                                  * the ioc->last_waited time and test
284                                  * against it before looping
285                                  */
286                                 last_waited = ioc->last_waited;
287                                 if (need_resched())
288                                         cond_resched();
289                                 continue;
290                         }
291                         spin_lock(&device->io_lock);
292                         requeue_list(pending_bios, pending, tail);
293                         device->running_pending = 1;
294
295                         spin_unlock(&device->io_lock);
296                         btrfs_requeue_work(&device->work);
297                         goto done;
298                 }
299                 /* unplug every 64 requests just for good measure */
300                 if (batch_run % 64 == 0) {
301                         blk_finish_plug(&plug);
302                         blk_start_plug(&plug);
303                         sync_pending = 0;
304                 }
305         }
306
307         cond_resched();
308         if (again)
309                 goto loop;
310
311         spin_lock(&device->io_lock);
312         if (device->pending_bios.head || device->pending_sync_bios.head)
313                 goto loop_lock;
314         spin_unlock(&device->io_lock);
315
316 done:
317         blk_finish_plug(&plug);
318         return 0;
319 }
320
321 static void pending_bios_fn(struct btrfs_work *work)
322 {
323         struct btrfs_device *device;
324
325         device = container_of(work, struct btrfs_device, work);
326         run_scheduled_bios(device);
327 }
328
329 static noinline int device_list_add(const char *path,
330                            struct btrfs_super_block *disk_super,
331                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
332 {
333         struct btrfs_device *device;
334         struct btrfs_fs_devices *fs_devices;
335         u64 found_transid = btrfs_super_generation(disk_super);
336         char *name;
337
338         fs_devices = find_fsid(disk_super->fsid);
339         if (!fs_devices) {
340                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
341                 if (!fs_devices)
342                         return -ENOMEM;
343                 INIT_LIST_HEAD(&fs_devices->devices);
344                 INIT_LIST_HEAD(&fs_devices->alloc_list);
345                 list_add(&fs_devices->list, &fs_uuids);
346                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
347                 fs_devices->latest_devid = devid;
348                 fs_devices->latest_trans = found_transid;
349                 mutex_init(&fs_devices->device_list_mutex);
350                 device = NULL;
351         } else {
352                 device = __find_device(&fs_devices->devices, devid,
353                                        disk_super->dev_item.uuid);
354         }
355         if (!device) {
356                 if (fs_devices->opened)
357                         return -EBUSY;
358
359                 device = kzalloc(sizeof(*device), GFP_NOFS);
360                 if (!device) {
361                         /* we can safely leave the fs_devices entry around */
362                         return -ENOMEM;
363                 }
364                 device->devid = devid;
365                 device->work.func = pending_bios_fn;
366                 memcpy(device->uuid, disk_super->dev_item.uuid,
367                        BTRFS_UUID_SIZE);
368                 spin_lock_init(&device->io_lock);
369                 device->name = kstrdup(path, GFP_NOFS);
370                 if (!device->name) {
371                         kfree(device);
372                         return -ENOMEM;
373                 }
374                 INIT_LIST_HEAD(&device->dev_alloc_list);
375
376                 /* init readahead state */
377                 spin_lock_init(&device->reada_lock);
378                 device->reada_curr_zone = NULL;
379                 atomic_set(&device->reada_in_flight, 0);
380                 device->reada_next = 0;
381                 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
382                 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
383
384                 mutex_lock(&fs_devices->device_list_mutex);
385                 list_add_rcu(&device->dev_list, &fs_devices->devices);
386                 mutex_unlock(&fs_devices->device_list_mutex);
387
388                 device->fs_devices = fs_devices;
389                 fs_devices->num_devices++;
390         } else if (!device->name || strcmp(device->name, path)) {
391                 name = kstrdup(path, GFP_NOFS);
392                 if (!name)
393                         return -ENOMEM;
394                 kfree(device->name);
395                 device->name = name;
396                 if (device->missing) {
397                         fs_devices->missing_devices--;
398                         device->missing = 0;
399                 }
400         }
401
402         if (found_transid > fs_devices->latest_trans) {
403                 fs_devices->latest_devid = devid;
404                 fs_devices->latest_trans = found_transid;
405         }
406         *fs_devices_ret = fs_devices;
407         return 0;
408 }
409
410 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
411 {
412         struct btrfs_fs_devices *fs_devices;
413         struct btrfs_device *device;
414         struct btrfs_device *orig_dev;
415
416         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
417         if (!fs_devices)
418                 return ERR_PTR(-ENOMEM);
419
420         INIT_LIST_HEAD(&fs_devices->devices);
421         INIT_LIST_HEAD(&fs_devices->alloc_list);
422         INIT_LIST_HEAD(&fs_devices->list);
423         mutex_init(&fs_devices->device_list_mutex);
424         fs_devices->latest_devid = orig->latest_devid;
425         fs_devices->latest_trans = orig->latest_trans;
426         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
427
428         /* We have held the volume lock, it is safe to get the devices. */
429         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
430                 device = kzalloc(sizeof(*device), GFP_NOFS);
431                 if (!device)
432                         goto error;
433
434                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
435                 if (!device->name) {
436                         kfree(device);
437                         goto error;
438                 }
439
440                 device->devid = orig_dev->devid;
441                 device->work.func = pending_bios_fn;
442                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
443                 spin_lock_init(&device->io_lock);
444                 INIT_LIST_HEAD(&device->dev_list);
445                 INIT_LIST_HEAD(&device->dev_alloc_list);
446
447                 list_add(&device->dev_list, &fs_devices->devices);
448                 device->fs_devices = fs_devices;
449                 fs_devices->num_devices++;
450         }
451         return fs_devices;
452 error:
453         free_fs_devices(fs_devices);
454         return ERR_PTR(-ENOMEM);
455 }
456
457 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
458 {
459         struct btrfs_device *device, *next;
460
461         mutex_lock(&uuid_mutex);
462 again:
463         /* This is the initialized path, it is safe to release the devices. */
464         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
465                 if (device->in_fs_metadata)
466                         continue;
467
468                 if (device->bdev) {
469                         blkdev_put(device->bdev, device->mode);
470                         device->bdev = NULL;
471                         fs_devices->open_devices--;
472                 }
473                 if (device->writeable) {
474                         list_del_init(&device->dev_alloc_list);
475                         device->writeable = 0;
476                         fs_devices->rw_devices--;
477                 }
478                 list_del_init(&device->dev_list);
479                 fs_devices->num_devices--;
480                 kfree(device->name);
481                 kfree(device);
482         }
483
484         if (fs_devices->seed) {
485                 fs_devices = fs_devices->seed;
486                 goto again;
487         }
488
489         mutex_unlock(&uuid_mutex);
490         return 0;
491 }
492
493 static void __free_device(struct work_struct *work)
494 {
495         struct btrfs_device *device;
496
497         device = container_of(work, struct btrfs_device, rcu_work);
498
499         if (device->bdev)
500                 blkdev_put(device->bdev, device->mode);
501
502         kfree(device->name);
503         kfree(device);
504 }
505
506 static void free_device(struct rcu_head *head)
507 {
508         struct btrfs_device *device;
509
510         device = container_of(head, struct btrfs_device, rcu);
511
512         INIT_WORK(&device->rcu_work, __free_device);
513         schedule_work(&device->rcu_work);
514 }
515
516 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
517 {
518         struct btrfs_device *device;
519
520         if (--fs_devices->opened > 0)
521                 return 0;
522
523         mutex_lock(&fs_devices->device_list_mutex);
524         list_for_each_entry(device, &fs_devices->devices, dev_list) {
525                 struct btrfs_device *new_device;
526
527                 if (device->bdev)
528                         fs_devices->open_devices--;
529
530                 if (device->writeable) {
531                         list_del_init(&device->dev_alloc_list);
532                         fs_devices->rw_devices--;
533                 }
534
535                 if (device->can_discard)
536                         fs_devices->num_can_discard--;
537
538                 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
539                 BUG_ON(!new_device);
540                 memcpy(new_device, device, sizeof(*new_device));
541                 new_device->name = kstrdup(device->name, GFP_NOFS);
542                 BUG_ON(device->name && !new_device->name);
543                 new_device->bdev = NULL;
544                 new_device->writeable = 0;
545                 new_device->in_fs_metadata = 0;
546                 new_device->can_discard = 0;
547                 list_replace_rcu(&device->dev_list, &new_device->dev_list);
548
549                 call_rcu(&device->rcu, free_device);
550         }
551         mutex_unlock(&fs_devices->device_list_mutex);
552
553         WARN_ON(fs_devices->open_devices);
554         WARN_ON(fs_devices->rw_devices);
555         fs_devices->opened = 0;
556         fs_devices->seeding = 0;
557
558         return 0;
559 }
560
561 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
562 {
563         struct btrfs_fs_devices *seed_devices = NULL;
564         int ret;
565
566         mutex_lock(&uuid_mutex);
567         ret = __btrfs_close_devices(fs_devices);
568         if (!fs_devices->opened) {
569                 seed_devices = fs_devices->seed;
570                 fs_devices->seed = NULL;
571         }
572         mutex_unlock(&uuid_mutex);
573
574         while (seed_devices) {
575                 fs_devices = seed_devices;
576                 seed_devices = fs_devices->seed;
577                 __btrfs_close_devices(fs_devices);
578                 free_fs_devices(fs_devices);
579         }
580         return ret;
581 }
582
583 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
584                                 fmode_t flags, void *holder)
585 {
586         struct request_queue *q;
587         struct block_device *bdev;
588         struct list_head *head = &fs_devices->devices;
589         struct btrfs_device *device;
590         struct block_device *latest_bdev = NULL;
591         struct buffer_head *bh;
592         struct btrfs_super_block *disk_super;
593         u64 latest_devid = 0;
594         u64 latest_transid = 0;
595         u64 devid;
596         int seeding = 1;
597         int ret = 0;
598
599         flags |= FMODE_EXCL;
600
601         list_for_each_entry(device, head, dev_list) {
602                 if (device->bdev)
603                         continue;
604                 if (!device->name)
605                         continue;
606
607                 bdev = blkdev_get_by_path(device->name, flags, holder);
608                 if (IS_ERR(bdev)) {
609                         printk(KERN_INFO "open %s failed\n", device->name);
610                         goto error;
611                 }
612                 set_blocksize(bdev, 4096);
613
614                 bh = btrfs_read_dev_super(bdev);
615                 if (!bh)
616                         goto error_close;
617
618                 disk_super = (struct btrfs_super_block *)bh->b_data;
619                 devid = btrfs_stack_device_id(&disk_super->dev_item);
620                 if (devid != device->devid)
621                         goto error_brelse;
622
623                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
624                            BTRFS_UUID_SIZE))
625                         goto error_brelse;
626
627                 device->generation = btrfs_super_generation(disk_super);
628                 if (!latest_transid || device->generation > latest_transid) {
629                         latest_devid = devid;
630                         latest_transid = device->generation;
631                         latest_bdev = bdev;
632                 }
633
634                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
635                         device->writeable = 0;
636                 } else {
637                         device->writeable = !bdev_read_only(bdev);
638                         seeding = 0;
639                 }
640
641                 q = bdev_get_queue(bdev);
642                 if (blk_queue_discard(q)) {
643                         device->can_discard = 1;
644                         fs_devices->num_can_discard++;
645                 }
646
647                 device->bdev = bdev;
648                 device->in_fs_metadata = 0;
649                 device->mode = flags;
650
651                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
652                         fs_devices->rotating = 1;
653
654                 fs_devices->open_devices++;
655                 if (device->writeable) {
656                         fs_devices->rw_devices++;
657                         list_add(&device->dev_alloc_list,
658                                  &fs_devices->alloc_list);
659                 }
660                 brelse(bh);
661                 continue;
662
663 error_brelse:
664                 brelse(bh);
665 error_close:
666                 blkdev_put(bdev, flags);
667 error:
668                 continue;
669         }
670         if (fs_devices->open_devices == 0) {
671                 ret = -EINVAL;
672                 goto out;
673         }
674         fs_devices->seeding = seeding;
675         fs_devices->opened = 1;
676         fs_devices->latest_bdev = latest_bdev;
677         fs_devices->latest_devid = latest_devid;
678         fs_devices->latest_trans = latest_transid;
679         fs_devices->total_rw_bytes = 0;
680 out:
681         return ret;
682 }
683
684 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
685                        fmode_t flags, void *holder)
686 {
687         int ret;
688
689         mutex_lock(&uuid_mutex);
690         if (fs_devices->opened) {
691                 fs_devices->opened++;
692                 ret = 0;
693         } else {
694                 ret = __btrfs_open_devices(fs_devices, flags, holder);
695         }
696         mutex_unlock(&uuid_mutex);
697         return ret;
698 }
699
700 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
701                           struct btrfs_fs_devices **fs_devices_ret)
702 {
703         struct btrfs_super_block *disk_super;
704         struct block_device *bdev;
705         struct buffer_head *bh;
706         int ret;
707         u64 devid;
708         u64 transid;
709
710         mutex_lock(&uuid_mutex);
711
712         flags |= FMODE_EXCL;
713         bdev = blkdev_get_by_path(path, flags, holder);
714
715         if (IS_ERR(bdev)) {
716                 ret = PTR_ERR(bdev);
717                 goto error;
718         }
719
720         ret = set_blocksize(bdev, 4096);
721         if (ret)
722                 goto error_close;
723         bh = btrfs_read_dev_super(bdev);
724         if (!bh) {
725                 ret = -EINVAL;
726                 goto error_close;
727         }
728         disk_super = (struct btrfs_super_block *)bh->b_data;
729         devid = btrfs_stack_device_id(&disk_super->dev_item);
730         transid = btrfs_super_generation(disk_super);
731         if (disk_super->label[0])
732                 printk(KERN_INFO "device label %s ", disk_super->label);
733         else
734                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
735         printk(KERN_CONT "devid %llu transid %llu %s\n",
736                (unsigned long long)devid, (unsigned long long)transid, path);
737         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
738
739         brelse(bh);
740 error_close:
741         blkdev_put(bdev, flags);
742 error:
743         mutex_unlock(&uuid_mutex);
744         return ret;
745 }
746
747 /* helper to account the used device space in the range */
748 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
749                                    u64 end, u64 *length)
750 {
751         struct btrfs_key key;
752         struct btrfs_root *root = device->dev_root;
753         struct btrfs_dev_extent *dev_extent;
754         struct btrfs_path *path;
755         u64 extent_end;
756         int ret;
757         int slot;
758         struct extent_buffer *l;
759
760         *length = 0;
761
762         if (start >= device->total_bytes)
763                 return 0;
764
765         path = btrfs_alloc_path();
766         if (!path)
767                 return -ENOMEM;
768         path->reada = 2;
769
770         key.objectid = device->devid;
771         key.offset = start;
772         key.type = BTRFS_DEV_EXTENT_KEY;
773
774         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
775         if (ret < 0)
776                 goto out;
777         if (ret > 0) {
778                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
779                 if (ret < 0)
780                         goto out;
781         }
782
783         while (1) {
784                 l = path->nodes[0];
785                 slot = path->slots[0];
786                 if (slot >= btrfs_header_nritems(l)) {
787                         ret = btrfs_next_leaf(root, path);
788                         if (ret == 0)
789                                 continue;
790                         if (ret < 0)
791                                 goto out;
792
793                         break;
794                 }
795                 btrfs_item_key_to_cpu(l, &key, slot);
796
797                 if (key.objectid < device->devid)
798                         goto next;
799
800                 if (key.objectid > device->devid)
801                         break;
802
803                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
804                         goto next;
805
806                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
807                 extent_end = key.offset + btrfs_dev_extent_length(l,
808                                                                   dev_extent);
809                 if (key.offset <= start && extent_end > end) {
810                         *length = end - start + 1;
811                         break;
812                 } else if (key.offset <= start && extent_end > start)
813                         *length += extent_end - start;
814                 else if (key.offset > start && extent_end <= end)
815                         *length += extent_end - key.offset;
816                 else if (key.offset > start && key.offset <= end) {
817                         *length += end - key.offset + 1;
818                         break;
819                 } else if (key.offset > end)
820                         break;
821
822 next:
823                 path->slots[0]++;
824         }
825         ret = 0;
826 out:
827         btrfs_free_path(path);
828         return ret;
829 }
830
831 /*
832  * find_free_dev_extent - find free space in the specified device
833  * @trans:      transaction handler
834  * @device:     the device which we search the free space in
835  * @num_bytes:  the size of the free space that we need
836  * @start:      store the start of the free space.
837  * @len:        the size of the free space. that we find, or the size of the max
838  *              free space if we don't find suitable free space
839  *
840  * this uses a pretty simple search, the expectation is that it is
841  * called very infrequently and that a given device has a small number
842  * of extents
843  *
844  * @start is used to store the start of the free space if we find. But if we
845  * don't find suitable free space, it will be used to store the start position
846  * of the max free space.
847  *
848  * @len is used to store the size of the free space that we find.
849  * But if we don't find suitable free space, it is used to store the size of
850  * the max free space.
851  */
852 int find_free_dev_extent(struct btrfs_trans_handle *trans,
853                          struct btrfs_device *device, u64 num_bytes,
854                          u64 *start, u64 *len)
855 {
856         struct btrfs_key key;
857         struct btrfs_root *root = device->dev_root;
858         struct btrfs_dev_extent *dev_extent;
859         struct btrfs_path *path;
860         u64 hole_size;
861         u64 max_hole_start;
862         u64 max_hole_size;
863         u64 extent_end;
864         u64 search_start;
865         u64 search_end = device->total_bytes;
866         int ret;
867         int slot;
868         struct extent_buffer *l;
869
870         /* FIXME use last free of some kind */
871
872         /* we don't want to overwrite the superblock on the drive,
873          * so we make sure to start at an offset of at least 1MB
874          */
875         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
876
877         max_hole_start = search_start;
878         max_hole_size = 0;
879         hole_size = 0;
880
881         if (search_start >= search_end) {
882                 ret = -ENOSPC;
883                 goto error;
884         }
885
886         path = btrfs_alloc_path();
887         if (!path) {
888                 ret = -ENOMEM;
889                 goto error;
890         }
891         path->reada = 2;
892
893         key.objectid = device->devid;
894         key.offset = search_start;
895         key.type = BTRFS_DEV_EXTENT_KEY;
896
897         ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
898         if (ret < 0)
899                 goto out;
900         if (ret > 0) {
901                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
902                 if (ret < 0)
903                         goto out;
904         }
905
906         while (1) {
907                 l = path->nodes[0];
908                 slot = path->slots[0];
909                 if (slot >= btrfs_header_nritems(l)) {
910                         ret = btrfs_next_leaf(root, path);
911                         if (ret == 0)
912                                 continue;
913                         if (ret < 0)
914                                 goto out;
915
916                         break;
917                 }
918                 btrfs_item_key_to_cpu(l, &key, slot);
919
920                 if (key.objectid < device->devid)
921                         goto next;
922
923                 if (key.objectid > device->devid)
924                         break;
925
926                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
927                         goto next;
928
929                 if (key.offset > search_start) {
930                         hole_size = key.offset - search_start;
931
932                         if (hole_size > max_hole_size) {
933                                 max_hole_start = search_start;
934                                 max_hole_size = hole_size;
935                         }
936
937                         /*
938                          * If this free space is greater than which we need,
939                          * it must be the max free space that we have found
940                          * until now, so max_hole_start must point to the start
941                          * of this free space and the length of this free space
942                          * is stored in max_hole_size. Thus, we return
943                          * max_hole_start and max_hole_size and go back to the
944                          * caller.
945                          */
946                         if (hole_size >= num_bytes) {
947                                 ret = 0;
948                                 goto out;
949                         }
950                 }
951
952                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
953                 extent_end = key.offset + btrfs_dev_extent_length(l,
954                                                                   dev_extent);
955                 if (extent_end > search_start)
956                         search_start = extent_end;
957 next:
958                 path->slots[0]++;
959                 cond_resched();
960         }
961
962         /*
963          * At this point, search_start should be the end of
964          * allocated dev extents, and when shrinking the device,
965          * search_end may be smaller than search_start.
966          */
967         if (search_end > search_start)
968                 hole_size = search_end - search_start;
969
970         if (hole_size > max_hole_size) {
971                 max_hole_start = search_start;
972                 max_hole_size = hole_size;
973         }
974
975         /* See above. */
976         if (hole_size < num_bytes)
977                 ret = -ENOSPC;
978         else
979                 ret = 0;
980
981 out:
982         btrfs_free_path(path);
983 error:
984         *start = max_hole_start;
985         if (len)
986                 *len = max_hole_size;
987         return ret;
988 }
989
990 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
991                           struct btrfs_device *device,
992                           u64 start)
993 {
994         int ret;
995         struct btrfs_path *path;
996         struct btrfs_root *root = device->dev_root;
997         struct btrfs_key key;
998         struct btrfs_key found_key;
999         struct extent_buffer *leaf = NULL;
1000         struct btrfs_dev_extent *extent = NULL;
1001
1002         path = btrfs_alloc_path();
1003         if (!path)
1004                 return -ENOMEM;
1005
1006         key.objectid = device->devid;
1007         key.offset = start;
1008         key.type = BTRFS_DEV_EXTENT_KEY;
1009 again:
1010         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1011         if (ret > 0) {
1012                 ret = btrfs_previous_item(root, path, key.objectid,
1013                                           BTRFS_DEV_EXTENT_KEY);
1014                 if (ret)
1015                         goto out;
1016                 leaf = path->nodes[0];
1017                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1018                 extent = btrfs_item_ptr(leaf, path->slots[0],
1019                                         struct btrfs_dev_extent);
1020                 BUG_ON(found_key.offset > start || found_key.offset +
1021                        btrfs_dev_extent_length(leaf, extent) < start);
1022                 key = found_key;
1023                 btrfs_release_path(path);
1024                 goto again;
1025         } else if (ret == 0) {
1026                 leaf = path->nodes[0];
1027                 extent = btrfs_item_ptr(leaf, path->slots[0],
1028                                         struct btrfs_dev_extent);
1029         }
1030         BUG_ON(ret);
1031
1032         if (device->bytes_used > 0) {
1033                 u64 len = btrfs_dev_extent_length(leaf, extent);
1034                 device->bytes_used -= len;
1035                 spin_lock(&root->fs_info->free_chunk_lock);
1036                 root->fs_info->free_chunk_space += len;
1037                 spin_unlock(&root->fs_info->free_chunk_lock);
1038         }
1039         ret = btrfs_del_item(trans, root, path);
1040
1041 out:
1042         btrfs_free_path(path);
1043         return ret;
1044 }
1045
1046 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1047                            struct btrfs_device *device,
1048                            u64 chunk_tree, u64 chunk_objectid,
1049                            u64 chunk_offset, u64 start, u64 num_bytes)
1050 {
1051         int ret;
1052         struct btrfs_path *path;
1053         struct btrfs_root *root = device->dev_root;
1054         struct btrfs_dev_extent *extent;
1055         struct extent_buffer *leaf;
1056         struct btrfs_key key;
1057
1058         WARN_ON(!device->in_fs_metadata);
1059         path = btrfs_alloc_path();
1060         if (!path)
1061                 return -ENOMEM;
1062
1063         key.objectid = device->devid;
1064         key.offset = start;
1065         key.type = BTRFS_DEV_EXTENT_KEY;
1066         ret = btrfs_insert_empty_item(trans, root, path, &key,
1067                                       sizeof(*extent));
1068         BUG_ON(ret);
1069
1070         leaf = path->nodes[0];
1071         extent = btrfs_item_ptr(leaf, path->slots[0],
1072                                 struct btrfs_dev_extent);
1073         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1074         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1075         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1076
1077         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1078                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1079                     BTRFS_UUID_SIZE);
1080
1081         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1082         btrfs_mark_buffer_dirty(leaf);
1083         btrfs_free_path(path);
1084         return ret;
1085 }
1086
1087 static noinline int find_next_chunk(struct btrfs_root *root,
1088                                     u64 objectid, u64 *offset)
1089 {
1090         struct btrfs_path *path;
1091         int ret;
1092         struct btrfs_key key;
1093         struct btrfs_chunk *chunk;
1094         struct btrfs_key found_key;
1095
1096         path = btrfs_alloc_path();
1097         if (!path)
1098                 return -ENOMEM;
1099
1100         key.objectid = objectid;
1101         key.offset = (u64)-1;
1102         key.type = BTRFS_CHUNK_ITEM_KEY;
1103
1104         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1105         if (ret < 0)
1106                 goto error;
1107
1108         BUG_ON(ret == 0);
1109
1110         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1111         if (ret) {
1112                 *offset = 0;
1113         } else {
1114                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1115                                       path->slots[0]);
1116                 if (found_key.objectid != objectid)
1117                         *offset = 0;
1118                 else {
1119                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1120                                                struct btrfs_chunk);
1121                         *offset = found_key.offset +
1122                                 btrfs_chunk_length(path->nodes[0], chunk);
1123                 }
1124         }
1125         ret = 0;
1126 error:
1127         btrfs_free_path(path);
1128         return ret;
1129 }
1130
1131 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1132 {
1133         int ret;
1134         struct btrfs_key key;
1135         struct btrfs_key found_key;
1136         struct btrfs_path *path;
1137
1138         root = root->fs_info->chunk_root;
1139
1140         path = btrfs_alloc_path();
1141         if (!path)
1142                 return -ENOMEM;
1143
1144         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1145         key.type = BTRFS_DEV_ITEM_KEY;
1146         key.offset = (u64)-1;
1147
1148         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1149         if (ret < 0)
1150                 goto error;
1151
1152         BUG_ON(ret == 0);
1153
1154         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1155                                   BTRFS_DEV_ITEM_KEY);
1156         if (ret) {
1157                 *objectid = 1;
1158         } else {
1159                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1160                                       path->slots[0]);
1161                 *objectid = found_key.offset + 1;
1162         }
1163         ret = 0;
1164 error:
1165         btrfs_free_path(path);
1166         return ret;
1167 }
1168
1169 /*
1170  * the device information is stored in the chunk root
1171  * the btrfs_device struct should be fully filled in
1172  */
1173 int btrfs_add_device(struct btrfs_trans_handle *trans,
1174                      struct btrfs_root *root,
1175                      struct btrfs_device *device)
1176 {
1177         int ret;
1178         struct btrfs_path *path;
1179         struct btrfs_dev_item *dev_item;
1180         struct extent_buffer *leaf;
1181         struct btrfs_key key;
1182         unsigned long ptr;
1183
1184         root = root->fs_info->chunk_root;
1185
1186         path = btrfs_alloc_path();
1187         if (!path)
1188                 return -ENOMEM;
1189
1190         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1191         key.type = BTRFS_DEV_ITEM_KEY;
1192         key.offset = device->devid;
1193
1194         ret = btrfs_insert_empty_item(trans, root, path, &key,
1195                                       sizeof(*dev_item));
1196         if (ret)
1197                 goto out;
1198
1199         leaf = path->nodes[0];
1200         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1201
1202         btrfs_set_device_id(leaf, dev_item, device->devid);
1203         btrfs_set_device_generation(leaf, dev_item, 0);
1204         btrfs_set_device_type(leaf, dev_item, device->type);
1205         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1206         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1207         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1208         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1209         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1210         btrfs_set_device_group(leaf, dev_item, 0);
1211         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1212         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1213         btrfs_set_device_start_offset(leaf, dev_item, 0);
1214
1215         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1216         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1217         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1218         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1219         btrfs_mark_buffer_dirty(leaf);
1220
1221         ret = 0;
1222 out:
1223         btrfs_free_path(path);
1224         return ret;
1225 }
1226
1227 static int btrfs_rm_dev_item(struct btrfs_root *root,
1228                              struct btrfs_device *device)
1229 {
1230         int ret;
1231         struct btrfs_path *path;
1232         struct btrfs_key key;
1233         struct btrfs_trans_handle *trans;
1234
1235         root = root->fs_info->chunk_root;
1236
1237         path = btrfs_alloc_path();
1238         if (!path)
1239                 return -ENOMEM;
1240
1241         trans = btrfs_start_transaction(root, 0);
1242         if (IS_ERR(trans)) {
1243                 btrfs_free_path(path);
1244                 return PTR_ERR(trans);
1245         }
1246         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1247         key.type = BTRFS_DEV_ITEM_KEY;
1248         key.offset = device->devid;
1249         lock_chunks(root);
1250
1251         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1252         if (ret < 0)
1253                 goto out;
1254
1255         if (ret > 0) {
1256                 ret = -ENOENT;
1257                 goto out;
1258         }
1259
1260         ret = btrfs_del_item(trans, root, path);
1261         if (ret)
1262                 goto out;
1263 out:
1264         btrfs_free_path(path);
1265         unlock_chunks(root);
1266         btrfs_commit_transaction(trans, root);
1267         return ret;
1268 }
1269
1270 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1271 {
1272         struct btrfs_device *device;
1273         struct btrfs_device *next_device;
1274         struct block_device *bdev;
1275         struct buffer_head *bh = NULL;
1276         struct btrfs_super_block *disk_super;
1277         struct btrfs_fs_devices *cur_devices;
1278         u64 all_avail;
1279         u64 devid;
1280         u64 num_devices;
1281         u8 *dev_uuid;
1282         int ret = 0;
1283         bool clear_super = false;
1284
1285         mutex_lock(&uuid_mutex);
1286
1287         all_avail = root->fs_info->avail_data_alloc_bits |
1288                 root->fs_info->avail_system_alloc_bits |
1289                 root->fs_info->avail_metadata_alloc_bits;
1290
1291         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1292             root->fs_info->fs_devices->num_devices <= 4) {
1293                 printk(KERN_ERR "btrfs: unable to go below four devices "
1294                        "on raid10\n");
1295                 ret = -EINVAL;
1296                 goto out;
1297         }
1298
1299         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1300             root->fs_info->fs_devices->num_devices <= 2) {
1301                 printk(KERN_ERR "btrfs: unable to go below two "
1302                        "devices on raid1\n");
1303                 ret = -EINVAL;
1304                 goto out;
1305         }
1306
1307         if (strcmp(device_path, "missing") == 0) {
1308                 struct list_head *devices;
1309                 struct btrfs_device *tmp;
1310
1311                 device = NULL;
1312                 devices = &root->fs_info->fs_devices->devices;
1313                 /*
1314                  * It is safe to read the devices since the volume_mutex
1315                  * is held.
1316                  */
1317                 list_for_each_entry(tmp, devices, dev_list) {
1318                         if (tmp->in_fs_metadata && !tmp->bdev) {
1319                                 device = tmp;
1320                                 break;
1321                         }
1322                 }
1323                 bdev = NULL;
1324                 bh = NULL;
1325                 disk_super = NULL;
1326                 if (!device) {
1327                         printk(KERN_ERR "btrfs: no missing devices found to "
1328                                "remove\n");
1329                         goto out;
1330                 }
1331         } else {
1332                 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1333                                           root->fs_info->bdev_holder);
1334                 if (IS_ERR(bdev)) {
1335                         ret = PTR_ERR(bdev);
1336                         goto out;
1337                 }
1338
1339                 set_blocksize(bdev, 4096);
1340                 bh = btrfs_read_dev_super(bdev);
1341                 if (!bh) {
1342                         ret = -EINVAL;
1343                         goto error_close;
1344                 }
1345                 disk_super = (struct btrfs_super_block *)bh->b_data;
1346                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1347                 dev_uuid = disk_super->dev_item.uuid;
1348                 device = btrfs_find_device(root, devid, dev_uuid,
1349                                            disk_super->fsid);
1350                 if (!device) {
1351                         ret = -ENOENT;
1352                         goto error_brelse;
1353                 }
1354         }
1355
1356         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1357                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1358                        "device\n");
1359                 ret = -EINVAL;
1360                 goto error_brelse;
1361         }
1362
1363         if (device->writeable) {
1364                 lock_chunks(root);
1365                 list_del_init(&device->dev_alloc_list);
1366                 unlock_chunks(root);
1367                 root->fs_info->fs_devices->rw_devices--;
1368                 clear_super = true;
1369         }
1370
1371         ret = btrfs_shrink_device(device, 0);
1372         if (ret)
1373                 goto error_undo;
1374
1375         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1376         if (ret)
1377                 goto error_undo;
1378
1379         spin_lock(&root->fs_info->free_chunk_lock);
1380         root->fs_info->free_chunk_space = device->total_bytes -
1381                 device->bytes_used;
1382         spin_unlock(&root->fs_info->free_chunk_lock);
1383
1384         device->in_fs_metadata = 0;
1385         btrfs_scrub_cancel_dev(root, device);
1386
1387         /*
1388          * the device list mutex makes sure that we don't change
1389          * the device list while someone else is writing out all
1390          * the device supers.
1391          */
1392
1393         cur_devices = device->fs_devices;
1394         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1395         list_del_rcu(&device->dev_list);
1396
1397         device->fs_devices->num_devices--;
1398
1399         if (device->missing)
1400                 root->fs_info->fs_devices->missing_devices--;
1401
1402         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1403                                  struct btrfs_device, dev_list);
1404         if (device->bdev == root->fs_info->sb->s_bdev)
1405                 root->fs_info->sb->s_bdev = next_device->bdev;
1406         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1407                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1408
1409         if (device->bdev)
1410                 device->fs_devices->open_devices--;
1411
1412         call_rcu(&device->rcu, free_device);
1413         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1414
1415         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1416         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1417
1418         if (cur_devices->open_devices == 0) {
1419                 struct btrfs_fs_devices *fs_devices;
1420                 fs_devices = root->fs_info->fs_devices;
1421                 while (fs_devices) {
1422                         if (fs_devices->seed == cur_devices)
1423                                 break;
1424                         fs_devices = fs_devices->seed;
1425                 }
1426                 fs_devices->seed = cur_devices->seed;
1427                 cur_devices->seed = NULL;
1428                 lock_chunks(root);
1429                 __btrfs_close_devices(cur_devices);
1430                 unlock_chunks(root);
1431                 free_fs_devices(cur_devices);
1432         }
1433
1434         /*
1435          * at this point, the device is zero sized.  We want to
1436          * remove it from the devices list and zero out the old super
1437          */
1438         if (clear_super) {
1439                 /* make sure this device isn't detected as part of
1440                  * the FS anymore
1441                  */
1442                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1443                 set_buffer_dirty(bh);
1444                 sync_dirty_buffer(bh);
1445         }
1446
1447         ret = 0;
1448
1449 error_brelse:
1450         brelse(bh);
1451 error_close:
1452         if (bdev)
1453                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1454 out:
1455         mutex_unlock(&uuid_mutex);
1456         return ret;
1457 error_undo:
1458         if (device->writeable) {
1459                 lock_chunks(root);
1460                 list_add(&device->dev_alloc_list,
1461                          &root->fs_info->fs_devices->alloc_list);
1462                 unlock_chunks(root);
1463                 root->fs_info->fs_devices->rw_devices++;
1464         }
1465         goto error_brelse;
1466 }
1467
1468 /*
1469  * does all the dirty work required for changing file system's UUID.
1470  */
1471 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1472                                 struct btrfs_root *root)
1473 {
1474         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1475         struct btrfs_fs_devices *old_devices;
1476         struct btrfs_fs_devices *seed_devices;
1477         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1478         struct btrfs_device *device;
1479         u64 super_flags;
1480
1481         BUG_ON(!mutex_is_locked(&uuid_mutex));
1482         if (!fs_devices->seeding)
1483                 return -EINVAL;
1484
1485         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1486         if (!seed_devices)
1487                 return -ENOMEM;
1488
1489         old_devices = clone_fs_devices(fs_devices);
1490         if (IS_ERR(old_devices)) {
1491                 kfree(seed_devices);
1492                 return PTR_ERR(old_devices);
1493         }
1494
1495         list_add(&old_devices->list, &fs_uuids);
1496
1497         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1498         seed_devices->opened = 1;
1499         INIT_LIST_HEAD(&seed_devices->devices);
1500         INIT_LIST_HEAD(&seed_devices->alloc_list);
1501         mutex_init(&seed_devices->device_list_mutex);
1502
1503         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1504         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1505                               synchronize_rcu);
1506         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1507
1508         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1509         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1510                 device->fs_devices = seed_devices;
1511         }
1512
1513         fs_devices->seeding = 0;
1514         fs_devices->num_devices = 0;
1515         fs_devices->open_devices = 0;
1516         fs_devices->seed = seed_devices;
1517
1518         generate_random_uuid(fs_devices->fsid);
1519         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1520         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1521         super_flags = btrfs_super_flags(disk_super) &
1522                       ~BTRFS_SUPER_FLAG_SEEDING;
1523         btrfs_set_super_flags(disk_super, super_flags);
1524
1525         return 0;
1526 }
1527
1528 /*
1529  * strore the expected generation for seed devices in device items.
1530  */
1531 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1532                                struct btrfs_root *root)
1533 {
1534         struct btrfs_path *path;
1535         struct extent_buffer *leaf;
1536         struct btrfs_dev_item *dev_item;
1537         struct btrfs_device *device;
1538         struct btrfs_key key;
1539         u8 fs_uuid[BTRFS_UUID_SIZE];
1540         u8 dev_uuid[BTRFS_UUID_SIZE];
1541         u64 devid;
1542         int ret;
1543
1544         path = btrfs_alloc_path();
1545         if (!path)
1546                 return -ENOMEM;
1547
1548         root = root->fs_info->chunk_root;
1549         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1550         key.offset = 0;
1551         key.type = BTRFS_DEV_ITEM_KEY;
1552
1553         while (1) {
1554                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1555                 if (ret < 0)
1556                         goto error;
1557
1558                 leaf = path->nodes[0];
1559 next_slot:
1560                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1561                         ret = btrfs_next_leaf(root, path);
1562                         if (ret > 0)
1563                                 break;
1564                         if (ret < 0)
1565                                 goto error;
1566                         leaf = path->nodes[0];
1567                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1568                         btrfs_release_path(path);
1569                         continue;
1570                 }
1571
1572                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1573                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1574                     key.type != BTRFS_DEV_ITEM_KEY)
1575                         break;
1576
1577                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1578                                           struct btrfs_dev_item);
1579                 devid = btrfs_device_id(leaf, dev_item);
1580                 read_extent_buffer(leaf, dev_uuid,
1581                                    (unsigned long)btrfs_device_uuid(dev_item),
1582                                    BTRFS_UUID_SIZE);
1583                 read_extent_buffer(leaf, fs_uuid,
1584                                    (unsigned long)btrfs_device_fsid(dev_item),
1585                                    BTRFS_UUID_SIZE);
1586                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1587                 BUG_ON(!device);
1588
1589                 if (device->fs_devices->seeding) {
1590                         btrfs_set_device_generation(leaf, dev_item,
1591                                                     device->generation);
1592                         btrfs_mark_buffer_dirty(leaf);
1593                 }
1594
1595                 path->slots[0]++;
1596                 goto next_slot;
1597         }
1598         ret = 0;
1599 error:
1600         btrfs_free_path(path);
1601         return ret;
1602 }
1603
1604 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1605 {
1606         struct request_queue *q;
1607         struct btrfs_trans_handle *trans;
1608         struct btrfs_device *device;
1609         struct block_device *bdev;
1610         struct list_head *devices;
1611         struct super_block *sb = root->fs_info->sb;
1612         u64 total_bytes;
1613         int seeding_dev = 0;
1614         int ret = 0;
1615
1616         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1617                 return -EINVAL;
1618
1619         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1620                                   root->fs_info->bdev_holder);
1621         if (IS_ERR(bdev))
1622                 return PTR_ERR(bdev);
1623
1624         if (root->fs_info->fs_devices->seeding) {
1625                 seeding_dev = 1;
1626                 down_write(&sb->s_umount);
1627                 mutex_lock(&uuid_mutex);
1628         }
1629
1630         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1631
1632         devices = &root->fs_info->fs_devices->devices;
1633         /*
1634          * we have the volume lock, so we don't need the extra
1635          * device list mutex while reading the list here.
1636          */
1637         list_for_each_entry(device, devices, dev_list) {
1638                 if (device->bdev == bdev) {
1639                         ret = -EEXIST;
1640                         goto error;
1641                 }
1642         }
1643
1644         device = kzalloc(sizeof(*device), GFP_NOFS);
1645         if (!device) {
1646                 /* we can safely leave the fs_devices entry around */
1647                 ret = -ENOMEM;
1648                 goto error;
1649         }
1650
1651         device->name = kstrdup(device_path, GFP_NOFS);
1652         if (!device->name) {
1653                 kfree(device);
1654                 ret = -ENOMEM;
1655                 goto error;
1656         }
1657
1658         ret = find_next_devid(root, &device->devid);
1659         if (ret) {
1660                 kfree(device->name);
1661                 kfree(device);
1662                 goto error;
1663         }
1664
1665         trans = btrfs_start_transaction(root, 0);
1666         if (IS_ERR(trans)) {
1667                 kfree(device->name);
1668                 kfree(device);
1669                 ret = PTR_ERR(trans);
1670                 goto error;
1671         }
1672
1673         lock_chunks(root);
1674
1675         q = bdev_get_queue(bdev);
1676         if (blk_queue_discard(q))
1677                 device->can_discard = 1;
1678         device->writeable = 1;
1679         device->work.func = pending_bios_fn;
1680         generate_random_uuid(device->uuid);
1681         spin_lock_init(&device->io_lock);
1682         device->generation = trans->transid;
1683         device->io_width = root->sectorsize;
1684         device->io_align = root->sectorsize;
1685         device->sector_size = root->sectorsize;
1686         device->total_bytes = i_size_read(bdev->bd_inode);
1687         device->disk_total_bytes = device->total_bytes;
1688         device->dev_root = root->fs_info->dev_root;
1689         device->bdev = bdev;
1690         device->in_fs_metadata = 1;
1691         device->mode = FMODE_EXCL;
1692         set_blocksize(device->bdev, 4096);
1693
1694         if (seeding_dev) {
1695                 sb->s_flags &= ~MS_RDONLY;
1696                 ret = btrfs_prepare_sprout(trans, root);
1697                 BUG_ON(ret);
1698         }
1699
1700         device->fs_devices = root->fs_info->fs_devices;
1701
1702         /*
1703          * we don't want write_supers to jump in here with our device
1704          * half setup
1705          */
1706         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1707         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1708         list_add(&device->dev_alloc_list,
1709                  &root->fs_info->fs_devices->alloc_list);
1710         root->fs_info->fs_devices->num_devices++;
1711         root->fs_info->fs_devices->open_devices++;
1712         root->fs_info->fs_devices->rw_devices++;
1713         if (device->can_discard)
1714                 root->fs_info->fs_devices->num_can_discard++;
1715         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1716
1717         spin_lock(&root->fs_info->free_chunk_lock);
1718         root->fs_info->free_chunk_space += device->total_bytes;
1719         spin_unlock(&root->fs_info->free_chunk_lock);
1720
1721         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1722                 root->fs_info->fs_devices->rotating = 1;
1723
1724         total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1725         btrfs_set_super_total_bytes(root->fs_info->super_copy,
1726                                     total_bytes + device->total_bytes);
1727
1728         total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1729         btrfs_set_super_num_devices(root->fs_info->super_copy,
1730                                     total_bytes + 1);
1731         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1732
1733         if (seeding_dev) {
1734                 ret = init_first_rw_device(trans, root, device);
1735                 BUG_ON(ret);
1736                 ret = btrfs_finish_sprout(trans, root);
1737                 BUG_ON(ret);
1738         } else {
1739                 ret = btrfs_add_device(trans, root, device);
1740         }
1741
1742         /*
1743          * we've got more storage, clear any full flags on the space
1744          * infos
1745          */
1746         btrfs_clear_space_info_full(root->fs_info);
1747
1748         unlock_chunks(root);
1749         btrfs_commit_transaction(trans, root);
1750
1751         if (seeding_dev) {
1752                 mutex_unlock(&uuid_mutex);
1753                 up_write(&sb->s_umount);
1754
1755                 ret = btrfs_relocate_sys_chunks(root);
1756                 BUG_ON(ret);
1757         }
1758
1759         return ret;
1760 error:
1761         blkdev_put(bdev, FMODE_EXCL);
1762         if (seeding_dev) {
1763                 mutex_unlock(&uuid_mutex);
1764                 up_write(&sb->s_umount);
1765         }
1766         return ret;
1767 }
1768
1769 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1770                                         struct btrfs_device *device)
1771 {
1772         int ret;
1773         struct btrfs_path *path;
1774         struct btrfs_root *root;
1775         struct btrfs_dev_item *dev_item;
1776         struct extent_buffer *leaf;
1777         struct btrfs_key key;
1778
1779         root = device->dev_root->fs_info->chunk_root;
1780
1781         path = btrfs_alloc_path();
1782         if (!path)
1783                 return -ENOMEM;
1784
1785         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1786         key.type = BTRFS_DEV_ITEM_KEY;
1787         key.offset = device->devid;
1788
1789         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1790         if (ret < 0)
1791                 goto out;
1792
1793         if (ret > 0) {
1794                 ret = -ENOENT;
1795                 goto out;
1796         }
1797
1798         leaf = path->nodes[0];
1799         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1800
1801         btrfs_set_device_id(leaf, dev_item, device->devid);
1802         btrfs_set_device_type(leaf, dev_item, device->type);
1803         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1804         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1805         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1806         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1807         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1808         btrfs_mark_buffer_dirty(leaf);
1809
1810 out:
1811         btrfs_free_path(path);
1812         return ret;
1813 }
1814
1815 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1816                       struct btrfs_device *device, u64 new_size)
1817 {
1818         struct btrfs_super_block *super_copy =
1819                 device->dev_root->fs_info->super_copy;
1820         u64 old_total = btrfs_super_total_bytes(super_copy);
1821         u64 diff = new_size - device->total_bytes;
1822
1823         if (!device->writeable)
1824                 return -EACCES;
1825         if (new_size <= device->total_bytes)
1826                 return -EINVAL;
1827
1828         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1829         device->fs_devices->total_rw_bytes += diff;
1830
1831         device->total_bytes = new_size;
1832         device->disk_total_bytes = new_size;
1833         btrfs_clear_space_info_full(device->dev_root->fs_info);
1834
1835         return btrfs_update_device(trans, device);
1836 }
1837
1838 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1839                       struct btrfs_device *device, u64 new_size)
1840 {
1841         int ret;
1842         lock_chunks(device->dev_root);
1843         ret = __btrfs_grow_device(trans, device, new_size);
1844         unlock_chunks(device->dev_root);
1845         return ret;
1846 }
1847
1848 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849                             struct btrfs_root *root,
1850                             u64 chunk_tree, u64 chunk_objectid,
1851                             u64 chunk_offset)
1852 {
1853         int ret;
1854         struct btrfs_path *path;
1855         struct btrfs_key key;
1856
1857         root = root->fs_info->chunk_root;
1858         path = btrfs_alloc_path();
1859         if (!path)
1860                 return -ENOMEM;
1861
1862         key.objectid = chunk_objectid;
1863         key.offset = chunk_offset;
1864         key.type = BTRFS_CHUNK_ITEM_KEY;
1865
1866         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1867         BUG_ON(ret);
1868
1869         ret = btrfs_del_item(trans, root, path);
1870
1871         btrfs_free_path(path);
1872         return ret;
1873 }
1874
1875 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1876                         chunk_offset)
1877 {
1878         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1879         struct btrfs_disk_key *disk_key;
1880         struct btrfs_chunk *chunk;
1881         u8 *ptr;
1882         int ret = 0;
1883         u32 num_stripes;
1884         u32 array_size;
1885         u32 len = 0;
1886         u32 cur;
1887         struct btrfs_key key;
1888
1889         array_size = btrfs_super_sys_array_size(super_copy);
1890
1891         ptr = super_copy->sys_chunk_array;
1892         cur = 0;
1893
1894         while (cur < array_size) {
1895                 disk_key = (struct btrfs_disk_key *)ptr;
1896                 btrfs_disk_key_to_cpu(&key, disk_key);
1897
1898                 len = sizeof(*disk_key);
1899
1900                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1901                         chunk = (struct btrfs_chunk *)(ptr + len);
1902                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1903                         len += btrfs_chunk_item_size(num_stripes);
1904                 } else {
1905                         ret = -EIO;
1906                         break;
1907                 }
1908                 if (key.objectid == chunk_objectid &&
1909                     key.offset == chunk_offset) {
1910                         memmove(ptr, ptr + len, array_size - (cur + len));
1911                         array_size -= len;
1912                         btrfs_set_super_sys_array_size(super_copy, array_size);
1913                 } else {
1914                         ptr += len;
1915                         cur += len;
1916                 }
1917         }
1918         return ret;
1919 }
1920
1921 static int btrfs_relocate_chunk(struct btrfs_root *root,
1922                          u64 chunk_tree, u64 chunk_objectid,
1923                          u64 chunk_offset)
1924 {
1925         struct extent_map_tree *em_tree;
1926         struct btrfs_root *extent_root;
1927         struct btrfs_trans_handle *trans;
1928         struct extent_map *em;
1929         struct map_lookup *map;
1930         int ret;
1931         int i;
1932
1933         root = root->fs_info->chunk_root;
1934         extent_root = root->fs_info->extent_root;
1935         em_tree = &root->fs_info->mapping_tree.map_tree;
1936
1937         ret = btrfs_can_relocate(extent_root, chunk_offset);
1938         if (ret)
1939                 return -ENOSPC;
1940
1941         /* step one, relocate all the extents inside this chunk */
1942         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1943         if (ret)
1944                 return ret;
1945
1946         trans = btrfs_start_transaction(root, 0);
1947         BUG_ON(IS_ERR(trans));
1948
1949         lock_chunks(root);
1950
1951         /*
1952          * step two, delete the device extents and the
1953          * chunk tree entries
1954          */
1955         read_lock(&em_tree->lock);
1956         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1957         read_unlock(&em_tree->lock);
1958
1959         BUG_ON(em->start > chunk_offset ||
1960                em->start + em->len < chunk_offset);
1961         map = (struct map_lookup *)em->bdev;
1962
1963         for (i = 0; i < map->num_stripes; i++) {
1964                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1965                                             map->stripes[i].physical);
1966                 BUG_ON(ret);
1967
1968                 if (map->stripes[i].dev) {
1969                         ret = btrfs_update_device(trans, map->stripes[i].dev);
1970                         BUG_ON(ret);
1971                 }
1972         }
1973         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1974                                chunk_offset);
1975
1976         BUG_ON(ret);
1977
1978         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1979
1980         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1981                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1982                 BUG_ON(ret);
1983         }
1984
1985         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1986         BUG_ON(ret);
1987
1988         write_lock(&em_tree->lock);
1989         remove_extent_mapping(em_tree, em);
1990         write_unlock(&em_tree->lock);
1991
1992         kfree(map);
1993         em->bdev = NULL;
1994
1995         /* once for the tree */
1996         free_extent_map(em);
1997         /* once for us */
1998         free_extent_map(em);
1999
2000         unlock_chunks(root);
2001         btrfs_end_transaction(trans, root);
2002         return 0;
2003 }
2004
2005 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2006 {
2007         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2008         struct btrfs_path *path;
2009         struct extent_buffer *leaf;
2010         struct btrfs_chunk *chunk;
2011         struct btrfs_key key;
2012         struct btrfs_key found_key;
2013         u64 chunk_tree = chunk_root->root_key.objectid;
2014         u64 chunk_type;
2015         bool retried = false;
2016         int failed = 0;
2017         int ret;
2018
2019         path = btrfs_alloc_path();
2020         if (!path)
2021                 return -ENOMEM;
2022
2023 again:
2024         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2025         key.offset = (u64)-1;
2026         key.type = BTRFS_CHUNK_ITEM_KEY;
2027
2028         while (1) {
2029                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2030                 if (ret < 0)
2031                         goto error;
2032                 BUG_ON(ret == 0);
2033
2034                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2035                                           key.type);
2036                 if (ret < 0)
2037                         goto error;
2038                 if (ret > 0)
2039                         break;
2040
2041                 leaf = path->nodes[0];
2042                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2043
2044                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2045                                        struct btrfs_chunk);
2046                 chunk_type = btrfs_chunk_type(leaf, chunk);
2047                 btrfs_release_path(path);
2048
2049                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2050                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2051                                                    found_key.objectid,
2052                                                    found_key.offset);
2053                         if (ret == -ENOSPC)
2054                                 failed++;
2055                         else if (ret)
2056                                 BUG();
2057                 }
2058
2059                 if (found_key.offset == 0)
2060                         break;
2061                 key.offset = found_key.offset - 1;
2062         }
2063         ret = 0;
2064         if (failed && !retried) {
2065                 failed = 0;
2066                 retried = true;
2067                 goto again;
2068         } else if (failed && retried) {
2069                 WARN_ON(1);
2070                 ret = -ENOSPC;
2071         }
2072 error:
2073         btrfs_free_path(path);
2074         return ret;
2075 }
2076
2077 static int insert_balance_item(struct btrfs_root *root,
2078                                struct btrfs_balance_control *bctl)
2079 {
2080         struct btrfs_trans_handle *trans;
2081         struct btrfs_balance_item *item;
2082         struct btrfs_disk_balance_args disk_bargs;
2083         struct btrfs_path *path;
2084         struct extent_buffer *leaf;
2085         struct btrfs_key key;
2086         int ret, err;
2087
2088         path = btrfs_alloc_path();
2089         if (!path)
2090                 return -ENOMEM;
2091
2092         trans = btrfs_start_transaction(root, 0);
2093         if (IS_ERR(trans)) {
2094                 btrfs_free_path(path);
2095                 return PTR_ERR(trans);
2096         }
2097
2098         key.objectid = BTRFS_BALANCE_OBJECTID;
2099         key.type = BTRFS_BALANCE_ITEM_KEY;
2100         key.offset = 0;
2101
2102         ret = btrfs_insert_empty_item(trans, root, path, &key,
2103                                       sizeof(*item));
2104         if (ret)
2105                 goto out;
2106
2107         leaf = path->nodes[0];
2108         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2109
2110         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2111
2112         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2113         btrfs_set_balance_data(leaf, item, &disk_bargs);
2114         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2115         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2116         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2117         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2118
2119         btrfs_set_balance_flags(leaf, item, bctl->flags);
2120
2121         btrfs_mark_buffer_dirty(leaf);
2122 out:
2123         btrfs_free_path(path);
2124         err = btrfs_commit_transaction(trans, root);
2125         if (err && !ret)
2126                 ret = err;
2127         return ret;
2128 }
2129
2130 static int del_balance_item(struct btrfs_root *root)
2131 {
2132         struct btrfs_trans_handle *trans;
2133         struct btrfs_path *path;
2134         struct btrfs_key key;
2135         int ret, err;
2136
2137         path = btrfs_alloc_path();
2138         if (!path)
2139                 return -ENOMEM;
2140
2141         trans = btrfs_start_transaction(root, 0);
2142         if (IS_ERR(trans)) {
2143                 btrfs_free_path(path);
2144                 return PTR_ERR(trans);
2145         }
2146
2147         key.objectid = BTRFS_BALANCE_OBJECTID;
2148         key.type = BTRFS_BALANCE_ITEM_KEY;
2149         key.offset = 0;
2150
2151         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2152         if (ret < 0)
2153                 goto out;
2154         if (ret > 0) {
2155                 ret = -ENOENT;
2156                 goto out;
2157         }
2158
2159         ret = btrfs_del_item(trans, root, path);
2160 out:
2161         btrfs_free_path(path);
2162         err = btrfs_commit_transaction(trans, root);
2163         if (err && !ret)
2164                 ret = err;
2165         return ret;
2166 }
2167
2168 /*
2169  * This is a heuristic used to reduce the number of chunks balanced on
2170  * resume after balance was interrupted.
2171  */
2172 static void update_balance_args(struct btrfs_balance_control *bctl)
2173 {
2174         /*
2175          * Turn on soft mode for chunk types that were being converted.
2176          */
2177         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2178                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2179         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2180                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2181         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2182                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2183
2184         /*
2185          * Turn on usage filter if is not already used.  The idea is
2186          * that chunks that we have already balanced should be
2187          * reasonably full.  Don't do it for chunks that are being
2188          * converted - that will keep us from relocating unconverted
2189          * (albeit full) chunks.
2190          */
2191         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2192             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2193                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2194                 bctl->data.usage = 90;
2195         }
2196         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2197             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2198                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2199                 bctl->sys.usage = 90;
2200         }
2201         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2202             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2203                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2204                 bctl->meta.usage = 90;
2205         }
2206 }
2207
2208 /*
2209  * Should be called with both balance and volume mutexes held to
2210  * serialize other volume operations (add_dev/rm_dev/resize) with
2211  * restriper.  Same goes for unset_balance_control.
2212  */
2213 static void set_balance_control(struct btrfs_balance_control *bctl)
2214 {
2215         struct btrfs_fs_info *fs_info = bctl->fs_info;
2216
2217         BUG_ON(fs_info->balance_ctl);
2218
2219         spin_lock(&fs_info->balance_lock);
2220         fs_info->balance_ctl = bctl;
2221         spin_unlock(&fs_info->balance_lock);
2222 }
2223
2224 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2225 {
2226         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2227
2228         BUG_ON(!fs_info->balance_ctl);
2229
2230         spin_lock(&fs_info->balance_lock);
2231         fs_info->balance_ctl = NULL;
2232         spin_unlock(&fs_info->balance_lock);
2233
2234         kfree(bctl);
2235 }
2236
2237 /*
2238  * Balance filters.  Return 1 if chunk should be filtered out
2239  * (should not be balanced).
2240  */
2241 static int chunk_profiles_filter(u64 chunk_profile,
2242                                  struct btrfs_balance_args *bargs)
2243 {
2244         chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2245
2246         if (chunk_profile == 0)
2247                 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2248
2249         if (bargs->profiles & chunk_profile)
2250                 return 0;
2251
2252         return 1;
2253 }
2254
2255 static u64 div_factor_fine(u64 num, int factor)
2256 {
2257         if (factor <= 0)
2258                 return 0;
2259         if (factor >= 100)
2260                 return num;
2261
2262         num *= factor;
2263         do_div(num, 100);
2264         return num;
2265 }
2266
2267 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2268                               struct btrfs_balance_args *bargs)
2269 {
2270         struct btrfs_block_group_cache *cache;
2271         u64 chunk_used, user_thresh;
2272         int ret = 1;
2273
2274         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2275         chunk_used = btrfs_block_group_used(&cache->item);
2276
2277         user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2278         if (chunk_used < user_thresh)
2279                 ret = 0;
2280
2281         btrfs_put_block_group(cache);
2282         return ret;
2283 }
2284
2285 static int chunk_devid_filter(struct extent_buffer *leaf,
2286                               struct btrfs_chunk *chunk,
2287                               struct btrfs_balance_args *bargs)
2288 {
2289         struct btrfs_stripe *stripe;
2290         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2291         int i;
2292
2293         for (i = 0; i < num_stripes; i++) {
2294                 stripe = btrfs_stripe_nr(chunk, i);
2295                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2296                         return 0;
2297         }
2298
2299         return 1;
2300 }
2301
2302 /* [pstart, pend) */
2303 static int chunk_drange_filter(struct extent_buffer *leaf,
2304                                struct btrfs_chunk *chunk,
2305                                u64 chunk_offset,
2306                                struct btrfs_balance_args *bargs)
2307 {
2308         struct btrfs_stripe *stripe;
2309         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2310         u64 stripe_offset;
2311         u64 stripe_length;
2312         int factor;
2313         int i;
2314
2315         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2316                 return 0;
2317
2318         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2319              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2320                 factor = 2;
2321         else
2322                 factor = 1;
2323         factor = num_stripes / factor;
2324
2325         for (i = 0; i < num_stripes; i++) {
2326                 stripe = btrfs_stripe_nr(chunk, i);
2327                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2328                         continue;
2329
2330                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2331                 stripe_length = btrfs_chunk_length(leaf, chunk);
2332                 do_div(stripe_length, factor);
2333
2334                 if (stripe_offset < bargs->pend &&
2335                     stripe_offset + stripe_length > bargs->pstart)
2336                         return 0;
2337         }
2338
2339         return 1;
2340 }
2341
2342 /* [vstart, vend) */
2343 static int chunk_vrange_filter(struct extent_buffer *leaf,
2344                                struct btrfs_chunk *chunk,
2345                                u64 chunk_offset,
2346                                struct btrfs_balance_args *bargs)
2347 {
2348         if (chunk_offset < bargs->vend &&
2349             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2350                 /* at least part of the chunk is inside this vrange */
2351                 return 0;
2352
2353         return 1;
2354 }
2355
2356 static int chunk_soft_convert_filter(u64 chunk_profile,
2357                                      struct btrfs_balance_args *bargs)
2358 {
2359         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2360                 return 0;
2361
2362         chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2363
2364         if (chunk_profile == 0)
2365                 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2366
2367         if (bargs->target & chunk_profile)
2368                 return 1;
2369
2370         return 0;
2371 }
2372
2373 static int should_balance_chunk(struct btrfs_root *root,
2374                                 struct extent_buffer *leaf,
2375                                 struct btrfs_chunk *chunk, u64 chunk_offset)
2376 {
2377         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2378         struct btrfs_balance_args *bargs = NULL;
2379         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2380
2381         /* type filter */
2382         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2383               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2384                 return 0;
2385         }
2386
2387         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2388                 bargs = &bctl->data;
2389         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2390                 bargs = &bctl->sys;
2391         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2392                 bargs = &bctl->meta;
2393
2394         /* profiles filter */
2395         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2396             chunk_profiles_filter(chunk_type, bargs)) {
2397                 return 0;
2398         }
2399
2400         /* usage filter */
2401         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2402             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2403                 return 0;
2404         }
2405
2406         /* devid filter */
2407         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2408             chunk_devid_filter(leaf, chunk, bargs)) {
2409                 return 0;
2410         }
2411
2412         /* drange filter, makes sense only with devid filter */
2413         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2414             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2415                 return 0;
2416         }
2417
2418         /* vrange filter */
2419         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2420             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2421                 return 0;
2422         }
2423
2424         /* soft profile changing mode */
2425         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2426             chunk_soft_convert_filter(chunk_type, bargs)) {
2427                 return 0;
2428         }
2429
2430         return 1;
2431 }
2432
2433 static u64 div_factor(u64 num, int factor)
2434 {
2435         if (factor == 10)
2436                 return num;
2437         num *= factor;
2438         do_div(num, 10);
2439         return num;
2440 }
2441
2442 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2443 {
2444         struct btrfs_root *chunk_root = fs_info->chunk_root;
2445         struct btrfs_root *dev_root = fs_info->dev_root;
2446         struct list_head *devices;
2447         struct btrfs_device *device;
2448         u64 old_size;
2449         u64 size_to_free;
2450         struct btrfs_chunk *chunk;
2451         struct btrfs_path *path;
2452         struct btrfs_key key;
2453         struct btrfs_key found_key;
2454         struct btrfs_trans_handle *trans;
2455         struct extent_buffer *leaf;
2456         int slot;
2457         int ret;
2458         int enospc_errors = 0;
2459
2460         /* step one make some room on all the devices */
2461         devices = &fs_info->fs_devices->devices;
2462         list_for_each_entry(device, devices, dev_list) {
2463                 old_size = device->total_bytes;
2464                 size_to_free = div_factor(old_size, 1);
2465                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2466                 if (!device->writeable ||
2467                     device->total_bytes - device->bytes_used > size_to_free)
2468                         continue;
2469
2470                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2471                 if (ret == -ENOSPC)
2472                         break;
2473                 BUG_ON(ret);
2474
2475                 trans = btrfs_start_transaction(dev_root, 0);
2476                 BUG_ON(IS_ERR(trans));
2477
2478                 ret = btrfs_grow_device(trans, device, old_size);
2479                 BUG_ON(ret);
2480
2481                 btrfs_end_transaction(trans, dev_root);
2482         }
2483
2484         /* step two, relocate all the chunks */
2485         path = btrfs_alloc_path();
2486         if (!path) {
2487                 ret = -ENOMEM;
2488                 goto error;
2489         }
2490         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2491         key.offset = (u64)-1;
2492         key.type = BTRFS_CHUNK_ITEM_KEY;
2493
2494         while (1) {
2495                 if (atomic_read(&fs_info->balance_pause_req)) {
2496                         ret = -ECANCELED;
2497                         goto error;
2498                 }
2499
2500                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2501                 if (ret < 0)
2502                         goto error;
2503
2504                 /*
2505                  * this shouldn't happen, it means the last relocate
2506                  * failed
2507                  */
2508                 if (ret == 0)
2509                         BUG(); /* FIXME break ? */
2510
2511                 ret = btrfs_previous_item(chunk_root, path, 0,
2512                                           BTRFS_CHUNK_ITEM_KEY);
2513                 if (ret) {
2514                         ret = 0;
2515                         break;
2516                 }
2517
2518                 leaf = path->nodes[0];
2519                 slot = path->slots[0];
2520                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2521
2522                 if (found_key.objectid != key.objectid)
2523                         break;
2524
2525                 /* chunk zero is special */
2526                 if (found_key.offset == 0)
2527                         break;
2528
2529                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2530
2531                 ret = should_balance_chunk(chunk_root, leaf, chunk,
2532                                            found_key.offset);
2533                 btrfs_release_path(path);
2534                 if (!ret)
2535                         goto loop;
2536
2537                 ret = btrfs_relocate_chunk(chunk_root,
2538                                            chunk_root->root_key.objectid,
2539                                            found_key.objectid,
2540                                            found_key.offset);
2541                 if (ret && ret != -ENOSPC)
2542                         goto error;
2543                 if (ret == -ENOSPC)
2544                         enospc_errors++;
2545 loop:
2546                 key.offset = found_key.offset - 1;
2547         }
2548
2549 error:
2550         btrfs_free_path(path);
2551         if (enospc_errors) {
2552                 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2553                        enospc_errors);
2554                 if (!ret)
2555                         ret = -ENOSPC;
2556         }
2557
2558         return ret;
2559 }
2560
2561 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2562 {
2563         return atomic_read(&fs_info->balance_pause_req) == 0;
2564 }
2565
2566 static void __cancel_balance(struct btrfs_fs_info *fs_info)
2567 {
2568         int ret;
2569
2570         unset_balance_control(fs_info);
2571         ret = del_balance_item(fs_info->tree_root);
2572         BUG_ON(ret);
2573 }
2574
2575 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
2576                                struct btrfs_ioctl_balance_args *bargs);
2577
2578 /*
2579  * Should be called with both balance and volume mutexes held
2580  */
2581 int btrfs_balance(struct btrfs_balance_control *bctl,
2582                   struct btrfs_ioctl_balance_args *bargs)
2583 {
2584         struct btrfs_fs_info *fs_info = bctl->fs_info;
2585         u64 allowed;
2586         int ret;
2587
2588         if (btrfs_fs_closing(fs_info) ||
2589             atomic_read(&fs_info->balance_pause_req)) {
2590                 ret = -EINVAL;
2591                 goto out;
2592         }
2593
2594         /*
2595          * In case of mixed groups both data and meta should be picked,
2596          * and identical options should be given for both of them.
2597          */
2598         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2599         if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2600             (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
2601                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2602                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2603                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2604                         printk(KERN_ERR "btrfs: with mixed groups data and "
2605                                "metadata balance options must be the same\n");
2606                         ret = -EINVAL;
2607                         goto out;
2608                 }
2609         }
2610
2611         /*
2612          * Profile changing sanity checks.  Skip them if a simple
2613          * balance is requested.
2614          */
2615         if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
2616               BTRFS_BALANCE_ARGS_CONVERT))
2617                 goto do_balance;
2618
2619         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2620         if (fs_info->fs_devices->num_devices == 1)
2621                 allowed |= BTRFS_BLOCK_GROUP_DUP;
2622         else if (fs_info->fs_devices->num_devices < 4)
2623                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2624         else
2625                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2626                                 BTRFS_BLOCK_GROUP_RAID10);
2627
2628         if (!profile_is_valid(bctl->data.target, 1) ||
2629             bctl->data.target & ~allowed) {
2630                 printk(KERN_ERR "btrfs: unable to start balance with target "
2631                        "data profile %llu\n",
2632                        (unsigned long long)bctl->data.target);
2633                 ret = -EINVAL;
2634                 goto out;
2635         }
2636         if (!profile_is_valid(bctl->meta.target, 1) ||
2637             bctl->meta.target & ~allowed) {
2638                 printk(KERN_ERR "btrfs: unable to start balance with target "
2639                        "metadata profile %llu\n",
2640                        (unsigned long long)bctl->meta.target);
2641                 ret = -EINVAL;
2642                 goto out;
2643         }
2644         if (!profile_is_valid(bctl->sys.target, 1) ||
2645             bctl->sys.target & ~allowed) {
2646                 printk(KERN_ERR "btrfs: unable to start balance with target "
2647                        "system profile %llu\n",
2648                        (unsigned long long)bctl->sys.target);
2649                 ret = -EINVAL;
2650                 goto out;
2651         }
2652
2653         if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
2654                 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2655                 ret = -EINVAL;
2656                 goto out;
2657         }
2658
2659         /* allow to reduce meta or sys integrity only if force set */
2660         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2661                         BTRFS_BLOCK_GROUP_RAID10;
2662         if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2663              (fs_info->avail_system_alloc_bits & allowed) &&
2664              !(bctl->sys.target & allowed)) ||
2665             ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2666              (fs_info->avail_metadata_alloc_bits & allowed) &&
2667              !(bctl->meta.target & allowed))) {
2668                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2669                         printk(KERN_INFO "btrfs: force reducing metadata "
2670                                "integrity\n");
2671                 } else {
2672                         printk(KERN_ERR "btrfs: balance will reduce metadata "
2673                                "integrity, use force if you want this\n");
2674                         ret = -EINVAL;
2675                         goto out;
2676                 }
2677         }
2678
2679 do_balance:
2680         ret = insert_balance_item(fs_info->tree_root, bctl);
2681         if (ret && ret != -EEXIST)
2682                 goto out;
2683
2684         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2685                 BUG_ON(ret == -EEXIST);
2686                 set_balance_control(bctl);
2687         } else {
2688                 BUG_ON(ret != -EEXIST);
2689                 spin_lock(&fs_info->balance_lock);
2690                 update_balance_args(bctl);
2691                 spin_unlock(&fs_info->balance_lock);
2692         }
2693
2694         atomic_inc(&fs_info->balance_running);
2695         mutex_unlock(&fs_info->balance_mutex);
2696
2697         ret = __btrfs_balance(fs_info);
2698
2699         mutex_lock(&fs_info->balance_mutex);
2700         atomic_dec(&fs_info->balance_running);
2701
2702         if (bargs) {
2703                 memset(bargs, 0, sizeof(*bargs));
2704                 update_ioctl_balance_args(fs_info, bargs);
2705         }
2706
2707         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2708             balance_need_close(fs_info)) {
2709                 __cancel_balance(fs_info);
2710         }
2711
2712         wake_up(&fs_info->balance_wait_q);
2713
2714         return ret;
2715 out:
2716         if (bctl->flags & BTRFS_BALANCE_RESUME)
2717                 __cancel_balance(fs_info);
2718         else
2719                 kfree(bctl);
2720         return ret;
2721 }
2722
2723 static int balance_kthread(void *data)
2724 {
2725         struct btrfs_balance_control *bctl =
2726                         (struct btrfs_balance_control *)data;
2727         struct btrfs_fs_info *fs_info = bctl->fs_info;
2728         int ret = 0;
2729
2730         mutex_lock(&fs_info->volume_mutex);
2731         mutex_lock(&fs_info->balance_mutex);
2732
2733         set_balance_control(bctl);
2734
2735         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2736                 printk(KERN_INFO "btrfs: force skipping balance\n");
2737         } else {
2738                 printk(KERN_INFO "btrfs: continuing balance\n");
2739                 ret = btrfs_balance(bctl, NULL);
2740         }
2741
2742         mutex_unlock(&fs_info->balance_mutex);
2743         mutex_unlock(&fs_info->volume_mutex);
2744         return ret;
2745 }
2746
2747 int btrfs_recover_balance(struct btrfs_root *tree_root)
2748 {
2749         struct task_struct *tsk;
2750         struct btrfs_balance_control *bctl;
2751         struct btrfs_balance_item *item;
2752         struct btrfs_disk_balance_args disk_bargs;
2753         struct btrfs_path *path;
2754         struct extent_buffer *leaf;
2755         struct btrfs_key key;
2756         int ret;
2757
2758         path = btrfs_alloc_path();
2759         if (!path)
2760                 return -ENOMEM;
2761
2762         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2763         if (!bctl) {
2764                 ret = -ENOMEM;
2765                 goto out;
2766         }
2767
2768         key.objectid = BTRFS_BALANCE_OBJECTID;
2769         key.type = BTRFS_BALANCE_ITEM_KEY;
2770         key.offset = 0;
2771
2772         ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2773         if (ret < 0)
2774                 goto out_bctl;
2775         if (ret > 0) { /* ret = -ENOENT; */
2776                 ret = 0;
2777                 goto out_bctl;
2778         }
2779
2780         leaf = path->nodes[0];
2781         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2782
2783         bctl->fs_info = tree_root->fs_info;
2784         bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
2785
2786         btrfs_balance_data(leaf, item, &disk_bargs);
2787         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2788         btrfs_balance_meta(leaf, item, &disk_bargs);
2789         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2790         btrfs_balance_sys(leaf, item, &disk_bargs);
2791         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2792
2793         tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
2794         if (IS_ERR(tsk))
2795                 ret = PTR_ERR(tsk);
2796         else
2797                 goto out;
2798
2799 out_bctl:
2800         kfree(bctl);
2801 out:
2802         btrfs_free_path(path);
2803         return ret;
2804 }
2805
2806 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2807 {
2808         int ret = 0;
2809
2810         mutex_lock(&fs_info->balance_mutex);
2811         if (!fs_info->balance_ctl) {
2812                 mutex_unlock(&fs_info->balance_mutex);
2813                 return -ENOTCONN;
2814         }
2815
2816         if (atomic_read(&fs_info->balance_running)) {
2817                 atomic_inc(&fs_info->balance_pause_req);
2818                 mutex_unlock(&fs_info->balance_mutex);
2819
2820                 wait_event(fs_info->balance_wait_q,
2821                            atomic_read(&fs_info->balance_running) == 0);
2822
2823                 mutex_lock(&fs_info->balance_mutex);
2824                 /* we are good with balance_ctl ripped off from under us */
2825                 BUG_ON(atomic_read(&fs_info->balance_running));
2826                 atomic_dec(&fs_info->balance_pause_req);
2827         } else {
2828                 ret = -ENOTCONN;
2829         }
2830
2831         mutex_unlock(&fs_info->balance_mutex);
2832         return ret;
2833 }
2834
2835 /*
2836  * shrinking a device means finding all of the device extents past
2837  * the new size, and then following the back refs to the chunks.
2838  * The chunk relocation code actually frees the device extent
2839  */
2840 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2841 {
2842         struct btrfs_trans_handle *trans;
2843         struct btrfs_root *root = device->dev_root;
2844         struct btrfs_dev_extent *dev_extent = NULL;
2845         struct btrfs_path *path;
2846         u64 length;
2847         u64 chunk_tree;
2848         u64 chunk_objectid;
2849         u64 chunk_offset;
2850         int ret;
2851         int slot;
2852         int failed = 0;
2853         bool retried = false;
2854         struct extent_buffer *l;
2855         struct btrfs_key key;
2856         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2857         u64 old_total = btrfs_super_total_bytes(super_copy);
2858         u64 old_size = device->total_bytes;
2859         u64 diff = device->total_bytes - new_size;
2860
2861         if (new_size >= device->total_bytes)
2862                 return -EINVAL;
2863
2864         path = btrfs_alloc_path();
2865         if (!path)
2866                 return -ENOMEM;
2867
2868         path->reada = 2;
2869
2870         lock_chunks(root);
2871
2872         device->total_bytes = new_size;
2873         if (device->writeable) {
2874                 device->fs_devices->total_rw_bytes -= diff;
2875                 spin_lock(&root->fs_info->free_chunk_lock);
2876                 root->fs_info->free_chunk_space -= diff;
2877                 spin_unlock(&root->fs_info->free_chunk_lock);
2878         }
2879         unlock_chunks(root);
2880
2881 again:
2882         key.objectid = device->devid;
2883         key.offset = (u64)-1;
2884         key.type = BTRFS_DEV_EXTENT_KEY;
2885
2886         while (1) {
2887                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2888                 if (ret < 0)
2889                         goto done;
2890
2891                 ret = btrfs_previous_item(root, path, 0, key.type);
2892                 if (ret < 0)
2893                         goto done;
2894                 if (ret) {
2895                         ret = 0;
2896                         btrfs_release_path(path);
2897                         break;
2898                 }
2899
2900                 l = path->nodes[0];
2901                 slot = path->slots[0];
2902                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2903
2904                 if (key.objectid != device->devid) {
2905                         btrfs_release_path(path);
2906                         break;
2907                 }
2908
2909                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2910                 length = btrfs_dev_extent_length(l, dev_extent);
2911
2912                 if (key.offset + length <= new_size) {
2913                         btrfs_release_path(path);
2914                         break;
2915                 }
2916
2917                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2918                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2919                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2920                 btrfs_release_path(path);
2921
2922                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2923                                            chunk_offset);
2924                 if (ret && ret != -ENOSPC)
2925                         goto done;
2926                 if (ret == -ENOSPC)
2927                         failed++;
2928                 key.offset -= 1;
2929         }
2930
2931         if (failed && !retried) {
2932                 failed = 0;
2933                 retried = true;
2934                 goto again;
2935         } else if (failed && retried) {
2936                 ret = -ENOSPC;
2937                 lock_chunks(root);
2938
2939                 device->total_bytes = old_size;
2940                 if (device->writeable)
2941                         device->fs_devices->total_rw_bytes += diff;
2942                 spin_lock(&root->fs_info->free_chunk_lock);
2943                 root->fs_info->free_chunk_space += diff;
2944                 spin_unlock(&root->fs_info->free_chunk_lock);
2945                 unlock_chunks(root);
2946                 goto done;
2947         }
2948
2949         /* Shrinking succeeded, else we would be at "done". */
2950         trans = btrfs_start_transaction(root, 0);
2951         if (IS_ERR(trans)) {
2952                 ret = PTR_ERR(trans);
2953                 goto done;
2954         }
2955
2956         lock_chunks(root);
2957
2958         device->disk_total_bytes = new_size;
2959         /* Now btrfs_update_device() will change the on-disk size. */
2960         ret = btrfs_update_device(trans, device);
2961         if (ret) {
2962                 unlock_chunks(root);
2963                 btrfs_end_transaction(trans, root);
2964                 goto done;
2965         }
2966         WARN_ON(diff > old_total);
2967         btrfs_set_super_total_bytes(super_copy, old_total - diff);
2968         unlock_chunks(root);
2969         btrfs_end_transaction(trans, root);
2970 done:
2971         btrfs_free_path(path);
2972         return ret;
2973 }
2974
2975 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2976                            struct btrfs_root *root,
2977                            struct btrfs_key *key,
2978                            struct btrfs_chunk *chunk, int item_size)
2979 {
2980         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2981         struct btrfs_disk_key disk_key;
2982         u32 array_size;
2983         u8 *ptr;
2984
2985         array_size = btrfs_super_sys_array_size(super_copy);
2986         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
2987                 return -EFBIG;
2988
2989         ptr = super_copy->sys_chunk_array + array_size;
2990         btrfs_cpu_key_to_disk(&disk_key, key);
2991         memcpy(ptr, &disk_key, sizeof(disk_key));
2992         ptr += sizeof(disk_key);
2993         memcpy(ptr, chunk, item_size);
2994         item_size += sizeof(disk_key);
2995         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
2996         return 0;
2997 }
2998
2999 /*
3000  * sort the devices in descending order by max_avail, total_avail
3001  */
3002 static int btrfs_cmp_device_info(const void *a, const void *b)
3003 {
3004         const struct btrfs_device_info *di_a = a;
3005         const struct btrfs_device_info *di_b = b;
3006
3007         if (di_a->max_avail > di_b->max_avail)
3008                 return -1;
3009         if (di_a->max_avail < di_b->max_avail)
3010                 return 1;
3011         if (di_a->total_avail > di_b->total_avail)
3012                 return -1;
3013         if (di_a->total_avail < di_b->total_avail)
3014                 return 1;
3015         return 0;
3016 }
3017
3018 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3019                                struct btrfs_root *extent_root,
3020                                struct map_lookup **map_ret,
3021                                u64 *num_bytes_out, u64 *stripe_size_out,
3022                                u64 start, u64 type)
3023 {
3024         struct btrfs_fs_info *info = extent_root->fs_info;
3025         struct btrfs_fs_devices *fs_devices = info->fs_devices;
3026         struct list_head *cur;
3027         struct map_lookup *map = NULL;
3028         struct extent_map_tree *em_tree;
3029         struct extent_map *em;
3030         struct btrfs_device_info *devices_info = NULL;
3031         u64 total_avail;
3032         int num_stripes;        /* total number of stripes to allocate */
3033         int sub_stripes;        /* sub_stripes info for map */
3034         int dev_stripes;        /* stripes per dev */
3035         int devs_max;           /* max devs to use */
3036         int devs_min;           /* min devs needed */
3037         int devs_increment;     /* ndevs has to be a multiple of this */
3038         int ncopies;            /* how many copies to data has */
3039         int ret;
3040         u64 max_stripe_size;
3041         u64 max_chunk_size;
3042         u64 stripe_size;
3043         u64 num_bytes;
3044         int ndevs;
3045         int i;
3046         int j;
3047
3048         if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
3049             (type & BTRFS_BLOCK_GROUP_DUP)) {
3050                 WARN_ON(1);
3051                 type &= ~BTRFS_BLOCK_GROUP_DUP;
3052         }
3053
3054         if (list_empty(&fs_devices->alloc_list))
3055                 return -ENOSPC;
3056
3057         sub_stripes = 1;
3058         dev_stripes = 1;
3059         devs_increment = 1;
3060         ncopies = 1;
3061         devs_max = 0;   /* 0 == as many as possible */
3062         devs_min = 1;
3063
3064         /*
3065          * define the properties of each RAID type.
3066          * FIXME: move this to a global table and use it in all RAID
3067          * calculation code
3068          */
3069         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
3070                 dev_stripes = 2;
3071                 ncopies = 2;
3072                 devs_max = 1;
3073         } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3074                 devs_min = 2;
3075         } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3076                 devs_increment = 2;
3077                 ncopies = 2;
3078                 devs_max = 2;
3079                 devs_min = 2;
3080         } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3081                 sub_stripes = 2;
3082                 devs_increment = 2;
3083                 ncopies = 2;
3084                 devs_min = 4;
3085         } else {
3086                 devs_max = 1;
3087         }
3088
3089         if (type & BTRFS_BLOCK_GROUP_DATA) {
3090                 max_stripe_size = 1024 * 1024 * 1024;
3091                 max_chunk_size = 10 * max_stripe_size;
3092         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
3093                 max_stripe_size = 256 * 1024 * 1024;
3094                 max_chunk_size = max_stripe_size;
3095         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
3096                 max_stripe_size = 8 * 1024 * 1024;
3097                 max_chunk_size = 2 * max_stripe_size;
3098         } else {
3099                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
3100                        type);
3101                 BUG_ON(1);
3102         }
3103
3104         /* we don't want a chunk larger than 10% of writeable space */
3105         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
3106                              max_chunk_size);
3107
3108         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
3109                                GFP_NOFS);
3110         if (!devices_info)
3111                 return -ENOMEM;
3112
3113         cur = fs_devices->alloc_list.next;
3114
3115         /*
3116          * in the first pass through the devices list, we gather information
3117          * about the available holes on each device.
3118          */
3119         ndevs = 0;
3120         while (cur != &fs_devices->alloc_list) {
3121                 struct btrfs_device *device;
3122                 u64 max_avail;
3123                 u64 dev_offset;
3124
3125                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
3126
3127                 cur = cur->next;
3128
3129                 if (!device->writeable) {
3130                         printk(KERN_ERR
3131                                "btrfs: read-only device in alloc_list\n");
3132                         WARN_ON(1);
3133                         continue;
3134                 }
3135
3136                 if (!device->in_fs_metadata)
3137                         continue;
3138
3139                 if (device->total_bytes > device->bytes_used)
3140                         total_avail = device->total_bytes - device->bytes_used;
3141                 else
3142                         total_avail = 0;
3143
3144                 /* If there is no space on this device, skip it. */
3145                 if (total_avail == 0)
3146                         continue;
3147
3148                 ret = find_free_dev_extent(trans, device,
3149                                            max_stripe_size * dev_stripes,
3150                                            &dev_offset, &max_avail);
3151                 if (ret && ret != -ENOSPC)
3152                         goto error;
3153
3154                 if (ret == 0)
3155                         max_avail = max_stripe_size * dev_stripes;
3156
3157                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3158                         continue;
3159
3160                 devices_info[ndevs].dev_offset = dev_offset;
3161                 devices_info[ndevs].max_avail = max_avail;
3162                 devices_info[ndevs].total_avail = total_avail;
3163                 devices_info[ndevs].dev = device;
3164                 ++ndevs;
3165         }
3166
3167         /*
3168          * now sort the devices by hole size / available space
3169          */
3170         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
3171              btrfs_cmp_device_info, NULL);
3172
3173         /* round down to number of usable stripes */
3174         ndevs -= ndevs % devs_increment;
3175
3176         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
3177                 ret = -ENOSPC;
3178                 goto error;
3179         }
3180
3181         if (devs_max && ndevs > devs_max)
3182                 ndevs = devs_max;
3183         /*
3184          * the primary goal is to maximize the number of stripes, so use as many
3185          * devices as possible, even if the stripes are not maximum sized.
3186          */
3187         stripe_size = devices_info[ndevs-1].max_avail;
3188         num_stripes = ndevs * dev_stripes;
3189
3190         if (stripe_size * num_stripes > max_chunk_size * ncopies) {
3191                 stripe_size = max_chunk_size * ncopies;
3192                 do_div(stripe_size, num_stripes);
3193         }
3194
3195         do_div(stripe_size, dev_stripes);
3196         do_div(stripe_size, BTRFS_STRIPE_LEN);
3197         stripe_size *= BTRFS_STRIPE_LEN;
3198
3199         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3200         if (!map) {
3201                 ret = -ENOMEM;
3202                 goto error;
3203         }
3204         map->num_stripes = num_stripes;
3205
3206         for (i = 0; i < ndevs; ++i) {
3207                 for (j = 0; j < dev_stripes; ++j) {
3208                         int s = i * dev_stripes + j;
3209                         map->stripes[s].dev = devices_info[i].dev;
3210                         map->stripes[s].physical = devices_info[i].dev_offset +
3211                                                    j * stripe_size;
3212                 }
3213         }
3214         map->sector_size = extent_root->sectorsize;
3215         map->stripe_len = BTRFS_STRIPE_LEN;
3216         map->io_align = BTRFS_STRIPE_LEN;
3217         map->io_width = BTRFS_STRIPE_LEN;
3218         map->type = type;
3219         map->sub_stripes = sub_stripes;
3220
3221         *map_ret = map;
3222         num_bytes = stripe_size * (num_stripes / ncopies);
3223
3224         *stripe_size_out = stripe_size;
3225         *num_bytes_out = num_bytes;
3226
3227         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3228
3229         em = alloc_extent_map();
3230         if (!em) {
3231                 ret = -ENOMEM;
3232                 goto error;
3233         }
3234         em->bdev = (struct block_device *)map;
3235         em->start = start;
3236         em->len = num_bytes;
3237         em->block_start = 0;
3238         em->block_len = em->len;
3239
3240         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3241         write_lock(&em_tree->lock);
3242         ret = add_extent_mapping(em_tree, em);
3243         write_unlock(&em_tree->lock);
3244         BUG_ON(ret);
3245         free_extent_map(em);
3246
3247         ret = btrfs_make_block_group(trans, extent_root, 0, type,
3248                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3249                                      start, num_bytes);
3250         BUG_ON(ret);
3251
3252         for (i = 0; i < map->num_stripes; ++i) {
3253                 struct btrfs_device *device;
3254                 u64 dev_offset;
3255
3256                 device = map->stripes[i].dev;
3257                 dev_offset = map->stripes[i].physical;
3258
3259                 ret = btrfs_alloc_dev_extent(trans, device,
3260                                 info->chunk_root->root_key.objectid,
3261                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3262                                 start, dev_offset, stripe_size);
3263                 BUG_ON(ret);
3264         }
3265
3266         kfree(devices_info);
3267         return 0;
3268
3269 error:
3270         kfree(map);
3271         kfree(devices_info);
3272         return ret;
3273 }
3274
3275 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
3276                                 struct btrfs_root *extent_root,
3277                                 struct map_lookup *map, u64 chunk_offset,
3278                                 u64 chunk_size, u64 stripe_size)
3279 {
3280         u64 dev_offset;
3281         struct btrfs_key key;
3282         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3283         struct btrfs_device *device;
3284         struct btrfs_chunk *chunk;
3285         struct btrfs_stripe *stripe;
3286         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
3287         int index = 0;
3288         int ret;
3289
3290         chunk = kzalloc(item_size, GFP_NOFS);
3291         if (!chunk)
3292                 return -ENOMEM;
3293
3294         index = 0;
3295         while (index < map->num_stripes) {
3296                 device = map->stripes[index].dev;
3297                 device->bytes_used += stripe_size;
3298                 ret = btrfs_update_device(trans, device);
3299                 BUG_ON(ret);
3300                 index++;
3301         }
3302
3303         spin_lock(&extent_root->fs_info->free_chunk_lock);
3304         extent_root->fs_info->free_chunk_space -= (stripe_size *
3305                                                    map->num_stripes);
3306         spin_unlock(&extent_root->fs_info->free_chunk_lock);
3307
3308         index = 0;
3309         stripe = &chunk->stripe;
3310         while (index < map->num_stripes) {
3311                 device = map->stripes[index].dev;
3312                 dev_offset = map->stripes[index].physical;
3313
3314                 btrfs_set_stack_stripe_devid(stripe, device->devid);
3315                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
3316                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
3317                 stripe++;
3318                 index++;
3319         }
3320
3321         btrfs_set_stack_chunk_length(chunk, chunk_size);
3322         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
3323         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
3324         btrfs_set_stack_chunk_type(chunk, map->type);
3325         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
3326         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
3327         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
3328         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
3329         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
3330
3331         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3332         key.type = BTRFS_CHUNK_ITEM_KEY;
3333         key.offset = chunk_offset;
3334
3335         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
3336         BUG_ON(ret);
3337
3338         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3339                 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
3340                                              item_size);
3341                 BUG_ON(ret);
3342         }
3343
3344         kfree(chunk);
3345         return 0;
3346 }
3347
3348 /*
3349  * Chunk allocation falls into two parts. The first part does works
3350  * that make the new allocated chunk useable, but not do any operation
3351  * that modifies the chunk tree. The second part does the works that
3352  * require modifying the chunk tree. This division is important for the
3353  * bootstrap process of adding storage to a seed btrfs.
3354  */
3355 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3356                       struct btrfs_root *extent_root, u64 type)
3357 {
3358         u64 chunk_offset;
3359         u64 chunk_size;
3360         u64 stripe_size;
3361         struct map_lookup *map;
3362         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3363         int ret;
3364
3365         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3366                               &chunk_offset);
3367         if (ret)
3368                 return ret;
3369
3370         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3371                                   &stripe_size, chunk_offset, type);
3372         if (ret)
3373                 return ret;
3374
3375         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3376                                    chunk_size, stripe_size);
3377         BUG_ON(ret);
3378         return 0;
3379 }
3380
3381 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3382                                          struct btrfs_root *root,
3383                                          struct btrfs_device *device)
3384 {
3385         u64 chunk_offset;
3386         u64 sys_chunk_offset;
3387         u64 chunk_size;
3388         u64 sys_chunk_size;
3389         u64 stripe_size;
3390         u64 sys_stripe_size;
3391         u64 alloc_profile;
3392         struct map_lookup *map;
3393         struct map_lookup *sys_map;
3394         struct btrfs_fs_info *fs_info = root->fs_info;
3395         struct btrfs_root *extent_root = fs_info->extent_root;
3396         int ret;
3397
3398         ret = find_next_chunk(fs_info->chunk_root,
3399                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
3400         if (ret)
3401                 return ret;
3402
3403         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
3404                                 fs_info->avail_metadata_alloc_bits;
3405         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3406
3407         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3408                                   &stripe_size, chunk_offset, alloc_profile);
3409         BUG_ON(ret);
3410
3411         sys_chunk_offset = chunk_offset + chunk_size;
3412
3413         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
3414                                 fs_info->avail_system_alloc_bits;
3415         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3416
3417         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3418                                   &sys_chunk_size, &sys_stripe_size,
3419                                   sys_chunk_offset, alloc_profile);
3420         BUG_ON(ret);
3421
3422         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3423         BUG_ON(ret);
3424
3425         /*
3426          * Modifying chunk tree needs allocating new blocks from both
3427          * system block group and metadata block group. So we only can
3428          * do operations require modifying the chunk tree after both
3429          * block groups were created.
3430          */
3431         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3432                                    chunk_size, stripe_size);
3433         BUG_ON(ret);
3434
3435         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3436                                    sys_chunk_offset, sys_chunk_size,
3437                                    sys_stripe_size);
3438         BUG_ON(ret);
3439         return 0;
3440 }
3441
3442 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
3443 {
3444         struct extent_map *em;
3445         struct map_lookup *map;
3446         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3447         int readonly = 0;
3448         int i;
3449
3450         read_lock(&map_tree->map_tree.lock);
3451         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3452         read_unlock(&map_tree->map_tree.lock);
3453         if (!em)
3454                 return 1;
3455
3456         if (btrfs_test_opt(root, DEGRADED)) {
3457                 free_extent_map(em);
3458                 return 0;
3459         }
3460
3461         map = (struct map_lookup *)em->bdev;
3462         for (i = 0; i < map->num_stripes; i++) {
3463                 if (!map->stripes[i].dev->writeable) {
3464                         readonly = 1;
3465                         break;
3466                 }
3467         }
3468         free_extent_map(em);
3469         return readonly;
3470 }
3471
3472 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
3473 {
3474         extent_map_tree_init(&tree->map_tree);
3475 }
3476
3477 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3478 {
3479         struct extent_map *em;
3480
3481         while (1) {
3482                 write_lock(&tree->map_tree.lock);
3483                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
3484                 if (em)
3485                         remove_extent_mapping(&tree->map_tree, em);
3486                 write_unlock(&tree->map_tree.lock);
3487                 if (!em)
3488                         break;
3489                 kfree(em->bdev);
3490                 /* once for us */
3491                 free_extent_map(em);
3492                 /* once for the tree */
3493                 free_extent_map(em);
3494         }
3495 }
3496
3497 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3498 {
3499         struct extent_map *em;
3500         struct map_lookup *map;
3501         struct extent_map_tree *em_tree = &map_tree->map_tree;
3502         int ret;
3503
3504         read_lock(&em_tree->lock);
3505         em = lookup_extent_mapping(em_tree, logical, len);
3506         read_unlock(&em_tree->lock);
3507         BUG_ON(!em);
3508
3509         BUG_ON(em->start > logical || em->start + em->len < logical);
3510         map = (struct map_lookup *)em->bdev;
3511         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
3512                 ret = map->num_stripes;
3513         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3514                 ret = map->sub_stripes;
3515         else
3516                 ret = 1;
3517         free_extent_map(em);
3518         return ret;
3519 }
3520
3521 static int find_live_mirror(struct map_lookup *map, int first, int num,
3522                             int optimal)
3523 {
3524         int i;
3525         if (map->stripes[optimal].dev->bdev)
3526                 return optimal;
3527         for (i = first; i < first + num; i++) {
3528                 if (map->stripes[i].dev->bdev)
3529                         return i;
3530         }
3531         /* we couldn't find one that doesn't fail.  Just return something
3532          * and the io error handling code will clean up eventually
3533          */
3534         return optimal;
3535 }
3536
3537 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3538                              u64 logical, u64 *length,
3539                              struct btrfs_bio **bbio_ret,
3540                              int mirror_num)
3541 {
3542         struct extent_map *em;
3543         struct map_lookup *map;
3544         struct extent_map_tree *em_tree = &map_tree->map_tree;
3545         u64 offset;
3546         u64 stripe_offset;
3547         u64 stripe_end_offset;
3548         u64 stripe_nr;
3549         u64 stripe_nr_orig;
3550         u64 stripe_nr_end;
3551         int stripes_allocated = 8;
3552         int stripes_required = 1;
3553         int stripe_index;
3554         int i;
3555         int num_stripes;
3556         int max_errors = 0;
3557         struct btrfs_bio *bbio = NULL;
3558
3559         if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
3560                 stripes_allocated = 1;
3561 again:
3562         if (bbio_ret) {
3563                 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
3564                                 GFP_NOFS);
3565                 if (!bbio)
3566                         return -ENOMEM;
3567
3568                 atomic_set(&bbio->error, 0);
3569         }
3570
3571         read_lock(&em_tree->lock);
3572         em = lookup_extent_mapping(em_tree, logical, *length);
3573         read_unlock(&em_tree->lock);
3574
3575         if (!em) {
3576                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
3577                        (unsigned long long)logical,
3578                        (unsigned long long)*length);
3579                 BUG();
3580         }
3581
3582         BUG_ON(em->start > logical || em->start + em->len < logical);
3583         map = (struct map_lookup *)em->bdev;
3584         offset = logical - em->start;
3585
3586         if (mirror_num > map->num_stripes)
3587                 mirror_num = 0;
3588
3589         /* if our btrfs_bio struct is too small, back off and try again */
3590         if (rw & REQ_WRITE) {
3591                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3592                                  BTRFS_BLOCK_GROUP_DUP)) {
3593                         stripes_required = map->num_stripes;
3594                         max_errors = 1;
3595                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3596                         stripes_required = map->sub_stripes;
3597                         max_errors = 1;
3598                 }
3599         }
3600         if (rw & REQ_DISCARD) {
3601                 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3602                         stripes_required = map->num_stripes;
3603         }
3604         if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
3605             stripes_allocated < stripes_required) {
3606                 stripes_allocated = map->num_stripes;
3607                 free_extent_map(em);
3608                 kfree(bbio);
3609                 goto again;
3610         }
3611         stripe_nr = offset;
3612         /*
3613          * stripe_nr counts the total number of stripes we have to stride
3614          * to get to this block
3615          */
3616         do_div(stripe_nr, map->stripe_len);
3617
3618         stripe_offset = stripe_nr * map->stripe_len;
3619         BUG_ON(offset < stripe_offset);
3620
3621         /* stripe_offset is the offset of this block in its stripe*/
3622         stripe_offset = offset - stripe_offset;
3623
3624         if (rw & REQ_DISCARD)
3625                 *length = min_t(u64, em->len - offset, *length);
3626         else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
3627                 /* we limit the length of each bio to what fits in a stripe */
3628                 *length = min_t(u64, em->len - offset,
3629                                 map->stripe_len - stripe_offset);
3630         } else {
3631                 *length = em->len - offset;
3632         }
3633
3634         if (!bbio_ret)
3635                 goto out;
3636
3637         num_stripes = 1;
3638         stripe_index = 0;
3639         stripe_nr_orig = stripe_nr;
3640         stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3641                         (~(map->stripe_len - 1));
3642         do_div(stripe_nr_end, map->stripe_len);
3643         stripe_end_offset = stripe_nr_end * map->stripe_len -
3644                             (offset + *length);
3645         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3646                 if (rw & REQ_DISCARD)
3647                         num_stripes = min_t(u64, map->num_stripes,
3648                                             stripe_nr_end - stripe_nr_orig);
3649                 stripe_index = do_div(stripe_nr, map->num_stripes);
3650         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3651                 if (rw & (REQ_WRITE | REQ_DISCARD))
3652                         num_stripes = map->num_stripes;
3653                 else if (mirror_num)
3654                         stripe_index = mirror_num - 1;
3655                 else {
3656                         stripe_index = find_live_mirror(map, 0,
3657                                             map->num_stripes,
3658                                             current->pid % map->num_stripes);
3659                         mirror_num = stripe_index + 1;
3660                 }
3661
3662         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3663                 if (rw & (REQ_WRITE | REQ_DISCARD)) {
3664                         num_stripes = map->num_stripes;
3665                 } else if (mirror_num) {
3666                         stripe_index = mirror_num - 1;
3667                 } else {
3668                         mirror_num = 1;
3669                 }
3670
3671         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3672                 int factor = map->num_stripes / map->sub_stripes;
3673
3674                 stripe_index = do_div(stripe_nr, factor);
3675                 stripe_index *= map->sub_stripes;
3676
3677                 if (rw & REQ_WRITE)
3678                         num_stripes = map->sub_stripes;
3679                 else if (rw & REQ_DISCARD)
3680                         num_stripes = min_t(u64, map->sub_stripes *
3681                                             (stripe_nr_end - stripe_nr_orig),
3682                                             map->num_stripes);
3683                 else if (mirror_num)
3684                         stripe_index += mirror_num - 1;
3685                 else {
3686                         stripe_index = find_live_mirror(map, stripe_index,
3687                                               map->sub_stripes, stripe_index +
3688                                               current->pid % map->sub_stripes);
3689                         mirror_num = stripe_index + 1;
3690                 }
3691         } else {
3692                 /*
3693                  * after this do_div call, stripe_nr is the number of stripes
3694                  * on this device we have to walk to find the data, and
3695                  * stripe_index is the number of our device in the stripe array
3696                  */
3697                 stripe_index = do_div(stripe_nr, map->num_stripes);
3698                 mirror_num = stripe_index + 1;
3699         }
3700         BUG_ON(stripe_index >= map->num_stripes);
3701
3702         if (rw & REQ_DISCARD) {
3703                 for (i = 0; i < num_stripes; i++) {
3704                         bbio->stripes[i].physical =
3705                                 map->stripes[stripe_index].physical +
3706                                 stripe_offset + stripe_nr * map->stripe_len;
3707                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3708
3709                         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3710                                 u64 stripes;
3711                                 u32 last_stripe = 0;
3712                                 int j;
3713
3714                                 div_u64_rem(stripe_nr_end - 1,
3715                                             map->num_stripes,
3716                                             &last_stripe);
3717
3718                                 for (j = 0; j < map->num_stripes; j++) {
3719                                         u32 test;
3720
3721                                         div_u64_rem(stripe_nr_end - 1 - j,
3722                                                     map->num_stripes, &test);
3723                                         if (test == stripe_index)
3724                                                 break;
3725                                 }
3726                                 stripes = stripe_nr_end - 1 - j;
3727                                 do_div(stripes, map->num_stripes);
3728                                 bbio->stripes[i].length = map->stripe_len *
3729                                         (stripes - stripe_nr + 1);
3730
3731                                 if (i == 0) {
3732                                         bbio->stripes[i].length -=
3733                                                 stripe_offset;
3734                                         stripe_offset = 0;
3735                                 }
3736                                 if (stripe_index == last_stripe)
3737                                         bbio->stripes[i].length -=
3738                                                 stripe_end_offset;
3739                         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3740                                 u64 stripes;
3741                                 int j;
3742                                 int factor = map->num_stripes /
3743                                              map->sub_stripes;
3744                                 u32 last_stripe = 0;
3745
3746                                 div_u64_rem(stripe_nr_end - 1,
3747                                             factor, &last_stripe);
3748                                 last_stripe *= map->sub_stripes;
3749
3750                                 for (j = 0; j < factor; j++) {
3751                                         u32 test;
3752
3753                                         div_u64_rem(stripe_nr_end - 1 - j,
3754                                                     factor, &test);
3755
3756                                         if (test ==
3757                                             stripe_index / map->sub_stripes)
3758                                                 break;
3759                                 }
3760                                 stripes = stripe_nr_end - 1 - j;
3761                                 do_div(stripes, factor);
3762                                 bbio->stripes[i].length = map->stripe_len *
3763                                         (stripes - stripe_nr + 1);
3764
3765                                 if (i < map->sub_stripes) {
3766                                         bbio->stripes[i].length -=
3767                                                 stripe_offset;
3768                                         if (i == map->sub_stripes - 1)
3769                                                 stripe_offset = 0;
3770                                 }
3771                                 if (stripe_index >= last_stripe &&
3772                                     stripe_index <= (last_stripe +
3773                                                      map->sub_stripes - 1)) {
3774                                         bbio->stripes[i].length -=
3775                                                 stripe_end_offset;
3776                                 }
3777                         } else
3778                                 bbio->stripes[i].length = *length;
3779
3780                         stripe_index++;
3781                         if (stripe_index == map->num_stripes) {
3782                                 /* This could only happen for RAID0/10 */
3783                                 stripe_index = 0;
3784                                 stripe_nr++;
3785                         }
3786                 }
3787         } else {
3788                 for (i = 0; i < num_stripes; i++) {
3789                         bbio->stripes[i].physical =
3790                                 map->stripes[stripe_index].physical +
3791                                 stripe_offset +
3792                                 stripe_nr * map->stripe_len;
3793                         bbio->stripes[i].dev =
3794                                 map->stripes[stripe_index].dev;
3795                         stripe_index++;
3796                 }
3797         }
3798         if (bbio_ret) {
3799                 *bbio_ret = bbio;
3800                 bbio->num_stripes = num_stripes;
3801                 bbio->max_errors = max_errors;
3802                 bbio->mirror_num = mirror_num;
3803         }
3804 out:
3805         free_extent_map(em);
3806         return 0;
3807 }
3808
3809 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3810                       u64 logical, u64 *length,
3811                       struct btrfs_bio **bbio_ret, int mirror_num)
3812 {
3813         return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3814                                  mirror_num);
3815 }
3816
3817 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3818                      u64 chunk_start, u64 physical, u64 devid,
3819                      u64 **logical, int *naddrs, int *stripe_len)
3820 {
3821         struct extent_map_tree *em_tree = &map_tree->map_tree;
3822         struct extent_map *em;
3823         struct map_lookup *map;
3824         u64 *buf;
3825         u64 bytenr;
3826         u64 length;
3827         u64 stripe_nr;
3828         int i, j, nr = 0;
3829
3830         read_lock(&em_tree->lock);
3831         em = lookup_extent_mapping(em_tree, chunk_start, 1);
3832         read_unlock(&em_tree->lock);
3833
3834         BUG_ON(!em || em->start != chunk_start);
3835         map = (struct map_lookup *)em->bdev;
3836
3837         length = em->len;
3838         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3839                 do_div(length, map->num_stripes / map->sub_stripes);
3840         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3841                 do_div(length, map->num_stripes);
3842
3843         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3844         BUG_ON(!buf);
3845
3846         for (i = 0; i < map->num_stripes; i++) {
3847                 if (devid && map->stripes[i].dev->devid != devid)
3848                         continue;
3849                 if (map->stripes[i].physical > physical ||
3850                     map->stripes[i].physical + length <= physical)
3851                         continue;
3852
3853                 stripe_nr = physical - map->stripes[i].physical;
3854                 do_div(stripe_nr, map->stripe_len);
3855
3856                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3857                         stripe_nr = stripe_nr * map->num_stripes + i;
3858                         do_div(stripe_nr, map->sub_stripes);
3859                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3860                         stripe_nr = stripe_nr * map->num_stripes + i;
3861                 }
3862                 bytenr = chunk_start + stripe_nr * map->stripe_len;
3863                 WARN_ON(nr >= map->num_stripes);
3864                 for (j = 0; j < nr; j++) {
3865                         if (buf[j] == bytenr)
3866                                 break;
3867                 }
3868                 if (j == nr) {
3869                         WARN_ON(nr >= map->num_stripes);
3870                         buf[nr++] = bytenr;
3871                 }
3872         }
3873
3874         *logical = buf;
3875         *naddrs = nr;
3876         *stripe_len = map->stripe_len;
3877
3878         free_extent_map(em);
3879         return 0;
3880 }
3881
3882 static void btrfs_end_bio(struct bio *bio, int err)
3883 {
3884         struct btrfs_bio *bbio = bio->bi_private;
3885         int is_orig_bio = 0;
3886
3887         if (err)
3888                 atomic_inc(&bbio->error);
3889
3890         if (bio == bbio->orig_bio)
3891                 is_orig_bio = 1;
3892
3893         if (atomic_dec_and_test(&bbio->stripes_pending)) {
3894                 if (!is_orig_bio) {
3895                         bio_put(bio);
3896                         bio = bbio->orig_bio;
3897                 }
3898                 bio->bi_private = bbio->private;
3899                 bio->bi_end_io = bbio->end_io;
3900                 bio->bi_bdev = (struct block_device *)
3901                                         (unsigned long)bbio->mirror_num;
3902                 /* only send an error to the higher layers if it is
3903                  * beyond the tolerance of the multi-bio
3904                  */
3905                 if (atomic_read(&bbio->error) > bbio->max_errors) {
3906                         err = -EIO;
3907                 } else {
3908                         /*
3909                          * this bio is actually up to date, we didn't
3910                          * go over the max number of errors
3911                          */
3912                         set_bit(BIO_UPTODATE, &bio->bi_flags);
3913                         err = 0;
3914                 }
3915                 kfree(bbio);
3916
3917                 bio_endio(bio, err);
3918         } else if (!is_orig_bio) {
3919                 bio_put(bio);
3920         }
3921 }
3922
3923 struct async_sched {
3924         struct bio *bio;
3925         int rw;
3926         struct btrfs_fs_info *info;
3927         struct btrfs_work work;
3928 };
3929
3930 /*
3931  * see run_scheduled_bios for a description of why bios are collected for
3932  * async submit.
3933  *
3934  * This will add one bio to the pending list for a device and make sure
3935  * the work struct is scheduled.
3936  */
3937 static noinline int schedule_bio(struct btrfs_root *root,
3938                                  struct btrfs_device *device,
3939                                  int rw, struct bio *bio)
3940 {
3941         int should_queue = 1;
3942         struct btrfs_pending_bios *pending_bios;
3943
3944         /* don't bother with additional async steps for reads, right now */
3945         if (!(rw & REQ_WRITE)) {
3946                 bio_get(bio);
3947                 submit_bio(rw, bio);
3948                 bio_put(bio);
3949                 return 0;
3950         }
3951
3952         /*
3953          * nr_async_bios allows us to reliably return congestion to the
3954          * higher layers.  Otherwise, the async bio makes it appear we have
3955          * made progress against dirty pages when we've really just put it
3956          * on a queue for later
3957          */
3958         atomic_inc(&root->fs_info->nr_async_bios);
3959         WARN_ON(bio->bi_next);
3960         bio->bi_next = NULL;
3961         bio->bi_rw |= rw;
3962
3963         spin_lock(&device->io_lock);
3964         if (bio->bi_rw & REQ_SYNC)
3965                 pending_bios = &device->pending_sync_bios;
3966         else
3967                 pending_bios = &device->pending_bios;
3968
3969         if (pending_bios->tail)
3970                 pending_bios->tail->bi_next = bio;
3971
3972         pending_bios->tail = bio;
3973         if (!pending_bios->head)
3974                 pending_bios->head = bio;
3975         if (device->running_pending)
3976                 should_queue = 0;
3977
3978         spin_unlock(&device->io_lock);
3979
3980         if (should_queue)
3981                 btrfs_queue_worker(&root->fs_info->submit_workers,
3982                                    &device->work);
3983         return 0;
3984 }
3985
3986 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3987                   int mirror_num, int async_submit)
3988 {
3989         struct btrfs_mapping_tree *map_tree;
3990         struct btrfs_device *dev;
3991         struct bio *first_bio = bio;
3992         u64 logical = (u64)bio->bi_sector << 9;
3993         u64 length = 0;
3994         u64 map_length;
3995         int ret;
3996         int dev_nr = 0;
3997         int total_devs = 1;
3998         struct btrfs_bio *bbio = NULL;
3999
4000         length = bio->bi_size;
4001         map_tree = &root->fs_info->mapping_tree;
4002         map_length = length;
4003
4004         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
4005                               mirror_num);
4006         BUG_ON(ret);
4007
4008         total_devs = bbio->num_stripes;
4009         if (map_length < length) {
4010                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
4011                        "len %llu\n", (unsigned long long)logical,
4012                        (unsigned long long)length,
4013                        (unsigned long long)map_length);
4014                 BUG();
4015         }
4016
4017         bbio->orig_bio = first_bio;
4018         bbio->private = first_bio->bi_private;
4019         bbio->end_io = first_bio->bi_end_io;
4020         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4021
4022         while (dev_nr < total_devs) {
4023                 if (dev_nr < total_devs - 1) {
4024                         bio = bio_clone(first_bio, GFP_NOFS);
4025                         BUG_ON(!bio);
4026                 } else {
4027                         bio = first_bio;
4028                 }
4029                 bio->bi_private = bbio;
4030                 bio->bi_end_io = btrfs_end_bio;
4031                 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4032                 dev = bbio->stripes[dev_nr].dev;
4033                 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4034                         pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4035                                  "(%s id %llu), size=%u\n", rw,
4036                                  (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4037                                  dev->name, dev->devid, bio->bi_size);
4038                         bio->bi_bdev = dev->bdev;
4039                         if (async_submit)
4040                                 schedule_bio(root, dev, rw, bio);
4041                         else
4042                                 submit_bio(rw, bio);
4043                 } else {
4044                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4045                         bio->bi_sector = logical >> 9;
4046                         bio_endio(bio, -EIO);
4047                 }
4048                 dev_nr++;
4049         }
4050         return 0;
4051 }
4052
4053 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
4054                                        u8 *uuid, u8 *fsid)
4055 {
4056         struct btrfs_device *device;
4057         struct btrfs_fs_devices *cur_devices;
4058
4059         cur_devices = root->fs_info->fs_devices;
4060         while (cur_devices) {
4061                 if (!fsid ||
4062                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4063                         device = __find_device(&cur_devices->devices,
4064                                                devid, uuid);
4065                         if (device)
4066                                 return device;
4067                 }
4068                 cur_devices = cur_devices->seed;
4069         }
4070         return NULL;
4071 }
4072
4073 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
4074                                             u64 devid, u8 *dev_uuid)
4075 {
4076         struct btrfs_device *device;
4077         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4078
4079         device = kzalloc(sizeof(*device), GFP_NOFS);
4080         if (!device)
4081                 return NULL;
4082         list_add(&device->dev_list,
4083                  &fs_devices->devices);
4084         device->dev_root = root->fs_info->dev_root;
4085         device->devid = devid;
4086         device->work.func = pending_bios_fn;
4087         device->fs_devices = fs_devices;
4088         device->missing = 1;
4089         fs_devices->num_devices++;
4090         fs_devices->missing_devices++;
4091         spin_lock_init(&device->io_lock);
4092         INIT_LIST_HEAD(&device->dev_alloc_list);
4093         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
4094         return device;
4095 }
4096
4097 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4098                           struct extent_buffer *leaf,
4099                           struct btrfs_chunk *chunk)
4100 {
4101         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4102         struct map_lookup *map;
4103         struct extent_map *em;
4104         u64 logical;
4105         u64 length;
4106         u64 devid;
4107         u8 uuid[BTRFS_UUID_SIZE];
4108         int num_stripes;
4109         int ret;
4110         int i;
4111
4112         logical = key->offset;
4113         length = btrfs_chunk_length(leaf, chunk);
4114
4115         read_lock(&map_tree->map_tree.lock);
4116         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
4117         read_unlock(&map_tree->map_tree.lock);
4118
4119         /* already mapped? */
4120         if (em && em->start <= logical && em->start + em->len > logical) {
4121                 free_extent_map(em);
4122                 return 0;
4123         } else if (em) {
4124                 free_extent_map(em);
4125         }
4126
4127         em = alloc_extent_map();
4128         if (!em)
4129                 return -ENOMEM;
4130         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
4131         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4132         if (!map) {
4133                 free_extent_map(em);
4134                 return -ENOMEM;
4135         }
4136
4137         em->bdev = (struct block_device *)map;
4138         em->start = logical;
4139         em->len = length;
4140         em->block_start = 0;
4141         em->block_len = em->len;
4142
4143         map->num_stripes = num_stripes;
4144         map->io_width = btrfs_chunk_io_width(leaf, chunk);
4145         map->io_align = btrfs_chunk_io_align(leaf, chunk);
4146         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
4147         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
4148         map->type = btrfs_chunk_type(leaf, chunk);
4149         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
4150         for (i = 0; i < num_stripes; i++) {
4151                 map->stripes[i].physical =
4152                         btrfs_stripe_offset_nr(leaf, chunk, i);
4153                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
4154                 read_extent_buffer(leaf, uuid, (unsigned long)
4155                                    btrfs_stripe_dev_uuid_nr(chunk, i),
4156                                    BTRFS_UUID_SIZE);
4157                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
4158                                                         NULL);
4159                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4160                         kfree(map);
4161                         free_extent_map(em);
4162                         return -EIO;
4163                 }
4164                 if (!map->stripes[i].dev) {
4165                         map->stripes[i].dev =
4166                                 add_missing_dev(root, devid, uuid);
4167                         if (!map->stripes[i].dev) {
4168                                 kfree(map);
4169                                 free_extent_map(em);
4170                                 return -EIO;
4171                         }
4172                 }
4173                 map->stripes[i].dev->in_fs_metadata = 1;
4174         }
4175
4176         write_lock(&map_tree->map_tree.lock);
4177         ret = add_extent_mapping(&map_tree->map_tree, em);
4178         write_unlock(&map_tree->map_tree.lock);
4179         BUG_ON(ret);
4180         free_extent_map(em);
4181
4182         return 0;
4183 }
4184
4185 static int fill_device_from_item(struct extent_buffer *leaf,
4186                                  struct btrfs_dev_item *dev_item,
4187                                  struct btrfs_device *device)
4188 {
4189         unsigned long ptr;
4190
4191         device->devid = btrfs_device_id(leaf, dev_item);
4192         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
4193         device->total_bytes = device->disk_total_bytes;
4194         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
4195         device->type = btrfs_device_type(leaf, dev_item);
4196         device->io_align = btrfs_device_io_align(leaf, dev_item);
4197         device->io_width = btrfs_device_io_width(leaf, dev_item);
4198         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
4199
4200         ptr = (unsigned long)btrfs_device_uuid(dev_item);
4201         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
4202
4203         return 0;
4204 }
4205
4206 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
4207 {
4208         struct btrfs_fs_devices *fs_devices;
4209         int ret;
4210
4211         mutex_lock(&uuid_mutex);
4212
4213         fs_devices = root->fs_info->fs_devices->seed;
4214         while (fs_devices) {
4215                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4216                         ret = 0;
4217                         goto out;
4218                 }
4219                 fs_devices = fs_devices->seed;
4220         }
4221
4222         fs_devices = find_fsid(fsid);
4223         if (!fs_devices) {
4224                 ret = -ENOENT;
4225                 goto out;
4226         }
4227
4228         fs_devices = clone_fs_devices(fs_devices);
4229         if (IS_ERR(fs_devices)) {
4230                 ret = PTR_ERR(fs_devices);
4231                 goto out;
4232         }
4233
4234         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
4235                                    root->fs_info->bdev_holder);
4236         if (ret)
4237                 goto out;
4238
4239         if (!fs_devices->seeding) {
4240                 __btrfs_close_devices(fs_devices);
4241                 free_fs_devices(fs_devices);
4242                 ret = -EINVAL;
4243                 goto out;
4244         }
4245
4246         fs_devices->seed = root->fs_info->fs_devices->seed;
4247         root->fs_info->fs_devices->seed = fs_devices;
4248 out:
4249         mutex_unlock(&uuid_mutex);
4250         return ret;
4251 }
4252
4253 static int read_one_dev(struct btrfs_root *root,
4254                         struct extent_buffer *leaf,
4255                         struct btrfs_dev_item *dev_item)
4256 {
4257         struct btrfs_device *device;
4258         u64 devid;
4259         int ret;
4260         u8 fs_uuid[BTRFS_UUID_SIZE];
4261         u8 dev_uuid[BTRFS_UUID_SIZE];
4262
4263         devid = btrfs_device_id(leaf, dev_item);
4264         read_extent_buffer(leaf, dev_uuid,
4265                            (unsigned long)btrfs_device_uuid(dev_item),
4266                            BTRFS_UUID_SIZE);
4267         read_extent_buffer(leaf, fs_uuid,
4268                            (unsigned long)btrfs_device_fsid(dev_item),
4269                            BTRFS_UUID_SIZE);
4270
4271         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
4272                 ret = open_seed_devices(root, fs_uuid);
4273                 if (ret && !btrfs_test_opt(root, DEGRADED))
4274                         return ret;
4275         }
4276
4277         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
4278         if (!device || !device->bdev) {
4279                 if (!btrfs_test_opt(root, DEGRADED))
4280                         return -EIO;
4281
4282                 if (!device) {
4283                         printk(KERN_WARNING "warning devid %llu missing\n",
4284                                (unsigned long long)devid);
4285                         device = add_missing_dev(root, devid, dev_uuid);
4286                         if (!device)
4287                                 return -ENOMEM;
4288                 } else if (!device->missing) {
4289                         /*
4290                          * this happens when a device that was properly setup
4291                          * in the device info lists suddenly goes bad.
4292                          * device->bdev is NULL, and so we have to set
4293                          * device->missing to one here
4294                          */
4295                         root->fs_info->fs_devices->missing_devices++;
4296                         device->missing = 1;
4297                 }
4298         }
4299
4300         if (device->fs_devices != root->fs_info->fs_devices) {
4301                 BUG_ON(device->writeable);
4302                 if (device->generation !=
4303                     btrfs_device_generation(leaf, dev_item))
4304                         return -EINVAL;
4305         }
4306
4307         fill_device_from_item(leaf, dev_item, device);
4308         device->dev_root = root->fs_info->dev_root;
4309         device->in_fs_metadata = 1;
4310         if (device->writeable) {
4311                 device->fs_devices->total_rw_bytes += device->total_bytes;
4312                 spin_lock(&root->fs_info->free_chunk_lock);
4313                 root->fs_info->free_chunk_space += device->total_bytes -
4314                         device->bytes_used;
4315                 spin_unlock(&root->fs_info->free_chunk_lock);
4316         }
4317         ret = 0;
4318         return ret;
4319 }
4320
4321 int btrfs_read_sys_array(struct btrfs_root *root)
4322 {
4323         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4324         struct extent_buffer *sb;
4325         struct btrfs_disk_key *disk_key;
4326         struct btrfs_chunk *chunk;
4327         u8 *ptr;
4328         unsigned long sb_ptr;
4329         int ret = 0;
4330         u32 num_stripes;
4331         u32 array_size;
4332         u32 len = 0;
4333         u32 cur;
4334         struct btrfs_key key;
4335
4336         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
4337                                           BTRFS_SUPER_INFO_SIZE);
4338         if (!sb)
4339                 return -ENOMEM;
4340         btrfs_set_buffer_uptodate(sb);
4341         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
4342
4343         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
4344         array_size = btrfs_super_sys_array_size(super_copy);
4345
4346         ptr = super_copy->sys_chunk_array;
4347         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
4348         cur = 0;
4349
4350         while (cur < array_size) {
4351                 disk_key = (struct btrfs_disk_key *)ptr;
4352                 btrfs_disk_key_to_cpu(&key, disk_key);
4353
4354                 len = sizeof(*disk_key); ptr += len;
4355                 sb_ptr += len;
4356                 cur += len;
4357
4358                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
4359                         chunk = (struct btrfs_chunk *)sb_ptr;
4360                         ret = read_one_chunk(root, &key, sb, chunk);
4361                         if (ret)
4362                                 break;
4363                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
4364                         len = btrfs_chunk_item_size(num_stripes);
4365                 } else {
4366                         ret = -EIO;
4367                         break;
4368                 }
4369                 ptr += len;
4370                 sb_ptr += len;
4371                 cur += len;
4372         }
4373         free_extent_buffer(sb);
4374         return ret;
4375 }
4376
4377 int btrfs_read_chunk_tree(struct btrfs_root *root)
4378 {
4379         struct btrfs_path *path;
4380         struct extent_buffer *leaf;
4381         struct btrfs_key key;
4382         struct btrfs_key found_key;
4383         int ret;
4384         int slot;
4385
4386         root = root->fs_info->chunk_root;
4387
4388         path = btrfs_alloc_path();
4389         if (!path)
4390                 return -ENOMEM;
4391
4392         /* first we search for all of the device items, and then we
4393          * read in all of the chunk items.  This way we can create chunk
4394          * mappings that reference all of the devices that are afound
4395          */
4396         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
4397         key.offset = 0;
4398         key.type = 0;
4399 again:
4400         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4401         if (ret < 0)
4402                 goto error;
4403         while (1) {
4404                 leaf = path->nodes[0];
4405                 slot = path->slots[0];
4406                 if (slot >= btrfs_header_nritems(leaf)) {
4407                         ret = btrfs_next_leaf(root, path);
4408                         if (ret == 0)
4409                                 continue;
4410                         if (ret < 0)
4411                                 goto error;
4412                         break;
4413                 }
4414                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4415                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4416                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
4417                                 break;
4418                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
4419                                 struct btrfs_dev_item *dev_item;
4420                                 dev_item = btrfs_item_ptr(leaf, slot,
4421                                                   struct btrfs_dev_item);
4422                                 ret = read_one_dev(root, leaf, dev_item);
4423                                 if (ret)
4424                                         goto error;
4425                         }
4426                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
4427                         struct btrfs_chunk *chunk;
4428                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4429                         ret = read_one_chunk(root, &found_key, leaf, chunk);
4430                         if (ret)
4431                                 goto error;
4432                 }
4433                 path->slots[0]++;
4434         }
4435         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4436                 key.objectid = 0;
4437                 btrfs_release_path(path);
4438                 goto again;
4439         }
4440         ret = 0;
4441 error:
4442         btrfs_free_path(path);
4443         return ret;
4444 }