Merge branch 'v3.1-rc10' into for-3.2/core

author Jens Axboe <axboe@kernel.dk>

Wed, 19 Oct 2011 12:30:42 +0000 (14:30 +0200)

committer Jens Axboe <axboe@kernel.dk>

Wed, 19 Oct 2011 12:30:42 +0000 (14:30 +0200)
author Jens Axboe <axboe@kernel.dk>
Wed, 19 Oct 2011 12:30:42 +0000 (14:30 +0200)
committer Jens Axboe <axboe@kernel.dk>
Wed, 19 Oct 2011 12:30:42 +0000 (14:30 +0200)
diff --combined block/blk-core.c

index 97e9e5405b837d911cbe3c6d916c968587a652ea,d34433ae791781b5799edfa838f058922443e78d..79e41a76d96a7a9f2b475ba80de85628f6bd5f13
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -38,6 -38,8 +38,6 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_
   EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
   EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
   
- -static int __make_request(struct request_queue *q, struct bio *bio);
- -
   /*
    * For the allocated request tables
    */
@@@ -346,9 -348,10 +346,10 @@@ void blk_put_queue(struct request_queu
   EXPORT_SYMBOL(blk_put_queue);
   
   /*
-  * Note: If a driver supplied the queue lock, it should not zap that lock
-  * unexpectedly as some queue cleanup components like elevator_exit() and
-  * blk_throtl_exit() need queue lock.
+  * Note: If a driver supplied the queue lock, it is disconnected
+  * by this function. The actual state of the lock doesn't matter
+  * here as the request_queue isn't accessible after this point
+  * (QUEUE_FLAG_DEAD is set) and no other requests will be queued.
    */
   void blk_cleanup_queue(struct request_queue *q)
   {
@@@ -365,10 -368,8 +366,8 @@@
         queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
         mutex_unlock(&q->sysfs_lock);
   
-       if (q->elevator)
-               elevator_exit(q->elevator);
- 
-       blk_throtl_exit(q);
+       if (q->queue_lock != &q->__queue_lock)
+               q->queue_lock = &q->__queue_lock;
   
         blk_put_queue(q);
   }
@@@ -540,7 -541,7 +539,7 @@@ blk_init_allocated_queue_node(struct re
         /*
          * This also sets hw/phys segments, boundary and size
          */
- -      blk_queue_make_request(q, __make_request);
+ +      blk_queue_make_request(q, blk_queue_bio);
   
         q->sg_reserved_size = INT_MAX;
   
@@@ -1165,7 -1166,7 +1164,7 @@@ static bool bio_attempt_front_merge(str
    * true if merge was successful, otherwise false.
    */
   static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
-                              struct bio *bio)
+                              struct bio *bio, unsigned int *request_count)
   {
         struct blk_plug *plug;
         struct request *rq;
@@@ -1174,10 -1175,13 +1173,13 @@@
         plug = tsk->plug;
         if (!plug)
                 goto out;
+       *request_count = 0;
   
         list_for_each_entry_reverse(rq, &plug->list, queuelist) {
                 int el_ret;
   
+               (*request_count)++;
+ 
                 if (rq->q != q)
                         continue;
   
@@@ -1211,12 -1215,13 +1213,13 @@@ void init_request_from_bio(struct reque
         blk_rq_bio_prep(req->q, req, bio);
   }
   
- -static int __make_request(struct request_queue *q, struct bio *bio)
+ +void blk_queue_bio(struct request_queue *q, struct bio *bio)
   {
         const bool sync = !!(bio->bi_rw & REQ_SYNC);
         struct blk_plug *plug;
         int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
         struct request *req;
+       unsigned int request_count = 0;
   
         /*
          * low level driver can indicate that it wants pages above a
@@@ -1235,8 -1240,8 +1238,8 @@@
          * Check if we can merge with the plugged list before grabbing
          * any locks.
          */
-       if (attempt_plug_merge(current, q, bio))
+       if (attempt_plug_merge(current, q, bio, &request_count))
- -              goto out;
+ +              return;
   
         spin_lock_irq(q->queue_lock);
   
@@@ -1300,11 -1305,10 +1303,10 @@@ get_rq
                         if (__rq->q != q)
                                 plug->should_sort = 1;
                 }
+               if (request_count >= BLK_MAX_REQUEST_COUNT)
+                       blk_flush_plug_list(plug, false);
                 list_add_tail(&req->queuelist, &plug->list);
-               plug->count++;
                 drive_stat_acct(req, 1);
-               if (plug->count >= BLK_MAX_REQUEST_COUNT)
-                       blk_flush_plug_list(plug, false);
         } else {
                 spin_lock_irq(q->queue_lock);
                 add_acct_request(q, req, where);
@@@ -1312,8 -1316,9 +1314,8 @@@
   out_unlock:
                 spin_unlock_irq(q->queue_lock);
         }
- -out:
- -      return 0;
   }
+ +EXPORT_SYMBOL_GPL(blk_queue_bio);     /* for device mapper only */
   
   /*
    * If bio->bi_dev is a partition, remap the location
@@@ -1412,142 -1417,165 +1414,142 @@@ static inline int bio_check_eod(struct 
         return 0;
   }
   
- -/**
- - * generic_make_request - hand a buffer to its device driver for I/O
- - * @bio:  The bio describing the location in memory and on the device.
- - *
- - * generic_make_request() is used to make I/O requests of block
- - * devices. It is passed a &struct bio, which describes the I/O that needs
- - * to be done.
- - *
- - * generic_make_request() does not return any status.  The
- - * success/failure status of the request, along with notification of
- - * completion, is delivered asynchronously through the bio->bi_end_io
- - * function described (one day) else where.
- - *
- - * The caller of generic_make_request must make sure that bi_io_vec
- - * are set to describe the memory buffer, and that bi_dev and bi_sector are
- - * set to describe the device address, and the
- - * bi_end_io and optionally bi_private are set to describe how
- - * completion notification should be signaled.
- - *
- - * generic_make_request and the drivers it calls may use bi_next if this
- - * bio happens to be merged with someone else, and may change bi_dev and
- - * bi_sector for remaps as it sees fit.  So the values of these fields
- - * should NOT be depended on after the call to generic_make_request.
- - */
- -static inline void __generic_make_request(struct bio *bio)
+ +static noinline_for_stack bool
+ +generic_make_request_checks(struct bio *bio)
   {
         struct request_queue *q;
- -      sector_t old_sector;
- -      int ret, nr_sectors = bio_sectors(bio);
- -      dev_t old_dev;
+ +      int nr_sectors = bio_sectors(bio);
         int err = -EIO;
+ +      char b[BDEVNAME_SIZE];
+ +      struct hd_struct *part;
   
         might_sleep();
   
         if (bio_check_eod(bio, nr_sectors))
                 goto end_io;
   
- -      /*
- -       * Resolve the mapping until finished. (drivers are
- -       * still free to implement/resolve their own stacking
- -       * by explicitly returning 0)
- -       *
- -       * NOTE: we don't repeat the blk_size check for each new device.
- -       * Stacking drivers are expected to know what they are doing.
- -       */
- -      old_sector = -1;
- -      old_dev = 0;
- -      do {
- -              char b[BDEVNAME_SIZE];
- -              struct hd_struct *part;
- -
- -              q = bdev_get_queue(bio->bi_bdev);
- -              if (unlikely(!q)) {
- -                      printk(KERN_ERR
- -                             "generic_make_request: Trying to access "
- -                              "nonexistent block-device %s (%Lu)\n",
- -                              bdevname(bio->bi_bdev, b),
- -                              (long long) bio->bi_sector);
- -                      goto end_io;
- -              }
- -
- -              if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
- -                           nr_sectors > queue_max_hw_sectors(q))) {
- -                      printk(KERN_ERR "bio too big device %s (%u > %u)\n",
- -                             bdevname(bio->bi_bdev, b),
- -                             bio_sectors(bio),
- -                             queue_max_hw_sectors(q));
- -                      goto end_io;
- -              }
- -
- -              if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
- -                      goto end_io;
- -
- -              part = bio->bi_bdev->bd_part;
- -              if (should_fail_request(part, bio->bi_size) ||
- -                  should_fail_request(&part_to_disk(part)->part0,
- -                                      bio->bi_size))
- -                      goto end_io;
+ +      q = bdev_get_queue(bio->bi_bdev);
+ +      if (unlikely(!q)) {
+ +              printk(KERN_ERR
+ +                     "generic_make_request: Trying to access "
+ +                      "nonexistent block-device %s (%Lu)\n",
+ +                      bdevname(bio->bi_bdev, b),
+ +                      (long long) bio->bi_sector);
+ +              goto end_io;
+ +      }
   
- -              /*
- -               * If this device has partitions, remap block n
- -               * of partition p to block n+start(p) of the disk.
- -               */
- -              blk_partition_remap(bio);
+ +      if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
+ +                   nr_sectors > queue_max_hw_sectors(q))) {
+ +              printk(KERN_ERR "bio too big device %s (%u > %u)\n",
+ +                     bdevname(bio->bi_bdev, b),
+ +                     bio_sectors(bio),
+ +                     queue_max_hw_sectors(q));
+ +              goto end_io;
+ +      }
   
- -              if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
- -                      goto end_io;
+ +      if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+ +              goto end_io;
   
- -              if (old_sector != -1)
- -                      trace_block_bio_remap(q, bio, old_dev, old_sector);
+ +      part = bio->bi_bdev->bd_part;
+ +      if (should_fail_request(part, bio->bi_size) ||
+ +          should_fail_request(&part_to_disk(part)->part0,
+ +                              bio->bi_size))
+ +              goto end_io;
   
- -              old_sector = bio->bi_sector;
- -              old_dev = bio->bi_bdev->bd_dev;
+ +      /*
+ +       * If this device has partitions, remap block n
+ +       * of partition p to block n+start(p) of the disk.
+ +       */
+ +      blk_partition_remap(bio);
   
- -              if (bio_check_eod(bio, nr_sectors))
- -                      goto end_io;
+ +      if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
+ +              goto end_io;
   
- -              /*
- -               * Filter flush bio's early so that make_request based
- -               * drivers without flush support don't have to worry
- -               * about them.
- -               */
- -              if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
- -                      bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
- -                      if (!nr_sectors) {
- -                              err = 0;
- -                              goto end_io;
- -                      }
- -              }
+ +      if (bio_check_eod(bio, nr_sectors))
+ +              goto end_io;
   
- -              if ((bio->bi_rw & REQ_DISCARD) &&
- -                  (!blk_queue_discard(q) ||
- -                   ((bio->bi_rw & REQ_SECURE) &&
- -                    !blk_queue_secdiscard(q)))) {
- -                      err = -EOPNOTSUPP;
+ +      /*
+ +       * Filter flush bio's early so that make_request based
+ +       * drivers without flush support don't have to worry
+ +       * about them.
+ +       */
+ +      if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+ +              bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+ +              if (!nr_sectors) {
+ +                      err = 0;
                         goto end_io;
                 }
+ +      }
   
- -              if (blk_throtl_bio(q, &bio))
- -                      goto end_io;
- -
- -              /*
- -               * If bio = NULL, bio has been throttled and will be submitted
- -               * later.
- -               */
- -              if (!bio)
- -                      break;
+ +      if ((bio->bi_rw & REQ_DISCARD) &&
+ +          (!blk_queue_discard(q) ||
+ +           ((bio->bi_rw & REQ_SECURE) &&
+ +            !blk_queue_secdiscard(q)))) {
+ +              err = -EOPNOTSUPP;
+ +              goto end_io;
+ +      }
   
- -              trace_block_bio_queue(q, bio);
+ +      if (blk_throtl_bio(q, &bio))
+ +              goto end_io;
   
- -              ret = q->make_request_fn(q, bio);
- -      } while (ret);
+ +      /* if bio = NULL, bio has been throttled and will be submitted later. */
+ +      if (!bio)
+ +              return false;
   
- -      return;
+ +      trace_block_bio_queue(q, bio);
+ +      return true;
   
   end_io:
         bio_endio(bio, err);
+ +      return false;
   }
   
- -/*
- - * We only want one ->make_request_fn to be active at a time,
- - * else stack usage with stacked devices could be a problem.
- - * So use current->bio_list to keep a list of requests
- - * submited by a make_request_fn function.
- - * current->bio_list is also used as a flag to say if
- - * generic_make_request is currently active in this task or not.
- - * If it is NULL, then no make_request is active.  If it is non-NULL,
- - * then a make_request is active, and new requests should be added
- - * at the tail
+ +/**
+ + * generic_make_request - hand a buffer to its device driver for I/O
+ + * @bio:  The bio describing the location in memory and on the device.
+ + *
+ + * generic_make_request() is used to make I/O requests of block
+ + * devices. It is passed a &struct bio, which describes the I/O that needs
+ + * to be done.
+ + *
+ + * generic_make_request() does not return any status.  The
+ + * success/failure status of the request, along with notification of
+ + * completion, is delivered asynchronously through the bio->bi_end_io
+ + * function described (one day) else where.
+ + *
+ + * The caller of generic_make_request must make sure that bi_io_vec
+ + * are set to describe the memory buffer, and that bi_dev and bi_sector are
+ + * set to describe the device address, and the
+ + * bi_end_io and optionally bi_private are set to describe how
+ + * completion notification should be signaled.
+ + *
+ + * generic_make_request and the drivers it calls may use bi_next if this
+ + * bio happens to be merged with someone else, and may resubmit the bio to
+ + * a lower device by calling into generic_make_request recursively, which
+ + * means the bio should NOT be touched after the call to ->make_request_fn.
    */
   void generic_make_request(struct bio *bio)
   {
         struct bio_list bio_list_on_stack;
   
+ +      if (!generic_make_request_checks(bio))
+ +              return;
+ +
+ +      /*
+ +       * We only want one ->make_request_fn to be active at a time, else
+ +       * stack usage with stacked devices could be a problem.  So use
+ +       * current->bio_list to keep a list of requests submited by a
+ +       * make_request_fn function.  current->bio_list is also used as a
+ +       * flag to say if generic_make_request is currently active in this
+ +       * task or not.  If it is NULL, then no make_request is active.  If
+ +       * it is non-NULL, then a make_request is active, and new requests
+ +       * should be added at the tail
+ +       */
         if (current->bio_list) {
- -              /* make_request is active */
                 bio_list_add(current->bio_list, bio);
                 return;
         }
+ +
         /* following loop may be a bit non-obvious, and so deserves some
          * explanation.
          * Before entering the loop, bio->bi_next is NULL (as all callers
@@@ -1555,21 -1583,22 +1557,21 @@@
          * We pretend that we have just taken it off a longer list, so
          * we assign bio_list to a pointer to the bio_list_on_stack,
          * thus initialising the bio_list of new bios to be
- -       * added.  __generic_make_request may indeed add some more bios
+ +       * added.  ->make_request() may indeed add some more bios
          * through a recursive call to generic_make_request.  If it
          * did, we find a non-NULL value in bio_list and re-enter the loop
          * from the top.  In this case we really did just take the bio
          * of the top of the list (no pretending) and so remove it from
- -       * bio_list, and call into __generic_make_request again.
- -       *
- -       * The loop was structured like this to make only one call to
- -       * __generic_make_request (which is important as it is large and
- -       * inlined) and to keep the structure simple.
+ +       * bio_list, and call into ->make_request() again.
          */
         BUG_ON(bio->bi_next);
         bio_list_init(&bio_list_on_stack);
         current->bio_list = &bio_list_on_stack;
         do {
- -              __generic_make_request(bio);
+ +              struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ +
+ +              q->make_request_fn(q, bio);
+ +
                 bio = bio_list_pop(current->bio_list);
         } while (bio);
         current->bio_list = NULL; /* deactivate */
@@@ -1675,6 -1704,7 +1677,7 @@@ EXPORT_SYMBOL_GPL(blk_rq_check_limits)
   int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
   {
         unsigned long flags;
+       int where = ELEVATOR_INSERT_BACK;
   
         if (blk_rq_check_limits(q, rq))
                 return -EIO;
@@@ -1691,7 -1721,10 +1694,10 @@@
          */
         BUG_ON(blk_queued_rq(rq));
   
-       add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
+       if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA))
+               where = ELEVATOR_INSERT_FLUSH;
+ 
+       add_acct_request(q, rq, where);
         spin_unlock_irqrestore(q->queue_lock, flags);
   
         return 0;
@@@ -2248,7 -2281,7 +2254,7 @@@ static bool blk_end_bidi_request(struc
    *     %false - we are done with this request
    *     %true  - still buffers pending for this request
    **/
- static bool __blk_end_bidi_request(struct request *rq, int error,
+ bool __blk_end_bidi_request(struct request *rq, int error,
                                    unsigned int nr_bytes, unsigned int bidi_bytes)
   {
         if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
@@@ -2595,20 -2628,6 +2601,20 @@@ EXPORT_SYMBOL(kblockd_schedule_delayed_
   
   #define PLUG_MAGIC    0x91827364
   
+ +/**
+ + * blk_start_plug - initialize blk_plug and track it inside the task_struct
+ + * @plug:     The &struct blk_plug that needs to be initialized
+ + *
+ + * Description:
+ + *   Tracking blk_plug inside the task_struct will help with auto-flushing the
+ + *   pending I/O should the task end up blocking between blk_start_plug() and
+ + *   blk_finish_plug(). This is important from a performance perspective, but
+ + *   also ensures that we don't deadlock. For instance, if the task is blocking
+ + *   for a memory allocation, memory reclaim could end up wanting to free a
+ + *   page belonging to that request that is currently residing in our private
+ + *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
+ + *   this kind of deadlock.
+ + */
   void blk_start_plug(struct blk_plug *plug)
   {
         struct task_struct *tsk = current;
@@@ -2617,7 -2636,6 +2623,6 @@@
         INIT_LIST_HEAD(&plug->list);
         INIT_LIST_HEAD(&plug->cb_list);
         plug->should_sort = 0;
-       plug->count = 0;
   
         /*
          * If this is a nested plug, don't actually assign it. It will be
@@@ -2701,7 -2719,6 +2706,6 @@@ void blk_flush_plug_list(struct blk_plu
                 return;
   
         list_splice_init(&plug->list, &list);
-       plug->count = 0;
   
         if (plug->should_sort) {
                 list_sort(NULL, &list, plug_rq_cmp);
diff --combined block/blk-sysfs.c

index adc923e9d1f81ca7ce38a75aa480cd50ff3989ce,60fda88c57f0dd83e930d3859ab0626407a01fd2..a8eff5f8b9c58d5e9dec7ea50b30e55d0f6448f8
--- 1/block/blk-sysfs.c
--- 2/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@@ -258,11 -258,13 +258,13 @@@ queue_rq_affinity_store(struct request_
   
         ret = queue_var_store(&val, page, count);
         spin_lock_irq(q->queue_lock);
-       if (val) {
+       if (val == 2) {
                 queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
-               if (val == 2)
-                       queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
-       } else {
+               queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
+       } else if (val == 1) {
+               queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+               queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+       } else if (val == 0) {
                 queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
                 queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
         }
@@@ -455,11 -457,11 +457,11 @@@ queue_attr_store(struct kobject *kobj, 
   }
   
   /**
- - * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
- - * @kobj:    the kobj belonging of the request queue to be released
+ + * blk_release_queue: - release a &struct request_queue when it is no longer needed
+ + * @kobj:    the kobj belonging to the request queue to be released
    *
    * Description:
- - *     blk_cleanup_queue is the pair to blk_init_queue() or
+ + *     blk_release_queue is the pair to blk_init_queue() or
    *     blk_queue_make_request().  It should be called when a request queue is
    *     being released; typically when a block device is being de-registered.
    *     Currently, its primary task it to free all the &struct request
@@@ -477,6 -479,11 +479,11 @@@ static void blk_release_queue(struct ko
   
         blk_sync_queue(q);
   
+       if (q->elevator)
+               elevator_exit(q->elevator);
+ 
+       blk_throtl_exit(q);
+ 
         if (rl->rq_pool)
                 mempool_destroy(rl->rq_pool);
   
diff --combined drivers/block/loop.c

index 8360239d553c7ed69a6df49a0c420b32d45f119b,4720c7ade0aed0dfd4fccce47ec7e1b39402587a..157ddcb9d0a54e3c8b7e70e65a501590d91c5b76
--- 1/drivers/block/loop.c
--- 2/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@@ -75,11 -75,11 +75,11 @@@
   #include <linux/kthread.h>
   #include <linux/splice.h>
   #include <linux/sysfs.h>
- 
+ #include <linux/miscdevice.h>
   #include <asm/uaccess.h>
   
- static LIST_HEAD(loop_devices);
- static DEFINE_MUTEX(loop_devices_mutex);
+ static DEFINE_IDR(loop_index_idr);
+ static DEFINE_MUTEX(loop_index_mutex);
   
   static int max_part;
   static int part_shift;
@@@ -514,7 -514,7 +514,7 @@@ static struct bio *loop_get_bio(struct 
         return bio_list_pop(&lo->lo_bio_list);
   }
   
- -static int loop_make_request(struct request_queue *q, struct bio *old_bio)
+ +static void loop_make_request(struct request_queue *q, struct bio *old_bio)
   {
         struct loop_device *lo = q->queuedata;
         int rw = bio_rw(old_bio);
@@@ -532,11 -532,12 +532,11 @@@
         loop_add_bio(lo, old_bio);
         wake_up(&lo->lo_event);
         spin_unlock_irq(&lo->lo_lock);
- -      return 0;
+ +      return;
   
   out:
         spin_unlock_irq(&lo->lo_lock);
         bio_io_error(old_bio);
- -      return 0;
   }
   
   struct switch_request {
@@@ -721,17 -722,10 +721,10 @@@ static inline int is_loop_device(struc
   static ssize_t loop_attr_show(struct device *dev, char *page,
                               ssize_t (*callback)(struct loop_device *, char *))
   {
-       struct loop_device *l, *lo = NULL;
- 
-       mutex_lock(&loop_devices_mutex);
-       list_for_each_entry(l, &loop_devices, lo_list)
-               if (disk_to_dev(l->lo_disk) == dev) {
-                       lo = l;
-                       break;
-               }
-       mutex_unlock(&loop_devices_mutex);
+       struct gendisk *disk = dev_to_disk(dev);
+       struct loop_device *lo = disk->private_data;
   
-       return lo ? callback(lo, page) : -EIO;
+       return callback(lo, page);
   }
   
   #define LOOP_ATTR_RO(_name)                                           \
@@@ -749,10 -743,10 +742,10 @@@ static ssize_t loop_attr_backing_file_s
         ssize_t ret;
         char *p = NULL;
   
-       mutex_lock(&lo->lo_ctl_mutex);
+       spin_lock_irq(&lo->lo_lock);
         if (lo->lo_backing_file)
                 p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
-       mutex_unlock(&lo->lo_ctl_mutex);
+       spin_unlock_irq(&lo->lo_lock);
   
         if (IS_ERR_OR_NULL(p))
                 ret = PTR_ERR(p);
@@@ -1006,7 -1000,9 +999,9 @@@ static int loop_clr_fd(struct loop_devi
   
         kthread_stop(lo->lo_thread);
   
+       spin_lock_irq(&lo->lo_lock);
         lo->lo_backing_file = NULL;
+       spin_unlock_irq(&lo->lo_lock);
   
         loop_release_xfer(lo);
         lo->transfer = NULL;
@@@ -1484,13 -1480,22 +1479,22 @@@ static int lo_compat_ioctl(struct block
   
   static int lo_open(struct block_device *bdev, fmode_t mode)
   {
-       struct loop_device *lo = bdev->bd_disk->private_data;
+       struct loop_device *lo;
+       int err = 0;
+ 
+       mutex_lock(&loop_index_mutex);
+       lo = bdev->bd_disk->private_data;
+       if (!lo) {
+               err = -ENXIO;
+               goto out;
+       }
   
         mutex_lock(&lo->lo_ctl_mutex);
         lo->lo_refcnt++;
         mutex_unlock(&lo->lo_ctl_mutex);
- 
-       return 0;
+ out:
+       mutex_unlock(&loop_index_mutex);
+       return err;
   }
   
   static int lo_release(struct gendisk *disk, fmode_t mode)
@@@ -1556,40 -1561,71 +1560,71 @@@ int loop_register_transfer(struct loop_
         return 0;
   }
   
+ static int unregister_transfer_cb(int id, void *ptr, void *data)
+ {
+       struct loop_device *lo = ptr;
+       struct loop_func_table *xfer = data;
+ 
+       mutex_lock(&lo->lo_ctl_mutex);
+       if (lo->lo_encryption == xfer)
+               loop_release_xfer(lo);
+       mutex_unlock(&lo->lo_ctl_mutex);
+       return 0;
+ }
+ 
   int loop_unregister_transfer(int number)
   {
         unsigned int n = number;
-       struct loop_device *lo;
         struct loop_func_table *xfer;
   
         if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
                 return -EINVAL;
   
         xfer_funcs[n] = NULL;
- 
-       list_for_each_entry(lo, &loop_devices, lo_list) {
-               mutex_lock(&lo->lo_ctl_mutex);
- 
-               if (lo->lo_encryption == xfer)
-                       loop_release_xfer(lo);
- 
-               mutex_unlock(&lo->lo_ctl_mutex);
-       }
- 
+       idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer);
         return 0;
   }
   
   EXPORT_SYMBOL(loop_register_transfer);
   EXPORT_SYMBOL(loop_unregister_transfer);
   
- static struct loop_device *loop_alloc(int i)
+ static int loop_add(struct loop_device **l, int i)
   {
         struct loop_device *lo;
         struct gendisk *disk;
+       int err;
   
         lo = kzalloc(sizeof(*lo), GFP_KERNEL);
-       if (!lo)
+       if (!lo) {
+               err = -ENOMEM;
                 goto out;
+       }
+ 
+       err = idr_pre_get(&loop_index_idr, GFP_KERNEL);
+       if (err < 0)
+               goto out_free_dev;
+ 
+       if (i >= 0) {
+               int m;
+ 
+               /* create specific i in the index */
+               err = idr_get_new_above(&loop_index_idr, lo, i, &m);
+               if (err >= 0 && i != m) {
+                       idr_remove(&loop_index_idr, m);
+                       err = -EEXIST;
+               }
+       } else if (i == -1) {
+               int m;
+ 
+               /* get next free nr */
+               err = idr_get_new(&loop_index_idr, lo, &m);
+               if (err >= 0)
+                       i = m;
+       } else {
+               err = -EINVAL;
+       }
+       if (err < 0)
+               goto out_free_dev;
   
         lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
         if (!lo->lo_queue)
@@@ -1610,81 -1646,158 +1645,158 @@@
         disk->private_data      = lo;
         disk->queue             = lo->lo_queue;
         sprintf(disk->disk_name, "loop%d", i);
-       return lo;
+       add_disk(disk);
+       *l = lo;
+       return lo->lo_number;
   
   out_free_queue:
         blk_cleanup_queue(lo->lo_queue);
   out_free_dev:
         kfree(lo);
   out:
-       return NULL;
+       return err;
   }
   
- static void loop_free(struct loop_device *lo)
+ static void loop_remove(struct loop_device *lo)
   {
+       del_gendisk(lo->lo_disk);
         blk_cleanup_queue(lo->lo_queue);
         put_disk(lo->lo_disk);
-       list_del(&lo->lo_list);
         kfree(lo);
   }
   
- static struct loop_device *loop_init_one(int i)
+ static int find_free_cb(int id, void *ptr, void *data)
+ {
+       struct loop_device *lo = ptr;
+       struct loop_device **l = data;
+ 
+       if (lo->lo_state == Lo_unbound) {
+               *l = lo;
+               return 1;
+       }
+       return 0;
+ }
+ 
+ static int loop_lookup(struct loop_device **l, int i)
   {
         struct loop_device *lo;
+       int ret = -ENODEV;
   
-       list_for_each_entry(lo, &loop_devices, lo_list) {
-               if (lo->lo_number == i)
-                       return lo;
+       if (i < 0) {
+               int err;
+ 
+               err = idr_for_each(&loop_index_idr, &find_free_cb, &lo);
+               if (err == 1) {
+                       *l = lo;
+                       ret = lo->lo_number;
+               }
+               goto out;
         }
   
-       lo = loop_alloc(i);
+       /* lookup and return a specific i */
+       lo = idr_find(&loop_index_idr, i);
         if (lo) {
-               add_disk(lo->lo_disk);
-               list_add_tail(&lo->lo_list, &loop_devices);
+               *l = lo;
+               ret = lo->lo_number;
         }
-       return lo;
- }
- 
- static void loop_del_one(struct loop_device *lo)
- {
-       del_gendisk(lo->lo_disk);
-       loop_free(lo);
+ out:
+       return ret;
   }
   
   static struct kobject *loop_probe(dev_t dev, int *part, void *data)
   {
         struct loop_device *lo;
         struct kobject *kobj;
+       int err;
   
-       mutex_lock(&loop_devices_mutex);
-       lo = loop_init_one(MINOR(dev) >> part_shift);
-       kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
-       mutex_unlock(&loop_devices_mutex);
+       mutex_lock(&loop_index_mutex);
+       err = loop_lookup(&lo, MINOR(dev) >> part_shift);
+       if (err < 0)
+               err = loop_add(&lo, MINOR(dev) >> part_shift);
+       if (err < 0)
+               kobj = ERR_PTR(err);
+       else
+               kobj = get_disk(lo->lo_disk);
+       mutex_unlock(&loop_index_mutex);
   
         *part = 0;
         return kobj;
   }
   
+ static long loop_control_ioctl(struct file *file, unsigned int cmd,
+                              unsigned long parm)
+ {
+       struct loop_device *lo;
+       int ret = -ENOSYS;
+ 
+       mutex_lock(&loop_index_mutex);
+       switch (cmd) {
+       case LOOP_CTL_ADD:
+               ret = loop_lookup(&lo, parm);
+               if (ret >= 0) {
+                       ret = -EEXIST;
+                       break;
+               }
+               ret = loop_add(&lo, parm);
+               break;
+       case LOOP_CTL_REMOVE:
+               ret = loop_lookup(&lo, parm);
+               if (ret < 0)
+                       break;
+               mutex_lock(&lo->lo_ctl_mutex);
+               if (lo->lo_state != Lo_unbound) {
+                       ret = -EBUSY;
+                       mutex_unlock(&lo->lo_ctl_mutex);
+                       break;
+               }
+               if (lo->lo_refcnt > 0) {
+                       ret = -EBUSY;
+                       mutex_unlock(&lo->lo_ctl_mutex);
+                       break;
+               }
+               lo->lo_disk->private_data = NULL;
+               mutex_unlock(&lo->lo_ctl_mutex);
+               idr_remove(&loop_index_idr, lo->lo_number);
+               loop_remove(lo);
+               break;
+       case LOOP_CTL_GET_FREE:
+               ret = loop_lookup(&lo, -1);
+               if (ret >= 0)
+                       break;
+               ret = loop_add(&lo, -1);
+       }
+       mutex_unlock(&loop_index_mutex);
+ 
+       return ret;
+ }
+ 
+ static const struct file_operations loop_ctl_fops = {
+       .open           = nonseekable_open,
+       .unlocked_ioctl = loop_control_ioctl,
+       .compat_ioctl   = loop_control_ioctl,
+       .owner          = THIS_MODULE,
+       .llseek         = noop_llseek,
+ };
+ 
+ static struct miscdevice loop_misc = {
+       .minor          = LOOP_CTRL_MINOR,
+       .name           = "loop-control",
+       .fops           = &loop_ctl_fops,
+ };
+ 
+ MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR);
+ MODULE_ALIAS("devname:loop-control");
+ 
   static int __init loop_init(void)
   {
         int i, nr;
         unsigned long range;
-       struct loop_device *lo, *next;
+       struct loop_device *lo;
+       int err;
   
-       /*
-        * loop module now has a feature to instantiate underlying device
-        * structure on-demand, provided that there is an access dev node.
-        * However, this will not work well with user space tool that doesn't
-        * know about such "feature".  In order to not break any existing
-        * tool, we do the following:
-        *
-        * (1) if max_loop is specified, create that many upfront, and this
-        *     also becomes a hard limit.
-        * (2) if max_loop is not specified, create 8 loop device on module
-        *     load, user can further extend loop device by create dev node
-        *     themselves and have kernel automatically instantiate actual
-        *     device on-demand.
-        */
+       err = misc_register(&loop_misc);
+       if (err < 0)
+               return err;
   
         part_shift = 0;
         if (max_part > 0) {
@@@ -1707,57 -1820,60 +1819,60 @@@
         if (max_loop > 1UL << (MINORBITS - part_shift))
                 return -EINVAL;
   
+       /*
+        * If max_loop is specified, create that many devices upfront.
+        * This also becomes a hard limit. If max_loop is not specified,
+        * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+        * init time. Loop devices can be requested on-demand with the
+        * /dev/loop-control interface, or be instantiated by accessing
+        * a 'dead' device node.
+        */
         if (max_loop) {
                 nr = max_loop;
                 range = max_loop << part_shift;
         } else {
-               nr = 8;
+               nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
                 range = 1UL << MINORBITS;
         }
   
         if (register_blkdev(LOOP_MAJOR, "loop"))
                 return -EIO;
   
-       for (i = 0; i < nr; i++) {
-               lo = loop_alloc(i);
-               if (!lo)
-                       goto Enomem;
-               list_add_tail(&lo->lo_list, &loop_devices);
-       }
- 
-       /* point of no return */
- 
-       list_for_each_entry(lo, &loop_devices, lo_list)
-               add_disk(lo->lo_disk);
- 
         blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
                                   THIS_MODULE, loop_probe, NULL, NULL);
   
+       /* pre-create number of devices given by config or max_loop */
+       mutex_lock(&loop_index_mutex);
+       for (i = 0; i < nr; i++)
+               loop_add(&lo, i);
+       mutex_unlock(&loop_index_mutex);
+ 
         printk(KERN_INFO "loop: module loaded\n");
         return 0;
+ }
   
- Enomem:
-       printk(KERN_INFO "loop: out of memory\n");
- 
-       list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
-               loop_free(lo);
+ static int loop_exit_cb(int id, void *ptr, void *data)
+ {
+       struct loop_device *lo = ptr;
   
-       unregister_blkdev(LOOP_MAJOR, "loop");
-       return -ENOMEM;
+       loop_remove(lo);
+       return 0;
   }
   
   static void __exit loop_exit(void)
   {
         unsigned long range;
-       struct loop_device *lo, *next;
   
         range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
   
-       list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
-               loop_del_one(lo);
+       idr_for_each(&loop_index_idr, &loop_exit_cb, NULL);
+       idr_remove_all(&loop_index_idr);
+       idr_destroy(&loop_index_idr);
   
         blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
         unregister_blkdev(LOOP_MAJOR, "loop");
+ 
+       misc_deregister(&loop_misc);
   }
   
   module_init(loop_init);
diff --combined drivers/md/md.c

index 5c2178562c964c17d8fa941f81cde5dd21ec9a00,5c95ccb595007cab1aba9859cc4da62cad80536f..8f52d4eb78a0ad26c7af97e39809b76ac15ce580
--- 1/drivers/md/md.c
--- 2/drivers/md/md.c
+++ b/drivers/md/md.c
@@@ -61,6 -61,11 +61,11 @@@
   static void autostart_arrays(int part);
   #endif
   
+ /* pers_list is a list of registered personalities protected
+  * by pers_lock.
+  * pers_lock does extra service to protect accesses to
+  * mddev->thread when the mutex cannot be held.
+  */
   static LIST_HEAD(pers_list);
   static DEFINE_SPINLOCK(pers_lock);
   
@@@ -330,17 -335,18 +335,17 @@@ static DEFINE_SPINLOCK(all_mddevs_lock)
    * call has finished, the bio has been linked into some internal structure
    * and so is visible to ->quiesce(), so we don't need the refcount any more.
    */
- -static int md_make_request(struct request_queue *q, struct bio *bio)
+ +static void md_make_request(struct request_queue *q, struct bio *bio)
   {
         const int rw = bio_data_dir(bio);
         mddev_t *mddev = q->queuedata;
- -      int rv;
         int cpu;
         unsigned int sectors;
   
         if (mddev == NULL || mddev->pers == NULL
             || !mddev->ready) {
                 bio_io_error(bio);
- -              return 0;
+ +              return;
         }
         smp_rmb(); /* Ensure implications of  'active' are visible */
         rcu_read_lock();
@@@ -365,7 -371,7 +370,7 @@@
          * go away inside make_request
          */
         sectors = bio_sectors(bio);
- -      rv = mddev->pers->make_request(mddev, bio);
+ +      mddev->pers->make_request(mddev, bio);
   
         cpu = part_stat_lock();
         part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
@@@ -374,6 -380,8 +379,6 @@@
   
         if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
                 wake_up(&mddev->sb_wait);
- -
- -      return rv;
   }
   
   /* mddev_suspend makes sure no new requests are submitted
@@@ -472,7 -480,8 +477,7 @@@ static void md_submit_flush_data(struc
                 bio_endio(bio, 0);
         else {
                 bio->bi_rw &= ~REQ_FLUSH;
- -              if (mddev->pers->make_request(mddev, bio))
- -                      generic_make_request(bio);
+ +              mddev->pers->make_request(mddev, bio);
         }
   
         mddev->flush_bio = NULL;
@@@ -735,7 -744,12 +740,12 @@@ static void mddev_unlock(mddev_t * mdde
         } else
                 mutex_unlock(&mddev->reconfig_mutex);
   
+       /* was we've dropped the mutex we need a spinlock to
+        * make sur the thread doesn't disappear
+        */
+       spin_lock(&pers_lock);
         md_wakeup_thread(mddev->thread);
+       spin_unlock(&pers_lock);
   }
   
   static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
@@@ -844,7 -858,7 +854,7 @@@ void md_super_write(mddev_t *mddev, mdk
         bio->bi_end_io = super_written;
   
         atomic_inc(&mddev->pending_writes);
-       submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
+       submit_bio(WRITE_FLUSH_FUA, bio);
   }
   
   void md_super_wait(mddev_t *mddev)
@@@ -1134,8 -1148,11 +1144,11 @@@ static int super_90_load(mdk_rdev_t *rd
                         ret = 0;
         }
         rdev->sectors = rdev->sb_start;
+       /* Limit to 4TB as metadata cannot record more than that */
+       if (rdev->sectors >= (2ULL << 32))
+               rdev->sectors = (2ULL << 32) - 2;
   
-       if (rdev->sectors < sb->size * 2 && sb->level > 1)
+       if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
                 /* "this cannot possibly happen" ... */
                 ret = -EINVAL;
   
@@@ -1169,7 -1186,7 +1182,7 @@@ static int super_90_validate(mddev_t *m
                 mddev->clevel[0] = 0;
                 mddev->layout = sb->layout;
                 mddev->raid_disks = sb->raid_disks;
-               mddev->dev_sectors = sb->size * 2;
+               mddev->dev_sectors = ((sector_t)sb->size) * 2;
                 mddev->events = ev1;
                 mddev->bitmap_info.offset = 0;
                 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
@@@ -1411,6 -1428,11 +1424,11 @@@ super_90_rdev_size_change(mdk_rdev_t *r
         rdev->sb_start = calc_dev_sboffset(rdev);
         if (!num_sectors || num_sectors > rdev->sb_start)
                 num_sectors = rdev->sb_start;
+       /* Limit to 4TB as metadata cannot record more than that.
+        * 4TB == 2^32 KB, or 2*2^32 sectors.
+        */
+       if (num_sectors >= (2ULL << 32))
+               num_sectors = (2ULL << 32) - 2;
         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
                        rdev->sb_page);
         md_super_wait(rdev->mddev);
@@@ -1734,6 -1756,11 +1752,11 @@@ static void super_1_sync(mddev_t *mddev
         sb->level = cpu_to_le32(mddev->level);
         sb->layout = cpu_to_le32(mddev->layout);
   
+       if (test_bit(WriteMostly, &rdev->flags))
+               sb->devflags |= WriteMostly1;
+       else
+               sb->devflags &= ~WriteMostly1;
+ 
         if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
                 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
@@@ -2557,7 -2584,10 +2580,10 @@@ state_store(mdk_rdev_t *rdev, const cha
         int err = -EINVAL;
         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
                 md_error(rdev->mddev, rdev);
-               err = 0;
+               if (test_bit(Faulty, &rdev->flags))
+                       err = 0;
+               else
+                       err = -EBUSY;
         } else if (cmd_match(buf, "remove")) {
                 if (rdev->raid_disk >= 0)
                         err = -EBUSY;
@@@ -2580,7 -2610,7 +2606,7 @@@
                 err = 0;
         } else if (cmd_match(buf, "-blocked")) {
                 if (!test_bit(Faulty, &rdev->flags) &&
-                   test_bit(BlockedBadBlocks, &rdev->flags)) {
+                   rdev->badblocks.unacked_exist) {
                         /* metadata handler doesn't understand badblocks,
                          * so we need to fail the device
                          */
@@@ -5979,6 -6009,8 +6005,8 @@@ static int set_disk_faulty(mddev_t *mdd
                 return -ENODEV;
   
         md_error(mddev, rdev);
+       if (!test_bit(Faulty, &rdev->flags))
+               return -EBUSY;
         return 0;
   }
   
@@@ -6407,11 -6439,18 +6435,18 @@@ mdk_thread_t *md_register_thread(void (
         return thread;
   }
   
- void md_unregister_thread(mdk_thread_t *thread)
+ void md_unregister_thread(mdk_thread_t **threadp)
   {
+       mdk_thread_t *thread = *threadp;
         if (!thread)
                 return;
         dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
+       /* Locking ensures that mddev_unlock does not wake_up a
+        * non-existent thread
+        */
+       spin_lock(&pers_lock);
+       *threadp = NULL;
+       spin_unlock(&pers_lock);
   
         kthread_stop(thread->tsk);
         kfree(thread);
@@@ -7318,8 -7357,7 +7353,7 @@@ static void reap_sync_thread(mddev_t *m
         mdk_rdev_t *rdev;
   
         /* resync has finished, collect result */
-       md_unregister_thread(mddev->sync_thread);
-       mddev->sync_thread = NULL;
+       md_unregister_thread(&mddev->sync_thread);
         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                 /* success...*/
diff --combined drivers/md/md.h

index bd47847cf7caab6307a3c574040ce95f3fb5fc18,0a309dc29b45a4eed774dd16132aeeef60ac52cd..1509a3eb9ae15dcfbcf33e1e130df96e961fc8f9
--- 1/drivers/md/md.h
--- 2/drivers/md/md.h
+++ b/drivers/md/md.h
@@@ -424,7 -424,7 +424,7 @@@ struct mdk_personalit
         int level;
         struct list_head list;
         struct module *owner;
- -      int (*make_request)(mddev_t *mddev, struct bio *bio);
+ +      void (*make_request)(mddev_t *mddev, struct bio *bio);
         int (*run)(mddev_t *mddev);
         int (*stop)(mddev_t *mddev);
         void (*status)(struct seq_file *seq, mddev_t *mddev);
@@@ -560,7 -560,7 +560,7 @@@ extern int register_md_personality(stru
   extern int unregister_md_personality(struct mdk_personality *p);
   extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
                                 mddev_t *mddev, const char *name);
- extern void md_unregister_thread(mdk_thread_t *thread);
+ extern void md_unregister_thread(mdk_thread_t **threadp);
   extern void md_wakeup_thread(mdk_thread_t *thread);
   extern void md_check_recovery(mddev_t *mddev);
   extern void md_write_start(mddev_t *mddev, struct bio *bi);
diff --combined drivers/md/multipath.c

index 407cb56914254cf55b067f54ce06f33bfddd8c26,d5b5fb3001717d6c5436389615c38044b9bb3b9a..618dd9e225132bf279aec8954e71cf9651dc5bae
--- 1/drivers/md/multipath.c
--- 2/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@@ -106,7 -106,7 +106,7 @@@ static void multipath_end_request(struc
         rdev_dec_pending(rdev, conf->mddev);
   }
   
- -static int multipath_make_request(mddev_t *mddev, struct bio * bio)
+ +static void multipath_make_request(mddev_t *mddev, struct bio * bio)
   {
         multipath_conf_t *conf = mddev->private;
         struct multipath_bh * mp_bh;
@@@ -114,7 -114,7 +114,7 @@@
   
         if (unlikely(bio->bi_rw & REQ_FLUSH)) {
                 md_flush_request(mddev, bio);
- -              return 0;
+ +              return;
         }
   
         mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
@@@ -126,7 -126,7 +126,7 @@@
         if (mp_bh->path < 0) {
                 bio_endio(bio, -EIO);
                 mempool_free(mp_bh, conf->pool);
- -              return 0;
+ +              return;
         }
         multipath = conf->multipaths + mp_bh->path;
   
@@@ -137,7 -137,7 +137,7 @@@
         mp_bh->bio.bi_end_io = multipath_end_request;
         mp_bh->bio.bi_private = mp_bh;
         generic_make_request(&mp_bh->bio);
- -      return 0;
+ +      return;
   }
   
   static void multipath_status (struct seq_file *seq, mddev_t *mddev)
@@@ -514,8 -514,7 +514,7 @@@ static int multipath_stop (mddev_t *mdd
   {
         multipath_conf_t *conf = mddev->private;
   
-       md_unregister_thread(mddev->thread);
-       mddev->thread = NULL;
+       md_unregister_thread(&mddev->thread);
         blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
         mempool_destroy(conf->pool);
         kfree(conf->multipaths);
diff --combined drivers/md/raid1.c

index 97f2a5f977b16c241b6cd5c1f6d7874c8bef5b42,d9587dffe533e69c81b6adf5221ee1da745cfa4b..d4ddfa62730147d7276d85fdf1e5e61441cdf1b9
--- 1/drivers/md/raid1.c
--- 2/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@@ -785,7 -785,7 +785,7 @@@ do_sync_io
         PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
   }
   
- -static int make_request(mddev_t *mddev, struct bio * bio)
+ +static void make_request(mddev_t *mddev, struct bio * bio)
   {
         conf_t *conf = mddev->private;
         mirror_info_t *mirror;
@@@ -870,7 -870,7 +870,7 @@@ read_again
                 if (rdisk < 0) {
                         /* couldn't find anywhere to read from */
                         raid_end_bio_io(r1_bio);
- -                      return 0;
+ +                      return;
                 }
                 mirror = conf->mirrors + rdisk;
   
@@@ -928,7 -928,7 +928,7 @@@
                         goto read_again;
                 } else
                         generic_make_request(read_bio);
- -              return 0;
+ +              return;
         }
   
         /*
@@@ -1099,12 -1099,11 +1099,11 @@@
                 bio_list_add(&conf->pending_bio_list, mbio);
                 spin_unlock_irqrestore(&conf->device_lock, flags);
         }
-       r1_bio_write_done(r1_bio);
- 
-       /* In case raid1d snuck in to freeze_array */
-       wake_up(&conf->wait_barrier);
- 
+       /* Mustn't call r1_bio_write_done before this next test,
+        * as it could result in the bio being freed.
+        */
         if (sectors_handled < (bio->bi_size >> 9)) {
+               r1_bio_write_done(r1_bio);
                 /* We need another r1_bio.  It has already been counted
                  * in bio->bi_phys_segments
                  */
@@@ -1117,8 -1116,15 +1116,13 @@@
                 goto retry_write;
         }
   
+       r1_bio_write_done(r1_bio);
+ 
+       /* In case raid1d snuck in to freeze_array */
+       wake_up(&conf->wait_barrier);
+ 
         if (do_sync || !bitmap || !plugged)
                 md_wakeup_thread(mddev->thread);
- -
- -      return 0;
   }
   
   static void status(struct seq_file *seq, mddev_t *mddev)
@@@ -2556,8 -2562,7 +2560,7 @@@ static int stop(mddev_t *mddev
         raise_barrier(conf);
         lower_barrier(conf);
   
-       md_unregister_thread(mddev->thread);
-       mddev->thread = NULL;
+       md_unregister_thread(&mddev->thread);
         if (conf->r1bio_pool)
                 mempool_destroy(conf->r1bio_pool);
         kfree(conf->mirrors);
diff --combined drivers/md/raid10.c

index 04b625e1cb602965e64a3498b282273b034b2aea,0cd9672cf9cbd6032438d4bcb3503bf4794357fc..ea5fc0b6a84c34af405654ce70209c0e976cccc7
--- 1/drivers/md/raid10.c
--- 2/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@@ -337,6 -337,21 +337,21 @@@ static void close_write(r10bio_t *r10_b
         md_write_end(r10_bio->mddev);
   }
   
+ static void one_write_done(r10bio_t *r10_bio)
+ {
+       if (atomic_dec_and_test(&r10_bio->remaining)) {
+               if (test_bit(R10BIO_WriteError, &r10_bio->state))
+                       reschedule_retry(r10_bio);
+               else {
+                       close_write(r10_bio);
+                       if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+                               reschedule_retry(r10_bio);
+                       else
+                               raid_end_bio_io(r10_bio);
+               }
+       }
+ }
+ 
   static void raid10_end_write_request(struct bio *bio, int error)
   {
         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@@ -387,17 -402,7 +402,7 @@@
          * Let's see if all mirrored write operations have finished
          * already.
          */
-       if (atomic_dec_and_test(&r10_bio->remaining)) {
-               if (test_bit(R10BIO_WriteError, &r10_bio->state))
-                       reschedule_retry(r10_bio);
-               else {
-                       close_write(r10_bio);
-                       if (test_bit(R10BIO_MadeGood, &r10_bio->state))
-                               reschedule_retry(r10_bio);
-                       else
-                               raid_end_bio_io(r10_bio);
-               }
-       }
+       one_write_done(r10_bio);
         if (dec_rdev)
                 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
   }
@@@ -825,7 -830,7 +830,7 @@@ static void unfreeze_array(conf_t *conf
         spin_unlock_irq(&conf->resync_lock);
   }
   
- -static int make_request(mddev_t *mddev, struct bio * bio)
+ +static void make_request(mddev_t *mddev, struct bio * bio)
   {
         conf_t *conf = mddev->private;
         mirror_info_t *mirror;
@@@ -844,7 -849,7 +849,7 @@@
   
         if (unlikely(bio->bi_rw & REQ_FLUSH)) {
                 md_flush_request(mddev, bio);
- -              return 0;
+ +              return;
         }
   
         /* If this request crosses a chunk boundary, we need to
@@@ -876,8 -881,10 +881,8 @@@
                 conf->nr_waiting++;
                 spin_unlock_irq(&conf->resync_lock);
   
- -              if (make_request(mddev, &bp->bio1))
- -                      generic_make_request(&bp->bio1);
- -              if (make_request(mddev, &bp->bio2))
- -                      generic_make_request(&bp->bio2);
+ +              make_request(mddev, &bp->bio1);
+ +              make_request(mddev, &bp->bio2);
   
                 spin_lock_irq(&conf->resync_lock);
                 conf->nr_waiting--;
@@@ -885,14 -892,14 +890,14 @@@
                 spin_unlock_irq(&conf->resync_lock);
   
                 bio_pair_release(bp);
- -              return 0;
+ +              return;
         bad_map:
                 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
                        " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
                        (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
   
                 bio_io_error(bio);
- -              return 0;
+ +              return;
         }
   
         md_write_start(mddev, bio);
@@@ -935,7 -942,7 +940,7 @@@ read_again
                 slot = r10_bio->read_slot;
                 if (disk < 0) {
                         raid_end_bio_io(r10_bio);
- -                      return 0;
+ +                      return;
                 }
                 mirror = conf->mirrors + disk;
   
@@@ -983,7 -990,7 +988,7 @@@
                         goto read_again;
                 } else
                         generic_make_request(read_bio);
- -              return 0;
+ +              return;
         }
   
         /*
@@@ -1125,20 -1132,12 +1130,12 @@@ retry_write
                 spin_unlock_irqrestore(&conf->device_lock, flags);
         }
   
-       if (atomic_dec_and_test(&r10_bio->remaining)) {
-               /* This matches the end of raid10_end_write_request() */
-               bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
-                               r10_bio->sectors,
-                               !test_bit(R10BIO_Degraded, &r10_bio->state),
-                               0);
-               md_write_end(mddev);
-               raid_end_bio_io(r10_bio);
-       }
- 
-       /* In case raid10d snuck in to freeze_array */
-       wake_up(&conf->wait_barrier);
+       /* Don't remove the bias on 'remaining' (one_write_done) until
+        * after checking if we need to go around again.
+        */
   
         if (sectors_handled < (bio->bi_size >> 9)) {
+               one_write_done(r10_bio);
                 /* We need another r10_bio.  It has already been counted
                  * in bio->bi_phys_segments.
                  */
@@@ -1152,9 -1151,14 +1149,13 @@@
                 r10_bio->state = 0;
                 goto retry_write;
         }
+       one_write_done(r10_bio);
+ 
+       /* In case raid10d snuck in to freeze_array */
+       wake_up(&conf->wait_barrier);
   
         if (do_sync || !mddev->bitmap || !plugged)
                 md_wakeup_thread(mddev->thread);
- -      return 0;
   }
   
   static void status(struct seq_file *seq, mddev_t *mddev)
@@@ -2951,7 -2955,7 +2952,7 @@@ static int run(mddev_t *mddev
         return 0;
   
   out_free_conf:
-       md_unregister_thread(mddev->thread);
+       md_unregister_thread(&mddev->thread);
         if (conf->r10bio_pool)
                 mempool_destroy(conf->r10bio_pool);
         safe_put_page(conf->tmppage);
@@@ -2969,8 -2973,7 +2970,7 @@@ static int stop(mddev_t *mddev
         raise_barrier(conf, 0);
         lower_barrier(conf);
   
-       md_unregister_thread(mddev->thread);
-       mddev->thread = NULL;
+       md_unregister_thread(&mddev->thread);
         blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
         if (conf->r10bio_pool)
                 mempool_destroy(conf->r10bio_pool);
diff --combined drivers/md/raid5.c

index 96b7f6a1b6f2718a4de6dabaaba539246c555e14,ac5e8b57e50fbc2d788c0906263ce30ac56eeb60..83f2c44e170fa31cb6aa882c67c7b3c8b922df99
--- 1/drivers/md/raid5.c
--- 2/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@@ -3336,7 -3336,7 +3336,7 @@@ static void handle_stripe(struct stripe
   
   finish:
         /* wait for this device to become unblocked */
-       if (unlikely(s.blocked_rdev))
+       if (conf->mddev->external && unlikely(s.blocked_rdev))
                 md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
   
         if (s.handle_bad_blocks)
@@@ -3695,7 -3695,7 +3695,7 @@@ static struct stripe_head *__get_priori
         return sh;
   }
   
- -static int make_request(mddev_t *mddev, struct bio * bi)
+ +static void make_request(mddev_t *mddev, struct bio * bi)
   {
         raid5_conf_t *conf = mddev->private;
         int dd_idx;
@@@ -3708,7 -3708,7 +3708,7 @@@
   
         if (unlikely(bi->bi_rw & REQ_FLUSH)) {
                 md_flush_request(mddev, bi);
- -              return 0;
+ +              return;
         }
   
         md_write_start(mddev, bi);
@@@ -3716,7 -3716,7 +3716,7 @@@
         if (rw == READ &&
              mddev->reshape_position == MaxSector &&
              chunk_aligned_read(mddev,bi))
- -              return 0;
+ +              return;
   
         logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
         last_sector = bi->bi_sector + (bi->bi_size>>9);
@@@ -3851,6 -3851,8 +3851,6 @@@
   
                 bio_endio(bi, 0);
         }
- -
- -      return 0;
   }
   
   static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
@@@ -4939,8 -4941,7 +4939,7 @@@ static int run(mddev_t *mddev
   
         return 0;
   abort:
-       md_unregister_thread(mddev->thread);
-       mddev->thread = NULL;
+       md_unregister_thread(&mddev->thread);
         if (conf) {
                 print_raid5_conf(conf);
                 free_conf(conf);
@@@ -4954,8 -4955,7 +4953,7 @@@ static int stop(mddev_t *mddev
   {
         raid5_conf_t *conf = mddev->private;
   
-       md_unregister_thread(mddev->thread);
-       mddev->thread = NULL;
+       md_unregister_thread(&mddev->thread);
         if (mddev->queue)
                 mddev->queue->backing_dev_info.congested_fn = NULL;
         free_conf(conf);
diff --combined include/linux/blkdev.h

index 1978655faa3b7c2961a9001bc030d3b294fda1b8,7fbaa9103344a261082afd872127f736abdcd56f..0b68044e7abbb2e0a4f7c963cf76020a9968eb19
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -30,6 -30,7 +30,7 @@@ struct request_pm_state
   struct blk_trace;
   struct request;
   struct sg_io_hdr;
+ struct bsg_job;
   
   #define BLKDEV_MIN_RQ 4
   #define BLKDEV_MAX_RQ 128     /* Default maximum */
@@@ -117,6 -118,7 +118,7 @@@ struct request 
                 struct {
                         unsigned int            seq;
                         struct list_head        list;
+                       rq_end_io_fn            *saved_end_io;
                 } flush;
         };
   
@@@ -193,7 -195,7 +195,7 @@@ struct request_pm_stat
   #include <linux/elevator.h>
   
   typedef void (request_fn_proc) (struct request_queue *q);
- -typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
+ +typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
   typedef int (prep_rq_fn) (struct request_queue *, struct request *);
   typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
   
@@@ -209,6 -211,7 +211,7 @@@ typedef int (merge_bvec_fn) (struct req
   typedef void (softirq_done_fn)(struct request *);
   typedef int (dma_drain_needed_fn)(struct request *);
   typedef int (lld_busy_fn) (struct request_queue *q);
+ typedef int (bsg_job_fn) (struct bsg_job *);
   
   enum blk_eh_timer_return {
         BLK_EH_NOT_HANDLED,
@@@ -375,6 -378,8 +378,8 @@@ struct request_queue 
         struct mutex            sysfs_lock;
   
   #if defined(CONFIG_BLK_DEV_BSG)
+       bsg_job_fn              *bsg_job_fn;
+       int                     bsg_job_size;
         struct bsg_class_device bsg_dev;
   #endif
   
@@@ -675,8 -680,6 +680,8 @@@ extern int scsi_cmd_ioctl(struct reques
   extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
                          struct scsi_ioctl_command __user *);
   
+ +extern void blk_queue_bio(struct request_queue *q, struct bio *bio);
+ +
   /*
    * A queue has just exitted congestion.  Note this in the global counter of
    * congested queues, and wake up anyone who was waiting for requests to be
@@@ -860,23 -863,16 +865,22 @@@ struct request_queue *blk_alloc_queue_n
   extern void blk_put_queue(struct request_queue *);
   
   /*
- - * Note: Code in between changing the blk_plug list/cb_list or element of such
- - * lists is preemptable, but such code can't do sleep (or be very careful),
- - * otherwise data is corrupted. For details, please check schedule() where
- - * blk_schedule_flush_plug() is called.
+ + * blk_plug permits building a queue of related requests by holding the I/O
+ + * fragments for a short period. This allows merging of sequential requests
+ + * into single larger request. As the requests are moved from a per-task list to
+ + * the device's request_queue in a batch, this results in improved scalability
+ + * as the lock contention for request_queue lock is reduced.
+ + *
+ + * It is ok not to disable preemption when adding the request to the plug list
+ + * or when attempting a merge, because blk_schedule_flush_list() will only flush
+ + * the plug list when the task sleeps by itself. For details, please see
+ + * schedule() where blk_schedule_flush_plug() is called.
    */
   struct blk_plug {
- -      unsigned long magic;
- -      struct list_head list;
- -      struct list_head cb_list;
- -      unsigned int should_sort;
+ +      unsigned long magic; /* detect uninitialized use-cases */
+ +      struct list_head list; /* requests */
+ +      struct list_head cb_list; /* md requires an unplug callback */
+ +      unsigned int should_sort; /* list to be sorted before flushing? */
-       unsigned int count; /* number of queued requests */
   };
   #define BLK_MAX_REQUEST_COUNT 16
author	Jens Axboe <axboe@kernel.dk>
	Wed, 19 Oct 2011 12:30:42 +0000 (14:30 +0200)
committer	Jens Axboe <axboe@kernel.dk>
	Wed, 19 Oct 2011 12:30:42 +0000 (14:30 +0200)
		1	2
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-sysfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/loop.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/md.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/md.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/multipath.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid1.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid10.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid5.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history