From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 19 Oct 2011 12:30:42 +0000 (+0200)
Subject: Merge branch 'v3.1-rc10' into for-3.2/core
X-Git-Tag: firefly_0821_release~3680^2~4193^2~19
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=5c04b426f2e8b46cfc7969a35b2631063a3c646c;hp=-c;p=firefly-linux-kernel-4.4.55.git

Merge branch 'v3.1-rc10' into for-3.2/core

Conflicts:
	block/blk-core.c
	include/linux/blkdev.h

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---

5c04b426f2e8b46cfc7969a35b2631063a3c646c
diff --combined block/blk-core.c
index 97e9e5405b83,d34433ae7917..79e41a76d96a
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@@ -38,6 -38,8 +38,6 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
  
 -static int __make_request(struct request_queue *q, struct bio *bio);
 -
  /*
   * For the allocated request tables
   */
@@@ -346,9 -348,10 +346,10 @@@ void blk_put_queue(struct request_queu
  EXPORT_SYMBOL(blk_put_queue);
  
  /*
-  * Note: If a driver supplied the queue lock, it should not zap that lock
-  * unexpectedly as some queue cleanup components like elevator_exit() and
-  * blk_throtl_exit() need queue lock.
+  * Note: If a driver supplied the queue lock, it is disconnected
+  * by this function. The actual state of the lock doesn't matter
+  * here as the request_queue isn't accessible after this point
+  * (QUEUE_FLAG_DEAD is set) and no other requests will be queued.
   */
  void blk_cleanup_queue(struct request_queue *q)
  {
@@@ -365,10 -368,8 +366,8 @@@
  	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
  	mutex_unlock(&q->sysfs_lock);
  
- 	if (q->elevator)
- 		elevator_exit(q->elevator);
- 
- 	blk_throtl_exit(q);
+ 	if (q->queue_lock != &q->__queue_lock)
+ 		q->queue_lock = &q->__queue_lock;
  
  	blk_put_queue(q);
  }
@@@ -540,7 -541,7 +539,7 @@@ blk_init_allocated_queue_node(struct re
  	/*
  	 * This also sets hw/phys segments, boundary and size
  	 */
 -	blk_queue_make_request(q, __make_request);
 +	blk_queue_make_request(q, blk_queue_bio);
  
  	q->sg_reserved_size = INT_MAX;
  
@@@ -1165,7 -1166,7 +1164,7 @@@ static bool bio_attempt_front_merge(str
   * true if merge was successful, otherwise false.
   */
  static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
- 			       struct bio *bio)
+ 			       struct bio *bio, unsigned int *request_count)
  {
  	struct blk_plug *plug;
  	struct request *rq;
@@@ -1174,10 -1175,13 +1173,13 @@@
  	plug = tsk->plug;
  	if (!plug)
  		goto out;
+ 	*request_count = 0;
  
  	list_for_each_entry_reverse(rq, &plug->list, queuelist) {
  		int el_ret;
  
+ 		(*request_count)++;
+ 
  		if (rq->q != q)
  			continue;
  
@@@ -1211,12 -1215,13 +1213,13 @@@ void init_request_from_bio(struct reque
  	blk_rq_bio_prep(req->q, req, bio);
  }
  
 -static int __make_request(struct request_queue *q, struct bio *bio)
 +void blk_queue_bio(struct request_queue *q, struct bio *bio)
  {
  	const bool sync = !!(bio->bi_rw & REQ_SYNC);
  	struct blk_plug *plug;
  	int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
  	struct request *req;
+ 	unsigned int request_count = 0;
  
  	/*
  	 * low level driver can indicate that it wants pages above a
@@@ -1235,8 -1240,8 +1238,8 @@@
  	 * Check if we can merge with the plugged list before grabbing
  	 * any locks.
  	 */
- 	if (attempt_plug_merge(current, q, bio))
+ 	if (attempt_plug_merge(current, q, bio, &request_count))
 -		goto out;
 +		return;
  
  	spin_lock_irq(q->queue_lock);
  
@@@ -1300,11 -1305,10 +1303,10 @@@ get_rq
  			if (__rq->q != q)
  				plug->should_sort = 1;
  		}
+ 		if (request_count >= BLK_MAX_REQUEST_COUNT)
+ 			blk_flush_plug_list(plug, false);
  		list_add_tail(&req->queuelist, &plug->list);
- 		plug->count++;
  		drive_stat_acct(req, 1);
- 		if (plug->count >= BLK_MAX_REQUEST_COUNT)
- 			blk_flush_plug_list(plug, false);
  	} else {
  		spin_lock_irq(q->queue_lock);
  		add_acct_request(q, req, where);
@@@ -1312,8 -1316,9 +1314,8 @@@
  out_unlock:
  		spin_unlock_irq(q->queue_lock);
  	}
 -out:
 -	return 0;
  }
 +EXPORT_SYMBOL_GPL(blk_queue_bio);	/* for device mapper only */
  
  /*
   * If bio->bi_dev is a partition, remap the location
@@@ -1412,142 -1417,165 +1414,142 @@@ static inline int bio_check_eod(struct 
  	return 0;
  }
  
 -/**
 - * generic_make_request - hand a buffer to its device driver for I/O
 - * @bio:  The bio describing the location in memory and on the device.
 - *
 - * generic_make_request() is used to make I/O requests of block
 - * devices. It is passed a &struct bio, which describes the I/O that needs
 - * to be done.
 - *
 - * generic_make_request() does not return any status.  The
 - * success/failure status of the request, along with notification of
 - * completion, is delivered asynchronously through the bio->bi_end_io
 - * function described (one day) else where.
 - *
 - * The caller of generic_make_request must make sure that bi_io_vec
 - * are set to describe the memory buffer, and that bi_dev and bi_sector are
 - * set to describe the device address, and the
 - * bi_end_io and optionally bi_private are set to describe how
 - * completion notification should be signaled.
 - *
 - * generic_make_request and the drivers it calls may use bi_next if this
 - * bio happens to be merged with someone else, and may change bi_dev and
 - * bi_sector for remaps as it sees fit.  So the values of these fields
 - * should NOT be depended on after the call to generic_make_request.
 - */
 -static inline void __generic_make_request(struct bio *bio)
 +static noinline_for_stack bool
 +generic_make_request_checks(struct bio *bio)
  {
  	struct request_queue *q;
 -	sector_t old_sector;
 -	int ret, nr_sectors = bio_sectors(bio);
 -	dev_t old_dev;
 +	int nr_sectors = bio_sectors(bio);
  	int err = -EIO;
 +	char b[BDEVNAME_SIZE];
 +	struct hd_struct *part;
  
  	might_sleep();
  
  	if (bio_check_eod(bio, nr_sectors))
  		goto end_io;
  
 -	/*
 -	 * Resolve the mapping until finished. (drivers are
 -	 * still free to implement/resolve their own stacking
 -	 * by explicitly returning 0)
 -	 *
 -	 * NOTE: we don't repeat the blk_size check for each new device.
 -	 * Stacking drivers are expected to know what they are doing.
 -	 */
 -	old_sector = -1;
 -	old_dev = 0;
 -	do {
 -		char b[BDEVNAME_SIZE];
 -		struct hd_struct *part;
 -
 -		q = bdev_get_queue(bio->bi_bdev);
 -		if (unlikely(!q)) {
 -			printk(KERN_ERR
 -			       "generic_make_request: Trying to access "
 -				"nonexistent block-device %s (%Lu)\n",
 -				bdevname(bio->bi_bdev, b),
 -				(long long) bio->bi_sector);
 -			goto end_io;
 -		}
 -
 -		if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
 -			     nr_sectors > queue_max_hw_sectors(q))) {
 -			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
 -			       bdevname(bio->bi_bdev, b),
 -			       bio_sectors(bio),
 -			       queue_max_hw_sectors(q));
 -			goto end_io;
 -		}
 -
 -		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
 -			goto end_io;
 -
 -		part = bio->bi_bdev->bd_part;
 -		if (should_fail_request(part, bio->bi_size) ||
 -		    should_fail_request(&part_to_disk(part)->part0,
 -					bio->bi_size))
 -			goto end_io;
 +	q = bdev_get_queue(bio->bi_bdev);
 +	if (unlikely(!q)) {
 +		printk(KERN_ERR
 +		       "generic_make_request: Trying to access "
 +			"nonexistent block-device %s (%Lu)\n",
 +			bdevname(bio->bi_bdev, b),
 +			(long long) bio->bi_sector);
 +		goto end_io;
 +	}
  
 -		/*
 -		 * If this device has partitions, remap block n
 -		 * of partition p to block n+start(p) of the disk.
 -		 */
 -		blk_partition_remap(bio);
 +	if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
 +		     nr_sectors > queue_max_hw_sectors(q))) {
 +		printk(KERN_ERR "bio too big device %s (%u > %u)\n",
 +		       bdevname(bio->bi_bdev, b),
 +		       bio_sectors(bio),
 +		       queue_max_hw_sectors(q));
 +		goto end_io;
 +	}
  
 -		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
 -			goto end_io;
 +	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
 +		goto end_io;
  
 -		if (old_sector != -1)
 -			trace_block_bio_remap(q, bio, old_dev, old_sector);
 +	part = bio->bi_bdev->bd_part;
 +	if (should_fail_request(part, bio->bi_size) ||
 +	    should_fail_request(&part_to_disk(part)->part0,
 +				bio->bi_size))
 +		goto end_io;
  
 -		old_sector = bio->bi_sector;
 -		old_dev = bio->bi_bdev->bd_dev;
 +	/*
 +	 * If this device has partitions, remap block n
 +	 * of partition p to block n+start(p) of the disk.
 +	 */
 +	blk_partition_remap(bio);
  
 -		if (bio_check_eod(bio, nr_sectors))
 -			goto end_io;
 +	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
 +		goto end_io;
  
 -		/*
 -		 * Filter flush bio's early so that make_request based
 -		 * drivers without flush support don't have to worry
 -		 * about them.
 -		 */
 -		if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
 -			bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
 -			if (!nr_sectors) {
 -				err = 0;
 -				goto end_io;
 -			}
 -		}
 +	if (bio_check_eod(bio, nr_sectors))
 +		goto end_io;
  
 -		if ((bio->bi_rw & REQ_DISCARD) &&
 -		    (!blk_queue_discard(q) ||
 -		     ((bio->bi_rw & REQ_SECURE) &&
 -		      !blk_queue_secdiscard(q)))) {
 -			err = -EOPNOTSUPP;
 +	/*
 +	 * Filter flush bio's early so that make_request based
 +	 * drivers without flush support don't have to worry
 +	 * about them.
 +	 */
 +	if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
 +		bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
 +		if (!nr_sectors) {
 +			err = 0;
  			goto end_io;
  		}
 +	}
  
 -		if (blk_throtl_bio(q, &bio))
 -			goto end_io;
 -
 -		/*
 -		 * If bio = NULL, bio has been throttled and will be submitted
 -		 * later.
 -		 */
 -		if (!bio)
 -			break;
 +	if ((bio->bi_rw & REQ_DISCARD) &&
 +	    (!blk_queue_discard(q) ||
 +	     ((bio->bi_rw & REQ_SECURE) &&
 +	      !blk_queue_secdiscard(q)))) {
 +		err = -EOPNOTSUPP;
 +		goto end_io;
 +	}
  
 -		trace_block_bio_queue(q, bio);
 +	if (blk_throtl_bio(q, &bio))
 +		goto end_io;
  
 -		ret = q->make_request_fn(q, bio);
 -	} while (ret);
 +	/* if bio = NULL, bio has been throttled and will be submitted later. */
 +	if (!bio)
 +		return false;
  
 -	return;
 +	trace_block_bio_queue(q, bio);
 +	return true;
  
  end_io:
  	bio_endio(bio, err);
 +	return false;
  }
  
 -/*
 - * We only want one ->make_request_fn to be active at a time,
 - * else stack usage with stacked devices could be a problem.
 - * So use current->bio_list to keep a list of requests
 - * submited by a make_request_fn function.
 - * current->bio_list is also used as a flag to say if
 - * generic_make_request is currently active in this task or not.
 - * If it is NULL, then no make_request is active.  If it is non-NULL,
 - * then a make_request is active, and new requests should be added
 - * at the tail
 +/**
 + * generic_make_request - hand a buffer to its device driver for I/O
 + * @bio:  The bio describing the location in memory and on the device.
 + *
 + * generic_make_request() is used to make I/O requests of block
 + * devices. It is passed a &struct bio, which describes the I/O that needs
 + * to be done.
 + *
 + * generic_make_request() does not return any status.  The
 + * success/failure status of the request, along with notification of
 + * completion, is delivered asynchronously through the bio->bi_end_io
 + * function described (one day) else where.
 + *
 + * The caller of generic_make_request must make sure that bi_io_vec
 + * are set to describe the memory buffer, and that bi_dev and bi_sector are
 + * set to describe the device address, and the
 + * bi_end_io and optionally bi_private are set to describe how
 + * completion notification should be signaled.
 + *
 + * generic_make_request and the drivers it calls may use bi_next if this
 + * bio happens to be merged with someone else, and may resubmit the bio to
 + * a lower device by calling into generic_make_request recursively, which
 + * means the bio should NOT be touched after the call to ->make_request_fn.
   */
  void generic_make_request(struct bio *bio)
  {
  	struct bio_list bio_list_on_stack;
  
 +	if (!generic_make_request_checks(bio))
 +		return;
 +
 +	/*
 +	 * We only want one ->make_request_fn to be active at a time, else
 +	 * stack usage with stacked devices could be a problem.  So use
 +	 * current->bio_list to keep a list of requests submited by a
 +	 * make_request_fn function.  current->bio_list is also used as a
 +	 * flag to say if generic_make_request is currently active in this
 +	 * task or not.  If it is NULL, then no make_request is active.  If
 +	 * it is non-NULL, then a make_request is active, and new requests
 +	 * should be added at the tail
 +	 */
  	if (current->bio_list) {
 -		/* make_request is active */
  		bio_list_add(current->bio_list, bio);
  		return;
  	}
 +
  	/* following loop may be a bit non-obvious, and so deserves some
  	 * explanation.
  	 * Before entering the loop, bio->bi_next is NULL (as all callers
@@@ -1555,21 -1583,22 +1557,21 @@@
  	 * We pretend that we have just taken it off a longer list, so
  	 * we assign bio_list to a pointer to the bio_list_on_stack,
  	 * thus initialising the bio_list of new bios to be
 -	 * added.  __generic_make_request may indeed add some more bios
 +	 * added.  ->make_request() may indeed add some more bios
  	 * through a recursive call to generic_make_request.  If it
  	 * did, we find a non-NULL value in bio_list and re-enter the loop
  	 * from the top.  In this case we really did just take the bio
  	 * of the top of the list (no pretending) and so remove it from
 -	 * bio_list, and call into __generic_make_request again.
 -	 *
 -	 * The loop was structured like this to make only one call to
 -	 * __generic_make_request (which is important as it is large and
 -	 * inlined) and to keep the structure simple.
 +	 * bio_list, and call into ->make_request() again.
  	 */
  	BUG_ON(bio->bi_next);
  	bio_list_init(&bio_list_on_stack);
  	current->bio_list = &bio_list_on_stack;
  	do {
 -		__generic_make_request(bio);
 +		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 +
 +		q->make_request_fn(q, bio);
 +
  		bio = bio_list_pop(current->bio_list);
  	} while (bio);
  	current->bio_list = NULL; /* deactivate */
@@@ -1675,6 -1704,7 +1677,7 @@@ EXPORT_SYMBOL_GPL(blk_rq_check_limits)
  int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
  {
  	unsigned long flags;
+ 	int where = ELEVATOR_INSERT_BACK;
  
  	if (blk_rq_check_limits(q, rq))
  		return -EIO;
@@@ -1691,7 -1721,10 +1694,10 @@@
  	 */
  	BUG_ON(blk_queued_rq(rq));
  
- 	add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
+ 	if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA))
+ 		where = ELEVATOR_INSERT_FLUSH;
+ 
+ 	add_acct_request(q, rq, where);
  	spin_unlock_irqrestore(q->queue_lock, flags);
  
  	return 0;
@@@ -2248,7 -2281,7 +2254,7 @@@ static bool blk_end_bidi_request(struc
   *     %false - we are done with this request
   *     %true  - still buffers pending for this request
   **/
- static bool __blk_end_bidi_request(struct request *rq, int error,
+ bool __blk_end_bidi_request(struct request *rq, int error,
  				   unsigned int nr_bytes, unsigned int bidi_bytes)
  {
  	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
@@@ -2595,20 -2628,6 +2601,20 @@@ EXPORT_SYMBOL(kblockd_schedule_delayed_
  
  #define PLUG_MAGIC	0x91827364
  
 +/**
 + * blk_start_plug - initialize blk_plug and track it inside the task_struct
 + * @plug:	The &struct blk_plug that needs to be initialized
 + *
 + * Description:
 + *   Tracking blk_plug inside the task_struct will help with auto-flushing the
 + *   pending I/O should the task end up blocking between blk_start_plug() and
 + *   blk_finish_plug(). This is important from a performance perspective, but
 + *   also ensures that we don't deadlock. For instance, if the task is blocking
 + *   for a memory allocation, memory reclaim could end up wanting to free a
 + *   page belonging to that request that is currently residing in our private
 + *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
 + *   this kind of deadlock.
 + */
  void blk_start_plug(struct blk_plug *plug)
  {
  	struct task_struct *tsk = current;
@@@ -2617,7 -2636,6 +2623,6 @@@
  	INIT_LIST_HEAD(&plug->list);
  	INIT_LIST_HEAD(&plug->cb_list);
  	plug->should_sort = 0;
- 	plug->count = 0;
  
  	/*
  	 * If this is a nested plug, don't actually assign it. It will be
@@@ -2701,7 -2719,6 +2706,6 @@@ void blk_flush_plug_list(struct blk_plu
  		return;
  
  	list_splice_init(&plug->list, &list);
- 	plug->count = 0;
  
  	if (plug->should_sort) {
  		list_sort(NULL, &list, plug_rq_cmp);
diff --combined block/blk-sysfs.c
index adc923e9d1f8,60fda88c57f0..a8eff5f8b9c5
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@@ -258,11 -258,13 +258,13 @@@ queue_rq_affinity_store(struct request_
  
  	ret = queue_var_store(&val, page, count);
  	spin_lock_irq(q->queue_lock);
- 	if (val) {
+ 	if (val == 2) {
  		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
- 		if (val == 2)
- 			queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
- 	} else {
+ 		queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
+ 	} else if (val == 1) {
+ 		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+ 		queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+ 	} else if (val == 0) {
  		queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
  		queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
  	}
@@@ -455,11 -457,11 +457,11 @@@ queue_attr_store(struct kobject *kobj, 
  }
  
  /**
 - * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
 - * @kobj:    the kobj belonging of the request queue to be released
 + * blk_release_queue: - release a &struct request_queue when it is no longer needed
 + * @kobj:    the kobj belonging to the request queue to be released
   *
   * Description:
 - *     blk_cleanup_queue is the pair to blk_init_queue() or
 + *     blk_release_queue is the pair to blk_init_queue() or
   *     blk_queue_make_request().  It should be called when a request queue is
   *     being released; typically when a block device is being de-registered.
   *     Currently, its primary task it to free all the &struct request
@@@ -477,6 -479,11 +479,11 @@@ static void blk_release_queue(struct ko
  
  	blk_sync_queue(q);
  
+ 	if (q->elevator)
+ 		elevator_exit(q->elevator);
+ 
+ 	blk_throtl_exit(q);
+ 
  	if (rl->rq_pool)
  		mempool_destroy(rl->rq_pool);
  
diff --combined drivers/block/loop.c
index 8360239d553c,4720c7ade0ae..157ddcb9d0a5
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@@ -75,11 -75,11 +75,11 @@@
  #include <linux/kthread.h>
  #include <linux/splice.h>
  #include <linux/sysfs.h>
- 
+ #include <linux/miscdevice.h>
  #include <asm/uaccess.h>
  
- static LIST_HEAD(loop_devices);
- static DEFINE_MUTEX(loop_devices_mutex);
+ static DEFINE_IDR(loop_index_idr);
+ static DEFINE_MUTEX(loop_index_mutex);
  
  static int max_part;
  static int part_shift;
@@@ -514,7 -514,7 +514,7 @@@ static struct bio *loop_get_bio(struct 
  	return bio_list_pop(&lo->lo_bio_list);
  }
  
 -static int loop_make_request(struct request_queue *q, struct bio *old_bio)
 +static void loop_make_request(struct request_queue *q, struct bio *old_bio)
  {
  	struct loop_device *lo = q->queuedata;
  	int rw = bio_rw(old_bio);
@@@ -532,11 -532,12 +532,11 @@@
  	loop_add_bio(lo, old_bio);
  	wake_up(&lo->lo_event);
  	spin_unlock_irq(&lo->lo_lock);
 -	return 0;
 +	return;
  
  out:
  	spin_unlock_irq(&lo->lo_lock);
  	bio_io_error(old_bio);
 -	return 0;
  }
  
  struct switch_request {
@@@ -721,17 -722,10 +721,10 @@@ static inline int is_loop_device(struc
  static ssize_t loop_attr_show(struct device *dev, char *page,
  			      ssize_t (*callback)(struct loop_device *, char *))
  {
- 	struct loop_device *l, *lo = NULL;
- 
- 	mutex_lock(&loop_devices_mutex);
- 	list_for_each_entry(l, &loop_devices, lo_list)
- 		if (disk_to_dev(l->lo_disk) == dev) {
- 			lo = l;
- 			break;
- 		}
- 	mutex_unlock(&loop_devices_mutex);
+ 	struct gendisk *disk = dev_to_disk(dev);
+ 	struct loop_device *lo = disk->private_data;
  
- 	return lo ? callback(lo, page) : -EIO;
+ 	return callback(lo, page);
  }
  
  #define LOOP_ATTR_RO(_name)						\
@@@ -749,10 -743,10 +742,10 @@@ static ssize_t loop_attr_backing_file_s
  	ssize_t ret;
  	char *p = NULL;
  
- 	mutex_lock(&lo->lo_ctl_mutex);
+ 	spin_lock_irq(&lo->lo_lock);
  	if (lo->lo_backing_file)
  		p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
- 	mutex_unlock(&lo->lo_ctl_mutex);
+ 	spin_unlock_irq(&lo->lo_lock);
  
  	if (IS_ERR_OR_NULL(p))
  		ret = PTR_ERR(p);
@@@ -1006,7 -1000,9 +999,9 @@@ static int loop_clr_fd(struct loop_devi
  
  	kthread_stop(lo->lo_thread);
  
+ 	spin_lock_irq(&lo->lo_lock);
  	lo->lo_backing_file = NULL;
+ 	spin_unlock_irq(&lo->lo_lock);
  
  	loop_release_xfer(lo);
  	lo->transfer = NULL;
@@@ -1484,13 -1480,22 +1479,22 @@@ static int lo_compat_ioctl(struct block
  
  static int lo_open(struct block_device *bdev, fmode_t mode)
  {
- 	struct loop_device *lo = bdev->bd_disk->private_data;
+ 	struct loop_device *lo;
+ 	int err = 0;
+ 
+ 	mutex_lock(&loop_index_mutex);
+ 	lo = bdev->bd_disk->private_data;
+ 	if (!lo) {
+ 		err = -ENXIO;
+ 		goto out;
+ 	}
  
  	mutex_lock(&lo->lo_ctl_mutex);
  	lo->lo_refcnt++;
  	mutex_unlock(&lo->lo_ctl_mutex);
- 
- 	return 0;
+ out:
+ 	mutex_unlock(&loop_index_mutex);
+ 	return err;
  }
  
  static int lo_release(struct gendisk *disk, fmode_t mode)
@@@ -1556,40 -1561,71 +1560,71 @@@ int loop_register_transfer(struct loop_
  	return 0;
  }
  
+ static int unregister_transfer_cb(int id, void *ptr, void *data)
+ {
+ 	struct loop_device *lo = ptr;
+ 	struct loop_func_table *xfer = data;
+ 
+ 	mutex_lock(&lo->lo_ctl_mutex);
+ 	if (lo->lo_encryption == xfer)
+ 		loop_release_xfer(lo);
+ 	mutex_unlock(&lo->lo_ctl_mutex);
+ 	return 0;
+ }
+ 
  int loop_unregister_transfer(int number)
  {
  	unsigned int n = number;
- 	struct loop_device *lo;
  	struct loop_func_table *xfer;
  
  	if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
  		return -EINVAL;
  
  	xfer_funcs[n] = NULL;
- 
- 	list_for_each_entry(lo, &loop_devices, lo_list) {
- 		mutex_lock(&lo->lo_ctl_mutex);
- 
- 		if (lo->lo_encryption == xfer)
- 			loop_release_xfer(lo);
- 
- 		mutex_unlock(&lo->lo_ctl_mutex);
- 	}
- 
+ 	idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer);
  	return 0;
  }
  
  EXPORT_SYMBOL(loop_register_transfer);
  EXPORT_SYMBOL(loop_unregister_transfer);
  
- static struct loop_device *loop_alloc(int i)
+ static int loop_add(struct loop_device **l, int i)
  {
  	struct loop_device *lo;
  	struct gendisk *disk;
+ 	int err;
  
  	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
- 	if (!lo)
+ 	if (!lo) {
+ 		err = -ENOMEM;
  		goto out;
+ 	}
+ 
+ 	err = idr_pre_get(&loop_index_idr, GFP_KERNEL);
+ 	if (err < 0)
+ 		goto out_free_dev;
+ 
+ 	if (i >= 0) {
+ 		int m;
+ 
+ 		/* create specific i in the index */
+ 		err = idr_get_new_above(&loop_index_idr, lo, i, &m);
+ 		if (err >= 0 && i != m) {
+ 			idr_remove(&loop_index_idr, m);
+ 			err = -EEXIST;
+ 		}
+ 	} else if (i == -1) {
+ 		int m;
+ 
+ 		/* get next free nr */
+ 		err = idr_get_new(&loop_index_idr, lo, &m);
+ 		if (err >= 0)
+ 			i = m;
+ 	} else {
+ 		err = -EINVAL;
+ 	}
+ 	if (err < 0)
+ 		goto out_free_dev;
  
  	lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
  	if (!lo->lo_queue)
@@@ -1610,81 -1646,158 +1645,158 @@@
  	disk->private_data	= lo;
  	disk->queue		= lo->lo_queue;
  	sprintf(disk->disk_name, "loop%d", i);
- 	return lo;
+ 	add_disk(disk);
+ 	*l = lo;
+ 	return lo->lo_number;
  
  out_free_queue:
  	blk_cleanup_queue(lo->lo_queue);
  out_free_dev:
  	kfree(lo);
  out:
- 	return NULL;
+ 	return err;
  }
  
- static void loop_free(struct loop_device *lo)
+ static void loop_remove(struct loop_device *lo)
  {
+ 	del_gendisk(lo->lo_disk);
  	blk_cleanup_queue(lo->lo_queue);
  	put_disk(lo->lo_disk);
- 	list_del(&lo->lo_list);
  	kfree(lo);
  }
  
- static struct loop_device *loop_init_one(int i)
+ static int find_free_cb(int id, void *ptr, void *data)
+ {
+ 	struct loop_device *lo = ptr;
+ 	struct loop_device **l = data;
+ 
+ 	if (lo->lo_state == Lo_unbound) {
+ 		*l = lo;
+ 		return 1;
+ 	}
+ 	return 0;
+ }
+ 
+ static int loop_lookup(struct loop_device **l, int i)
  {
  	struct loop_device *lo;
+ 	int ret = -ENODEV;
  
- 	list_for_each_entry(lo, &loop_devices, lo_list) {
- 		if (lo->lo_number == i)
- 			return lo;
+ 	if (i < 0) {
+ 		int err;
+ 
+ 		err = idr_for_each(&loop_index_idr, &find_free_cb, &lo);
+ 		if (err == 1) {
+ 			*l = lo;
+ 			ret = lo->lo_number;
+ 		}
+ 		goto out;
  	}
  
- 	lo = loop_alloc(i);
+ 	/* lookup and return a specific i */
+ 	lo = idr_find(&loop_index_idr, i);
  	if (lo) {
- 		add_disk(lo->lo_disk);
- 		list_add_tail(&lo->lo_list, &loop_devices);
+ 		*l = lo;
+ 		ret = lo->lo_number;
  	}
- 	return lo;
- }
- 
- static void loop_del_one(struct loop_device *lo)
- {
- 	del_gendisk(lo->lo_disk);
- 	loop_free(lo);
+ out:
+ 	return ret;
  }
  
  static struct kobject *loop_probe(dev_t dev, int *part, void *data)
  {
  	struct loop_device *lo;
  	struct kobject *kobj;
+ 	int err;
  
- 	mutex_lock(&loop_devices_mutex);
- 	lo = loop_init_one(MINOR(dev) >> part_shift);
- 	kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
- 	mutex_unlock(&loop_devices_mutex);
+ 	mutex_lock(&loop_index_mutex);
+ 	err = loop_lookup(&lo, MINOR(dev) >> part_shift);
+ 	if (err < 0)
+ 		err = loop_add(&lo, MINOR(dev) >> part_shift);
+ 	if (err < 0)
+ 		kobj = ERR_PTR(err);
+ 	else
+ 		kobj = get_disk(lo->lo_disk);
+ 	mutex_unlock(&loop_index_mutex);
  
  	*part = 0;
  	return kobj;
  }
  
+ static long loop_control_ioctl(struct file *file, unsigned int cmd,
+ 			       unsigned long parm)
+ {
+ 	struct loop_device *lo;
+ 	int ret = -ENOSYS;
+ 
+ 	mutex_lock(&loop_index_mutex);
+ 	switch (cmd) {
+ 	case LOOP_CTL_ADD:
+ 		ret = loop_lookup(&lo, parm);
+ 		if (ret >= 0) {
+ 			ret = -EEXIST;
+ 			break;
+ 		}
+ 		ret = loop_add(&lo, parm);
+ 		break;
+ 	case LOOP_CTL_REMOVE:
+ 		ret = loop_lookup(&lo, parm);
+ 		if (ret < 0)
+ 			break;
+ 		mutex_lock(&lo->lo_ctl_mutex);
+ 		if (lo->lo_state != Lo_unbound) {
+ 			ret = -EBUSY;
+ 			mutex_unlock(&lo->lo_ctl_mutex);
+ 			break;
+ 		}
+ 		if (lo->lo_refcnt > 0) {
+ 			ret = -EBUSY;
+ 			mutex_unlock(&lo->lo_ctl_mutex);
+ 			break;
+ 		}
+ 		lo->lo_disk->private_data = NULL;
+ 		mutex_unlock(&lo->lo_ctl_mutex);
+ 		idr_remove(&loop_index_idr, lo->lo_number);
+ 		loop_remove(lo);
+ 		break;
+ 	case LOOP_CTL_GET_FREE:
+ 		ret = loop_lookup(&lo, -1);
+ 		if (ret >= 0)
+ 			break;
+ 		ret = loop_add(&lo, -1);
+ 	}
+ 	mutex_unlock(&loop_index_mutex);
+ 
+ 	return ret;
+ }
+ 
+ static const struct file_operations loop_ctl_fops = {
+ 	.open		= nonseekable_open,
+ 	.unlocked_ioctl	= loop_control_ioctl,
+ 	.compat_ioctl	= loop_control_ioctl,
+ 	.owner		= THIS_MODULE,
+ 	.llseek		= noop_llseek,
+ };
+ 
+ static struct miscdevice loop_misc = {
+ 	.minor		= LOOP_CTRL_MINOR,
+ 	.name		= "loop-control",
+ 	.fops		= &loop_ctl_fops,
+ };
+ 
+ MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR);
+ MODULE_ALIAS("devname:loop-control");
+ 
  static int __init loop_init(void)
  {
  	int i, nr;
  	unsigned long range;
- 	struct loop_device *lo, *next;
+ 	struct loop_device *lo;
+ 	int err;
  
- 	/*
- 	 * loop module now has a feature to instantiate underlying device
- 	 * structure on-demand, provided that there is an access dev node.
- 	 * However, this will not work well with user space tool that doesn't
- 	 * know about such "feature".  In order to not break any existing
- 	 * tool, we do the following:
- 	 *
- 	 * (1) if max_loop is specified, create that many upfront, and this
- 	 *     also becomes a hard limit.
- 	 * (2) if max_loop is not specified, create 8 loop device on module
- 	 *     load, user can further extend loop device by create dev node
- 	 *     themselves and have kernel automatically instantiate actual
- 	 *     device on-demand.
- 	 */
+ 	err = misc_register(&loop_misc);
+ 	if (err < 0)
+ 		return err;
  
  	part_shift = 0;
  	if (max_part > 0) {
@@@ -1707,57 -1820,60 +1819,60 @@@
  	if (max_loop > 1UL << (MINORBITS - part_shift))
  		return -EINVAL;
  
+ 	/*
+ 	 * If max_loop is specified, create that many devices upfront.
+ 	 * This also becomes a hard limit. If max_loop is not specified,
+ 	 * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+ 	 * init time. Loop devices can be requested on-demand with the
+ 	 * /dev/loop-control interface, or be instantiated by accessing
+ 	 * a 'dead' device node.
+ 	 */
  	if (max_loop) {
  		nr = max_loop;
  		range = max_loop << part_shift;
  	} else {
- 		nr = 8;
+ 		nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
  		range = 1UL << MINORBITS;
  	}
  
  	if (register_blkdev(LOOP_MAJOR, "loop"))
  		return -EIO;
  
- 	for (i = 0; i < nr; i++) {
- 		lo = loop_alloc(i);
- 		if (!lo)
- 			goto Enomem;
- 		list_add_tail(&lo->lo_list, &loop_devices);
- 	}
- 
- 	/* point of no return */
- 
- 	list_for_each_entry(lo, &loop_devices, lo_list)
- 		add_disk(lo->lo_disk);
- 
  	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
  				  THIS_MODULE, loop_probe, NULL, NULL);
  
+ 	/* pre-create number of devices given by config or max_loop */
+ 	mutex_lock(&loop_index_mutex);
+ 	for (i = 0; i < nr; i++)
+ 		loop_add(&lo, i);
+ 	mutex_unlock(&loop_index_mutex);
+ 
  	printk(KERN_INFO "loop: module loaded\n");
  	return 0;
+ }
  
- Enomem:
- 	printk(KERN_INFO "loop: out of memory\n");
- 
- 	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
- 		loop_free(lo);
+ static int loop_exit_cb(int id, void *ptr, void *data)
+ {
+ 	struct loop_device *lo = ptr;
  
- 	unregister_blkdev(LOOP_MAJOR, "loop");
- 	return -ENOMEM;
+ 	loop_remove(lo);
+ 	return 0;
  }
  
  static void __exit loop_exit(void)
  {
  	unsigned long range;
- 	struct loop_device *lo, *next;
  
  	range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
  
- 	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
- 		loop_del_one(lo);
+ 	idr_for_each(&loop_index_idr, &loop_exit_cb, NULL);
+ 	idr_remove_all(&loop_index_idr);
+ 	idr_destroy(&loop_index_idr);
  
  	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
  	unregister_blkdev(LOOP_MAJOR, "loop");
+ 
+ 	misc_deregister(&loop_misc);
  }
  
  module_init(loop_init);
diff --combined drivers/md/md.c
index 5c2178562c96,5c95ccb59500..8f52d4eb78a0
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@@ -61,6 -61,11 +61,11 @@@
  static void autostart_arrays(int part);
  #endif
  
+ /* pers_list is a list of registered personalities protected
+  * by pers_lock.
+  * pers_lock does extra service to protect accesses to
+  * mddev->thread when the mutex cannot be held.
+  */
  static LIST_HEAD(pers_list);
  static DEFINE_SPINLOCK(pers_lock);
  
@@@ -330,17 -335,18 +335,17 @@@ static DEFINE_SPINLOCK(all_mddevs_lock)
   * call has finished, the bio has been linked into some internal structure
   * and so is visible to ->quiesce(), so we don't need the refcount any more.
   */
 -static int md_make_request(struct request_queue *q, struct bio *bio)
 +static void md_make_request(struct request_queue *q, struct bio *bio)
  {
  	const int rw = bio_data_dir(bio);
  	mddev_t *mddev = q->queuedata;
 -	int rv;
  	int cpu;
  	unsigned int sectors;
  
  	if (mddev == NULL || mddev->pers == NULL
  	    || !mddev->ready) {
  		bio_io_error(bio);
 -		return 0;
 +		return;
  	}
  	smp_rmb(); /* Ensure implications of  'active' are visible */
  	rcu_read_lock();
@@@ -365,7 -371,7 +370,7 @@@
  	 * go away inside make_request
  	 */
  	sectors = bio_sectors(bio);
 -	rv = mddev->pers->make_request(mddev, bio);
 +	mddev->pers->make_request(mddev, bio);
  
  	cpu = part_stat_lock();
  	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
@@@ -374,6 -380,8 +379,6 @@@
  
  	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
  		wake_up(&mddev->sb_wait);
 -
 -	return rv;
  }
  
  /* mddev_suspend makes sure no new requests are submitted
@@@ -472,7 -480,8 +477,7 @@@ static void md_submit_flush_data(struc
  		bio_endio(bio, 0);
  	else {
  		bio->bi_rw &= ~REQ_FLUSH;
 -		if (mddev->pers->make_request(mddev, bio))
 -			generic_make_request(bio);
 +		mddev->pers->make_request(mddev, bio);
  	}
  
  	mddev->flush_bio = NULL;
@@@ -735,7 -744,12 +740,12 @@@ static void mddev_unlock(mddev_t * mdde
  	} else
  		mutex_unlock(&mddev->reconfig_mutex);
  
+ 	/* was we've dropped the mutex we need a spinlock to
+ 	 * make sur the thread doesn't disappear
+ 	 */
+ 	spin_lock(&pers_lock);
  	md_wakeup_thread(mddev->thread);
+ 	spin_unlock(&pers_lock);
  }
  
  static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
@@@ -844,7 -858,7 +854,7 @@@ void md_super_write(mddev_t *mddev, mdk
  	bio->bi_end_io = super_written;
  
  	atomic_inc(&mddev->pending_writes);
- 	submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
+ 	submit_bio(WRITE_FLUSH_FUA, bio);
  }
  
  void md_super_wait(mddev_t *mddev)
@@@ -1134,8 -1148,11 +1144,11 @@@ static int super_90_load(mdk_rdev_t *rd
  			ret = 0;
  	}
  	rdev->sectors = rdev->sb_start;
+ 	/* Limit to 4TB as metadata cannot record more than that */
+ 	if (rdev->sectors >= (2ULL << 32))
+ 		rdev->sectors = (2ULL << 32) - 2;
  
- 	if (rdev->sectors < sb->size * 2 && sb->level > 1)
+ 	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
  		/* "this cannot possibly happen" ... */
  		ret = -EINVAL;
  
@@@ -1169,7 -1186,7 +1182,7 @@@ static int super_90_validate(mddev_t *m
  		mddev->clevel[0] = 0;
  		mddev->layout = sb->layout;
  		mddev->raid_disks = sb->raid_disks;
- 		mddev->dev_sectors = sb->size * 2;
+ 		mddev->dev_sectors = ((sector_t)sb->size) * 2;
  		mddev->events = ev1;
  		mddev->bitmap_info.offset = 0;
  		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
@@@ -1411,6 -1428,11 +1424,11 @@@ super_90_rdev_size_change(mdk_rdev_t *r
  	rdev->sb_start = calc_dev_sboffset(rdev);
  	if (!num_sectors || num_sectors > rdev->sb_start)
  		num_sectors = rdev->sb_start;
+ 	/* Limit to 4TB as metadata cannot record more than that.
+ 	 * 4TB == 2^32 KB, or 2*2^32 sectors.
+ 	 */
+ 	if (num_sectors >= (2ULL << 32))
+ 		num_sectors = (2ULL << 32) - 2;
  	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
  		       rdev->sb_page);
  	md_super_wait(rdev->mddev);
@@@ -1734,6 -1756,11 +1752,11 @@@ static void super_1_sync(mddev_t *mddev
  	sb->level = cpu_to_le32(mddev->level);
  	sb->layout = cpu_to_le32(mddev->layout);
  
+ 	if (test_bit(WriteMostly, &rdev->flags))
+ 		sb->devflags |= WriteMostly1;
+ 	else
+ 		sb->devflags &= ~WriteMostly1;
+ 
  	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
  		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
  		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
@@@ -2557,7 -2584,10 +2580,10 @@@ state_store(mdk_rdev_t *rdev, const cha
  	int err = -EINVAL;
  	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
  		md_error(rdev->mddev, rdev);
- 		err = 0;
+ 		if (test_bit(Faulty, &rdev->flags))
+ 			err = 0;
+ 		else
+ 			err = -EBUSY;
  	} else if (cmd_match(buf, "remove")) {
  		if (rdev->raid_disk >= 0)
  			err = -EBUSY;
@@@ -2580,7 -2610,7 +2606,7 @@@
  		err = 0;
  	} else if (cmd_match(buf, "-blocked")) {
  		if (!test_bit(Faulty, &rdev->flags) &&
- 		    test_bit(BlockedBadBlocks, &rdev->flags)) {
+ 		    rdev->badblocks.unacked_exist) {
  			/* metadata handler doesn't understand badblocks,
  			 * so we need to fail the device
  			 */
@@@ -5979,6 -6009,8 +6005,8 @@@ static int set_disk_faulty(mddev_t *mdd
  		return -ENODEV;
  
  	md_error(mddev, rdev);
+ 	if (!test_bit(Faulty, &rdev->flags))
+ 		return -EBUSY;
  	return 0;
  }
  
@@@ -6407,11 -6439,18 +6435,18 @@@ mdk_thread_t *md_register_thread(void (
  	return thread;
  }
  
- void md_unregister_thread(mdk_thread_t *thread)
+ void md_unregister_thread(mdk_thread_t **threadp)
  {
+ 	mdk_thread_t *thread = *threadp;
  	if (!thread)
  		return;
  	dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
+ 	/* Locking ensures that mddev_unlock does not wake_up a
+ 	 * non-existent thread
+ 	 */
+ 	spin_lock(&pers_lock);
+ 	*threadp = NULL;
+ 	spin_unlock(&pers_lock);
  
  	kthread_stop(thread->tsk);
  	kfree(thread);
@@@ -7318,8 -7357,7 +7353,7 @@@ static void reap_sync_thread(mddev_t *m
  	mdk_rdev_t *rdev;
  
  	/* resync has finished, collect result */
- 	md_unregister_thread(mddev->sync_thread);
- 	mddev->sync_thread = NULL;
+ 	md_unregister_thread(&mddev->sync_thread);
  	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
  	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
  		/* success...*/
diff --combined drivers/md/md.h
index bd47847cf7ca,0a309dc29b45..1509a3eb9ae1
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@@ -424,7 -424,7 +424,7 @@@ struct mdk_personalit
  	int level;
  	struct list_head list;
  	struct module *owner;
 -	int (*make_request)(mddev_t *mddev, struct bio *bio);
 +	void (*make_request)(mddev_t *mddev, struct bio *bio);
  	int (*run)(mddev_t *mddev);
  	int (*stop)(mddev_t *mddev);
  	void (*status)(struct seq_file *seq, mddev_t *mddev);
@@@ -560,7 -560,7 +560,7 @@@ extern int register_md_personality(stru
  extern int unregister_md_personality(struct mdk_personality *p);
  extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
  				mddev_t *mddev, const char *name);
- extern void md_unregister_thread(mdk_thread_t *thread);
+ extern void md_unregister_thread(mdk_thread_t **threadp);
  extern void md_wakeup_thread(mdk_thread_t *thread);
  extern void md_check_recovery(mddev_t *mddev);
  extern void md_write_start(mddev_t *mddev, struct bio *bi);
diff --combined drivers/md/multipath.c
index 407cb5691425,d5b5fb300171..618dd9e22513
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@@ -106,7 -106,7 +106,7 @@@ static void multipath_end_request(struc
  	rdev_dec_pending(rdev, conf->mddev);
  }
  
 -static int multipath_make_request(mddev_t *mddev, struct bio * bio)
 +static void multipath_make_request(mddev_t *mddev, struct bio * bio)
  {
  	multipath_conf_t *conf = mddev->private;
  	struct multipath_bh * mp_bh;
@@@ -114,7 -114,7 +114,7 @@@
  
  	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
  		md_flush_request(mddev, bio);
 -		return 0;
 +		return;
  	}
  
  	mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
@@@ -126,7 -126,7 +126,7 @@@
  	if (mp_bh->path < 0) {
  		bio_endio(bio, -EIO);
  		mempool_free(mp_bh, conf->pool);
 -		return 0;
 +		return;
  	}
  	multipath = conf->multipaths + mp_bh->path;
  
@@@ -137,7 -137,7 +137,7 @@@
  	mp_bh->bio.bi_end_io = multipath_end_request;
  	mp_bh->bio.bi_private = mp_bh;
  	generic_make_request(&mp_bh->bio);
 -	return 0;
 +	return;
  }
  
  static void multipath_status (struct seq_file *seq, mddev_t *mddev)
@@@ -514,8 -514,7 +514,7 @@@ static int multipath_stop (mddev_t *mdd
  {
  	multipath_conf_t *conf = mddev->private;
  
- 	md_unregister_thread(mddev->thread);
- 	mddev->thread = NULL;
+ 	md_unregister_thread(&mddev->thread);
  	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
  	mempool_destroy(conf->pool);
  	kfree(conf->multipaths);
diff --combined drivers/md/raid1.c
index 97f2a5f977b1,d9587dffe533..d4ddfa627301
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@@ -785,7 -785,7 +785,7 @@@ do_sync_io
  	PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
  }
  
 -static int make_request(mddev_t *mddev, struct bio * bio)
 +static void make_request(mddev_t *mddev, struct bio * bio)
  {
  	conf_t *conf = mddev->private;
  	mirror_info_t *mirror;
@@@ -870,7 -870,7 +870,7 @@@ read_again
  		if (rdisk < 0) {
  			/* couldn't find anywhere to read from */
  			raid_end_bio_io(r1_bio);
 -			return 0;
 +			return;
  		}
  		mirror = conf->mirrors + rdisk;
  
@@@ -928,7 -928,7 +928,7 @@@
  			goto read_again;
  		} else
  			generic_make_request(read_bio);
 -		return 0;
 +		return;
  	}
  
  	/*
@@@ -1099,12 -1099,11 +1099,11 @@@
  		bio_list_add(&conf->pending_bio_list, mbio);
  		spin_unlock_irqrestore(&conf->device_lock, flags);
  	}
- 	r1_bio_write_done(r1_bio);
- 
- 	/* In case raid1d snuck in to freeze_array */
- 	wake_up(&conf->wait_barrier);
- 
+ 	/* Mustn't call r1_bio_write_done before this next test,
+ 	 * as it could result in the bio being freed.
+ 	 */
  	if (sectors_handled < (bio->bi_size >> 9)) {
+ 		r1_bio_write_done(r1_bio);
  		/* We need another r1_bio.  It has already been counted
  		 * in bio->bi_phys_segments
  		 */
@@@ -1117,8 -1116,15 +1116,13 @@@
  		goto retry_write;
  	}
  
+ 	r1_bio_write_done(r1_bio);
+ 
+ 	/* In case raid1d snuck in to freeze_array */
+ 	wake_up(&conf->wait_barrier);
+ 
  	if (do_sync || !bitmap || !plugged)
  		md_wakeup_thread(mddev->thread);
 -
 -	return 0;
  }
  
  static void status(struct seq_file *seq, mddev_t *mddev)
@@@ -2556,8 -2562,7 +2560,7 @@@ static int stop(mddev_t *mddev
  	raise_barrier(conf);
  	lower_barrier(conf);
  
- 	md_unregister_thread(mddev->thread);
- 	mddev->thread = NULL;
+ 	md_unregister_thread(&mddev->thread);
  	if (conf->r1bio_pool)
  		mempool_destroy(conf->r1bio_pool);
  	kfree(conf->mirrors);
diff --combined drivers/md/raid10.c
index 04b625e1cb60,0cd9672cf9cb..ea5fc0b6a84c
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@@ -337,6 -337,21 +337,21 @@@ static void close_write(r10bio_t *r10_b
  	md_write_end(r10_bio->mddev);
  }
  
+ static void one_write_done(r10bio_t *r10_bio)
+ {
+ 	if (atomic_dec_and_test(&r10_bio->remaining)) {
+ 		if (test_bit(R10BIO_WriteError, &r10_bio->state))
+ 			reschedule_retry(r10_bio);
+ 		else {
+ 			close_write(r10_bio);
+ 			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+ 				reschedule_retry(r10_bio);
+ 			else
+ 				raid_end_bio_io(r10_bio);
+ 		}
+ 	}
+ }
+ 
  static void raid10_end_write_request(struct bio *bio, int error)
  {
  	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@@ -387,17 -402,7 +402,7 @@@
  	 * Let's see if all mirrored write operations have finished
  	 * already.
  	 */
- 	if (atomic_dec_and_test(&r10_bio->remaining)) {
- 		if (test_bit(R10BIO_WriteError, &r10_bio->state))
- 			reschedule_retry(r10_bio);
- 		else {
- 			close_write(r10_bio);
- 			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
- 				reschedule_retry(r10_bio);
- 			else
- 				raid_end_bio_io(r10_bio);
- 		}
- 	}
+ 	one_write_done(r10_bio);
  	if (dec_rdev)
  		rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
  }
@@@ -825,7 -830,7 +830,7 @@@ static void unfreeze_array(conf_t *conf
  	spin_unlock_irq(&conf->resync_lock);
  }
  
 -static int make_request(mddev_t *mddev, struct bio * bio)
 +static void make_request(mddev_t *mddev, struct bio * bio)
  {
  	conf_t *conf = mddev->private;
  	mirror_info_t *mirror;
@@@ -844,7 -849,7 +849,7 @@@
  
  	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
  		md_flush_request(mddev, bio);
 -		return 0;
 +		return;
  	}
  
  	/* If this request crosses a chunk boundary, we need to
@@@ -876,8 -881,10 +881,8 @@@
  		conf->nr_waiting++;
  		spin_unlock_irq(&conf->resync_lock);
  
 -		if (make_request(mddev, &bp->bio1))
 -			generic_make_request(&bp->bio1);
 -		if (make_request(mddev, &bp->bio2))
 -			generic_make_request(&bp->bio2);
 +		make_request(mddev, &bp->bio1);
 +		make_request(mddev, &bp->bio2);
  
  		spin_lock_irq(&conf->resync_lock);
  		conf->nr_waiting--;
@@@ -885,14 -892,14 +890,14 @@@
  		spin_unlock_irq(&conf->resync_lock);
  
  		bio_pair_release(bp);
 -		return 0;
 +		return;
  	bad_map:
  		printk("md/raid10:%s: make_request bug: can't convert block across chunks"
  		       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
  		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
  
  		bio_io_error(bio);
 -		return 0;
 +		return;
  	}
  
  	md_write_start(mddev, bio);
@@@ -935,7 -942,7 +940,7 @@@ read_again
  		slot = r10_bio->read_slot;
  		if (disk < 0) {
  			raid_end_bio_io(r10_bio);
 -			return 0;
 +			return;
  		}
  		mirror = conf->mirrors + disk;
  
@@@ -983,7 -990,7 +988,7 @@@
  			goto read_again;
  		} else
  			generic_make_request(read_bio);
 -		return 0;
 +		return;
  	}
  
  	/*
@@@ -1125,20 -1132,12 +1130,12 @@@ retry_write
  		spin_unlock_irqrestore(&conf->device_lock, flags);
  	}
  
- 	if (atomic_dec_and_test(&r10_bio->remaining)) {
- 		/* This matches the end of raid10_end_write_request() */
- 		bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
- 				r10_bio->sectors,
- 				!test_bit(R10BIO_Degraded, &r10_bio->state),
- 				0);
- 		md_write_end(mddev);
- 		raid_end_bio_io(r10_bio);
- 	}
- 
- 	/* In case raid10d snuck in to freeze_array */
- 	wake_up(&conf->wait_barrier);
+ 	/* Don't remove the bias on 'remaining' (one_write_done) until
+ 	 * after checking if we need to go around again.
+ 	 */
  
  	if (sectors_handled < (bio->bi_size >> 9)) {
+ 		one_write_done(r10_bio);
  		/* We need another r10_bio.  It has already been counted
  		 * in bio->bi_phys_segments.
  		 */
@@@ -1152,9 -1151,14 +1149,13 @@@
  		r10_bio->state = 0;
  		goto retry_write;
  	}
+ 	one_write_done(r10_bio);
+ 
+ 	/* In case raid10d snuck in to freeze_array */
+ 	wake_up(&conf->wait_barrier);
  
  	if (do_sync || !mddev->bitmap || !plugged)
  		md_wakeup_thread(mddev->thread);
 -	return 0;
  }
  
  static void status(struct seq_file *seq, mddev_t *mddev)
@@@ -2951,7 -2955,7 +2952,7 @@@ static int run(mddev_t *mddev
  	return 0;
  
  out_free_conf:
- 	md_unregister_thread(mddev->thread);
+ 	md_unregister_thread(&mddev->thread);
  	if (conf->r10bio_pool)
  		mempool_destroy(conf->r10bio_pool);
  	safe_put_page(conf->tmppage);
@@@ -2969,8 -2973,7 +2970,7 @@@ static int stop(mddev_t *mddev
  	raise_barrier(conf, 0);
  	lower_barrier(conf);
  
- 	md_unregister_thread(mddev->thread);
- 	mddev->thread = NULL;
+ 	md_unregister_thread(&mddev->thread);
  	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
  	if (conf->r10bio_pool)
  		mempool_destroy(conf->r10bio_pool);
diff --combined drivers/md/raid5.c
index 96b7f6a1b6f2,ac5e8b57e50f..83f2c44e170f
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@@ -3336,7 -3336,7 +3336,7 @@@ static void handle_stripe(struct stripe
  
  finish:
  	/* wait for this device to become unblocked */
- 	if (unlikely(s.blocked_rdev))
+ 	if (conf->mddev->external && unlikely(s.blocked_rdev))
  		md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
  
  	if (s.handle_bad_blocks)
@@@ -3695,7 -3695,7 +3695,7 @@@ static struct stripe_head *__get_priori
  	return sh;
  }
  
 -static int make_request(mddev_t *mddev, struct bio * bi)
 +static void make_request(mddev_t *mddev, struct bio * bi)
  {
  	raid5_conf_t *conf = mddev->private;
  	int dd_idx;
@@@ -3708,7 -3708,7 +3708,7 @@@
  
  	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
  		md_flush_request(mddev, bi);
 -		return 0;
 +		return;
  	}
  
  	md_write_start(mddev, bi);
@@@ -3716,7 -3716,7 +3716,7 @@@
  	if (rw == READ &&
  	     mddev->reshape_position == MaxSector &&
  	     chunk_aligned_read(mddev,bi))
 -		return 0;
 +		return;
  
  	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
  	last_sector = bi->bi_sector + (bi->bi_size>>9);
@@@ -3851,6 -3851,8 +3851,6 @@@
  
  		bio_endio(bi, 0);
  	}
 -
 -	return 0;
  }
  
  static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
@@@ -4939,8 -4941,7 +4939,7 @@@ static int run(mddev_t *mddev
  
  	return 0;
  abort:
- 	md_unregister_thread(mddev->thread);
- 	mddev->thread = NULL;
+ 	md_unregister_thread(&mddev->thread);
  	if (conf) {
  		print_raid5_conf(conf);
  		free_conf(conf);
@@@ -4954,8 -4955,7 +4953,7 @@@ static int stop(mddev_t *mddev
  {
  	raid5_conf_t *conf = mddev->private;
  
- 	md_unregister_thread(mddev->thread);
- 	mddev->thread = NULL;
+ 	md_unregister_thread(&mddev->thread);
  	if (mddev->queue)
  		mddev->queue->backing_dev_info.congested_fn = NULL;
  	free_conf(conf);
diff --combined include/linux/blkdev.h
index 1978655faa3b,7fbaa9103344..0b68044e7abb
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -30,6 -30,7 +30,7 @@@ struct request_pm_state
  struct blk_trace;
  struct request;
  struct sg_io_hdr;
+ struct bsg_job;
  
  #define BLKDEV_MIN_RQ	4
  #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@@ -117,6 -118,7 +118,7 @@@ struct request 
  		struct {
  			unsigned int		seq;
  			struct list_head	list;
+ 			rq_end_io_fn		*saved_end_io;
  		} flush;
  	};
  
@@@ -193,7 -195,7 +195,7 @@@ struct request_pm_stat
  #include <linux/elevator.h>
  
  typedef void (request_fn_proc) (struct request_queue *q);
 -typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 +typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
  typedef int (prep_rq_fn) (struct request_queue *, struct request *);
  typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
  
@@@ -209,6 -211,7 +211,7 @@@ typedef int (merge_bvec_fn) (struct req
  typedef void (softirq_done_fn)(struct request *);
  typedef int (dma_drain_needed_fn)(struct request *);
  typedef int (lld_busy_fn) (struct request_queue *q);
+ typedef int (bsg_job_fn) (struct bsg_job *);
  
  enum blk_eh_timer_return {
  	BLK_EH_NOT_HANDLED,
@@@ -375,6 -378,8 +378,8 @@@ struct request_queue 
  	struct mutex		sysfs_lock;
  
  #if defined(CONFIG_BLK_DEV_BSG)
+ 	bsg_job_fn		*bsg_job_fn;
+ 	int			bsg_job_size;
  	struct bsg_class_device bsg_dev;
  #endif
  
@@@ -675,8 -680,6 +680,8 @@@ extern int scsi_cmd_ioctl(struct reques
  extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
  			 struct scsi_ioctl_command __user *);
  
 +extern void blk_queue_bio(struct request_queue *q, struct bio *bio);
 +
  /*
   * A queue has just exitted congestion.  Note this in the global counter of
   * congested queues, and wake up anyone who was waiting for requests to be
@@@ -860,23 -863,16 +865,22 @@@ struct request_queue *blk_alloc_queue_n
  extern void blk_put_queue(struct request_queue *);
  
  /*
 - * Note: Code in between changing the blk_plug list/cb_list or element of such
 - * lists is preemptable, but such code can't do sleep (or be very careful),
 - * otherwise data is corrupted. For details, please check schedule() where
 - * blk_schedule_flush_plug() is called.
 + * blk_plug permits building a queue of related requests by holding the I/O
 + * fragments for a short period. This allows merging of sequential requests
 + * into single larger request. As the requests are moved from a per-task list to
 + * the device's request_queue in a batch, this results in improved scalability
 + * as the lock contention for request_queue lock is reduced.
 + *
 + * It is ok not to disable preemption when adding the request to the plug list
 + * or when attempting a merge, because blk_schedule_flush_list() will only flush
 + * the plug list when the task sleeps by itself. For details, please see
 + * schedule() where blk_schedule_flush_plug() is called.
   */
  struct blk_plug {
 -	unsigned long magic;
 -	struct list_head list;
 -	struct list_head cb_list;
 -	unsigned int should_sort;
 +	unsigned long magic; /* detect uninitialized use-cases */
 +	struct list_head list; /* requests */
 +	struct list_head cb_list; /* md requires an unplug callback */
 +	unsigned int should_sort; /* list to be sorted before flushing? */
- 	unsigned int count; /* number of queued requests */
  };
  #define BLK_MAX_REQUEST_COUNT 16