rk3066b lcdc:add one lcdc dual display support for rk610
[firefly-linux-kernel-4.4.55.git] / drivers / md / raid5.c
index 431b9b26ca5deda17ed117f634e1975556e009e0..cff955a040855cacb026be11f4e42b76c605d2de 100644 (file)
  *
  * We group bitmap updates into batches.  Each batch has a number.
  * We may write out several batches at once, but that isn't very important.
- * conf->bm_write is the number of the last batch successfully written.
- * conf->bm_flush is the number of the last batch that was closed to
+ * conf->seq_write is the number of the last batch successfully written.
+ * conf->seq_flush is the number of the last batch that was closed to
  *    new additions.
  * When we discover that we will need to write to any block in a stripe
  * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
- * the number of the batch it will be in. This is bm_flush+1.
+ * the number of the batch it will be in. This is seq_flush+1.
  * When we are ready to do a write, if that batch hasn't been written yet,
  *   we plug the array and queue the stripe for later.
  * When an unplug happens, we increment bm_flush, thus closing the current
 #include <linux/async.h>
 #include <linux/seq_file.h>
 #include <linux/cpu.h>
+#include <linux/slab.h>
 #include "md.h"
 #include "raid5.h"
+#include "raid0.h"
 #include "bitmap.h"
 
 /*
@@ -127,7 +129,7 @@ static inline int raid5_dec_bi_hw_segments(struct bio *bio)
 
 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
 {
-       bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
+       bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
 }
 
 /* Find first data disk in a raid6 stripe */
@@ -197,14 +199,14 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
                BUG_ON(!list_empty(&sh->lru));
                BUG_ON(atomic_read(&conf->active_stripes)==0);
                if (test_bit(STRIPE_HANDLE, &sh->state)) {
-                       if (test_bit(STRIPE_DELAYED, &sh->state)) {
+                       if (test_bit(STRIPE_DELAYED, &sh->state) &&
+                           !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                                list_add_tail(&sh->lru, &conf->delayed_list);
-                               blk_plug_device(conf->mddev->queue);
-                       } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
-                                  sh->bm_seq - conf->seq_write > 0) {
+                       else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+                                  sh->bm_seq - conf->seq_write > 0)
                                list_add_tail(&sh->lru, &conf->bitmap_list);
-                               blk_plug_device(conf->mddev->queue);
-                       } else {
+                       else {
+                               clear_bit(STRIPE_DELAYED, &sh->state);
                                clear_bit(STRIPE_BIT_DELAY, &sh->state);
                                list_add_tail(&sh->lru, &conf->handle_list);
                        }
@@ -275,12 +277,13 @@ out:
        return sh;
 }
 
-static void shrink_buffers(struct stripe_head *sh, int num)
+static void shrink_buffers(struct stripe_head *sh)
 {
        struct page *p;
        int i;
+       int num = sh->raid_conf->pool_size;
 
-       for (i=0; i<num ; i++) {
+       for (i = 0; i < num ; i++) {
                p = sh->dev[i].page;
                if (!p)
                        continue;
@@ -289,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num)
        }
 }
 
-static int grow_buffers(struct stripe_head *sh, int num)
+static int grow_buffers(struct stripe_head *sh)
 {
        int i;
+       int num = sh->raid_conf->pool_size;
 
-       for (i=0; i<num; i++) {
+       for (i = 0; i < num; i++) {
                struct page *page;
 
                if (!(page = alloc_page(GFP_KERNEL))) {
@@ -362,8 +366,72 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
        return NULL;
 }
 
-static void unplug_slaves(mddev_t *mddev);
-static void raid5_unplug_device(struct request_queue *q);
+/*
+ * Need to check if array has failed when deciding whether to:
+ *  - start an array
+ *  - remove non-faulty devices
+ *  - add a spare
+ *  - allow a reshape
+ * This determination is simple when no reshape is happening.
+ * However if there is a reshape, we need to carefully check
+ * both the before and after sections.
+ * This is because some failed devices may only affect one
+ * of the two sections, and some non-in_sync devices may
+ * be insync in the section most affected by failed devices.
+ */
+static int has_failed(raid5_conf_t *conf)
+{
+       int degraded;
+       int i;
+       if (conf->mddev->reshape_position == MaxSector)
+               return conf->mddev->degraded > conf->max_degraded;
+
+       rcu_read_lock();
+       degraded = 0;
+       for (i = 0; i < conf->previous_raid_disks; i++) {
+               mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+               if (!rdev || test_bit(Faulty, &rdev->flags))
+                       degraded++;
+               else if (test_bit(In_sync, &rdev->flags))
+                       ;
+               else
+                       /* not in-sync or faulty.
+                        * If the reshape increases the number of devices,
+                        * this is being recovered by the reshape, so
+                        * this 'previous' section is not in_sync.
+                        * If the number of devices is being reduced however,
+                        * the device can only be part of the array if
+                        * we are reverting a reshape, so this section will
+                        * be in-sync.
+                        */
+                       if (conf->raid_disks >= conf->previous_raid_disks)
+                               degraded++;
+       }
+       rcu_read_unlock();
+       if (degraded > conf->max_degraded)
+               return 1;
+       rcu_read_lock();
+       degraded = 0;
+       for (i = 0; i < conf->raid_disks; i++) {
+               mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+               if (!rdev || test_bit(Faulty, &rdev->flags))
+                       degraded++;
+               else if (test_bit(In_sync, &rdev->flags))
+                       ;
+               else
+                       /* not in-sync or faulty.
+                        * If reshape increases the number of devices, this
+                        * section has already been recovered, else it
+                        * almost certainly hasn't.
+                        */
+                       if (conf->raid_disks <= conf->previous_raid_disks)
+                               degraded++;
+       }
+       rcu_read_unlock();
+       if (degraded > conf->max_degraded)
+               return 1;
+       return 0;
+}
 
 static struct stripe_head *
 get_active_stripe(raid5_conf_t *conf, sector_t sector,
@@ -393,8 +461,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
                                                     < (conf->max_nr_stripes *3/4)
                                                     || !conf->inactive_blocked),
                                                    conf->device_lock,
-                                                   raid5_unplug_device(conf->mddev->queue)
-                                       );
+                                                   );
                                conf->inactive_blocked = 0;
                        } else
                                init_stripe(sh, sector, previous);
@@ -436,9 +503,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                int rw;
                struct bio *bi;
                mdk_rdev_t *rdev;
-               if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-                       rw = WRITE;
-               else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+               if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
+                       if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
+                               rw = WRITE_FUA;
+                       else
+                               rw = WRITE;
+               } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
                        rw = READ;
                else
                        continue;
@@ -446,7 +516,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                bi = &sh->dev[i].req;
 
                bi->bi_rw = rw;
-               if (rw == WRITE)
+               if (rw & WRITE)
                        bi->bi_end_io = raid5_end_write_request;
                else
                        bi->bi_end_io = raid5_end_read_request;
@@ -480,13 +550,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                        bi->bi_io_vec[0].bv_offset = 0;
                        bi->bi_size = STRIPE_SIZE;
                        bi->bi_next = NULL;
-                       if (rw == WRITE &&
+                       if ((rw & WRITE) &&
                            test_bit(R5_ReWrite, &sh->dev[i].flags))
                                atomic_add(STRIPE_SECTORS,
                                        &rdev->corrected_errors);
                        generic_make_request(bi);
                } else {
-                       if (rw == WRITE)
+                       if (rw & WRITE)
                                set_bit(STRIPE_DEGRADED, &sh->state);
                        pr_debug("skip op %ld on disc %d for sector %llu\n",
                                bi->bi_rw, i, (unsigned long long)sh->sector);
@@ -517,7 +587,7 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
        init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
 
        bio_for_each_segment(bvl, bio, i) {
-               int len = bio_iovec_idx(bio, i)->bv_len;
+               int len = bvl->bv_len;
                int clen;
                int b_offset = 0;
 
@@ -533,8 +603,8 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
                        clen = len;
 
                if (clen > 0) {
-                       b_offset += bio_iovec_idx(bio, i)->bv_offset;
-                       bio_page = bio_iovec_idx(bio, i)->bv_page;
+                       b_offset += bvl->bv_offset;
+                       bio_page = bvl->bv_page;
                        if (frombio)
                                tx = async_memcpy(page, bio_page, page_offset,
                                                  b_offset, clen, &submit);
@@ -961,6 +1031,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
                        while (wbi && wbi->bi_sector <
                                dev->sector + STRIPE_SECTORS) {
+                               if (wbi->bi_rw & REQ_FUA)
+                                       set_bit(R5_WantFUA, &dev->flags);
                                tx = async_copy_data(1, wbi, dev->page,
                                        dev->sector, tx);
                                wbi = r5_next_bio(wbi, dev->sector);
@@ -978,15 +1050,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
        int pd_idx = sh->pd_idx;
        int qd_idx = sh->qd_idx;
        int i;
+       bool fua = false;
 
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
 
+       for (i = disks; i--; )
+               fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
+
        for (i = disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
 
-               if (dev->written || i == pd_idx || i == qd_idx)
+               if (dev->written || i == pd_idx || i == qd_idx) {
                        set_bit(R5_UPTODATE, &dev->flags);
+                       if (fua)
+                               set_bit(R5_WantFUA, &dev->flags);
+               }
        }
 
        if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -1238,19 +1317,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 static int grow_one_stripe(raid5_conf_t *conf)
 {
        struct stripe_head *sh;
-       int disks = max(conf->raid_disks, conf->previous_raid_disks);
        sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
        if (!sh)
                return 0;
-       memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev));
+       memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
        sh->raid_conf = conf;
        spin_lock_init(&sh->lock);
        #ifdef CONFIG_MULTICORE_RAID456
        init_waitqueue_head(&sh->ops.wait_for_ops);
        #endif
 
-       if (grow_buffers(sh, disks)) {
-               shrink_buffers(sh, disks);
+       if (grow_buffers(sh)) {
+               shrink_buffers(sh);
                kmem_cache_free(conf->slab_cache, sh);
                return 0;
        }
@@ -1267,10 +1345,14 @@ static int grow_stripes(raid5_conf_t *conf, int num)
        struct kmem_cache *sc;
        int devs = max(conf->raid_disks, conf->previous_raid_disks);
 
-       sprintf(conf->cache_name[0],
-               "raid%d-%s", conf->level, mdname(conf->mddev));
-       sprintf(conf->cache_name[1],
-               "raid%d-%s-alt", conf->level, mdname(conf->mddev));
+       if (conf->mddev->gendisk)
+               sprintf(conf->cache_name[0],
+                       "raid%d-%s", conf->level, mdname(conf->mddev));
+       else
+               sprintf(conf->cache_name[0],
+                       "raid%d-%p", conf->level, conf->mddev);
+       sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
+
        conf->active_name = 0;
        sc = kmem_cache_create(conf->cache_name[conf->active_name],
                               sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
@@ -1388,8 +1470,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
                wait_event_lock_irq(conf->wait_for_stripe,
                                    !list_empty(&conf->inactive_list),
                                    conf->device_lock,
-                                   unplug_slaves(conf->mddev)
-                       );
+                                   );
                osh = get_free_stripe(conf);
                spin_unlock_irq(&conf->device_lock);
                atomic_set(&nsh->count, 1);
@@ -1466,7 +1547,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
        if (!sh)
                return 0;
        BUG_ON(atomic_read(&sh->count));
-       shrink_buffers(sh, conf->pool_size);
+       shrink_buffers(sh);
        kmem_cache_free(conf->slab_cache, sh);
        atomic_dec(&conf->active_stripes);
        return 1;
@@ -1508,7 +1589,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
                set_bit(R5_UPTODATE, &sh->dev[i].flags);
                if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
                        rdev = conf->disks[i].rdev;
-                       printk_rl(KERN_INFO "raid5:%s: read error corrected"
+                       printk_rl(KERN_INFO "md/raid:%s: read error corrected"
                                  " (%lu sectors at %llu on %s)\n",
                                  mdname(conf->mddev), STRIPE_SECTORS,
                                  (unsigned long long)(sh->sector
@@ -1526,9 +1607,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
 
                clear_bit(R5_UPTODATE, &sh->dev[i].flags);
                atomic_inc(&rdev->read_errors);
-               if (conf->mddev->degraded)
+               if (conf->mddev->degraded >= conf->max_degraded)
                        printk_rl(KERN_WARNING
-                                 "raid5:%s: read error not correctable "
+                                 "md/raid:%s: read error not correctable "
                                  "(sector %llu on %s).\n",
                                  mdname(conf->mddev),
                                  (unsigned long long)(sh->sector
@@ -1537,7 +1618,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
                else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
                        /* Oh, no!!! */
                        printk_rl(KERN_WARNING
-                                 "raid5:%s: read error NOT corrected!! "
+                                 "md/raid:%s: read error NOT corrected!! "
                                  "(sector %llu on %s).\n",
                                  mdname(conf->mddev),
                                  (unsigned long long)(sh->sector
@@ -1546,7 +1627,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
                else if (atomic_read(&rdev->read_errors)
                         > conf->max_nr_stripes)
                        printk(KERN_WARNING
-                              "raid5:%s: Too many read errors, failing device %s.\n",
+                              "md/raid:%s: Too many read errors, failing device %s.\n",
                               mdname(conf->mddev), bdn);
                else
                        retry = 1;
@@ -1618,27 +1699,28 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        char b[BDEVNAME_SIZE];
-       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
-       pr_debug("raid5: error called\n");
+       raid5_conf_t *conf = mddev->private;
+       pr_debug("raid456: error called\n");
 
-       if (!test_bit(Faulty, &rdev->flags)) {
-               set_bit(MD_CHANGE_DEVS, &mddev->flags);
-               if (test_and_clear_bit(In_sync, &rdev->flags)) {
-                       unsigned long flags;
-                       spin_lock_irqsave(&conf->device_lock, flags);
-                       mddev->degraded++;
-                       spin_unlock_irqrestore(&conf->device_lock, flags);
-                       /*
-                        * if recovery was running, make sure it aborts.
-                        */
-                       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-               }
-               set_bit(Faulty, &rdev->flags);
-               printk(KERN_ALERT
-                      "raid5: Disk failure on %s, disabling device.\n"
-                      "raid5: Operation continuing on %d devices.\n",
-                      bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
+       if (test_and_clear_bit(In_sync, &rdev->flags)) {
+               unsigned long flags;
+               spin_lock_irqsave(&conf->device_lock, flags);
+               mddev->degraded++;
+               spin_unlock_irqrestore(&conf->device_lock, flags);
+               /*
+                * if recovery was running, make sure it aborts.
+                */
+               set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        }
+       set_bit(Faulty, &rdev->flags);
+       set_bit(MD_CHANGE_DEVS, &mddev->flags);
+       printk(KERN_ALERT
+              "md/raid:%s: Disk failure on %s, disabling device.\n"
+              "md/raid:%s: Operation continuing on %d devices.\n",
+              mdname(mddev),
+              bdevname(rdev->bdev, b),
+              mdname(mddev),
+              conf->raid_disks - mddev->degraded);
 }
 
 /*
@@ -1649,8 +1731,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
                                     int previous, int *dd_idx,
                                     struct stripe_head *sh)
 {
-       long stripe;
-       unsigned long chunk_number;
+       sector_t stripe, stripe2;
+       sector_t chunk_number;
        unsigned int chunk_offset;
        int pd_idx, qd_idx;
        int ddf_layout = 0;
@@ -1670,18 +1752,13 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
         */
        chunk_offset = sector_div(r_sector, sectors_per_chunk);
        chunk_number = r_sector;
-       BUG_ON(r_sector != chunk_number);
 
        /*
         * Compute the stripe number
         */
-       stripe = chunk_number / data_disks;
-
-       /*
-        * Compute the data disk and parity disk indexes inside the stripe
-        */
-       *dd_idx = chunk_number % data_disks;
-
+       stripe = chunk_number;
+       *dd_idx = sector_div(stripe, data_disks);
+       stripe2 = stripe;
        /*
         * Select the parity disk based on the user selected algorithm.
         */
@@ -1693,21 +1770,21 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
        case 5:
                switch (algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
-                       pd_idx = data_disks - stripe % raid_disks;
+                       pd_idx = data_disks - sector_div(stripe2, raid_disks);
                        if (*dd_idx >= pd_idx)
                                (*dd_idx)++;
                        break;
                case ALGORITHM_RIGHT_ASYMMETRIC:
-                       pd_idx = stripe % raid_disks;
+                       pd_idx = sector_div(stripe2, raid_disks);
                        if (*dd_idx >= pd_idx)
                                (*dd_idx)++;
                        break;
                case ALGORITHM_LEFT_SYMMETRIC:
-                       pd_idx = data_disks - stripe % raid_disks;
+                       pd_idx = data_disks - sector_div(stripe2, raid_disks);
                        *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
                        break;
                case ALGORITHM_RIGHT_SYMMETRIC:
-                       pd_idx = stripe % raid_disks;
+                       pd_idx = sector_div(stripe2, raid_disks);
                        *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
                        break;
                case ALGORITHM_PARITY_0:
@@ -1718,8 +1795,6 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
                        pd_idx = data_disks;
                        break;
                default:
-                       printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-                               algorithm);
                        BUG();
                }
                break;
@@ -1727,7 +1802,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
 
                switch (algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
-                       pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                       pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
                        qd_idx = pd_idx + 1;
                        if (pd_idx == raid_disks-1) {
                                (*dd_idx)++;    /* Q D D D P */
@@ -1736,7 +1811,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
                                (*dd_idx) += 2; /* D D P Q D */
                        break;
                case ALGORITHM_RIGHT_ASYMMETRIC:
-                       pd_idx = stripe % raid_disks;
+                       pd_idx = sector_div(stripe2, raid_disks);
                        qd_idx = pd_idx + 1;
                        if (pd_idx == raid_disks-1) {
                                (*dd_idx)++;    /* Q D D D P */
@@ -1745,12 +1820,12 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
                                (*dd_idx) += 2; /* D D P Q D */
                        break;
                case ALGORITHM_LEFT_SYMMETRIC:
-                       pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                       pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
                        qd_idx = (pd_idx + 1) % raid_disks;
                        *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
                        break;
                case ALGORITHM_RIGHT_SYMMETRIC:
-                       pd_idx = stripe % raid_disks;
+                       pd_idx = sector_div(stripe2, raid_disks);
                        qd_idx = (pd_idx + 1) % raid_disks;
                        *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
                        break;
@@ -1769,7 +1844,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
                        /* Exactly the same as RIGHT_ASYMMETRIC, but or
                         * of blocks for computing Q is different.
                         */
-                       pd_idx = stripe % raid_disks;
+                       pd_idx = sector_div(stripe2, raid_disks);
                        qd_idx = pd_idx + 1;
                        if (pd_idx == raid_disks-1) {
                                (*dd_idx)++;    /* Q D D D P */
@@ -1784,7 +1859,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
                         * D D D P Q  rather than
                         * Q D D D P
                         */
-                       pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks);
+                       stripe2 += 1;
+                       pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
                        qd_idx = pd_idx + 1;
                        if (pd_idx == raid_disks-1) {
                                (*dd_idx)++;    /* Q D D D P */
@@ -1796,7 +1872,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
 
                case ALGORITHM_ROTATING_N_CONTINUE:
                        /* Same as left_symmetric but Q is before P */
-                       pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                       pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
                        qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
                        *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
                        ddf_layout = 1;
@@ -1804,27 +1880,27 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
 
                case ALGORITHM_LEFT_ASYMMETRIC_6:
                        /* RAID5 left_asymmetric, with Q on last device */
-                       pd_idx = data_disks - stripe % (raid_disks-1);
+                       pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
                        if (*dd_idx >= pd_idx)
                                (*dd_idx)++;
                        qd_idx = raid_disks - 1;
                        break;
 
                case ALGORITHM_RIGHT_ASYMMETRIC_6:
-                       pd_idx = stripe % (raid_disks-1);
+                       pd_idx = sector_div(stripe2, raid_disks-1);
                        if (*dd_idx >= pd_idx)
                                (*dd_idx)++;
                        qd_idx = raid_disks - 1;
                        break;
 
                case ALGORITHM_LEFT_SYMMETRIC_6:
-                       pd_idx = data_disks - stripe % (raid_disks-1);
+                       pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
                        *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
                        qd_idx = raid_disks - 1;
                        break;
 
                case ALGORITHM_RIGHT_SYMMETRIC_6:
-                       pd_idx = stripe % (raid_disks-1);
+                       pd_idx = sector_div(stripe2, raid_disks-1);
                        *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
                        qd_idx = raid_disks - 1;
                        break;
@@ -1835,10 +1911,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
                        qd_idx = raid_disks - 1;
                        break;
 
-
                default:
-                       printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-                              algorithm);
                        BUG();
                }
                break;
@@ -1869,14 +1942,14 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
                                 : conf->algorithm;
        sector_t stripe;
        int chunk_offset;
-       int chunk_number, dummy1, dd_idx = i;
+       sector_t chunk_number;
+       int dummy1, dd_idx = i;
        sector_t r_sector;
        struct stripe_head sh2;
 
 
        chunk_offset = sector_div(new_sector, sectors_per_chunk);
        stripe = new_sector;
-       BUG_ON(new_sector != stripe);
 
        if (i == sh->pd_idx)
                return 0;
@@ -1901,8 +1974,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
                case ALGORITHM_PARITY_N:
                        break;
                default:
-                       printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-                              algorithm);
                        BUG();
                }
                break;
@@ -1961,21 +2032,20 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
                        i -= 1;
                        break;
                default:
-                       printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-                              algorithm);
                        BUG();
                }
                break;
        }
 
        chunk_number = stripe * data_disks + i;
-       r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
+       r_sector = chunk_number * sectors_per_chunk + chunk_offset;
 
        check = raid5_compute_sector(conf, r_sector,
                                     previous, &dummy1, &sh2);
        if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
                || sh2.qd_idx != sh->qd_idx) {
-               printk(KERN_ERR "compute_blocknr: map not correct\n");
+               printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
+                      mdname(conf->mddev));
                return 0;
        }
        return r_sector;
@@ -2947,6 +3017,7 @@ static void handle_stripe5(struct stripe_head *sh)
        struct r5dev *dev;
        mdk_rdev_t *blocked_rdev = NULL;
        int prexor;
+       int dec_preread_active = 0;
 
        memset(&s, 0, sizeof(s));
        pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
@@ -2968,7 +3039,6 @@ static void handle_stripe5(struct stripe_head *sh)
                mdk_rdev_t *rdev;
 
                dev = &sh->dev[i];
-               clear_bit(R5_Insync, &dev->flags);
 
                pr_debug("check %d: state 0x%lx toread %p read %p write %p "
                        "written %p\n", i, dev->flags, dev->toread, dev->read,
@@ -3005,17 +3075,27 @@ static void handle_stripe5(struct stripe_head *sh)
                        blocked_rdev = rdev;
                        atomic_inc(&rdev->nr_pending);
                }
-               if (!rdev || !test_bit(In_sync, &rdev->flags)) {
+               clear_bit(R5_Insync, &dev->flags);
+               if (!rdev)
+                       /* Not in-sync */;
+               else if (test_bit(In_sync, &rdev->flags))
+                       set_bit(R5_Insync, &dev->flags);
+               else if (!test_bit(Faulty, &rdev->flags)) {
+                       /* could be in-sync depending on recovery/reshape status */
+                       if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+                               set_bit(R5_Insync, &dev->flags);
+               }
+               if (!test_bit(R5_Insync, &dev->flags)) {
                        /* The ReadError flag will just be confusing now */
                        clear_bit(R5_ReadError, &dev->flags);
                        clear_bit(R5_ReWrite, &dev->flags);
                }
-               if (!rdev || !test_bit(In_sync, &rdev->flags)
-                   || test_bit(R5_ReadError, &dev->flags)) {
+               if (test_bit(R5_ReadError, &dev->flags))
+                       clear_bit(R5_Insync, &dev->flags);
+               if (!test_bit(R5_Insync, &dev->flags)) {
                        s.failed++;
                        s.failed_num = i;
-               } else
-                       set_bit(R5_Insync, &dev->flags);
+               }
        }
        rcu_read_unlock();
 
@@ -3042,12 +3122,16 @@ static void handle_stripe5(struct stripe_head *sh)
        /* check if the array has lost two devices and, if so, some requests might
         * need to be failed
         */
-       if (s.failed > 1 && s.to_read+s.to_write+s.written)
-               handle_failed_stripe(conf, sh, &s, disks, &return_bi);
-       if (s.failed > 1 && s.syncing) {
-               md_done_sync(conf->mddev, STRIPE_SECTORS,0);
-               clear_bit(STRIPE_SYNCING, &sh->state);
-               s.syncing = 0;
+       if (s.failed > 1) {
+               sh->check_state = 0;
+               sh->reconstruct_state = 0;
+               if (s.to_read+s.to_write+s.written)
+                       handle_failed_stripe(conf, sh, &s, disks, &return_bi);
+               if (s.syncing) {
+                       md_done_sync(conf->mddev, STRIPE_SECTORS,0);
+                       clear_bit(STRIPE_SYNCING, &sh->state);
+                       s.syncing = 0;
+               }
        }
 
        /* might be able to return some write requests if the parity block
@@ -3096,12 +3180,8 @@ static void handle_stripe5(struct stripe_head *sh)
                                        set_bit(STRIPE_INSYNC, &sh->state);
                        }
                }
-               if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-                       atomic_dec(&conf->preread_active_stripes);
-                       if (atomic_read(&conf->preread_active_stripes) <
-                               IO_THRESHOLD)
-                               md_wakeup_thread(conf->mddev->thread);
-               }
+               if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                       dec_preread_active = 1;
        }
 
        /* Now to consider new write requests and what else, if anything
@@ -3208,6 +3288,16 @@ static void handle_stripe5(struct stripe_head *sh)
 
        ops_run_io(sh, &s);
 
+       if (dec_preread_active) {
+               /* We delay this until after ops_run_io so that if make_request
+                * is waiting on a flush, it won't continue until the writes
+                * have actually been submitted.
+                */
+               atomic_dec(&conf->preread_active_stripes);
+               if (atomic_read(&conf->preread_active_stripes) <
+                   IO_THRESHOLD)
+                       md_wakeup_thread(conf->mddev->thread);
+       }
        return_io(return_bi);
 }
 
@@ -3221,6 +3311,7 @@ static void handle_stripe6(struct stripe_head *sh)
        struct r6_state r6s;
        struct r5dev *dev, *pdev, *qdev;
        mdk_rdev_t *blocked_rdev = NULL;
+       int dec_preread_active = 0;
 
        pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
                "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
@@ -3242,7 +3333,6 @@ static void handle_stripe6(struct stripe_head *sh)
        for (i=disks; i--; ) {
                mdk_rdev_t *rdev;
                dev = &sh->dev[i];
-               clear_bit(R5_Insync, &dev->flags);
 
                pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
                        i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -3280,18 +3370,28 @@ static void handle_stripe6(struct stripe_head *sh)
                        blocked_rdev = rdev;
                        atomic_inc(&rdev->nr_pending);
                }
-               if (!rdev || !test_bit(In_sync, &rdev->flags)) {
+               clear_bit(R5_Insync, &dev->flags);
+               if (!rdev)
+                       /* Not in-sync */;
+               else if (test_bit(In_sync, &rdev->flags))
+                       set_bit(R5_Insync, &dev->flags);
+               else if (!test_bit(Faulty, &rdev->flags)) {
+                       /* in sync if before recovery_offset */
+                       if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+                               set_bit(R5_Insync, &dev->flags);
+               }
+               if (!test_bit(R5_Insync, &dev->flags)) {
                        /* The ReadError flag will just be confusing now */
                        clear_bit(R5_ReadError, &dev->flags);
                        clear_bit(R5_ReWrite, &dev->flags);
                }
-               if (!rdev || !test_bit(In_sync, &rdev->flags)
-                   || test_bit(R5_ReadError, &dev->flags)) {
+               if (test_bit(R5_ReadError, &dev->flags))
+                       clear_bit(R5_Insync, &dev->flags);
+               if (!test_bit(R5_Insync, &dev->flags)) {
                        if (s.failed < 2)
                                r6s.failed_num[s.failed] = i;
                        s.failed++;
-               } else
-                       set_bit(R5_Insync, &dev->flags);
+               }
        }
        rcu_read_unlock();
 
@@ -3318,12 +3418,16 @@ static void handle_stripe6(struct stripe_head *sh)
        /* check if the array has lost >2 devices and, if so, some requests
         * might need to be failed
         */
-       if (s.failed > 2 && s.to_read+s.to_write+s.written)
-               handle_failed_stripe(conf, sh, &s, disks, &return_bi);
-       if (s.failed > 2 && s.syncing) {
-               md_done_sync(conf->mddev, STRIPE_SECTORS,0);
-               clear_bit(STRIPE_SYNCING, &sh->state);
-               s.syncing = 0;
+       if (s.failed > 2) {
+               sh->check_state = 0;
+               sh->reconstruct_state = 0;
+               if (s.to_read+s.to_write+s.written)
+                       handle_failed_stripe(conf, sh, &s, disks, &return_bi);
+               if (s.syncing) {
+                       md_done_sync(conf->mddev, STRIPE_SECTORS,0);
+                       clear_bit(STRIPE_SYNCING, &sh->state);
+                       s.syncing = 0;
+               }
        }
 
        /*
@@ -3358,7 +3462,6 @@ static void handle_stripe6(struct stripe_head *sh)
         * completed
         */
        if (sh->reconstruct_state == reconstruct_state_drain_result) {
-               int qd_idx = sh->qd_idx;
 
                sh->reconstruct_state = reconstruct_state_idle;
                /* All the 'written' buffers and the parity blocks are ready to
@@ -3380,12 +3483,8 @@ static void handle_stripe6(struct stripe_head *sh)
                                        set_bit(STRIPE_INSYNC, &sh->state);
                        }
                }
-               if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
-                       atomic_dec(&conf->preread_active_stripes);
-                       if (atomic_read(&conf->preread_active_stripes) <
-                               IO_THRESHOLD)
-                               md_wakeup_thread(conf->mddev->thread);
-               }
+               if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                       dec_preread_active = 1;
        }
 
        /* Now to consider new write requests and what else, if anything
@@ -3494,6 +3593,18 @@ static void handle_stripe6(struct stripe_head *sh)
 
        ops_run_io(sh, &s);
 
+
+       if (dec_preread_active) {
+               /* We delay this until after ops_run_io so that if make_request
+                * is waiting on a flush, it won't continue until the writes
+                * have actually been submitted.
+                */
+               atomic_dec(&conf->preread_active_stripes);
+               if (atomic_read(&conf->preread_active_stripes) <
+                   IO_THRESHOLD)
+                       md_wakeup_thread(conf->mddev->thread);
+       }
+
        return_io(return_bi);
 }
 
@@ -3518,8 +3629,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
                                atomic_inc(&conf->preread_active_stripes);
                        list_add_tail(&sh->lru, &conf->hold_list);
                }
-       } else
-               blk_plug_device(conf->mddev->queue);
+       }
 }
 
 static void activate_bit_delay(raid5_conf_t *conf)
@@ -3536,60 +3646,14 @@ static void activate_bit_delay(raid5_conf_t *conf)
        }
 }
 
-static void unplug_slaves(mddev_t *mddev)
-{
-       raid5_conf_t *conf = mddev->private;
-       int i;
-       int devs = max(conf->raid_disks, conf->previous_raid_disks);
-
-       rcu_read_lock();
-       for (i = 0; i < devs; i++) {
-               mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
-               if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
-                       struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
-
-                       atomic_inc(&rdev->nr_pending);
-                       rcu_read_unlock();
-
-                       blk_unplug(r_queue);
-
-                       rdev_dec_pending(rdev, mddev);
-                       rcu_read_lock();
-               }
-       }
-       rcu_read_unlock();
-}
-
-static void raid5_unplug_device(struct request_queue *q)
-{
-       mddev_t *mddev = q->queuedata;
-       raid5_conf_t *conf = mddev->private;
-       unsigned long flags;
-
-       spin_lock_irqsave(&conf->device_lock, flags);
-
-       if (blk_remove_plug(q)) {
-               conf->seq_flush++;
-               raid5_activate_delayed(conf);
-       }
-       md_wakeup_thread(mddev->thread);
-
-       spin_unlock_irqrestore(&conf->device_lock, flags);
-
-       unplug_slaves(mddev);
-}
-
-static int raid5_congested(void *data, int bits)
+int md_raid5_congested(mddev_t *mddev, int bits)
 {
-       mddev_t *mddev = data;
        raid5_conf_t *conf = mddev->private;
 
        /* No difference between reads and writes.  Just check
         * how busy the stripe_cache is
         */
 
-       if (mddev_congested(mddev, bits))
-               return 1;
        if (conf->inactive_blocked)
                return 1;
        if (conf->quiesce)
@@ -3599,6 +3663,15 @@ static int raid5_congested(void *data, int bits)
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(md_raid5_congested);
+
+static int raid5_congested(void *data, int bits)
+{
+       mddev_t *mddev = data;
+
+       return mddev_congested(mddev, bits) ||
+               md_raid5_congested(mddev, bits);
+}
 
 /* We want read requests to align with chunks where possible,
  * but write requests don't need to.
@@ -3697,10 +3770,10 @@ static void raid5_align_endio(struct bio *bi, int error)
 
        bio_put(bi);
 
-       mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
-       conf = mddev->private;
        rdev = (void*)raid_bi->bi_next;
        raid_bi->bi_next = NULL;
+       mddev = rdev->mddev;
+       conf = mddev->private;
 
        rdev_dec_pending(rdev, conf->mddev);
 
@@ -3724,7 +3797,7 @@ static int bio_fits_rdev(struct bio *bi)
        if ((bi->bi_size>>9) > queue_max_sectors(q))
                return 0;
        blk_recount_segments(q, bi);
-       if (bi->bi_phys_segments > queue_max_phys_segments(q))
+       if (bi->bi_phys_segments > queue_max_segments(q))
                return 0;
 
        if (q->merge_bvec_fn)
@@ -3737,11 +3810,10 @@ static int bio_fits_rdev(struct bio *bi)
 }
 
 
-static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
+static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
 {
-       mddev_t *mddev = q->queuedata;
        raid5_conf_t *conf = mddev->private;
-       unsigned int dd_idx;
+       int dd_idx;
        struct bio* align_bi;
        mdk_rdev_t *rdev;
 
@@ -3750,9 +3822,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
                return 0;
        }
        /*
-        * use bio_clone to make a copy of the bio
+        * use bio_clone_mddev to make a copy of the bio
         */
-       align_bi = bio_clone(raid_bio, GFP_NOIO);
+       align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
        if (!align_bi)
                return 0;
        /*
@@ -3776,7 +3848,6 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
                raid_bio->bi_next = (void*)rdev;
                align_bi->bi_bdev =  rdev->bdev;
                align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
-               align_bi->bi_sector += rdev->data_offset;
 
                if (!bio_fits_rdev(align_bi)) {
                        /* too big in some way */
@@ -3785,6 +3856,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
                        return 0;
                }
 
+               /* No reshape active, so we can trust rdev->data_offset */
+               align_bi->bi_sector += rdev->data_offset;
+
                spin_lock_irq(&conf->device_lock);
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0,
@@ -3854,33 +3928,27 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
        return sh;
 }
 
-static int make_request(struct request_queue *q, struct bio * bi)
+static int make_request(mddev_t *mddev, struct bio * bi)
 {
-       mddev_t *mddev = q->queuedata;
        raid5_conf_t *conf = mddev->private;
        int dd_idx;
        sector_t new_sector;
        sector_t logical_sector, last_sector;
        struct stripe_head *sh;
        const int rw = bio_data_dir(bi);
-       int cpu, remaining;
+       int remaining;
+       int plugged;
 
-       if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
-               bio_endio(bi, -EOPNOTSUPP);
+       if (unlikely(bi->bi_rw & REQ_FLUSH)) {
+               md_flush_request(mddev, bi);
                return 0;
        }
 
        md_write_start(mddev, bi);
 
-       cpu = part_stat_lock();
-       part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
-       part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
-                     bio_sectors(bi));
-       part_stat_unlock();
-
        if (rw == READ &&
             mddev->reshape_position == MaxSector &&
-            chunk_aligned_read(q,bi))
+            chunk_aligned_read(mddev,bi))
                return 0;
 
        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
@@ -3888,6 +3956,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
 
+       plugged = mddev_check_plugged(mddev);
        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
                DEFINE_WAIT(w);
                int disks, data_disks;
@@ -3901,7 +3970,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
                        /* spinlock is needed as reshape_progress may be
                         * 64bit on a 32bit platform, and so it might be
                         * possible to see a half-updated value
-                        * Ofcourse reshape_progress could change after
+                        * Of course reshape_progress could change after
                         * the lock is dropped, so once we get a reference
                         * to the stripe that we think it is, we will have
                         * to check again.
@@ -3928,7 +3997,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
                new_sector = raid5_compute_sector(conf, logical_sector,
                                                  previous,
                                                  &dd_idx, NULL);
-               pr_debug("raid5: make_request, sector %llu logical %llu\n",
+               pr_debug("raid456: make_request, sector %llu logical %llu\n",
                        (unsigned long long)new_sector, 
                        (unsigned long long)logical_sector);
 
@@ -3982,7 +4051,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
                                 * add failed due to overlap.  Flush everything
                                 * and wait a while
                                 */
-                               raid5_unplug_device(mddev->queue);
+                               md_wakeup_thread(mddev->thread);
                                release_stripe(sh);
                                schedule();
                                goto retry;
@@ -3990,6 +4059,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
                        finish_wait(&conf->wait_for_overlap, &w);
                        set_bit(STRIPE_HANDLE, &sh->state);
                        clear_bit(STRIPE_DELAYED, &sh->state);
+                       if ((bi->bi_rw & REQ_SYNC) &&
+                           !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                               atomic_inc(&conf->preread_active_stripes);
                        release_stripe(sh);
                } else {
                        /* cannot get stripe for read-ahead, just give-up */
@@ -3999,6 +4071,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
                }
                        
        }
+       if (!plugged)
+               md_wakeup_thread(mddev->thread);
+
        spin_lock_irq(&conf->device_lock);
        remaining = raid5_dec_bi_phys_segments(bi);
        spin_unlock_irq(&conf->device_lock);
@@ -4009,6 +4084,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 
                bio_endio(bi, 0);
        }
+
        return 0;
 }
 
@@ -4025,7 +4101,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
         * As the reads complete, handle_stripe will copy the data
         * into the destination stripe and release that stripe.
         */
-       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+       raid5_conf_t *conf = mddev->private;
        struct stripe_head *sh;
        sector_t first_sector, last_sector;
        int raid_disks = conf->previous_raid_disks;
@@ -4114,7 +4190,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes)==0);
                mddev->reshape_position = conf->reshape_progress;
-               mddev->curr_resync_completed = mddev->curr_resync;
+               mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
@@ -4215,7 +4291,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes) == 0);
                mddev->reshape_position = conf->reshape_progress;
-               mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
+               mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
@@ -4234,16 +4310,15 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 /* FIXME go_faster isn't used */
 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+       raid5_conf_t *conf = mddev->private;
        struct stripe_head *sh;
        sector_t max_sector = mddev->dev_sectors;
-       int sync_blocks;
+       sector_t sync_blocks;
        int still_degraded = 0;
        int i;
 
        if (sector_nr >= max_sector) {
                /* just being told to finish up .. nothing much to do */
-               unplug_slaves(mddev);
 
                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
                        end_reshape(conf);
@@ -4400,24 +4475,30 @@ static void raid5d(mddev_t *mddev)
        struct stripe_head *sh;
        raid5_conf_t *conf = mddev->private;
        int handled;
+       struct blk_plug plug;
 
        pr_debug("+++ raid5d active\n");
 
        md_check_recovery(mddev);
 
+       blk_start_plug(&plug);
        handled = 0;
        spin_lock_irq(&conf->device_lock);
        while (1) {
                struct bio *bio;
 
-               if (conf->seq_flush != conf->seq_write) {
-                       int seq = conf->seq_flush;
+               if (atomic_read(&mddev->plug_cnt) == 0 &&
+                   !list_empty(&conf->bitmap_list)) {
+                       /* Now is a good time to flush some bitmap updates */
+                       conf->seq_flush++;
                        spin_unlock_irq(&conf->device_lock);
                        bitmap_unplug(mddev->bitmap);
                        spin_lock_irq(&conf->device_lock);
-                       conf->seq_write = seq;
+                       conf->seq_write = conf->seq_flush;
                        activate_bit_delay(conf);
                }
+               if (atomic_read(&mddev->plug_cnt) == 0)
+                       raid5_activate_delayed(conf);
 
                while ((bio = remove_bio_from_retry(conf))) {
                        int ok;
@@ -4447,7 +4528,7 @@ static void raid5d(mddev_t *mddev)
        spin_unlock_irq(&conf->device_lock);
 
        async_tx_issue_pending_all();
-       unplug_slaves(mddev);
+       blk_finish_plug(&plug);
 
        pr_debug("--- raid5d inactive\n");
 }
@@ -4462,23 +4543,15 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
                return 0;
 }
 
-static ssize_t
-raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
+int
+raid5_set_cache_size(mddev_t *mddev, int size)
 {
        raid5_conf_t *conf = mddev->private;
-       unsigned long new;
        int err;
 
-       if (len >= PAGE_SIZE)
-               return -EINVAL;
-       if (!conf)
-               return -ENODEV;
-
-       if (strict_strtoul(page, 10, &new))
-               return -EINVAL;
-       if (new <= 16 || new > 32768)
+       if (size <= 16 || size > 32768)
                return -EINVAL;
-       while (new < conf->max_nr_stripes) {
+       while (size < conf->max_nr_stripes) {
                if (drop_one_stripe(conf))
                        conf->max_nr_stripes--;
                else
@@ -4487,11 +4560,32 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
        err = md_allow_write(mddev);
        if (err)
                return err;
-       while (new > conf->max_nr_stripes) {
+       while (size > conf->max_nr_stripes) {
                if (grow_one_stripe(conf))
                        conf->max_nr_stripes++;
                else break;
        }
+       return 0;
+}
+EXPORT_SYMBOL(raid5_set_cache_size);
+
+static ssize_t
+raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
+{
+       raid5_conf_t *conf = mddev->private;
+       unsigned long new;
+       int err;
+
+       if (len >= PAGE_SIZE)
+               return -EINVAL;
+       if (!conf)
+               return -ENODEV;
+
+       if (strict_strtoul(page, 10, &new))
+               return -EINVAL;
+       err = raid5_set_cache_size(mddev, new);
+       if (err)
+               return err;
        return len;
 }
 
@@ -4627,7 +4721,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
                        kfree(percpu->scribble);
                        pr_err("%s: failed memory allocation for cpu%ld\n",
                               __func__, cpu);
-                       return NOTIFY_BAD;
+                       return notifier_from_errno(-ENOMEM);
                }
                break;
        case CPU_DEAD:
@@ -4648,7 +4742,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
 {
        unsigned long cpu;
        struct page *spare_page;
-       struct raid5_percpu *allcpus;
+       struct raid5_percpu __percpu *allcpus;
        void *scribble;
        int err;
 
@@ -4696,7 +4790,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
        if (mddev->new_level != 5
            && mddev->new_level != 4
            && mddev->new_level != 6) {
-               printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
+               printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
                       mdname(mddev), mddev->new_level);
                return ERR_PTR(-EIO);
        }
@@ -4704,12 +4798,12 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
             && !algorithm_valid_raid5(mddev->new_layout)) ||
            (mddev->new_level == 6
             && !algorithm_valid_raid6(mddev->new_layout))) {
-               printk(KERN_ERR "raid5: %s: layout %d not supported\n",
+               printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
                       mdname(mddev), mddev->new_layout);
                return ERR_PTR(-EIO);
        }
        if (mddev->new_level == 6 && mddev->raid_disks < 4) {
-               printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
+               printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
                       mdname(mddev), mddev->raid_disks);
                return ERR_PTR(-EINVAL);
        }
@@ -4717,8 +4811,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
        if (!mddev->new_chunk_sectors ||
            (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
            !is_power_of_2(mddev->new_chunk_sectors)) {
-               printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
-                      mddev->new_chunk_sectors << 9, mdname(mddev));
+               printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
+                      mdname(mddev), mddev->new_chunk_sectors << 9);
                return ERR_PTR(-EINVAL);
        }
 
@@ -4760,7 +4854,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
        if (raid5_alloc_percpu(conf) != 0)
                goto abort;
 
-       pr_debug("raid5: run(%s) called.\n", mdname(mddev));
+       pr_debug("raid456: run(%s) called.\n", mdname(mddev));
 
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                raid_disk = rdev->raid_disk;
@@ -4773,10 +4867,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 
                if (test_bit(In_sync, &rdev->flags)) {
                        char b[BDEVNAME_SIZE];
-                       printk(KERN_INFO "raid5: device %s operational as raid"
-                               " disk %d\n", bdevname(rdev->bdev,b),
-                               raid_disk);
-               } else
+                       printk(KERN_INFO "md/raid:%s: device %s operational as raid"
+                              " disk %d\n",
+                              mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
+               } else if (rdev->saved_raid_disk != raid_disk)
                        /* Cannot rely on bitmap to complete recovery */
                        conf->fullsync = 1;
        }
@@ -4799,16 +4893,17 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
                 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
        if (grow_stripes(conf, conf->max_nr_stripes)) {
                printk(KERN_ERR
-                       "raid5: couldn't allocate %dkB for buffers\n", memory);
+                      "md/raid:%s: couldn't allocate %dkB for buffers\n",
+                      mdname(mddev), memory);
                goto abort;
        } else
-               printk(KERN_INFO "raid5: allocated %dkB for %s\n",
-                       memory, mdname(mddev));
+               printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
+                      mdname(mddev), memory);
 
        conf->thread = md_register_thread(raid5d, mddev, NULL);
        if (!conf->thread) {
                printk(KERN_ERR
-                      "raid5: couldn't allocate thread for %s\n",
+                      "md/raid:%s: couldn't allocate thread.\n",
                       mdname(mddev));
                goto abort;
        }
@@ -4853,13 +4948,13 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
 static int run(mddev_t *mddev)
 {
        raid5_conf_t *conf;
-       int working_disks = 0, chunk_size;
+       int working_disks = 0;
        int dirty_parity_disks = 0;
        mdk_rdev_t *rdev;
        sector_t reshape_offset = 0;
 
        if (mddev->recovery_cp != MaxSector)
-               printk(KERN_NOTICE "raid5: %s is not clean"
+               printk(KERN_NOTICE "md/raid:%s: not clean"
                       " -- starting background reconstruction\n",
                       mdname(mddev));
        if (mddev->reshape_position != MaxSector) {
@@ -4873,7 +4968,7 @@ static int run(mddev_t *mddev)
                int max_degraded = (mddev->level == 6 ? 2 : 1);
 
                if (mddev->new_level != mddev->level) {
-                       printk(KERN_ERR "raid5: %s: unsupported reshape "
+                       printk(KERN_ERR "md/raid:%s: unsupported reshape "
                               "required - aborting.\n",
                               mdname(mddev));
                        return -EINVAL;
@@ -4886,8 +4981,8 @@ static int run(mddev_t *mddev)
                here_new = mddev->reshape_position;
                if (sector_div(here_new, mddev->new_chunk_sectors *
                               (mddev->raid_disks - max_degraded))) {
-                       printk(KERN_ERR "raid5: reshape_position not "
-                              "on a stripe boundary\n");
+                       printk(KERN_ERR "md/raid:%s: reshape_position not "
+                              "on a stripe boundary\n", mdname(mddev));
                        return -EINVAL;
                }
                reshape_offset = here_new * mddev->new_chunk_sectors;
@@ -4908,8 +5003,9 @@ static int run(mddev_t *mddev)
                        if ((here_new * mddev->new_chunk_sectors != 
                             here_old * mddev->chunk_sectors) ||
                            mddev->ro == 0) {
-                               printk(KERN_ERR "raid5: in-place reshape must be started"
-                                      " in read-only mode - aborting\n");
+                               printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
+                                      " in read-only mode - aborting\n",
+                                      mdname(mddev));
                                return -EINVAL;
                        }
                } else if (mddev->delta_disks < 0
@@ -4918,11 +5014,13 @@ static int run(mddev_t *mddev)
                    : (here_new * mddev->new_chunk_sectors >=
                       here_old * mddev->chunk_sectors)) {
                        /* Reading from the same stripe as writing to - bad */
-                       printk(KERN_ERR "raid5: reshape_position too early for "
-                              "auto-recovery - aborting.\n");
+                       printk(KERN_ERR "md/raid:%s: reshape_position too early for "
+                              "auto-recovery - aborting.\n",
+                              mdname(mddev));
                        return -EINVAL;
                }
-               printk(KERN_INFO "raid5: reshape will continue\n");
+               printk(KERN_INFO "md/raid:%s: reshape will continue\n",
+                      mdname(mddev));
                /* OK, we should be able to continue; */
        } else {
                BUG_ON(mddev->level != mddev->new_level);
@@ -4949,8 +5047,10 @@ static int run(mddev_t *mddev)
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->raid_disk < 0)
                        continue;
-               if (test_bit(In_sync, &rdev->flags))
+               if (test_bit(In_sync, &rdev->flags)) {
                        working_disks++;
+                       continue;
+               }
                /* This disc is not fully in-sync.  However if it
                 * just stored parity (beyond the recovery_offset),
                 * when we don't need to be concerned about the
@@ -4964,18 +5064,6 @@ static int run(mddev_t *mddev)
                    mddev->minor_version > 90)
                        rdev->recovery_offset = reshape_offset;
                        
-               printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n",
-                      rdev->raid_disk, working_disks, conf->prev_algo,
-                      conf->previous_raid_disks, conf->max_degraded,
-                      conf->algorithm, conf->raid_disks, 
-                      only_parity(rdev->raid_disk,
-                                  conf->prev_algo,
-                                  conf->previous_raid_disks,
-                                  conf->max_degraded),
-                      only_parity(rdev->raid_disk,
-                                  conf->algorithm,
-                                  conf->raid_disks,
-                                  conf->max_degraded));
                if (rdev->recovery_offset < reshape_offset) {
                        /* We need to check old and new layout */
                        if (!only_parity(rdev->raid_disk,
@@ -4995,8 +5083,8 @@ static int run(mddev_t *mddev)
        mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
                           - working_disks);
 
-       if (mddev->degraded > conf->max_degraded) {
-               printk(KERN_ERR "raid5: not enough operational devices for %s"
+       if (has_failed(conf)) {
+               printk(KERN_ERR "md/raid:%s: not enough operational devices"
                        " (%d/%d failed)\n",
                        mdname(mddev), mddev->degraded, conf->raid_disks);
                goto abort;
@@ -5010,32 +5098,32 @@ static int run(mddev_t *mddev)
            mddev->recovery_cp != MaxSector) {
                if (mddev->ok_start_degraded)
                        printk(KERN_WARNING
-                              "raid5: starting dirty degraded array: %s"
-                              "- data corruption possible.\n",
+                              "md/raid:%s: starting dirty degraded array"
+                              " - data corruption possible.\n",
                               mdname(mddev));
                else {
                        printk(KERN_ERR
-                              "raid5: cannot start dirty degraded array for %s\n",
+                              "md/raid:%s: cannot start dirty degraded array.\n",
                               mdname(mddev));
                        goto abort;
                }
        }
 
        if (mddev->degraded == 0)
-               printk("raid5: raid level %d set %s active with %d out of %d"
-                      " devices, algorithm %d\n", conf->level, mdname(mddev),
+               printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
+                      " devices, algorithm %d\n", mdname(mddev), conf->level,
                       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
                       mddev->new_layout);
        else
-               printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
-                       " out of %d devices, algorithm %d\n", conf->level,
-                       mdname(mddev), mddev->raid_disks - mddev->degraded,
-                       mddev->raid_disks, mddev->new_layout);
+               printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
+                      " out of %d devices, algorithm %d\n",
+                      mdname(mddev), conf->level,
+                      mddev->raid_disks - mddev->degraded,
+                      mddev->raid_disks, mddev->new_layout);
 
        print_raid5_conf(conf);
 
        if (conf->reshape_progress != MaxSector) {
-               printk("...ok start reshape thread\n");
                conf->reshape_safe = conf->reshape_progress;
                atomic_set(&conf->reshape_stripes, 0);
                clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -5046,67 +5134,66 @@ static int run(mddev_t *mddev)
                                                        "reshape");
        }
 
-       /* read-ahead size must cover two whole stripes, which is
-        * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
-        */
-       {
-               int data_disks = conf->previous_raid_disks - conf->max_degraded;
-               int stripe = data_disks *
-                       ((mddev->chunk_sectors << 9) / PAGE_SIZE);
-               if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
-                       mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
-       }
 
        /* Ok, everything is just fine now */
-       if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
+       if (mddev->to_remove == &raid5_attrs_group)
+               mddev->to_remove = NULL;
+       else if (mddev->kobj.sd &&
+           sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
                printk(KERN_WARNING
                       "raid5: failed to create sysfs attributes for %s\n",
                       mdname(mddev));
+       md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 
-       mddev->queue->queue_lock = &conf->device_lock;
+       if (mddev->queue) {
+               int chunk_size;
+               /* read-ahead size must cover two whole stripes, which
+                * is 2 * (datadisks) * chunksize where 'n' is the
+                * number of raid devices
+                */
+               int data_disks = conf->previous_raid_disks - conf->max_degraded;
+               int stripe = data_disks *
+                       ((mddev->chunk_sectors << 9) / PAGE_SIZE);
+               if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+                       mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 
-       mddev->queue->unplug_fn = raid5_unplug_device;
-       mddev->queue->backing_dev_info.congested_data = mddev;
-       mddev->queue->backing_dev_info.congested_fn = raid5_congested;
+               blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
 
-       md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
+               mddev->queue->backing_dev_info.congested_data = mddev;
+               mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 
-       blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
-       chunk_size = mddev->chunk_sectors << 9;
-       blk_queue_io_min(mddev->queue, chunk_size);
-       blk_queue_io_opt(mddev->queue, chunk_size *
-                        (conf->raid_disks - conf->max_degraded));
+               chunk_size = mddev->chunk_sectors << 9;
+               blk_queue_io_min(mddev->queue, chunk_size);
+               blk_queue_io_opt(mddev->queue, chunk_size *
+                                (conf->raid_disks - conf->max_degraded));
 
-       list_for_each_entry(rdev, &mddev->disks, same_set)
-               disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                 rdev->data_offset << 9);
+               list_for_each_entry(rdev, &mddev->disks, same_set)
+                       disk_stack_limits(mddev->gendisk, rdev->bdev,
+                                         rdev->data_offset << 9);
+       }
 
        return 0;
 abort:
-       md_unregister_thread(mddev->thread);
-       mddev->thread = NULL;
+       md_unregister_thread(&mddev->thread);
        if (conf) {
                print_raid5_conf(conf);
                free_conf(conf);
        }
        mddev->private = NULL;
-       printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
+       printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
        return -EIO;
 }
 
-
-
 static int stop(mddev_t *mddev)
 {
-       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+       raid5_conf_t *conf = mddev->private;
 
-       md_unregister_thread(mddev->thread);
-       mddev->thread = NULL;
-       mddev->queue->backing_dev_info.congested_fn = NULL;
-       blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-       sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
+       md_unregister_thread(&mddev->thread);
+       if (mddev->queue)
+               mddev->queue->backing_dev_info.congested_fn = NULL;
        free_conf(conf);
        mddev->private = NULL;
+       mddev->to_remove = &raid5_attrs_group;
        return 0;
 }
 
@@ -5147,7 +5234,7 @@ static void printall(struct seq_file *seq, raid5_conf_t *conf)
 
 static void status(struct seq_file *seq, mddev_t *mddev)
 {
-       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+       raid5_conf_t *conf = mddev->private;
        int i;
 
        seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
@@ -5169,21 +5256,22 @@ static void print_raid5_conf (raid5_conf_t *conf)
        int i;
        struct disk_info *tmp;
 
-       printk("RAID5 conf printout:\n");
+       printk(KERN_DEBUG "RAID conf printout:\n");
        if (!conf) {
                printk("(conf==NULL)\n");
                return;
        }
-       printk(" --- rd:%d wd:%d\n", conf->raid_disks,
-                conf->raid_disks - conf->mddev->degraded);
+       printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
+              conf->raid_disks,
+              conf->raid_disks - conf->mddev->degraded);
 
        for (i = 0; i < conf->raid_disks; i++) {
                char b[BDEVNAME_SIZE];
                tmp = conf->disks + i;
                if (tmp->rdev)
-               printk(" disk %d, o:%d, dev:%s\n",
-                       i, !test_bit(Faulty, &tmp->rdev->flags),
-                       bdevname(tmp->rdev->bdev,b));
+                       printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
+                              i, !test_bit(Faulty, &tmp->rdev->flags),
+                              bdevname(tmp->rdev->bdev, b));
        }
 }
 
@@ -5192,20 +5280,24 @@ static int raid5_spare_active(mddev_t *mddev)
        int i;
        raid5_conf_t *conf = mddev->private;
        struct disk_info *tmp;
+       int count = 0;
+       unsigned long flags;
 
        for (i = 0; i < conf->raid_disks; i++) {
                tmp = conf->disks + i;
                if (tmp->rdev
+                   && tmp->rdev->recovery_offset == MaxSector
                    && !test_bit(Faulty, &tmp->rdev->flags)
                    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
-                       unsigned long flags;
-                       spin_lock_irqsave(&conf->device_lock, flags);
-                       mddev->degraded--;
-                       spin_unlock_irqrestore(&conf->device_lock, flags);
+                       count++;
+                       sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
                }
        }
+       spin_lock_irqsave(&conf->device_lock, flags);
+       mddev->degraded -= count;
+       spin_unlock_irqrestore(&conf->device_lock, flags);
        print_raid5_conf(conf);
-       return 0;
+       return count;
 }
 
 static int raid5_remove_disk(mddev_t *mddev, int number)
@@ -5231,7 +5323,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
                 * isn't possible.
                 */
                if (!test_bit(Faulty, &rdev->flags) &&
-                   mddev->degraded <= conf->max_degraded &&
+                   !has_failed(conf) &&
                    number < conf->raid_disks) {
                        err = -EBUSY;
                        goto abort;
@@ -5259,7 +5351,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
        int first = 0;
        int last = conf->raid_disks - 1;
 
-       if (mddev->degraded > conf->max_degraded)
+       if (has_failed(conf))
                /* no point adding a device */
                return -EINVAL;
 
@@ -5306,9 +5398,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
            raid5_size(mddev, sectors, mddev->raid_disks))
                return -EINVAL;
        set_capacity(mddev->gendisk, mddev->array_sectors);
-       mddev->changed = 1;
        revalidate_disk(mddev->gendisk);
-       if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
+       if (sectors > mddev->dev_sectors &&
+           mddev->recovery_cp > mddev->dev_sectors) {
                mddev->recovery_cp = mddev->dev_sectors;
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        }
@@ -5332,7 +5424,8 @@ static int check_stripe_cache(mddev_t *mddev)
            > conf->max_nr_stripes ||
            ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
            > conf->max_nr_stripes) {
-               printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
+               printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
+                      mdname(mddev),
                       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
                        / STRIPE_SIZE)*4);
                return 0;
@@ -5351,7 +5444,7 @@ static int check_reshape(mddev_t *mddev)
        if (mddev->bitmap)
                /* Cannot grow a bitmap yet */
                return -EBUSY;
-       if (mddev->degraded > conf->max_degraded)
+       if (has_failed(conf))
                return -EINVAL;
        if (mddev->delta_disks < 0) {
                /* We might be able to shrink, but the devices must
@@ -5377,7 +5470,6 @@ static int raid5_start_reshape(mddev_t *mddev)
        raid5_conf_t *conf = mddev->private;
        mdk_rdev_t *rdev;
        int spares = 0;
-       int added_devices = 0;
        unsigned long flags;
 
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -5387,8 +5479,8 @@ static int raid5_start_reshape(mddev_t *mddev)
                return -ENOSPC;
 
        list_for_each_entry(rdev, &mddev->disks, same_set)
-               if (rdev->raid_disk < 0 &&
-                   !test_bit(Faulty, &rdev->flags))
+               if (!test_bit(In_sync, &rdev->flags)
+                   && !test_bit(Faulty, &rdev->flags))
                        spares++;
 
        if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
@@ -5403,7 +5495,7 @@ static int raid5_start_reshape(mddev_t *mddev)
         */
        if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
            < mddev->array_sectors) {
-               printk(KERN_ERR "md%s: array size must be reduced "
+               printk(KERN_ERR "md/raid:%s: array size must be reduced "
                       "before number of disks\n", mdname(mddev));
                return -EINVAL;
        }
@@ -5426,32 +5518,40 @@ static int raid5_start_reshape(mddev_t *mddev)
 
        /* Add some new drives, as many as will fit.
         * We know there are enough to make the newly sized array work.
+        * Don't add devices if we are reducing the number of
+        * devices in the array.  This is because it is not possible
+        * to correctly record the "partially reconstructed" state of
+        * such devices during the reshape and confusion could result.
         */
-       list_for_each_entry(rdev, &mddev->disks, same_set)
-               if (rdev->raid_disk < 0 &&
-                   !test_bit(Faulty, &rdev->flags)) {
-                       if (raid5_add_disk(mddev, rdev) == 0) {
-                               char nm[20];
-                               if (rdev->raid_disk >= conf->previous_raid_disks) {
-                                       set_bit(In_sync, &rdev->flags);
-                                       added_devices++;
-                               } else
-                                       rdev->recovery_offset = 0;
-                               sprintf(nm, "rd%d", rdev->raid_disk);
-                               if (sysfs_create_link(&mddev->kobj,
-                                                     &rdev->kobj, nm))
-                                       printk(KERN_WARNING
-                                              "raid5: failed to create "
-                                              " link %s for %s\n",
-                                              nm, mdname(mddev));
-                       } else
-                               break;
-               }
+       if (mddev->delta_disks >= 0) {
+               int added_devices = 0;
+               list_for_each_entry(rdev, &mddev->disks, same_set)
+                       if (rdev->raid_disk < 0 &&
+                           !test_bit(Faulty, &rdev->flags)) {
+                               if (raid5_add_disk(mddev, rdev) == 0) {
+                                       char nm[20];
+                                       if (rdev->raid_disk
+                                           >= conf->previous_raid_disks) {
+                                               set_bit(In_sync, &rdev->flags);
+                                               added_devices++;
+                                       } else
+                                               rdev->recovery_offset = 0;
+                                       sprintf(nm, "rd%d", rdev->raid_disk);
+                                       if (sysfs_create_link(&mddev->kobj,
+                                                             &rdev->kobj, nm))
+                                               /* Failure here is OK */;
+                               }
+                       } else if (rdev->raid_disk >= conf->previous_raid_disks
+                                  && !test_bit(Faulty, &rdev->flags)) {
+                               /* This is a spare that was manually added */
+                               set_bit(In_sync, &rdev->flags);
+                               added_devices++;
+                       }
 
-       /* When a reshape changes the number of devices, ->degraded
-        * is measured against the large of the pre and post number of
-        * devices.*/
-       if (mddev->delta_disks > 0) {
+               /* When a reshape changes the number of devices,
+                * ->degraded is measured against the larger of the
+                * pre and post number of devices.
+                */
                spin_lock_irqsave(&conf->device_lock, flags);
                mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
                        - added_devices;
@@ -5498,7 +5598,7 @@ static void end_reshape(raid5_conf_t *conf)
                /* read-ahead size must cover two whole stripes, which is
                 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
                 */
-               {
+               if (conf->mddev->queue) {
                        int data_disks = conf->raid_disks - conf->max_degraded;
                        int stripe = data_disks * ((conf->chunk_sectors << 9)
                                                   / PAGE_SIZE);
@@ -5520,7 +5620,6 @@ static void raid5_finish_reshape(mddev_t *mddev)
                if (mddev->delta_disks > 0) {
                        md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
                        set_capacity(mddev->gendisk, mddev->array_sectors);
-                       mddev->changed = 1;
                        revalidate_disk(mddev->gendisk);
                } else {
                        int d;
@@ -5585,6 +5684,33 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 }
 
 
+static void *raid45_takeover_raid0(mddev_t *mddev, int level)
+{
+       struct raid0_private_data *raid0_priv = mddev->private;
+       sector_t sectors;
+
+       /* for raid0 takeover only one zone is supported */
+       if (raid0_priv->nr_strip_zones > 1) {
+               printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
+                      mdname(mddev));
+               return ERR_PTR(-EINVAL);
+       }
+
+       sectors = raid0_priv->strip_zone[0].zone_end;
+       sector_div(sectors, raid0_priv->strip_zone[0].nb_dev);
+       mddev->dev_sectors = sectors;
+       mddev->new_level = level;
+       mddev->new_layout = ALGORITHM_PARITY_N;
+       mddev->new_chunk_sectors = mddev->chunk_sectors;
+       mddev->raid_disks += 1;
+       mddev->delta_disks = 1;
+       /* make sure it will be not marked as dirty */
+       mddev->recovery_cp = MaxSector;
+
+       return setup_conf(mddev);
+}
+
+
 static void *raid5_takeover_raid1(mddev_t *mddev)
 {
        int chunksect;
@@ -5709,12 +5835,13 @@ static int raid6_check_reshape(mddev_t *mddev)
 static void *raid5_takeover(mddev_t *mddev)
 {
        /* raid5 can take over:
-        *  raid0 - if all devices are the same - make it a raid4 layout
+        *  raid0 - if there is only one strip zone - make it a raid4 layout
         *  raid1 - if there are two drives.  We need to know the chunk size
         *  raid4 - trivial - just use a raid4 layout.
         *  raid6 - Providing it is a *_6 layout
         */
-
+       if (mddev->level == 0)
+               return raid45_takeover_raid0(mddev, 5);
        if (mddev->level == 1)
                return raid5_takeover_raid1(mddev);
        if (mddev->level == 4) {
@@ -5728,6 +5855,22 @@ static void *raid5_takeover(mddev_t *mddev)
        return ERR_PTR(-EINVAL);
 }
 
+static void *raid4_takeover(mddev_t *mddev)
+{
+       /* raid4 can take over:
+        *  raid0 - if there is only one strip zone
+        *  raid5 - if layout is right
+        */
+       if (mddev->level == 0)
+               return raid45_takeover_raid0(mddev, 4);
+       if (mddev->level == 5 &&
+           mddev->layout == ALGORITHM_PARITY_N) {
+               mddev->new_layout = 0;
+               mddev->new_level = 4;
+               return setup_conf(mddev);
+       }
+       return ERR_PTR(-EINVAL);
+}
 
 static struct mdk_personality raid5_personality;
 
@@ -5843,6 +5986,7 @@ static struct mdk_personality raid4_personality =
        .start_reshape  = raid5_start_reshape,
        .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
+       .takeover       = raid4_takeover,
 };
 
 static int __init raid5_init(void)
@@ -5863,6 +6007,7 @@ static void raid5_exit(void)
 module_init(raid5_init);
 module_exit(raid5_exit);
 MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
 MODULE_ALIAS("md-personality-4"); /* RAID5 */
 MODULE_ALIAS("md-raid5");
 MODULE_ALIAS("md-raid4");