dm snapshot: fix a possible invalid memory access on unload

[firefly-linux-kernel-4.4.55.git] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index b98765f6f77fd9f1e11ecdcd8809928e7b821716..aa76865b804b41c2b2af3b78f5b16c83eb034feb 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -296,12 +296,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
         BUG_ON(atomic_read(&conf->active_stripes)==0);
         if (test_bit(STRIPE_HANDLE, &sh->state)) {
                 if (test_bit(STRIPE_DELAYED, &sh->state) &&
-                   !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+                   !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
                         list_add_tail(&sh->lru, &conf->delayed_list);
-                       if (atomic_read(&conf->preread_active_stripes)
-                           < IO_THRESHOLD)
-                               md_wakeup_thread(conf->mddev->thread);
-               } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+               else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
                            sh->bm_seq - conf->seq_write > 0)
                         list_add_tail(&sh->lru, &conf->bitmap_list);
                 else {
@@ -2898,31 +2895,102 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
   * Returns 1 when no more member devices need to be checked, otherwise returns
   * 0 to tell the loop in handle_stripe_fill to continue
   */
-static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
-                      int disk_idx, int disks)
+
+static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
+                          int disk_idx, int disks)
  {
         struct r5dev *dev = &sh->dev[disk_idx];
         struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
                                   &sh->dev[s->failed_num[1]] };
+       int i;
+
+
+       if (test_bit(R5_LOCKED, &dev->flags) ||
+           test_bit(R5_UPTODATE, &dev->flags))
+               /* No point reading this as we already have it or have
+                * decided to get it.
+                */
+               return 0;
+
+       if (dev->toread ||
+           (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
+               /* We need this block to directly satisfy a request */
+               return 1;
+
+       if (s->syncing || s->expanding ||
+           (s->replacing && want_replace(sh, disk_idx)))
+               /* When syncing, or expanding we read everything.
+                * When replacing, we need the replaced block.
+                */
+               return 1;
+
+       if ((s->failed >= 1 && fdev[0]->toread) ||
+           (s->failed >= 2 && fdev[1]->toread))
+               /* If we want to read from a failed device, then
+                * we need to actually read every other device.
+                */
+               return 1;
+
+       /* Sometimes neither read-modify-write nor reconstruct-write
+        * cycles can work.  In those cases we read every block we
+        * can.  Then the parity-update is certain to have enough to
+        * work with.
+        * This can only be a problem when we need to write something,
+        * and some device has failed.  If either of those tests
+        * fail we need look no further.
+        */
+       if (!s->failed || !s->to_write)
+               return 0;
+
+       if (test_bit(R5_Insync, &dev->flags) &&
+           !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+               /* Pre-reads at not permitted until after short delay
+                * to gather multiple requests.  However if this
+                * device is no Insync, the block could only be be computed
+                * and there is no need to delay that.
+                */
+               return 0;
+
+       for (i = 0; i < s->failed; i++) {
+               if (fdev[i]->towrite &&
+                   !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+                   !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+                       /* If we have a partial write to a failed
+                        * device, then we will need to reconstruct
+                        * the content of that device, so all other
+                        * devices must be read.
+                        */
+                       return 1;
+       }
+
+       /* If we are forced to do a reconstruct-write, either because
+        * the current RAID6 implementation only supports that, or
+        * or because parity cannot be trusted and we are currently
+        * recovering it, there is extra need to be careful.
+        * If one of the devices that we would need to read, because
+        * it is not being overwritten (and maybe not written at all)
+        * is missing/faulty, then we need to read everything we can.
+        */
+       if (sh->raid_conf->level != 6 &&
+           sh->sector < sh->raid_conf->mddev->recovery_cp)
+               /* reconstruct-write isn't being forced */
+               return 0;
+       for (i = 0; i < s->failed; i++) {
+               if (!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+                   !test_bit(R5_OVERWRITE, &fdev[i]->flags))
+                       return 1;
+       }
+
+       return 0;
+}
+
+static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
+                      int disk_idx, int disks)
+{
+       struct r5dev *dev = &sh->dev[disk_idx];
  
         /* is the data in this block needed, and can we get it? */
-       if (!test_bit(R5_LOCKED, &dev->flags) &&
-           !test_bit(R5_UPTODATE, &dev->flags) &&
-           (dev->toread ||
-            (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
-            s->syncing || s->expanding ||
-            (s->replacing && want_replace(sh, disk_idx)) ||
-            (s->failed >= 1 && fdev[0]->toread) ||
-            (s->failed >= 2 && fdev[1]->toread) ||
-            (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
-             (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
-             !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
-            ((sh->raid_conf->level == 6 ||
-              sh->sector >= sh->raid_conf->mddev->recovery_cp)
-             && s->failed && s->to_write &&
-             (s->to_write - s->non_overwrite <
-              sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) &&
-             (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
+       if (need_this_block(sh, s, disk_idx, disks)) {
                 /* we would like to get this block, possibly by computing it,
                  * otherwise read it if the backing disk is insync
                  */
@@ -4081,7 +4149,7 @@ static void activate_bit_delay(struct r5conf *conf,
         }
  }
  
-int md_raid5_congested(struct mddev *mddev, int bits)
+static int raid5_congested(struct mddev *mddev, int bits)
  {
         struct r5conf *conf = mddev->private;
  
@@ -4098,24 +4166,14 @@ int md_raid5_congested(struct mddev *mddev, int bits)
  
         return 0;
  }
-EXPORT_SYMBOL_GPL(md_raid5_congested);
-
-static int raid5_congested(void *data, int bits)
-{
-       struct mddev *mddev = data;
-
-       return mddev_congested(mddev, bits) ||
-               md_raid5_congested(mddev, bits);
-}
  
  /* We want read requests to align with chunks where possible,
   * but write requests don't need to.
   */
-static int raid5_mergeable_bvec(struct request_queue *q,
+static int raid5_mergeable_bvec(struct mddev *mddev,
                                 struct bvec_merge_data *bvm,
                                 struct bio_vec *biovec)
  {
-       struct mddev *mddev = q->queuedata;
         sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
         int max;
         unsigned int chunk_sectors = mddev->chunk_sectors;
@@ -5296,11 +5354,14 @@ static void raid5d(struct md_thread *thread)
  static ssize_t
  raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
  {
-       struct r5conf *conf = mddev->private;
+       struct r5conf *conf;
+       int ret = 0;
+       spin_lock(&mddev->lock);
+       conf = mddev->private;
         if (conf)
-               return sprintf(page, "%d\n", conf->max_nr_stripes);
-       else
-               return 0;
+               ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+       spin_unlock(&mddev->lock);
+       return ret;
  }
  
  int
@@ -5339,21 +5400,25 @@ EXPORT_SYMBOL(raid5_set_cache_size);
  static ssize_t
  raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
  {
-       struct r5conf *conf = mddev->private;
+       struct r5conf *conf;
         unsigned long new;
         int err;
  
         if (len >= PAGE_SIZE)
                 return -EINVAL;
-       if (!conf)
-               return -ENODEV;
-
         if (kstrtoul(page, 10, &new))
                 return -EINVAL;
-       err = raid5_set_cache_size(mddev, new);
+       err = mddev_lock(mddev);
         if (err)
                 return err;
-       return len;
+       conf = mddev->private;
+       if (!conf)
+               err = -ENODEV;
+       else
+               err = raid5_set_cache_size(mddev, new);
+       mddev_unlock(mddev);
+
+       return err ?: len;
  }
  
  static struct md_sysfs_entry
@@ -5364,29 +5429,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
  static ssize_t
  raid5_show_preread_threshold(struct mddev *mddev, char *page)
  {
-       struct r5conf *conf = mddev->private;
+       struct r5conf *conf;
+       int ret = 0;
+       spin_lock(&mddev->lock);
+       conf = mddev->private;
         if (conf)
-               return sprintf(page, "%d\n", conf->bypass_threshold);
-       else
-               return 0;
+               ret = sprintf(page, "%d\n", conf->bypass_threshold);
+       spin_unlock(&mddev->lock);
+       return ret;
  }
  
  static ssize_t
  raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
  {
-       struct r5conf *conf = mddev->private;
+       struct r5conf *conf;
         unsigned long new;
+       int err;
+
         if (len >= PAGE_SIZE)
                 return -EINVAL;
-       if (!conf)
-               return -ENODEV;
-
         if (kstrtoul(page, 10, &new))
                 return -EINVAL;
-       if (new > conf->max_nr_stripes)
-               return -EINVAL;
-       conf->bypass_threshold = new;
-       return len;
+
+       err = mddev_lock(mddev);
+       if (err)
+               return err;
+       conf = mddev->private;
+       if (!conf)
+               err = -ENODEV;
+       else if (new > conf->max_nr_stripes)
+               err = -EINVAL;
+       else
+               conf->bypass_threshold = new;
+       mddev_unlock(mddev);
+       return err ?: len;
  }
  
  static struct md_sysfs_entry
@@ -5398,39 +5474,48 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
  static ssize_t
  raid5_show_skip_copy(struct mddev *mddev, char *page)
  {
-       struct r5conf *conf = mddev->private;
+       struct r5conf *conf;
+       int ret = 0;
+       spin_lock(&mddev->lock);
+       conf = mddev->private;
         if (conf)
-               return sprintf(page, "%d\n", conf->skip_copy);
-       else
-               return 0;
+               ret = sprintf(page, "%d\n", conf->skip_copy);
+       spin_unlock(&mddev->lock);
+       return ret;
  }
  
  static ssize_t
  raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
  {
-       struct r5conf *conf = mddev->private;
+       struct r5conf *conf;
         unsigned long new;
+       int err;
+
         if (len >= PAGE_SIZE)
                 return -EINVAL;
-       if (!conf)
-               return -ENODEV;
-
         if (kstrtoul(page, 10, &new))
                 return -EINVAL;
         new = !!new;
-       if (new == conf->skip_copy)
-               return len;
  
-       mddev_suspend(mddev);
-       conf->skip_copy = new;
-       if (new)
-               mddev->queue->backing_dev_info.capabilities |=
-                                               BDI_CAP_STABLE_WRITES;
-       else
-               mddev->queue->backing_dev_info.capabilities &=
-                                               ~BDI_CAP_STABLE_WRITES;
-       mddev_resume(mddev);
-       return len;
+       err = mddev_lock(mddev);
+       if (err)
+               return err;
+       conf = mddev->private;
+       if (!conf)
+               err = -ENODEV;
+       else if (new != conf->skip_copy) {
+               mddev_suspend(mddev);
+               conf->skip_copy = new;
+               if (new)
+                       mddev->queue->backing_dev_info.capabilities |=
+                               BDI_CAP_STABLE_WRITES;
+               else
+                       mddev->queue->backing_dev_info.capabilities &=
+                               ~BDI_CAP_STABLE_WRITES;
+               mddev_resume(mddev);
+       }
+       mddev_unlock(mddev);
+       return err ?: len;
  }
  
  static struct md_sysfs_entry
@@ -5454,11 +5539,14 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
  static ssize_t
  raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
  {
-       struct r5conf *conf = mddev->private;
+       struct r5conf *conf;
+       int ret = 0;
+       spin_lock(&mddev->lock);
+       conf = mddev->private;
         if (conf)
-               return sprintf(page, "%d\n", conf->worker_cnt_per_group);
-       else
-               return 0;
+               ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
+       spin_unlock(&mddev->lock);
+       return ret;
  }
  
  static int alloc_thread_groups(struct r5conf *conf, int cnt,
@@ -5468,7 +5556,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
  static ssize_t
  raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
  {
-       struct r5conf *conf = mddev->private;
+       struct r5conf *conf;
         unsigned long new;
         int err;
         struct r5worker_group *new_groups, *old_groups;
@@ -5476,41 +5564,41 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
  
         if (len >= PAGE_SIZE)
                 return -EINVAL;
-       if (!conf)
-               return -ENODEV;
-
         if (kstrtoul(page, 10, &new))
                 return -EINVAL;
  
-       if (new == conf->worker_cnt_per_group)
-               return len;
-
-       mddev_suspend(mddev);
+       err = mddev_lock(mddev);
+       if (err)
+               return err;
+       conf = mddev->private;
+       if (!conf)
+               err = -ENODEV;
+       else if (new != conf->worker_cnt_per_group) {
+               mddev_suspend(mddev);
  
-       old_groups = conf->worker_groups;
-       if (old_groups)
-               flush_workqueue(raid5_wq);
+               old_groups = conf->worker_groups;
+               if (old_groups)
+                       flush_workqueue(raid5_wq);
  
-       err = alloc_thread_groups(conf, new,
-                                 &group_cnt, &worker_cnt_per_group,
-                                 &new_groups);
-       if (!err) {
-               spin_lock_irq(&conf->device_lock);
-               conf->group_cnt = group_cnt;
-               conf->worker_cnt_per_group = worker_cnt_per_group;
-               conf->worker_groups = new_groups;
-               spin_unlock_irq(&conf->device_lock);
+               err = alloc_thread_groups(conf, new,
+                                         &group_cnt, &worker_cnt_per_group,
+                                         &new_groups);
+               if (!err) {
+                       spin_lock_irq(&conf->device_lock);
+                       conf->group_cnt = group_cnt;
+                       conf->worker_cnt_per_group = worker_cnt_per_group;
+                       conf->worker_groups = new_groups;
+                       spin_unlock_irq(&conf->device_lock);
  
-               if (old_groups)
-                       kfree(old_groups[0].workers);
-               kfree(old_groups);
+                       if (old_groups)
+                               kfree(old_groups[0].workers);
+                       kfree(old_groups);
+               }
+               mddev_resume(mddev);
         }
+       mddev_unlock(mddev);
  
-       mddev_resume(mddev);
-
-       if (err)
-               return err;
-       return len;
+       return err ?: len;
  }
  
  static struct md_sysfs_entry
@@ -6178,11 +6266,6 @@ static int run(struct mddev *mddev)
                 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
                         mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
  
-               blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
-
-               mddev->queue->backing_dev_info.congested_data = mddev;
-               mddev->queue->backing_dev_info.congested_fn = raid5_congested;
-
                 chunk_size = mddev->chunk_sectors << 9;
                 blk_queue_io_min(mddev->queue, chunk_size);
                 blk_queue_io_opt(mddev->queue, chunk_size *
@@ -6260,17 +6343,12 @@ abort:
         return -EIO;
  }
  
-static int stop(struct mddev *mddev)
+static void raid5_free(struct mddev *mddev, void *priv)
  {
-       struct r5conf *conf = mddev->private;
+       struct r5conf *conf = priv;
  
-       md_unregister_thread(&mddev->thread);
-       if (mddev->queue)
-               mddev->queue->backing_dev_info.congested_fn = NULL;
         free_conf(conf);
-       mddev->private = NULL;
         mddev->to_remove = &raid5_attrs_group;
-       return 0;
  }
  
  static void status(struct seq_file *seq, struct mddev *mddev)
@@ -7044,7 +7122,7 @@ static struct md_personality raid6_personality =
         .owner          = THIS_MODULE,
         .make_request   = make_request,
         .run            = run,
-       .stop           = stop,
+       .free           = raid5_free,
         .status         = status,
         .error_handler  = error,
         .hot_add_disk   = raid5_add_disk,
@@ -7058,6 +7136,8 @@ static struct md_personality raid6_personality =
         .finish_reshape = raid5_finish_reshape,
         .quiesce        = raid5_quiesce,
         .takeover       = raid6_takeover,
+       .congested      = raid5_congested,
+       .mergeable_bvec = raid5_mergeable_bvec,
  };
  static struct md_personality raid5_personality =
  {
@@ -7066,7 +7146,7 @@ static struct md_personality raid5_personality =
         .owner          = THIS_MODULE,
         .make_request   = make_request,
         .run            = run,
-       .stop           = stop,
+       .free           = raid5_free,
         .status         = status,
         .error_handler  = error,
         .hot_add_disk   = raid5_add_disk,
@@ -7080,6 +7160,8 @@ static struct md_personality raid5_personality =
         .finish_reshape = raid5_finish_reshape,
         .quiesce        = raid5_quiesce,
         .takeover       = raid5_takeover,
+       .congested      = raid5_congested,
+       .mergeable_bvec = raid5_mergeable_bvec,
  };
  
  static struct md_personality raid4_personality =
@@ -7089,7 +7171,7 @@ static struct md_personality raid4_personality =
         .owner          = THIS_MODULE,
         .make_request   = make_request,
         .run            = run,
-       .stop           = stop,
+       .free           = raid5_free,
         .status         = status,
         .error_handler  = error,
         .hot_add_disk   = raid5_add_disk,
@@ -7103,6 +7185,8 @@ static struct md_personality raid4_personality =
         .finish_reshape = raid5_finish_reshape,
         .quiesce        = raid5_quiesce,
         .takeover       = raid4_takeover,
+       .congested      = raid5_congested,
+       .mergeable_bvec = raid5_mergeable_bvec,
  };
  
  static int __init raid5_init(void)