drivers/md/md-cluster.c

   1 /*
   2  * Copyright (C) 2015, SUSE
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2, or (at your option)
   7  * any later version.
   8  *
   9  */
  10
  11
  12 #include <linux/module.h>
  13 #include <linux/dlm.h>
  14 #include <linux/sched.h>
  15 #include <linux/raid/md_p.h>
  16 #include "md.h"
  17 #include "bitmap.h"
  18 #include "md-cluster.h"
  19
  20 #define LVB_SIZE        64
  21 #define NEW_DEV_TIMEOUT 5000
  22
  23 struct dlm_lock_resource {
  24         dlm_lockspace_t *ls;
  25         struct dlm_lksb lksb;
  26         char *name; /* lock name. */
  27         uint32_t flags; /* flags to pass to dlm_lock() */
  28         struct completion completion; /* completion for synchronized locking */
  29         void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
  30         struct mddev *mddev; /* pointing back to mddev. */
  31 };
  32
  33 struct suspend_info {
  34         int slot;
  35         sector_t lo;
  36         sector_t hi;
  37         struct list_head list;
  38 };
  39
  40 struct resync_info {
  41         __le64 lo;
  42         __le64 hi;
  43 };
  44
  45 /* md_cluster_info flags */
  46 #define         MD_CLUSTER_WAITING_FOR_NEWDISK          1
  47 #define         MD_CLUSTER_SUSPEND_READ_BALANCING       2
  48 #define         MD_CLUSTER_BEGIN_JOIN_CLUSTER           3
  49
  50
  51 struct md_cluster_info {
  52         /* dlm lock space and resources for clustered raid. */
  53         dlm_lockspace_t *lockspace;
  54         int slot_number;
  55         struct completion completion;
  56         struct mutex sb_mutex;
  57         struct dlm_lock_resource *bitmap_lockres;
  58         struct list_head suspend_list;
  59         spinlock_t suspend_lock;
  60         struct md_thread *recovery_thread;
  61         unsigned long recovery_map;
  62         /* communication loc resources */
  63         struct dlm_lock_resource *ack_lockres;
  64         struct dlm_lock_resource *message_lockres;
  65         struct dlm_lock_resource *token_lockres;
  66         struct dlm_lock_resource *no_new_dev_lockres;
  67         struct md_thread *recv_thread;
  68         struct completion newdisk_completion;
  69         unsigned long state;
  70 };
  71
  72 enum msg_type {
  73         METADATA_UPDATED = 0,
  74         RESYNCING,
  75         NEWDISK,
  76         REMOVE,
  77         RE_ADD,
  78         BITMAP_NEEDS_SYNC,
  79 };
  80
  81 struct cluster_msg {
  82         int type;
  83         int slot;
  84         /* TODO: Unionize this for smaller footprint */
  85         sector_t low;
  86         sector_t high;
  87         char uuid[16];
  88         int raid_slot;
  89 };
  90
  91 static void sync_ast(void *arg)
  92 {
  93         struct dlm_lock_resource *res;
  94
  95         res = (struct dlm_lock_resource *) arg;
  96         complete(&res->completion);
  97 }
  98
  99 static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
 100 {
 101         int ret = 0;
 102
 103         ret = dlm_lock(res->ls, mode, &res->lksb,
 104                         res->flags, res->name, strlen(res->name),
 105                         0, sync_ast, res, res->bast);
 106         if (ret)
 107                 return ret;
 108         wait_for_completion(&res->completion);
 109         return res->lksb.sb_status;
 110 }
 111
 112 static int dlm_unlock_sync(struct dlm_lock_resource *res)
 113 {
 114         return dlm_lock_sync(res, DLM_LOCK_NL);
 115 }
 116
 117 static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
 118                 char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
 119 {
 120         struct dlm_lock_resource *res = NULL;
 121         int ret, namelen;
 122         struct md_cluster_info *cinfo = mddev->cluster_info;
 123
 124         res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
 125         if (!res)
 126                 return NULL;
 127         init_completion(&res->completion);
 128         res->ls = cinfo->lockspace;
 129         res->mddev = mddev;
 130         namelen = strlen(name);
 131         res->name = kzalloc(namelen + 1, GFP_KERNEL);
 132         if (!res->name) {
 133                 pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
 134                 goto out_err;
 135         }
 136         strlcpy(res->name, name, namelen + 1);
 137         if (with_lvb) {
 138                 res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
 139                 if (!res->lksb.sb_lvbptr) {
 140                         pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
 141                         goto out_err;
 142                 }
 143                 res->flags = DLM_LKF_VALBLK;
 144         }
 145
 146         if (bastfn)
 147                 res->bast = bastfn;
 148
 149         res->flags |= DLM_LKF_EXPEDITE;
 150
 151         ret = dlm_lock_sync(res, DLM_LOCK_NL);
 152         if (ret) {
 153                 pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
 154                 goto out_err;
 155         }
 156         res->flags &= ~DLM_LKF_EXPEDITE;
 157         res->flags |= DLM_LKF_CONVERT;
 158
 159         return res;
 160 out_err:
 161         kfree(res->lksb.sb_lvbptr);
 162         kfree(res->name);
 163         kfree(res);
 164         return NULL;
 165 }
 166
 167 static void lockres_free(struct dlm_lock_resource *res)
 168 {
 169         int ret;
 170
 171         if (!res)
 172                 return;
 173
 174         /* cancel a lock request or a conversion request that is blocked */
 175         res->flags |= DLM_LKF_CANCEL;
 176 retry:
 177         ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
 178         if (unlikely(ret != 0)) {
 179                 pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);
 180
 181                 /* if a lock conversion is cancelled, then the lock is put
 182                  * back to grant queue, need to ensure it is unlocked */
 183                 if (ret == -DLM_ECANCEL)
 184                         goto retry;
 185         }
 186         res->flags &= ~DLM_LKF_CANCEL;
 187         wait_for_completion(&res->completion);
 188
 189         kfree(res->name);
 190         kfree(res->lksb.sb_lvbptr);
 191         kfree(res);
 192 }
 193
 194 static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
 195                 sector_t lo, sector_t hi)
 196 {
 197         struct resync_info *ri;
 198
 199         ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
 200         ri->lo = cpu_to_le64(lo);
 201         ri->hi = cpu_to_le64(hi);
 202 }
 203
 204 static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
 205 {
 206         struct resync_info ri;
 207         struct suspend_info *s = NULL;
 208         sector_t hi = 0;
 209
 210         dlm_lock_sync(lockres, DLM_LOCK_CR);
 211         memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
 212         hi = le64_to_cpu(ri.hi);
 213         if (ri.hi > 0) {
 214                 s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
 215                 if (!s)
 216                         goto out;
 217                 s->hi = hi;
 218                 s->lo = le64_to_cpu(ri.lo);
 219         }
 220         dlm_unlock_sync(lockres);
 221 out:
 222         return s;
 223 }
 224
 225 static void recover_bitmaps(struct md_thread *thread)
 226 {
 227         struct mddev *mddev = thread->mddev;
 228         struct md_cluster_info *cinfo = mddev->cluster_info;
 229         struct dlm_lock_resource *bm_lockres;
 230         char str[64];
 231         int slot, ret;
 232         struct suspend_info *s, *tmp;
 233         sector_t lo, hi;
 234
 235         while (cinfo->recovery_map) {
 236                 slot = fls64((u64)cinfo->recovery_map) - 1;
 237
 238                 /* Clear suspend_area associated with the bitmap */
 239                 spin_lock_irq(&cinfo->suspend_lock);
 240                 list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
 241                         if (slot == s->slot) {
 242                                 list_del(&s->list);
 243                                 kfree(s);
 244                         }
 245                 spin_unlock_irq(&cinfo->suspend_lock);
 246
 247                 snprintf(str, 64, "bitmap%04d", slot);
 248                 bm_lockres = lockres_init(mddev, str, NULL, 1);
 249                 if (!bm_lockres) {
 250                         pr_err("md-cluster: Cannot initialize bitmaps\n");
 251                         goto clear_bit;
 252                 }
 253
 254                 ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
 255                 if (ret) {
 256                         pr_err("md-cluster: Could not DLM lock %s: %d\n",
 257                                         str, ret);
 258                         goto clear_bit;
 259                 }
 260                 ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
 261                 if (ret) {
 262                         pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
 263                         goto dlm_unlock;
 264                 }
 265                 if (hi > 0) {
 266                         /* TODO:Wait for current resync to get over */
 267                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 268                         if (lo < mddev->recovery_cp)
 269                                 mddev->recovery_cp = lo;
 270                         md_check_recovery(mddev);
 271                 }
 272 dlm_unlock:
 273                 dlm_unlock_sync(bm_lockres);
 274 clear_bit:
 275                 clear_bit(slot, &cinfo->recovery_map);
 276         }
 277 }
 278
 279 static void recover_prep(void *arg)
 280 {
 281         struct mddev *mddev = arg;
 282         struct md_cluster_info *cinfo = mddev->cluster_info;
 283         set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 284 }
 285
 286 static void __recover_slot(struct mddev *mddev, int slot)
 287 {
 288         struct md_cluster_info *cinfo = mddev->cluster_info;
 289
 290         set_bit(slot, &cinfo->recovery_map);
 291         if (!cinfo->recovery_thread) {
 292                 cinfo->recovery_thread = md_register_thread(recover_bitmaps,
 293                                 mddev, "recover");
 294                 if (!cinfo->recovery_thread) {
 295                         pr_warn("md-cluster: Could not create recovery thread\n");
 296                         return;
 297                 }
 298         }
 299         md_wakeup_thread(cinfo->recovery_thread);
 300 }
 301
 302 static void recover_slot(void *arg, struct dlm_slot *slot)
 303 {
 304         struct mddev *mddev = arg;
 305         struct md_cluster_info *cinfo = mddev->cluster_info;
 306
 307         pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
 308                         mddev->bitmap_info.cluster_name,
 309                         slot->nodeid, slot->slot,
 310                         cinfo->slot_number);
 311         /* deduct one since dlm slot starts from one while the num of
 312          * cluster-md begins with 0 */
 313         __recover_slot(mddev, slot->slot - 1);
 314 }
 315
 316 static void recover_done(void *arg, struct dlm_slot *slots,
 317                 int num_slots, int our_slot,
 318                 uint32_t generation)
 319 {
 320         struct mddev *mddev = arg;
 321         struct md_cluster_info *cinfo = mddev->cluster_info;
 322
 323         cinfo->slot_number = our_slot;
 324         /* completion is only need to be complete when node join cluster,
 325          * it doesn't need to run during another node's failure */
 326         if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
 327                 complete(&cinfo->completion);
 328                 clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
 329         }
 330         clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 331 }
 332
 333 /* the ops is called when node join the cluster, and do lock recovery
 334  * if node failure occurs */
 335 static const struct dlm_lockspace_ops md_ls_ops = {
 336         .recover_prep = recover_prep,
 337         .recover_slot = recover_slot,
 338         .recover_done = recover_done,
 339 };
 340
 341 /*
 342  * The BAST function for the ack lock resource
 343  * This function wakes up the receive thread in
 344  * order to receive and process the message.
 345  */
 346 static void ack_bast(void *arg, int mode)
 347 {
 348         struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
 349         struct md_cluster_info *cinfo = res->mddev->cluster_info;
 350
 351         if (mode == DLM_LOCK_EX)
 352                 md_wakeup_thread(cinfo->recv_thread);
 353 }
 354
 355 static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
 356 {
 357         struct suspend_info *s, *tmp;
 358
 359         list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
 360                 if (slot == s->slot) {
 361                         pr_info("%s:%d Deleting suspend_info: %d\n",
 362                                         __func__, __LINE__, slot);
 363                         list_del(&s->list);
 364                         kfree(s);
 365                         break;
 366                 }
 367 }
 368
 369 static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
 370 {
 371         spin_lock_irq(&cinfo->suspend_lock);
 372         __remove_suspend_info(cinfo, slot);
 373         spin_unlock_irq(&cinfo->suspend_lock);
 374 }
 375
 376
 377 static void process_suspend_info(struct mddev *mddev,
 378                 int slot, sector_t lo, sector_t hi)
 379 {
 380         struct md_cluster_info *cinfo = mddev->cluster_info;
 381         struct suspend_info *s;
 382
 383         if (!hi) {
 384                 remove_suspend_info(cinfo, slot);
 385                 return;
 386         }
 387         s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
 388         if (!s)
 389                 return;
 390         s->slot = slot;
 391         s->lo = lo;
 392         s->hi = hi;
 393         mddev->pers->quiesce(mddev, 1);
 394         mddev->pers->quiesce(mddev, 0);
 395         spin_lock_irq(&cinfo->suspend_lock);
 396         /* Remove existing entry (if exists) before adding */
 397         __remove_suspend_info(cinfo, slot);
 398         list_add(&s->list, &cinfo->suspend_list);
 399         spin_unlock_irq(&cinfo->suspend_lock);
 400 }
 401
 402 static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 403 {
 404         char disk_uuid[64];
 405         struct md_cluster_info *cinfo = mddev->cluster_info;
 406         char event_name[] = "EVENT=ADD_DEVICE";
 407         char raid_slot[16];
 408         char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
 409         int len;
 410
 411         len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
 412         sprintf(disk_uuid + len, "%pU", cmsg->uuid);
 413         snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
 414         pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
 415         init_completion(&cinfo->newdisk_completion);
 416         set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
 417         kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
 418         wait_for_completion_timeout(&cinfo->newdisk_completion,
 419                         NEW_DEV_TIMEOUT);
 420         clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
 421 }
 422
 423
 424 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 425 {
 426         struct md_cluster_info *cinfo = mddev->cluster_info;
 427
 428         md_reload_sb(mddev);
 429         dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
 430 }
 431
 432 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 433 {
 434         struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
 435
 436         if (rdev)
 437                 md_kick_rdev_from_array(rdev);
 438         else
 439                 pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
 440 }
 441
 442 static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
 443 {
 444         struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
 445
 446         if (rdev && test_bit(Faulty, &rdev->flags))
 447                 clear_bit(Faulty, &rdev->flags);
 448         else
 449                 pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
 450 }
 451
 452 static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 453 {
 454         switch (msg->type) {
 455         case METADATA_UPDATED:
 456                 pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
 457                         __func__, __LINE__, msg->slot);
 458                 process_metadata_update(mddev, msg);
 459                 break;
 460         case RESYNCING:
 461                 pr_info("%s: %d Received message: RESYNCING from %d\n",
 462                         __func__, __LINE__, msg->slot);
 463                 process_suspend_info(mddev, msg->slot,
 464                                 msg->low, msg->high);
 465                 break;
 466         case NEWDISK:
 467                 pr_info("%s: %d Received message: NEWDISK from %d\n",
 468                         __func__, __LINE__, msg->slot);
 469                 process_add_new_disk(mddev, msg);
 470                 break;
 471         case REMOVE:
 472                 pr_info("%s: %d Received REMOVE from %d\n",
 473                         __func__, __LINE__, msg->slot);
 474                 process_remove_disk(mddev, msg);
 475                 break;
 476         case RE_ADD:
 477                 pr_info("%s: %d Received RE_ADD from %d\n",
 478                         __func__, __LINE__, msg->slot);
 479                 process_readd_disk(mddev, msg);
 480                 break;
 481         case BITMAP_NEEDS_SYNC:
 482                 pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
 483                         __func__, __LINE__, msg->slot);
 484                 __recover_slot(mddev, msg->slot);
 485                 break;
 486         default:
 487                 pr_warn("%s:%d Received unknown message from %d\n",
 488                         __func__, __LINE__, msg->slot);
 489         }
 490 }
 491
 492 /*
 493  * thread for receiving message
 494  */
 495 static void recv_daemon(struct md_thread *thread)
 496 {
 497         struct md_cluster_info *cinfo = thread->mddev->cluster_info;
 498         struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
 499         struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
 500         struct cluster_msg msg;
 501         int ret;
 502
 503         /*get CR on Message*/
 504         if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
 505                 pr_err("md/raid1:failed to get CR on MESSAGE\n");
 506                 return;
 507         }
 508
 509         /* read lvb and wake up thread to process this message_lockres */
 510         memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
 511         process_recvd_msg(thread->mddev, &msg);
 512
 513         /*release CR on ack_lockres*/
 514         ret = dlm_unlock_sync(ack_lockres);
 515         if (unlikely(ret != 0))
 516                 pr_info("unlock ack failed return %d\n", ret);
 517         /*up-convert to PR on message_lockres*/
 518         ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
 519         if (unlikely(ret != 0))
 520                 pr_info("lock PR on msg failed return %d\n", ret);
 521         /*get CR on ack_lockres again*/
 522         ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
 523         if (unlikely(ret != 0))
 524                 pr_info("lock CR on ack failed return %d\n", ret);
 525         /*release CR on message_lockres*/
 526         ret = dlm_unlock_sync(message_lockres);
 527         if (unlikely(ret != 0))
 528                 pr_info("unlock msg failed return %d\n", ret);
 529 }
 530
 531 /* lock_comm()
 532  * Takes the lock on the TOKEN lock resource so no other
 533  * node can communicate while the operation is underway.
 534  */
 535 static int lock_comm(struct md_cluster_info *cinfo)
 536 {
 537         int error;
 538
 539         error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
 540         if (error)
 541                 pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
 542                                 __func__, __LINE__, error);
 543         return error;
 544 }
 545
 546 static void unlock_comm(struct md_cluster_info *cinfo)
 547 {
 548         dlm_unlock_sync(cinfo->token_lockres);
 549 }
 550
 551 /* __sendmsg()
 552  * This function performs the actual sending of the message. This function is
 553  * usually called after performing the encompassing operation
 554  * The function:
 555  * 1. Grabs the message lockresource in EX mode
 556  * 2. Copies the message to the message LVB
 557  * 3. Downconverts message lockresource to CW
 558  * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
 559  *    and the other nodes read the message. The thread will wait here until all other
 560  *    nodes have released ack lock resource.
 561  * 5. Downconvert ack lockresource to CR
 562  */
 563 static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
 564 {
 565         int error;
 566         int slot = cinfo->slot_number - 1;
 567
 568         cmsg->slot = cpu_to_le32(slot);
 569         /*get EX on Message*/
 570         error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
 571         if (error) {
 572                 pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
 573                 goto failed_message;
 574         }
 575
 576         memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
 577                         sizeof(struct cluster_msg));
 578         /*down-convert EX to CW on Message*/
 579         error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
 580         if (error) {
 581                 pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
 582                                 error);
 583                 goto failed_ack;
 584         }
 585
 586         /*up-convert CR to EX on Ack*/
 587         error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX);
 588         if (error) {
 589                 pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n",
 590                                 error);
 591                 goto failed_ack;
 592         }
 593
 594         /*down-convert EX to CR on Ack*/
 595         error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR);
 596         if (error) {
 597                 pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n",
 598                                 error);
 599                 goto failed_ack;
 600         }
 601
 602 failed_ack:
 603         error = dlm_unlock_sync(cinfo->message_lockres);
 604         if (unlikely(error != 0)) {
 605                 pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
 606                         error);
 607                 /* in case the message can't be released due to some reason */
 608                 goto failed_ack;
 609         }
 610 failed_message:
 611         return error;
 612 }
 613
 614 static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
 615 {
 616         int ret;
 617
 618         lock_comm(cinfo);
 619         ret = __sendmsg(cinfo, cmsg);
 620         unlock_comm(cinfo);
 621         return ret;
 622 }
 623
 624 static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 625 {
 626         struct md_cluster_info *cinfo = mddev->cluster_info;
 627         int i, ret = 0;
 628         struct dlm_lock_resource *bm_lockres;
 629         struct suspend_info *s;
 630         char str[64];
 631         sector_t lo, hi;
 632
 633
 634         for (i = 0; i < total_slots; i++) {
 635                 memset(str, '\0', 64);
 636                 snprintf(str, 64, "bitmap%04d", i);
 637                 bm_lockres = lockres_init(mddev, str, NULL, 1);
 638                 if (!bm_lockres)
 639                         return -ENOMEM;
 640                 if (i == (cinfo->slot_number - 1))
 641                         continue;
 642
 643                 bm_lockres->flags |= DLM_LKF_NOQUEUE;
 644                 ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
 645                 if (ret == -EAGAIN) {
 646                         memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE);
 647                         s = read_resync_info(mddev, bm_lockres);
 648                         if (s) {
 649                                 pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
 650                                                 __func__, __LINE__,
 651                                                 (unsigned long long) s->lo,
 652                                                 (unsigned long long) s->hi, i);
 653                                 spin_lock_irq(&cinfo->suspend_lock);
 654                                 s->slot = i;
 655                                 list_add(&s->list, &cinfo->suspend_list);
 656                                 spin_unlock_irq(&cinfo->suspend_lock);
 657                         }
 658                         ret = 0;
 659                         lockres_free(bm_lockres);
 660                         continue;
 661                 }
 662                 if (ret) {
 663                         lockres_free(bm_lockres);
 664                         goto out;
 665                 }
 666
 667                 /* Read the disk bitmap sb and check if it needs recovery */
 668                 ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
 669                 if (ret) {
 670                         pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
 671                         lockres_free(bm_lockres);
 672                         continue;
 673                 }
 674                 if ((hi > 0) && (lo < mddev->recovery_cp)) {
 675                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 676                         mddev->recovery_cp = lo;
 677                         md_check_recovery(mddev);
 678                 }
 679
 680                 dlm_unlock_sync(bm_lockres);
 681                 lockres_free(bm_lockres);
 682         }
 683 out:
 684         return ret;
 685 }
 686
 687 static int join(struct mddev *mddev, int nodes)
 688 {
 689         struct md_cluster_info *cinfo;
 690         int ret, ops_rv;
 691         char str[64];
 692
 693         cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
 694         if (!cinfo)
 695                 return -ENOMEM;
 696
 697         INIT_LIST_HEAD(&cinfo->suspend_list);
 698         spin_lock_init(&cinfo->suspend_lock);
 699         init_completion(&cinfo->completion);
 700         set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
 701
 702         mutex_init(&cinfo->sb_mutex);
 703         mddev->cluster_info = cinfo;
 704
 705         memset(str, 0, 64);
 706         sprintf(str, "%pU", mddev->uuid);
 707         ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
 708                                 DLM_LSFL_FS, LVB_SIZE,
 709                                 &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
 710         if (ret)
 711                 goto err;
 712         wait_for_completion(&cinfo->completion);
 713         if (nodes < cinfo->slot_number) {
 714                 pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).",
 715                         cinfo->slot_number, nodes);
 716                 ret = -ERANGE;
 717                 goto err;
 718         }
 719         /* Initiate the communication resources */
 720         ret = -ENOMEM;
 721         cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
 722         if (!cinfo->recv_thread) {
 723                 pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
 724                 goto err;
 725         }
 726         cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1);
 727         if (!cinfo->message_lockres)
 728                 goto err;
 729         cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
 730         if (!cinfo->token_lockres)
 731                 goto err;
 732         cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
 733         if (!cinfo->ack_lockres)
 734                 goto err;
 735         cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
 736         if (!cinfo->no_new_dev_lockres)
 737                 goto err;
 738
 739         /* get sync CR lock on ACK. */
 740         if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
 741                 pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
 742                                 ret);
 743         /* get sync CR lock on no-new-dev. */
 744         if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
 745                 pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
 746
 747
 748         pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
 749         snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
 750         cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1);
 751         if (!cinfo->bitmap_lockres)
 752                 goto err;
 753         if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) {
 754                 pr_err("Failed to get bitmap lock\n");
 755                 ret = -EINVAL;
 756                 goto err;
 757         }
 758
 759         ret = gather_all_resync_info(mddev, nodes);
 760         if (ret)
 761                 goto err;
 762
 763         return 0;
 764 err:
 765         lockres_free(cinfo->message_lockres);
 766         lockres_free(cinfo->token_lockres);
 767         lockres_free(cinfo->ack_lockres);
 768         lockres_free(cinfo->no_new_dev_lockres);
 769         lockres_free(cinfo->bitmap_lockres);
 770         if (cinfo->lockspace)
 771                 dlm_release_lockspace(cinfo->lockspace, 2);
 772         mddev->cluster_info = NULL;
 773         kfree(cinfo);
 774         return ret;
 775 }
 776
 777 static void resync_bitmap(struct mddev *mddev)
 778 {
 779         struct md_cluster_info *cinfo = mddev->cluster_info;
 780         struct cluster_msg cmsg = {0};
 781         int err;
 782
 783         cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
 784         err = sendmsg(cinfo, &cmsg);
 785         if (err)
 786                 pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
 787                         __func__, __LINE__, err);
 788 }
 789
 790 static int leave(struct mddev *mddev)
 791 {
 792         struct md_cluster_info *cinfo = mddev->cluster_info;
 793
 794         if (!cinfo)
 795                 return 0;
 796
 797         /* BITMAP_NEEDS_SYNC message should be sent when node
 798          * is leaving the cluster with dirty bitmap, also we
 799          * can only deliver it when dlm connection is available */
 800         if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
 801                 resync_bitmap(mddev);
 802
 803         md_unregister_thread(&cinfo->recovery_thread);
 804         md_unregister_thread(&cinfo->recv_thread);
 805         lockres_free(cinfo->message_lockres);
 806         lockres_free(cinfo->token_lockres);
 807         lockres_free(cinfo->ack_lockres);
 808         lockres_free(cinfo->no_new_dev_lockres);
 809         lockres_free(cinfo->bitmap_lockres);
 810         dlm_release_lockspace(cinfo->lockspace, 2);
 811         return 0;
 812 }
 813
 814 /* slot_number(): Returns the MD slot number to use
 815  * DLM starts the slot numbers from 1, wheras cluster-md
 816  * wants the number to be from zero, so we deduct one
 817  */
 818 static int slot_number(struct mddev *mddev)
 819 {
 820         struct md_cluster_info *cinfo = mddev->cluster_info;
 821
 822         return cinfo->slot_number - 1;
 823 }
 824
 825 static int metadata_update_start(struct mddev *mddev)
 826 {
 827         return lock_comm(mddev->cluster_info);
 828 }
 829
 830 static int metadata_update_finish(struct mddev *mddev)
 831 {
 832         struct md_cluster_info *cinfo = mddev->cluster_info;
 833         struct cluster_msg cmsg;
 834         int ret;
 835
 836         memset(&cmsg, 0, sizeof(cmsg));
 837         cmsg.type = cpu_to_le32(METADATA_UPDATED);
 838         ret = __sendmsg(cinfo, &cmsg);
 839         unlock_comm(cinfo);
 840         return ret;
 841 }
 842
 843 static int metadata_update_cancel(struct mddev *mddev)
 844 {
 845         struct md_cluster_info *cinfo = mddev->cluster_info;
 846
 847         return dlm_unlock_sync(cinfo->token_lockres);
 848 }
 849
 850 static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 851 {
 852         struct md_cluster_info *cinfo = mddev->cluster_info;
 853         struct cluster_msg cmsg;
 854         int slot = cinfo->slot_number - 1;
 855
 856         add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
 857         /* Re-acquire the lock to refresh LVB */
 858         dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
 859         pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
 860                         (unsigned long long)lo,
 861                         (unsigned long long)hi);
 862         cmsg.type = cpu_to_le32(RESYNCING);
 863         cmsg.slot = cpu_to_le32(slot);
 864         cmsg.low = cpu_to_le64(lo);
 865         cmsg.high = cpu_to_le64(hi);
 866         return sendmsg(cinfo, &cmsg);
 867 }
 868
 869 static int area_resyncing(struct mddev *mddev, int direction,
 870                 sector_t lo, sector_t hi)
 871 {
 872         struct md_cluster_info *cinfo = mddev->cluster_info;
 873         int ret = 0;
 874         struct suspend_info *s;
 875
 876         if ((direction == READ) &&
 877                 test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
 878                 return 1;
 879
 880         spin_lock_irq(&cinfo->suspend_lock);
 881         if (list_empty(&cinfo->suspend_list))
 882                 goto out;
 883         list_for_each_entry(s, &cinfo->suspend_list, list)
 884                 if (hi > s->lo && lo < s->hi) {
 885                         ret = 1;
 886                         break;
 887                 }
 888 out:
 889         spin_unlock_irq(&cinfo->suspend_lock);
 890         return ret;
 891 }
 892
 893 static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
 894 {
 895         struct md_cluster_info *cinfo = mddev->cluster_info;
 896         struct cluster_msg cmsg;
 897         int ret = 0;
 898         struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
 899         char *uuid = sb->device_uuid;
 900
 901         memset(&cmsg, 0, sizeof(cmsg));
 902         cmsg.type = cpu_to_le32(NEWDISK);
 903         memcpy(cmsg.uuid, uuid, 16);
 904         cmsg.raid_slot = rdev->desc_nr;
 905         lock_comm(cinfo);
 906         ret = __sendmsg(cinfo, &cmsg);
 907         if (ret)
 908                 return ret;
 909         cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
 910         ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
 911         cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
 912         /* Some node does not "see" the device */
 913         if (ret == -EAGAIN)
 914                 ret = -ENOENT;
 915         else
 916                 dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
 917         return ret;
 918 }
 919
 920 static int add_new_disk_finish(struct mddev *mddev)
 921 {
 922         struct cluster_msg cmsg;
 923         struct md_cluster_info *cinfo = mddev->cluster_info;
 924         int ret;
 925         /* Write sb and inform others */
 926         md_update_sb(mddev, 1);
 927         cmsg.type = METADATA_UPDATED;
 928         ret = __sendmsg(cinfo, &cmsg);
 929         unlock_comm(cinfo);
 930         return ret;
 931 }
 932
 933 static int new_disk_ack(struct mddev *mddev, bool ack)
 934 {
 935         struct md_cluster_info *cinfo = mddev->cluster_info;
 936
 937         if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) {
 938                 pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev));
 939                 return -EINVAL;
 940         }
 941
 942         if (ack)
 943                 dlm_unlock_sync(cinfo->no_new_dev_lockres);
 944         complete(&cinfo->newdisk_completion);
 945         return 0;
 946 }
 947
 948 static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 949 {
 950         struct cluster_msg cmsg;
 951         struct md_cluster_info *cinfo = mddev->cluster_info;
 952         cmsg.type = REMOVE;
 953         cmsg.raid_slot = rdev->desc_nr;
 954         return __sendmsg(cinfo, &cmsg);
 955 }
 956
 957 static int gather_bitmaps(struct md_rdev *rdev)
 958 {
 959         int sn, err;
 960         sector_t lo, hi;
 961         struct cluster_msg cmsg;
 962         struct mddev *mddev = rdev->mddev;
 963         struct md_cluster_info *cinfo = mddev->cluster_info;
 964
 965         cmsg.type = RE_ADD;
 966         cmsg.raid_slot = rdev->desc_nr;
 967         err = sendmsg(cinfo, &cmsg);
 968         if (err)
 969                 goto out;
 970
 971         for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
 972                 if (sn == (cinfo->slot_number - 1))
 973                         continue;
 974                 err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
 975                 if (err) {
 976                         pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
 977                         goto out;
 978                 }
 979                 if ((hi > 0) && (lo < mddev->recovery_cp))
 980                         mddev->recovery_cp = lo;
 981         }
 982 out:
 983         return err;
 984 }
 985
 986 static struct md_cluster_operations cluster_ops = {
 987         .join   = join,
 988         .leave  = leave,
 989         .slot_number = slot_number,
 990         .resync_info_update = resync_info_update,
 991         .metadata_update_start = metadata_update_start,
 992         .metadata_update_finish = metadata_update_finish,
 993         .metadata_update_cancel = metadata_update_cancel,
 994         .area_resyncing = area_resyncing,
 995         .add_new_disk_start = add_new_disk_start,
 996         .add_new_disk_finish = add_new_disk_finish,
 997         .new_disk_ack = new_disk_ack,
 998         .remove_disk = remove_disk,
 999         .gather_bitmaps = gather_bitmaps,
1000 };
1001
1002 static int __init cluster_init(void)
1003 {
1004         pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
1005         pr_info("Registering Cluster MD functions\n");
1006         register_md_cluster_operations(&cluster_ops, THIS_MODULE);
1007         return 0;
1008 }
1009
1010 static void cluster_exit(void)
1011 {
1012         unregister_md_cluster_operations();
1013 }
1014
1015 module_init(cluster_init);
1016 module_exit(cluster_exit);
1017 MODULE_LICENSE("GPL");
1018 MODULE_DESCRIPTION("Clustering support for MD");