fs/ocfs2/dlm/dlmmaster.c

   1 /* -*- mode: c; c-basic-offset: 8; -*-
   2  * vim: noexpandtab sw=8 ts=8 sts=0:
   3  *
   4  * dlmmod.c
   5  *
   6  * standalone DLM module
   7  *
   8  * Copyright (C) 2004 Oracle.  All rights reserved.
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2 of the License, or (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public
  21  * License along with this program; if not, write to the
  22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  23  * Boston, MA 021110-1307, USA.
  24  *
  25  */
  26
  27
  28 #include <linux/module.h>
  29 #include <linux/fs.h>
  30 #include <linux/types.h>
  31 #include <linux/slab.h>
  32 #include <linux/highmem.h>
  33 #include <linux/utsname.h>
  34 #include <linux/init.h>
  35 #include <linux/sysctl.h>
  36 #include <linux/random.h>
  37 #include <linux/blkdev.h>
  38 #include <linux/socket.h>
  39 #include <linux/inet.h>
  40 #include <linux/spinlock.h>
  41 #include <linux/delay.h>
  42
  43
  44 #include "cluster/heartbeat.h"
  45 #include "cluster/nodemanager.h"
  46 #include "cluster/tcp.h"
  47
  48 #include "dlmapi.h"
  49 #include "dlmcommon.h"
  50 #include "dlmdomain.h"
  51 #include "dlmdebug.h"
  52
  53 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
  54 #include "cluster/masklog.h"
  55
  56 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
  57                               struct dlm_master_list_entry *mle,
  58                               struct o2nm_node *node,
  59                               int idx);
  60 static void dlm_mle_node_up(struct dlm_ctxt *dlm,
  61                             struct dlm_master_list_entry *mle,
  62                             struct o2nm_node *node,
  63                             int idx);
  64
  65 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
  66 static int dlm_do_assert_master(struct dlm_ctxt *dlm,
  67                                 struct dlm_lock_resource *res,
  68                                 void *nodemap, u32 flags);
  69 static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data);
  70
  71 static inline void __dlm_mle_name(struct dlm_master_list_entry *mle,
  72                                   unsigned char **name, unsigned int *namelen)
  73 {
  74         BUG_ON(mle->type != DLM_MLE_BLOCK &&
  75                mle->type != DLM_MLE_MASTER &&
  76                mle->type != DLM_MLE_MIGRATION);
  77
  78         if (mle->type != DLM_MLE_MASTER) {
  79                 *name = mle->u.mlename.name;
  80                 *namelen = mle->u.mlename.len;
  81         } else {
  82                 *name  = (unsigned char *)mle->u.mleres->lockname.name;
  83                 *namelen = mle->u.mleres->lockname.len;
  84         }
  85 }
  86
  87 static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
  88                                 struct dlm_master_list_entry *mle,
  89                                 const char *name,
  90                                 unsigned int namelen)
  91 {
  92         unsigned char *mlename;
  93         unsigned int mlelen;
  94
  95         if (dlm != mle->dlm)
  96                 return 0;
  97
  98         __dlm_mle_name(mle, &mlename, &mlelen);
  99
 100         if (namelen != mlelen || memcmp(name, mlename, namelen) != 0)
 101                 return 0;
 102
 103         return 1;
 104 }
 105
 106 static struct kmem_cache *dlm_lockres_cache = NULL;
 107 static struct kmem_cache *dlm_lockname_cache = NULL;
 108 static struct kmem_cache *dlm_mle_cache = NULL;
 109
 110 static void dlm_mle_release(struct kref *kref);
 111 static void dlm_init_mle(struct dlm_master_list_entry *mle,
 112                         enum dlm_mle_type type,
 113                         struct dlm_ctxt *dlm,
 114                         struct dlm_lock_resource *res,
 115                         const char *name,
 116                         unsigned int namelen);
 117 static void dlm_put_mle(struct dlm_master_list_entry *mle);
 118 static void __dlm_put_mle(struct dlm_master_list_entry *mle);
 119 static int dlm_find_mle(struct dlm_ctxt *dlm,
 120                         struct dlm_master_list_entry **mle,
 121                         char *name, unsigned int namelen);
 122
 123 static int dlm_do_master_request(struct dlm_lock_resource *res,
 124                                  struct dlm_master_list_entry *mle, int to);
 125
 126
 127 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
 128                                      struct dlm_lock_resource *res,
 129                                      struct dlm_master_list_entry *mle,
 130                                      int *blocked);
 131 static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
 132                                     struct dlm_lock_resource *res,
 133                                     struct dlm_master_list_entry *mle,
 134                                     int blocked);
 135 static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 136                                  struct dlm_lock_resource *res,
 137                                  struct dlm_master_list_entry *mle,
 138                                  struct dlm_master_list_entry **oldmle,
 139                                  const char *name, unsigned int namelen,
 140                                  u8 new_master, u8 master);
 141
 142 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
 143                                     struct dlm_lock_resource *res);
 144 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
 145                                       struct dlm_lock_resource *res);
 146 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
 147                                        struct dlm_lock_resource *res,
 148                                        u8 target);
 149 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
 150                                        struct dlm_lock_resource *res);
 151
 152
 153 int dlm_is_host_down(int errno)
 154 {
 155         switch (errno) {
 156                 case -EBADF:
 157                 case -ECONNREFUSED:
 158                 case -ENOTCONN:
 159                 case -ECONNRESET:
 160                 case -EPIPE:
 161                 case -EHOSTDOWN:
 162                 case -EHOSTUNREACH:
 163                 case -ETIMEDOUT:
 164                 case -ECONNABORTED:
 165                 case -ENETDOWN:
 166                 case -ENETUNREACH:
 167                 case -ENETRESET:
 168                 case -ESHUTDOWN:
 169                 case -ENOPROTOOPT:
 170                 case -EINVAL:   /* if returned from our tcp code,
 171                                    this means there is no socket */
 172                         return 1;
 173         }
 174         return 0;
 175 }
 176
 177
 178 /*
 179  * MASTER LIST FUNCTIONS
 180  */
 181
 182
 183 /*
 184  * regarding master list entries and heartbeat callbacks:
 185  *
 186  * in order to avoid sleeping and allocation that occurs in
 187  * heartbeat, master list entries are simply attached to the
 188  * dlm's established heartbeat callbacks.  the mle is attached
 189  * when it is created, and since the dlm->spinlock is held at
 190  * that time, any heartbeat event will be properly discovered
 191  * by the mle.  the mle needs to be detached from the
 192  * dlm->mle_hb_events list as soon as heartbeat events are no
 193  * longer useful to the mle, and before the mle is freed.
 194  *
 195  * as a general rule, heartbeat events are no longer needed by
 196  * the mle once an "answer" regarding the lock master has been
 197  * received.
 198  */
 199 static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
 200                                               struct dlm_master_list_entry *mle)
 201 {
 202         assert_spin_locked(&dlm->spinlock);
 203
 204         list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
 205 }
 206
 207
 208 static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
 209                                               struct dlm_master_list_entry *mle)
 210 {
 211         if (!list_empty(&mle->hb_events))
 212                 list_del_init(&mle->hb_events);
 213 }
 214
 215
 216 static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
 217                                             struct dlm_master_list_entry *mle)
 218 {
 219         spin_lock(&dlm->spinlock);
 220         __dlm_mle_detach_hb_events(dlm, mle);
 221         spin_unlock(&dlm->spinlock);
 222 }
 223
 224 static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
 225 {
 226         struct dlm_ctxt *dlm;
 227         dlm = mle->dlm;
 228
 229         assert_spin_locked(&dlm->spinlock);
 230         assert_spin_locked(&dlm->master_lock);
 231         mle->inuse++;
 232         kref_get(&mle->mle_refs);
 233 }
 234
 235 static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
 236 {
 237         struct dlm_ctxt *dlm;
 238         dlm = mle->dlm;
 239
 240         spin_lock(&dlm->spinlock);
 241         spin_lock(&dlm->master_lock);
 242         mle->inuse--;
 243         __dlm_put_mle(mle);
 244         spin_unlock(&dlm->master_lock);
 245         spin_unlock(&dlm->spinlock);
 246
 247 }
 248
 249 /* remove from list and free */
 250 static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 251 {
 252         struct dlm_ctxt *dlm;
 253         dlm = mle->dlm;
 254
 255         assert_spin_locked(&dlm->spinlock);
 256         assert_spin_locked(&dlm->master_lock);
 257         if (!atomic_read(&mle->mle_refs.refcount)) {
 258                 /* this may or may not crash, but who cares.
 259                  * it's a BUG. */
 260                 mlog(ML_ERROR, "bad mle: %p\n", mle);
 261                 dlm_print_one_mle(mle);
 262                 BUG();
 263         } else
 264                 kref_put(&mle->mle_refs, dlm_mle_release);
 265 }
 266
 267
 268 /* must not have any spinlocks coming in */
 269 static void dlm_put_mle(struct dlm_master_list_entry *mle)
 270 {
 271         struct dlm_ctxt *dlm;
 272         dlm = mle->dlm;
 273
 274         spin_lock(&dlm->spinlock);
 275         spin_lock(&dlm->master_lock);
 276         __dlm_put_mle(mle);
 277         spin_unlock(&dlm->master_lock);
 278         spin_unlock(&dlm->spinlock);
 279 }
 280
 281 static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
 282 {
 283         kref_get(&mle->mle_refs);
 284 }
 285
 286 static void dlm_init_mle(struct dlm_master_list_entry *mle,
 287                         enum dlm_mle_type type,
 288                         struct dlm_ctxt *dlm,
 289                         struct dlm_lock_resource *res,
 290                         const char *name,
 291                         unsigned int namelen)
 292 {
 293         assert_spin_locked(&dlm->spinlock);
 294
 295         mle->dlm = dlm;
 296         mle->type = type;
 297         INIT_LIST_HEAD(&mle->list);
 298         INIT_LIST_HEAD(&mle->hb_events);
 299         memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
 300         spin_lock_init(&mle->spinlock);
 301         init_waitqueue_head(&mle->wq);
 302         atomic_set(&mle->woken, 0);
 303         kref_init(&mle->mle_refs);
 304         memset(mle->response_map, 0, sizeof(mle->response_map));
 305         mle->master = O2NM_MAX_NODES;
 306         mle->new_master = O2NM_MAX_NODES;
 307         mle->inuse = 0;
 308
 309         BUG_ON(mle->type != DLM_MLE_BLOCK &&
 310                mle->type != DLM_MLE_MASTER &&
 311                mle->type != DLM_MLE_MIGRATION);
 312
 313         if (mle->type == DLM_MLE_MASTER) {
 314                 BUG_ON(!res);
 315                 mle->u.mleres = res;
 316         } else {
 317                 BUG_ON(!name);
 318                 memcpy(mle->u.mlename.name, name, namelen);
 319                 mle->u.mlename.len = namelen;
 320         }
 321
 322         /* copy off the node_map and register hb callbacks on our copy */
 323         memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
 324         memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
 325         clear_bit(dlm->node_num, mle->vote_map);
 326         clear_bit(dlm->node_num, mle->node_map);
 327
 328         /* attach the mle to the domain node up/down events */
 329         __dlm_mle_attach_hb_events(dlm, mle);
 330 }
 331
 332 void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
 333 {
 334         assert_spin_locked(&dlm->spinlock);
 335         assert_spin_locked(&dlm->master_lock);
 336
 337         if (!list_empty(&mle->list))
 338                 list_del_init(&mle->list);
 339 }
 340
 341 void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
 342 {
 343         assert_spin_locked(&dlm->master_lock);
 344
 345         list_add(&mle->list, &dlm->master_list);
 346 }
 347
 348 /* returns 1 if found, 0 if not */
 349 static int dlm_find_mle(struct dlm_ctxt *dlm,
 350                         struct dlm_master_list_entry **mle,
 351                         char *name, unsigned int namelen)
 352 {
 353         struct dlm_master_list_entry *tmpmle;
 354
 355         assert_spin_locked(&dlm->master_lock);
 356
 357         list_for_each_entry(tmpmle, &dlm->master_list, list) {
 358                 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
 359                         continue;
 360                 dlm_get_mle(tmpmle);
 361                 *mle = tmpmle;
 362                 return 1;
 363         }
 364         return 0;
 365 }
 366
 367 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
 368 {
 369         struct dlm_master_list_entry *mle;
 370
 371         assert_spin_locked(&dlm->spinlock);
 372
 373         list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
 374                 if (node_up)
 375                         dlm_mle_node_up(dlm, mle, NULL, idx);
 376                 else
 377                         dlm_mle_node_down(dlm, mle, NULL, idx);
 378         }
 379 }
 380
 381 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
 382                               struct dlm_master_list_entry *mle,
 383                               struct o2nm_node *node, int idx)
 384 {
 385         spin_lock(&mle->spinlock);
 386
 387         if (!test_bit(idx, mle->node_map))
 388                 mlog(0, "node %u already removed from nodemap!\n", idx);
 389         else
 390                 clear_bit(idx, mle->node_map);
 391
 392         spin_unlock(&mle->spinlock);
 393 }
 394
 395 static void dlm_mle_node_up(struct dlm_ctxt *dlm,
 396                             struct dlm_master_list_entry *mle,
 397                             struct o2nm_node *node, int idx)
 398 {
 399         spin_lock(&mle->spinlock);
 400
 401         if (test_bit(idx, mle->node_map))
 402                 mlog(0, "node %u already in node map!\n", idx);
 403         else
 404                 set_bit(idx, mle->node_map);
 405
 406         spin_unlock(&mle->spinlock);
 407 }
 408
 409
 410 int dlm_init_mle_cache(void)
 411 {
 412         dlm_mle_cache = kmem_cache_create("o2dlm_mle",
 413                                           sizeof(struct dlm_master_list_entry),
 414                                           0, SLAB_HWCACHE_ALIGN,
 415                                           NULL);
 416         if (dlm_mle_cache == NULL)
 417                 return -ENOMEM;
 418         return 0;
 419 }
 420
 421 void dlm_destroy_mle_cache(void)
 422 {
 423         if (dlm_mle_cache)
 424                 kmem_cache_destroy(dlm_mle_cache);
 425 }
 426
 427 static void dlm_mle_release(struct kref *kref)
 428 {
 429         struct dlm_master_list_entry *mle;
 430         struct dlm_ctxt *dlm;
 431
 432         mlog_entry_void();
 433
 434         mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
 435         dlm = mle->dlm;
 436
 437         if (mle->type != DLM_MLE_MASTER) {
 438                 mlog(0, "calling mle_release for %.*s, type %d\n",
 439                      mle->u.mlename.len, mle->u.mlename.name, mle->type);
 440         } else {
 441                 mlog(0, "calling mle_release for %.*s, type %d\n",
 442                      mle->u.mleres->lockname.len,
 443                      mle->u.mleres->lockname.name, mle->type);
 444         }
 445         assert_spin_locked(&dlm->spinlock);
 446         assert_spin_locked(&dlm->master_lock);
 447
 448         /* remove from list if not already */
 449         __dlm_unlink_mle(dlm, mle);
 450
 451         /* detach the mle from the domain node up/down events */
 452         __dlm_mle_detach_hb_events(dlm, mle);
 453
 454         /* NOTE: kfree under spinlock here.
 455          * if this is bad, we can move this to a freelist. */
 456         kmem_cache_free(dlm_mle_cache, mle);
 457 }
 458
 459
 460 /*
 461  * LOCK RESOURCE FUNCTIONS
 462  */
 463
 464 int dlm_init_master_caches(void)
 465 {
 466         dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
 467                                               sizeof(struct dlm_lock_resource),
 468                                               0, SLAB_HWCACHE_ALIGN, NULL);
 469         if (!dlm_lockres_cache)
 470                 goto bail;
 471
 472         dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
 473                                                DLM_LOCKID_NAME_MAX, 0,
 474                                                SLAB_HWCACHE_ALIGN, NULL);
 475         if (!dlm_lockname_cache)
 476                 goto bail;
 477
 478         return 0;
 479 bail:
 480         dlm_destroy_master_caches();
 481         return -ENOMEM;
 482 }
 483
 484 void dlm_destroy_master_caches(void)
 485 {
 486         if (dlm_lockname_cache)
 487                 kmem_cache_destroy(dlm_lockname_cache);
 488
 489         if (dlm_lockres_cache)
 490                 kmem_cache_destroy(dlm_lockres_cache);
 491 }
 492
 493 static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
 494                                   struct dlm_lock_resource *res,
 495                                   u8 owner)
 496 {
 497         assert_spin_locked(&res->spinlock);
 498
 499         mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
 500
 501         if (owner == dlm->node_num)
 502                 atomic_inc(&dlm->local_resources);
 503         else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
 504                 atomic_inc(&dlm->unknown_resources);
 505         else
 506                 atomic_inc(&dlm->remote_resources);
 507
 508         res->owner = owner;
 509 }
 510
 511 void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
 512                               struct dlm_lock_resource *res, u8 owner)
 513 {
 514         assert_spin_locked(&res->spinlock);
 515
 516         if (owner == res->owner)
 517                 return;
 518
 519         if (res->owner == dlm->node_num)
 520                 atomic_dec(&dlm->local_resources);
 521         else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
 522                 atomic_dec(&dlm->unknown_resources);
 523         else
 524                 atomic_dec(&dlm->remote_resources);
 525
 526         dlm_set_lockres_owner(dlm, res, owner);
 527 }
 528
 529
 530 static void dlm_lockres_release(struct kref *kref)
 531 {
 532         struct dlm_lock_resource *res;
 533         struct dlm_ctxt *dlm;
 534
 535         res = container_of(kref, struct dlm_lock_resource, refs);
 536         dlm = res->dlm;
 537
 538         /* This should not happen -- all lockres' have a name
 539          * associated with them at init time. */
 540         BUG_ON(!res->lockname.name);
 541
 542         mlog(0, "destroying lockres %.*s\n", res->lockname.len,
 543              res->lockname.name);
 544
 545         spin_lock(&dlm->track_lock);
 546         if (!list_empty(&res->tracking))
 547                 list_del_init(&res->tracking);
 548         else {
 549                 mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
 550                      res->lockname.len, res->lockname.name);
 551                 dlm_print_one_lock_resource(res);
 552         }
 553         spin_unlock(&dlm->track_lock);
 554
 555         dlm_put(dlm);
 556
 557         if (!hlist_unhashed(&res->hash_node) ||
 558             !list_empty(&res->granted) ||
 559             !list_empty(&res->converting) ||
 560             !list_empty(&res->blocked) ||
 561             !list_empty(&res->dirty) ||
 562             !list_empty(&res->recovering) ||
 563             !list_empty(&res->purge)) {
 564                 mlog(ML_ERROR,
 565                      "Going to BUG for resource %.*s."
 566                      "  We're on a list! [%c%c%c%c%c%c%c]\n",
 567                      res->lockname.len, res->lockname.name,
 568                      !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
 569                      !list_empty(&res->granted) ? 'G' : ' ',
 570                      !list_empty(&res->converting) ? 'C' : ' ',
 571                      !list_empty(&res->blocked) ? 'B' : ' ',
 572                      !list_empty(&res->dirty) ? 'D' : ' ',
 573                      !list_empty(&res->recovering) ? 'R' : ' ',
 574                      !list_empty(&res->purge) ? 'P' : ' ');
 575
 576                 dlm_print_one_lock_resource(res);
 577         }
 578
 579         /* By the time we're ready to blow this guy away, we shouldn't
 580          * be on any lists. */
 581         BUG_ON(!hlist_unhashed(&res->hash_node));
 582         BUG_ON(!list_empty(&res->granted));
 583         BUG_ON(!list_empty(&res->converting));
 584         BUG_ON(!list_empty(&res->blocked));
 585         BUG_ON(!list_empty(&res->dirty));
 586         BUG_ON(!list_empty(&res->recovering));
 587         BUG_ON(!list_empty(&res->purge));
 588
 589         kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
 590
 591         kmem_cache_free(dlm_lockres_cache, res);
 592 }
 593
 594 void dlm_lockres_put(struct dlm_lock_resource *res)
 595 {
 596         kref_put(&res->refs, dlm_lockres_release);
 597 }
 598
 599 static void dlm_init_lockres(struct dlm_ctxt *dlm,
 600                              struct dlm_lock_resource *res,
 601                              const char *name, unsigned int namelen)
 602 {
 603         char *qname;
 604
 605         /* If we memset here, we lose our reference to the kmalloc'd
 606          * res->lockname.name, so be sure to init every field
 607          * correctly! */
 608
 609         qname = (char *) res->lockname.name;
 610         memcpy(qname, name, namelen);
 611
 612         res->lockname.len = namelen;
 613         res->lockname.hash = dlm_lockid_hash(name, namelen);
 614
 615         init_waitqueue_head(&res->wq);
 616         spin_lock_init(&res->spinlock);
 617         INIT_HLIST_NODE(&res->hash_node);
 618         INIT_LIST_HEAD(&res->granted);
 619         INIT_LIST_HEAD(&res->converting);
 620         INIT_LIST_HEAD(&res->blocked);
 621         INIT_LIST_HEAD(&res->dirty);
 622         INIT_LIST_HEAD(&res->recovering);
 623         INIT_LIST_HEAD(&res->purge);
 624         INIT_LIST_HEAD(&res->tracking);
 625         atomic_set(&res->asts_reserved, 0);
 626         res->migration_pending = 0;
 627         res->inflight_locks = 0;
 628
 629         /* put in dlm_lockres_release */
 630         dlm_grab(dlm);
 631         res->dlm = dlm;
 632
 633         kref_init(&res->refs);
 634
 635         /* just for consistency */
 636         spin_lock(&res->spinlock);
 637         dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
 638         spin_unlock(&res->spinlock);
 639
 640         res->state = DLM_LOCK_RES_IN_PROGRESS;
 641
 642         res->last_used = 0;
 643
 644         spin_lock(&dlm->spinlock);
 645         list_add_tail(&res->tracking, &dlm->tracking_list);
 646         spin_unlock(&dlm->spinlock);
 647
 648         memset(res->lvb, 0, DLM_LVB_LEN);
 649         memset(res->refmap, 0, sizeof(res->refmap));
 650 }
 651
 652 struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 653                                    const char *name,
 654                                    unsigned int namelen)
 655 {
 656         struct dlm_lock_resource *res = NULL;
 657
 658         res = (struct dlm_lock_resource *)
 659                                 kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
 660         if (!res)
 661                 goto error;
 662
 663         res->lockname.name = (char *)
 664                                 kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
 665         if (!res->lockname.name)
 666                 goto error;
 667
 668         dlm_init_lockres(dlm, res, name, namelen);
 669         return res;
 670
 671 error:
 672         if (res && res->lockname.name)
 673                 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
 674
 675         if (res)
 676                 kmem_cache_free(dlm_lockres_cache, res);
 677         return NULL;
 678 }
 679
 680 void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
 681                                    struct dlm_lock_resource *res,
 682                                    int new_lockres,
 683                                    const char *file,
 684                                    int line)
 685 {
 686         if (!new_lockres)
 687                 assert_spin_locked(&res->spinlock);
 688
 689         if (!test_bit(dlm->node_num, res->refmap)) {
 690                 BUG_ON(res->inflight_locks != 0);
 691                 dlm_lockres_set_refmap_bit(dlm->node_num, res);
 692         }
 693         res->inflight_locks++;
 694         mlog(0, "%s:%.*s: inflight++: now %u\n",
 695              dlm->name, res->lockname.len, res->lockname.name,
 696              res->inflight_locks);
 697 }
 698
 699 void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
 700                                    struct dlm_lock_resource *res,
 701                                    const char *file,
 702                                    int line)
 703 {
 704         assert_spin_locked(&res->spinlock);
 705
 706         BUG_ON(res->inflight_locks == 0);
 707         res->inflight_locks--;
 708         mlog(0, "%s:%.*s: inflight--: now %u\n",
 709              dlm->name, res->lockname.len, res->lockname.name,
 710              res->inflight_locks);
 711         if (res->inflight_locks == 0)
 712                 dlm_lockres_clear_refmap_bit(dlm->node_num, res);
 713         wake_up(&res->wq);
 714 }
 715
 716 /*
 717  * lookup a lock resource by name.
 718  * may already exist in the hashtable.
 719  * lockid is null terminated
 720  *
 721  * if not, allocate enough for the lockres and for
 722  * the temporary structure used in doing the mastering.
 723  *
 724  * also, do a lookup in the dlm->master_list to see
 725  * if another node has begun mastering the same lock.
 726  * if so, there should be a block entry in there
 727  * for this name, and we should *not* attempt to master
 728  * the lock here.   need to wait around for that node
 729  * to assert_master (or die).
 730  *
 731  */
 732 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
 733                                           const char *lockid,
 734                                           int namelen,
 735                                           int flags)
 736 {
 737         struct dlm_lock_resource *tmpres=NULL, *res=NULL;
 738         struct dlm_master_list_entry *mle = NULL;
 739         struct dlm_master_list_entry *alloc_mle = NULL;
 740         int blocked = 0;
 741         int ret, nodenum;
 742         struct dlm_node_iter iter;
 743         unsigned int hash;
 744         int tries = 0;
 745         int bit, wait_on_recovery = 0;
 746         int drop_inflight_if_nonlocal = 0;
 747
 748         BUG_ON(!lockid);
 749
 750         hash = dlm_lockid_hash(lockid, namelen);
 751
 752         mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
 753
 754 lookup:
 755         spin_lock(&dlm->spinlock);
 756         tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
 757         if (tmpres) {
 758                 int dropping_ref = 0;
 759
 760                 spin_unlock(&dlm->spinlock);
 761
 762                 spin_lock(&tmpres->spinlock);
 763                 /* We wait for the other thread that is mastering the resource */
 764                 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
 765                         __dlm_wait_on_lockres(tmpres);
 766                         BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
 767                 }
 768
 769                 if (tmpres->owner == dlm->node_num) {
 770                         BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
 771                         dlm_lockres_grab_inflight_ref(dlm, tmpres);
 772                 } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
 773                         dropping_ref = 1;
 774                 spin_unlock(&tmpres->spinlock);
 775
 776                 /* wait until done messaging the master, drop our ref to allow
 777                  * the lockres to be purged, start over. */
 778                 if (dropping_ref) {
 779                         spin_lock(&tmpres->spinlock);
 780                         __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
 781                         spin_unlock(&tmpres->spinlock);
 782                         dlm_lockres_put(tmpres);
 783                         tmpres = NULL;
 784                         goto lookup;
 785                 }
 786
 787                 mlog(0, "found in hash!\n");
 788                 if (res)
 789                         dlm_lockres_put(res);
 790                 res = tmpres;
 791                 goto leave;
 792         }
 793
 794         if (!res) {
 795                 spin_unlock(&dlm->spinlock);
 796                 mlog(0, "allocating a new resource\n");
 797                 /* nothing found and we need to allocate one. */
 798                 alloc_mle = (struct dlm_master_list_entry *)
 799                         kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 800                 if (!alloc_mle)
 801                         goto leave;
 802                 res = dlm_new_lockres(dlm, lockid, namelen);
 803                 if (!res)
 804                         goto leave;
 805                 goto lookup;
 806         }
 807
 808         mlog(0, "no lockres found, allocated our own: %p\n", res);
 809
 810         if (flags & LKM_LOCAL) {
 811                 /* caller knows it's safe to assume it's not mastered elsewhere
 812                  * DONE!  return right away */
 813                 spin_lock(&res->spinlock);
 814                 dlm_change_lockres_owner(dlm, res, dlm->node_num);
 815                 __dlm_insert_lockres(dlm, res);
 816                 dlm_lockres_grab_inflight_ref(dlm, res);
 817                 spin_unlock(&res->spinlock);
 818                 spin_unlock(&dlm->spinlock);
 819                 /* lockres still marked IN_PROGRESS */
 820                 goto wake_waiters;
 821         }
 822
 823         /* check master list to see if another node has started mastering it */
 824         spin_lock(&dlm->master_lock);
 825
 826         /* if we found a block, wait for lock to be mastered by another node */
 827         blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
 828         if (blocked) {
 829                 int mig;
 830                 if (mle->type == DLM_MLE_MASTER) {
 831                         mlog(ML_ERROR, "master entry for nonexistent lock!\n");
 832                         BUG();
 833                 }
 834                 mig = (mle->type == DLM_MLE_MIGRATION);
 835                 /* if there is a migration in progress, let the migration
 836                  * finish before continuing.  we can wait for the absence
 837                  * of the MIGRATION mle: either the migrate finished or
 838                  * one of the nodes died and the mle was cleaned up.
 839                  * if there is a BLOCK here, but it already has a master
 840                  * set, we are too late.  the master does not have a ref
 841                  * for us in the refmap.  detach the mle and drop it.
 842                  * either way, go back to the top and start over. */
 843                 if (mig || mle->master != O2NM_MAX_NODES) {
 844                         BUG_ON(mig && mle->master == dlm->node_num);
 845                         /* we arrived too late.  the master does not
 846                          * have a ref for us. retry. */
 847                         mlog(0, "%s:%.*s: late on %s\n",
 848                              dlm->name, namelen, lockid,
 849                              mig ?  "MIGRATION" : "BLOCK");
 850                         spin_unlock(&dlm->master_lock);
 851                         spin_unlock(&dlm->spinlock);
 852
 853                         /* master is known, detach */
 854                         if (!mig)
 855                                 dlm_mle_detach_hb_events(dlm, mle);
 856                         dlm_put_mle(mle);
 857                         mle = NULL;
 858                         /* this is lame, but we cant wait on either
 859                          * the mle or lockres waitqueue here */
 860                         if (mig)
 861                                 msleep(100);
 862                         goto lookup;
 863                 }
 864         } else {
 865                 /* go ahead and try to master lock on this node */
 866                 mle = alloc_mle;
 867                 /* make sure this does not get freed below */
 868                 alloc_mle = NULL;
 869                 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
 870                 set_bit(dlm->node_num, mle->maybe_map);
 871                 __dlm_insert_mle(dlm, mle);
 872
 873                 /* still holding the dlm spinlock, check the recovery map
 874                  * to see if there are any nodes that still need to be
 875                  * considered.  these will not appear in the mle nodemap
 876                  * but they might own this lockres.  wait on them. */
 877                 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
 878                 if (bit < O2NM_MAX_NODES) {
 879                         mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
 880                              "recover before lock mastery can begin\n",
 881                              dlm->name, namelen, (char *)lockid, bit);
 882                         wait_on_recovery = 1;
 883                 }
 884         }
 885
 886         /* at this point there is either a DLM_MLE_BLOCK or a
 887          * DLM_MLE_MASTER on the master list, so it's safe to add the
 888          * lockres to the hashtable.  anyone who finds the lock will
 889          * still have to wait on the IN_PROGRESS. */
 890
 891         /* finally add the lockres to its hash bucket */
 892         __dlm_insert_lockres(dlm, res);
 893         /* since this lockres is new it doesnt not require the spinlock */
 894         dlm_lockres_grab_inflight_ref_new(dlm, res);
 895
 896         /* if this node does not become the master make sure to drop
 897          * this inflight reference below */
 898         drop_inflight_if_nonlocal = 1;
 899
 900         /* get an extra ref on the mle in case this is a BLOCK
 901          * if so, the creator of the BLOCK may try to put the last
 902          * ref at this time in the assert master handler, so we
 903          * need an extra one to keep from a bad ptr deref. */
 904         dlm_get_mle_inuse(mle);
 905         spin_unlock(&dlm->master_lock);
 906         spin_unlock(&dlm->spinlock);
 907
 908 redo_request:
 909         while (wait_on_recovery) {
 910                 /* any cluster changes that occurred after dropping the
 911                  * dlm spinlock would be detectable be a change on the mle,
 912                  * so we only need to clear out the recovery map once. */
 913                 if (dlm_is_recovery_lock(lockid, namelen)) {
 914                         mlog(ML_NOTICE, "%s: recovery map is not empty, but "
 915                              "must master $RECOVERY lock now\n", dlm->name);
 916                         if (!dlm_pre_master_reco_lockres(dlm, res))
 917                                 wait_on_recovery = 0;
 918                         else {
 919                                 mlog(0, "%s: waiting 500ms for heartbeat state "
 920                                     "change\n", dlm->name);
 921                                 msleep(500);
 922                         }
 923                         continue;
 924                 }
 925
 926                 dlm_kick_recovery_thread(dlm);
 927                 msleep(1000);
 928                 dlm_wait_for_recovery(dlm);
 929
 930                 spin_lock(&dlm->spinlock);
 931                 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
 932                 if (bit < O2NM_MAX_NODES) {
 933                         mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
 934                              "recover before lock mastery can begin\n",
 935                              dlm->name, namelen, (char *)lockid, bit);
 936                         wait_on_recovery = 1;
 937                 } else
 938                         wait_on_recovery = 0;
 939                 spin_unlock(&dlm->spinlock);
 940
 941                 if (wait_on_recovery)
 942                         dlm_wait_for_node_recovery(dlm, bit, 10000);
 943         }
 944
 945         /* must wait for lock to be mastered elsewhere */
 946         if (blocked)
 947                 goto wait;
 948
 949         ret = -EINVAL;
 950         dlm_node_iter_init(mle->vote_map, &iter);
 951         while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
 952                 ret = dlm_do_master_request(res, mle, nodenum);
 953                 if (ret < 0)
 954                         mlog_errno(ret);
 955                 if (mle->master != O2NM_MAX_NODES) {
 956                         /* found a master ! */
 957                         if (mle->master <= nodenum)
 958                                 break;
 959                         /* if our master request has not reached the master
 960                          * yet, keep going until it does.  this is how the
 961                          * master will know that asserts are needed back to
 962                          * the lower nodes. */
 963                         mlog(0, "%s:%.*s: requests only up to %u but master "
 964                              "is %u, keep going\n", dlm->name, namelen,
 965                              lockid, nodenum, mle->master);
 966                 }
 967         }
 968
 969 wait:
 970         /* keep going until the response map includes all nodes */
 971         ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
 972         if (ret < 0) {
 973                 wait_on_recovery = 1;
 974                 mlog(0, "%s:%.*s: node map changed, redo the "
 975                      "master request now, blocked=%d\n",
 976                      dlm->name, res->lockname.len,
 977                      res->lockname.name, blocked);
 978                 if (++tries > 20) {
 979                         mlog(ML_ERROR, "%s:%.*s: spinning on "
 980                              "dlm_wait_for_lock_mastery, blocked=%d\n",
 981                              dlm->name, res->lockname.len,
 982                              res->lockname.name, blocked);
 983                         dlm_print_one_lock_resource(res);
 984                         dlm_print_one_mle(mle);
 985                         tries = 0;
 986                 }
 987                 goto redo_request;
 988         }
 989
 990         mlog(0, "lockres mastered by %u\n", res->owner);
 991         /* make sure we never continue without this */
 992         BUG_ON(res->owner == O2NM_MAX_NODES);
 993
 994         /* master is known, detach if not already detached */
 995         dlm_mle_detach_hb_events(dlm, mle);
 996         dlm_put_mle(mle);
 997         /* put the extra ref */
 998         dlm_put_mle_inuse(mle);
 999
1000 wake_waiters:
1001         spin_lock(&res->spinlock);
1002         if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
1003                 dlm_lockres_drop_inflight_ref(dlm, res);
1004         res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
1005         spin_unlock(&res->spinlock);
1006         wake_up(&res->wq);
1007
1008 leave:
1009         /* need to free the unused mle */
1010         if (alloc_mle)
1011                 kmem_cache_free(dlm_mle_cache, alloc_mle);
1012
1013         return res;
1014 }
1015
1016
1017 #define DLM_MASTERY_TIMEOUT_MS   5000
1018
1019 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
1020                                      struct dlm_lock_resource *res,
1021                                      struct dlm_master_list_entry *mle,
1022                                      int *blocked)
1023 {
1024         u8 m;
1025         int ret, bit;
1026         int map_changed, voting_done;
1027         int assert, sleep;
1028
1029 recheck:
1030         ret = 0;
1031         assert = 0;
1032
1033         /* check if another node has already become the owner */
1034         spin_lock(&res->spinlock);
1035         if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1036                 mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
1037                      res->lockname.len, res->lockname.name, res->owner);
1038                 spin_unlock(&res->spinlock);
1039                 /* this will cause the master to re-assert across
1040                  * the whole cluster, freeing up mles */
1041                 if (res->owner != dlm->node_num) {
1042                         ret = dlm_do_master_request(res, mle, res->owner);
1043                         if (ret < 0) {
1044                                 /* give recovery a chance to run */
1045                                 mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
1046                                 msleep(500);
1047                                 goto recheck;
1048                         }
1049                 }
1050                 ret = 0;
1051                 goto leave;
1052         }
1053         spin_unlock(&res->spinlock);
1054
1055         spin_lock(&mle->spinlock);
1056         m = mle->master;
1057         map_changed = (memcmp(mle->vote_map, mle->node_map,
1058                               sizeof(mle->vote_map)) != 0);
1059         voting_done = (memcmp(mle->vote_map, mle->response_map,
1060                              sizeof(mle->vote_map)) == 0);
1061
1062         /* restart if we hit any errors */
1063         if (map_changed) {
1064                 int b;
1065                 mlog(0, "%s: %.*s: node map changed, restarting\n",
1066                      dlm->name, res->lockname.len, res->lockname.name);
1067                 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
1068                 b = (mle->type == DLM_MLE_BLOCK);
1069                 if ((*blocked && !b) || (!*blocked && b)) {
1070                         mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
1071                              dlm->name, res->lockname.len, res->lockname.name,
1072                              *blocked, b);
1073                         *blocked = b;
1074                 }
1075                 spin_unlock(&mle->spinlock);
1076                 if (ret < 0) {
1077                         mlog_errno(ret);
1078                         goto leave;
1079                 }
1080                 mlog(0, "%s:%.*s: restart lock mastery succeeded, "
1081                      "rechecking now\n", dlm->name, res->lockname.len,
1082                      res->lockname.name);
1083                 goto recheck;
1084         } else {
1085                 if (!voting_done) {
1086                         mlog(0, "map not changed and voting not done "
1087                              "for %s:%.*s\n", dlm->name, res->lockname.len,
1088                              res->lockname.name);
1089                 }
1090         }
1091
1092         if (m != O2NM_MAX_NODES) {
1093                 /* another node has done an assert!
1094                  * all done! */
1095                 sleep = 0;
1096         } else {
1097                 sleep = 1;
1098                 /* have all nodes responded? */
1099                 if (voting_done && !*blocked) {
1100                         bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
1101                         if (dlm->node_num <= bit) {
1102                                 /* my node number is lowest.
1103                                  * now tell other nodes that I am
1104                                  * mastering this. */
1105                                 mle->master = dlm->node_num;
1106                                 /* ref was grabbed in get_lock_resource
1107                                  * will be dropped in dlmlock_master */
1108                                 assert = 1;
1109                                 sleep = 0;
1110                         }
1111                         /* if voting is done, but we have not received
1112                          * an assert master yet, we must sleep */
1113                 }
1114         }
1115
1116         spin_unlock(&mle->spinlock);
1117
1118         /* sleep if we haven't finished voting yet */
1119         if (sleep) {
1120                 unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
1121
1122                 /*
1123                 if (atomic_read(&mle->mle_refs.refcount) < 2)
1124                         mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
1125                         atomic_read(&mle->mle_refs.refcount),
1126                         res->lockname.len, res->lockname.name);
1127                 */
1128                 atomic_set(&mle->woken, 0);
1129                 (void)wait_event_timeout(mle->wq,
1130                                          (atomic_read(&mle->woken) == 1),
1131                                          timeo);
1132                 if (res->owner == O2NM_MAX_NODES) {
1133                         mlog(0, "%s:%.*s: waiting again\n", dlm->name,
1134                              res->lockname.len, res->lockname.name);
1135                         goto recheck;
1136                 }
1137                 mlog(0, "done waiting, master is %u\n", res->owner);
1138                 ret = 0;
1139                 goto leave;
1140         }
1141
1142         ret = 0;   /* done */
1143         if (assert) {
1144                 m = dlm->node_num;
1145                 mlog(0, "about to master %.*s here, this=%u\n",
1146                      res->lockname.len, res->lockname.name, m);
1147                 ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
1148                 if (ret) {
1149                         /* This is a failure in the network path,
1150                          * not in the response to the assert_master
1151                          * (any nonzero response is a BUG on this node).
1152                          * Most likely a socket just got disconnected
1153                          * due to node death. */
1154                         mlog_errno(ret);
1155                 }
1156                 /* no longer need to restart lock mastery.
1157                  * all living nodes have been contacted. */
1158                 ret = 0;
1159         }
1160
1161         /* set the lockres owner */
1162         spin_lock(&res->spinlock);
1163         /* mastery reference obtained either during
1164          * assert_master_handler or in get_lock_resource */
1165         dlm_change_lockres_owner(dlm, res, m);
1166         spin_unlock(&res->spinlock);
1167
1168 leave:
1169         return ret;
1170 }
1171
1172 struct dlm_bitmap_diff_iter
1173 {
1174         int curnode;
1175         unsigned long *orig_bm;
1176         unsigned long *cur_bm;
1177         unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
1178 };
1179
1180 enum dlm_node_state_change
1181 {
1182         NODE_DOWN = -1,
1183         NODE_NO_CHANGE = 0,
1184         NODE_UP
1185 };
1186
1187 static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
1188                                       unsigned long *orig_bm,
1189                                       unsigned long *cur_bm)
1190 {
1191         unsigned long p1, p2;
1192         int i;
1193
1194         iter->curnode = -1;
1195         iter->orig_bm = orig_bm;
1196         iter->cur_bm = cur_bm;
1197
1198         for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
1199                 p1 = *(iter->orig_bm + i);
1200                 p2 = *(iter->cur_bm + i);
1201                 iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
1202         }
1203 }
1204
1205 static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
1206                                      enum dlm_node_state_change *state)
1207 {
1208         int bit;
1209
1210         if (iter->curnode >= O2NM_MAX_NODES)
1211                 return -ENOENT;
1212
1213         bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
1214                             iter->curnode+1);
1215         if (bit >= O2NM_MAX_NODES) {
1216                 iter->curnode = O2NM_MAX_NODES;
1217                 return -ENOENT;
1218         }
1219
1220         /* if it was there in the original then this node died */
1221         if (test_bit(bit, iter->orig_bm))
1222                 *state = NODE_DOWN;
1223         else
1224                 *state = NODE_UP;
1225
1226         iter->curnode = bit;
1227         return bit;
1228 }
1229
1230
1231 static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1232                                     struct dlm_lock_resource *res,
1233                                     struct dlm_master_list_entry *mle,
1234                                     int blocked)
1235 {
1236         struct dlm_bitmap_diff_iter bdi;
1237         enum dlm_node_state_change sc;
1238         int node;
1239         int ret = 0;
1240
1241         mlog(0, "something happened such that the "
1242              "master process may need to be restarted!\n");
1243
1244         assert_spin_locked(&mle->spinlock);
1245
1246         dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
1247         node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1248         while (node >= 0) {
1249                 if (sc == NODE_UP) {
1250                         /* a node came up.  clear any old vote from
1251                          * the response map and set it in the vote map
1252                          * then restart the mastery. */
1253                         mlog(ML_NOTICE, "node %d up while restarting\n", node);
1254
1255                         /* redo the master request, but only for the new node */
1256                         mlog(0, "sending request to new node\n");
1257                         clear_bit(node, mle->response_map);
1258                         set_bit(node, mle->vote_map);
1259                 } else {
1260                         mlog(ML_ERROR, "node down! %d\n", node);
1261                         if (blocked) {
1262                                 int lowest = find_next_bit(mle->maybe_map,
1263                                                        O2NM_MAX_NODES, 0);
1264
1265                                 /* act like it was never there */
1266                                 clear_bit(node, mle->maybe_map);
1267
1268                                 if (node == lowest) {
1269                                         mlog(0, "expected master %u died"
1270                                             " while this node was blocked "
1271                                             "waiting on it!\n", node);
1272                                         lowest = find_next_bit(mle->maybe_map,
1273                                                         O2NM_MAX_NODES,
1274                                                         lowest+1);
1275                                         if (lowest < O2NM_MAX_NODES) {
1276                                                 mlog(0, "%s:%.*s:still "
1277                                                      "blocked. waiting on %u "
1278                                                      "now\n", dlm->name,
1279                                                      res->lockname.len,
1280                                                      res->lockname.name,
1281                                                      lowest);
1282                                         } else {
1283                                                 /* mle is an MLE_BLOCK, but
1284                                                  * there is now nothing left to
1285                                                  * block on.  we need to return
1286                                                  * all the way back out and try
1287                                                  * again with an MLE_MASTER.
1288                                                  * dlm_do_local_recovery_cleanup
1289                                                  * has already run, so the mle
1290                                                  * refcount is ok */
1291                                                 mlog(0, "%s:%.*s: no "
1292                                                      "longer blocking. try to "
1293                                                      "master this here\n",
1294                                                      dlm->name,
1295                                                      res->lockname.len,
1296                                                      res->lockname.name);
1297                                                 mle->type = DLM_MLE_MASTER;
1298                                                 mle->u.mleres = res;
1299                                         }
1300                                 }
1301                         }
1302
1303                         /* now blank out everything, as if we had never
1304                          * contacted anyone */
1305                         memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
1306                         memset(mle->response_map, 0, sizeof(mle->response_map));
1307                         /* reset the vote_map to the current node_map */
1308                         memcpy(mle->vote_map, mle->node_map,
1309                                sizeof(mle->node_map));
1310                         /* put myself into the maybe map */
1311                         if (mle->type != DLM_MLE_BLOCK)
1312                                 set_bit(dlm->node_num, mle->maybe_map);
1313                 }
1314                 ret = -EAGAIN;
1315                 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1316         }
1317         return ret;
1318 }
1319
1320
1321 /*
1322  * DLM_MASTER_REQUEST_MSG
1323  *
1324  * returns: 0 on success,
1325  *          -errno on a network error
1326  *
1327  * on error, the caller should assume the target node is "dead"
1328  *
1329  */
1330
1331 static int dlm_do_master_request(struct dlm_lock_resource *res,
1332                                  struct dlm_master_list_entry *mle, int to)
1333 {
1334         struct dlm_ctxt *dlm = mle->dlm;
1335         struct dlm_master_request request;
1336         int ret, response=0, resend;
1337         unsigned char *mlename;
1338         unsigned int mlenamelen;
1339
1340         memset(&request, 0, sizeof(request));
1341         request.node_idx = dlm->node_num;
1342
1343         BUG_ON(mle->type == DLM_MLE_MIGRATION);
1344
1345         __dlm_mle_name(mle, &mlename, &mlenamelen);
1346
1347         request.namelen = (u8)mlenamelen;
1348         memcpy(request.name, mlename, request.namelen);
1349
1350 again:
1351         ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
1352                                  sizeof(request), to, &response);
1353         if (ret < 0)  {
1354                 if (ret == -ESRCH) {
1355                         /* should never happen */
1356                         mlog(ML_ERROR, "TCP stack not ready!\n");
1357                         BUG();
1358                 } else if (ret == -EINVAL) {
1359                         mlog(ML_ERROR, "bad args passed to o2net!\n");
1360                         BUG();
1361                 } else if (ret == -ENOMEM) {
1362                         mlog(ML_ERROR, "out of memory while trying to send "
1363                              "network message!  retrying\n");
1364                         /* this is totally crude */
1365                         msleep(50);
1366                         goto again;
1367                 } else if (!dlm_is_host_down(ret)) {
1368                         /* not a network error. bad. */
1369                         mlog_errno(ret);
1370                         mlog(ML_ERROR, "unhandled error!");
1371                         BUG();
1372                 }
1373                 /* all other errors should be network errors,
1374                  * and likely indicate node death */
1375                 mlog(ML_ERROR, "link to %d went down!\n", to);
1376                 goto out;
1377         }
1378
1379         ret = 0;
1380         resend = 0;
1381         spin_lock(&mle->spinlock);
1382         switch (response) {
1383                 case DLM_MASTER_RESP_YES:
1384                         set_bit(to, mle->response_map);
1385                         mlog(0, "node %u is the master, response=YES\n", to);
1386                         mlog(0, "%s:%.*s: master node %u now knows I have a "
1387                              "reference\n", dlm->name, res->lockname.len,
1388                              res->lockname.name, to);
1389                         mle->master = to;
1390                         break;
1391                 case DLM_MASTER_RESP_NO:
1392                         mlog(0, "node %u not master, response=NO\n", to);
1393                         set_bit(to, mle->response_map);
1394                         break;
1395                 case DLM_MASTER_RESP_MAYBE:
1396                         mlog(0, "node %u not master, response=MAYBE\n", to);
1397                         set_bit(to, mle->response_map);
1398                         set_bit(to, mle->maybe_map);
1399                         break;
1400                 case DLM_MASTER_RESP_ERROR:
1401                         mlog(0, "node %u hit an error, resending\n", to);
1402                         resend = 1;
1403                         response = 0;
1404                         break;
1405                 default:
1406                         mlog(ML_ERROR, "bad response! %u\n", response);
1407                         BUG();
1408         }
1409         spin_unlock(&mle->spinlock);
1410         if (resend) {
1411                 /* this is also totally crude */
1412                 msleep(50);
1413                 goto again;
1414         }
1415
1416 out:
1417         return ret;
1418 }
1419
1420 /*
1421  * locks that can be taken here:
1422  * dlm->spinlock
1423  * res->spinlock
1424  * mle->spinlock
1425  * dlm->master_list
1426  *
1427  * if possible, TRIM THIS DOWN!!!
1428  */
1429 int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
1430                                void **ret_data)
1431 {
1432         u8 response = DLM_MASTER_RESP_MAYBE;
1433         struct dlm_ctxt *dlm = data;
1434         struct dlm_lock_resource *res = NULL;
1435         struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
1436         struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
1437         char *name;
1438         unsigned int namelen, hash;
1439         int found, ret;
1440         int set_maybe;
1441         int dispatch_assert = 0;
1442
1443         if (!dlm_grab(dlm))
1444                 return DLM_MASTER_RESP_NO;
1445
1446         if (!dlm_domain_fully_joined(dlm)) {
1447                 response = DLM_MASTER_RESP_NO;
1448                 goto send_response;
1449         }
1450
1451         name = request->name;
1452         namelen = request->namelen;
1453         hash = dlm_lockid_hash(name, namelen);
1454
1455         if (namelen > DLM_LOCKID_NAME_MAX) {
1456                 response = DLM_IVBUFLEN;
1457                 goto send_response;
1458         }
1459
1460 way_up_top:
1461         spin_lock(&dlm->spinlock);
1462         res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1463         if (res) {
1464                 spin_unlock(&dlm->spinlock);
1465
1466                 /* take care of the easy cases up front */
1467                 spin_lock(&res->spinlock);
1468                 if (res->state & (DLM_LOCK_RES_RECOVERING|
1469                                   DLM_LOCK_RES_MIGRATING)) {
1470                         spin_unlock(&res->spinlock);
1471                         mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
1472                              "being recovered/migrated\n");
1473                         response = DLM_MASTER_RESP_ERROR;
1474                         if (mle)
1475                                 kmem_cache_free(dlm_mle_cache, mle);
1476                         goto send_response;
1477                 }
1478
1479                 if (res->owner == dlm->node_num) {
1480                         mlog(0, "%s:%.*s: setting bit %u in refmap\n",
1481                              dlm->name, namelen, name, request->node_idx);
1482                         dlm_lockres_set_refmap_bit(request->node_idx, res);
1483                         spin_unlock(&res->spinlock);
1484                         response = DLM_MASTER_RESP_YES;
1485                         if (mle)
1486                                 kmem_cache_free(dlm_mle_cache, mle);
1487
1488                         /* this node is the owner.
1489                          * there is some extra work that needs to
1490                          * happen now.  the requesting node has
1491                          * caused all nodes up to this one to
1492                          * create mles.  this node now needs to
1493                          * go back and clean those up. */
1494                         dispatch_assert = 1;
1495                         goto send_response;
1496                 } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1497                         spin_unlock(&res->spinlock);
1498                         // mlog(0, "node %u is the master\n", res->owner);
1499                         response = DLM_MASTER_RESP_NO;
1500                         if (mle)
1501                                 kmem_cache_free(dlm_mle_cache, mle);
1502                         goto send_response;
1503                 }
1504
1505                 /* ok, there is no owner.  either this node is
1506                  * being blocked, or it is actively trying to
1507                  * master this lock. */
1508                 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1509                         mlog(ML_ERROR, "lock with no owner should be "
1510                              "in-progress!\n");
1511                         BUG();
1512                 }
1513
1514                 // mlog(0, "lockres is in progress...\n");
1515                 spin_lock(&dlm->master_lock);
1516                 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1517                 if (!found) {
1518                         mlog(ML_ERROR, "no mle found for this lock!\n");
1519                         BUG();
1520                 }
1521                 set_maybe = 1;
1522                 spin_lock(&tmpmle->spinlock);
1523                 if (tmpmle->type == DLM_MLE_BLOCK) {
1524                         // mlog(0, "this node is waiting for "
1525                         // "lockres to be mastered\n");
1526                         response = DLM_MASTER_RESP_NO;
1527                 } else if (tmpmle->type == DLM_MLE_MIGRATION) {
1528                         mlog(0, "node %u is master, but trying to migrate to "
1529                              "node %u.\n", tmpmle->master, tmpmle->new_master);
1530                         if (tmpmle->master == dlm->node_num) {
1531                                 mlog(ML_ERROR, "no owner on lockres, but this "
1532                                      "node is trying to migrate it to %u?!\n",
1533                                      tmpmle->new_master);
1534                                 BUG();
1535                         } else {
1536                                 /* the real master can respond on its own */
1537                                 response = DLM_MASTER_RESP_NO;
1538                         }
1539                 } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1540                         set_maybe = 0;
1541                         if (tmpmle->master == dlm->node_num) {
1542                                 response = DLM_MASTER_RESP_YES;
1543                                 /* this node will be the owner.
1544                                  * go back and clean the mles on any
1545                                  * other nodes */
1546                                 dispatch_assert = 1;
1547                                 dlm_lockres_set_refmap_bit(request->node_idx, res);
1548                                 mlog(0, "%s:%.*s: setting bit %u in refmap\n",
1549                                      dlm->name, namelen, name,
1550                                      request->node_idx);
1551                         } else
1552                                 response = DLM_MASTER_RESP_NO;
1553                 } else {
1554                         // mlog(0, "this node is attempting to "
1555                         // "master lockres\n");
1556                         response = DLM_MASTER_RESP_MAYBE;
1557                 }
1558                 if (set_maybe)
1559                         set_bit(request->node_idx, tmpmle->maybe_map);
1560                 spin_unlock(&tmpmle->spinlock);
1561
1562                 spin_unlock(&dlm->master_lock);
1563                 spin_unlock(&res->spinlock);
1564
1565                 /* keep the mle attached to heartbeat events */
1566                 dlm_put_mle(tmpmle);
1567                 if (mle)
1568                         kmem_cache_free(dlm_mle_cache, mle);
1569                 goto send_response;
1570         }
1571
1572         /*
1573          * lockres doesn't exist on this node
1574          * if there is an MLE_BLOCK, return NO
1575          * if there is an MLE_MASTER, return MAYBE
1576          * otherwise, add an MLE_BLOCK, return NO
1577          */
1578         spin_lock(&dlm->master_lock);
1579         found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1580         if (!found) {
1581                 /* this lockid has never been seen on this node yet */
1582                 // mlog(0, "no mle found\n");
1583                 if (!mle) {
1584                         spin_unlock(&dlm->master_lock);
1585                         spin_unlock(&dlm->spinlock);
1586
1587                         mle = (struct dlm_master_list_entry *)
1588                                 kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1589                         if (!mle) {
1590                                 response = DLM_MASTER_RESP_ERROR;
1591                                 mlog_errno(-ENOMEM);
1592                                 goto send_response;
1593                         }
1594                         goto way_up_top;
1595                 }
1596
1597                 // mlog(0, "this is second time thru, already allocated, "
1598                 // "add the block.\n");
1599                 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
1600                 set_bit(request->node_idx, mle->maybe_map);
1601                 __dlm_insert_mle(dlm, mle);
1602                 response = DLM_MASTER_RESP_NO;
1603         } else {
1604                 // mlog(0, "mle was found\n");
1605                 set_maybe = 1;
1606                 spin_lock(&tmpmle->spinlock);
1607                 if (tmpmle->master == dlm->node_num) {
1608                         mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
1609                         BUG();
1610                 }
1611                 if (tmpmle->type == DLM_MLE_BLOCK)
1612                         response = DLM_MASTER_RESP_NO;
1613                 else if (tmpmle->type == DLM_MLE_MIGRATION) {
1614                         mlog(0, "migration mle was found (%u->%u)\n",
1615                              tmpmle->master, tmpmle->new_master);
1616                         /* real master can respond on its own */
1617                         response = DLM_MASTER_RESP_NO;
1618                 } else
1619                         response = DLM_MASTER_RESP_MAYBE;
1620                 if (set_maybe)
1621                         set_bit(request->node_idx, tmpmle->maybe_map);
1622                 spin_unlock(&tmpmle->spinlock);
1623         }
1624         spin_unlock(&dlm->master_lock);
1625         spin_unlock(&dlm->spinlock);
1626
1627         if (found) {
1628                 /* keep the mle attached to heartbeat events */
1629                 dlm_put_mle(tmpmle);
1630         }
1631 send_response:
1632         /*
1633          * __dlm_lookup_lockres() grabbed a reference to this lockres.
1634          * The reference is released by dlm_assert_master_worker() under
1635          * the call to dlm_dispatch_assert_master().  If
1636          * dlm_assert_master_worker() isn't called, we drop it here.
1637          */
1638         if (dispatch_assert) {
1639                 if (response != DLM_MASTER_RESP_YES)
1640                         mlog(ML_ERROR, "invalid response %d\n", response);
1641                 if (!res) {
1642                         mlog(ML_ERROR, "bad lockres while trying to assert!\n");
1643                         BUG();
1644                 }
1645                 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1646                              dlm->node_num, res->lockname.len, res->lockname.name);
1647                 ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
1648                                                  DLM_ASSERT_MASTER_MLE_CLEANUP);
1649                 if (ret < 0) {
1650                         mlog(ML_ERROR, "failed to dispatch assert master work\n");
1651                         response = DLM_MASTER_RESP_ERROR;
1652                         dlm_lockres_put(res);
1653                 }
1654         } else {
1655                 if (res)
1656                         dlm_lockres_put(res);
1657         }
1658
1659         dlm_put(dlm);
1660         return response;
1661 }
1662
1663 /*
1664  * DLM_ASSERT_MASTER_MSG
1665  */
1666
1667
1668 /*
1669  * NOTE: this can be used for debugging
1670  * can periodically run all locks owned by this node
1671  * and re-assert across the cluster...
1672  */
1673 static int dlm_do_assert_master(struct dlm_ctxt *dlm,
1674                                 struct dlm_lock_resource *res,
1675                                 void *nodemap, u32 flags)
1676 {
1677         struct dlm_assert_master assert;
1678         int to, tmpret;
1679         struct dlm_node_iter iter;
1680         int ret = 0;
1681         int reassert;
1682         const char *lockname = res->lockname.name;
1683         unsigned int namelen = res->lockname.len;
1684
1685         BUG_ON(namelen > O2NM_MAX_NAME_LEN);
1686
1687         spin_lock(&res->spinlock);
1688         res->state |= DLM_LOCK_RES_SETREF_INPROG;
1689         spin_unlock(&res->spinlock);
1690
1691 again:
1692         reassert = 0;
1693
1694         /* note that if this nodemap is empty, it returns 0 */
1695         dlm_node_iter_init(nodemap, &iter);
1696         while ((to = dlm_node_iter_next(&iter)) >= 0) {
1697                 int r = 0;
1698                 struct dlm_master_list_entry *mle = NULL;
1699
1700                 mlog(0, "sending assert master to %d (%.*s)\n", to,
1701                      namelen, lockname);
1702                 memset(&assert, 0, sizeof(assert));
1703                 assert.node_idx = dlm->node_num;
1704                 assert.namelen = namelen;
1705                 memcpy(assert.name, lockname, namelen);
1706                 assert.flags = cpu_to_be32(flags);
1707
1708                 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1709                                             &assert, sizeof(assert), to, &r);
1710                 if (tmpret < 0) {
1711                         mlog(0, "assert_master returned %d!\n", tmpret);
1712                         if (!dlm_is_host_down(tmpret)) {
1713                                 mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
1714                                 BUG();
1715                         }
1716                         /* a node died.  finish out the rest of the nodes. */
1717                         mlog(0, "link to %d went down!\n", to);
1718                         /* any nonzero status return will do */
1719                         ret = tmpret;
1720                         r = 0;
1721                 } else if (r < 0) {
1722                         /* ok, something horribly messed.  kill thyself. */
1723                         mlog(ML_ERROR,"during assert master of %.*s to %u, "
1724                              "got %d.\n", namelen, lockname, to, r);
1725                         spin_lock(&dlm->spinlock);
1726                         spin_lock(&dlm->master_lock);
1727                         if (dlm_find_mle(dlm, &mle, (char *)lockname,
1728                                          namelen)) {
1729                                 dlm_print_one_mle(mle);
1730                                 __dlm_put_mle(mle);
1731                         }
1732                         spin_unlock(&dlm->master_lock);
1733                         spin_unlock(&dlm->spinlock);
1734                         BUG();
1735                 }
1736
1737                 if (r & DLM_ASSERT_RESPONSE_REASSERT &&
1738                     !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
1739                                 mlog(ML_ERROR, "%.*s: very strange, "
1740                                      "master MLE but no lockres on %u\n",
1741                                      namelen, lockname, to);
1742                 }
1743
1744                 if (r & DLM_ASSERT_RESPONSE_REASSERT) {
1745                         mlog(0, "%.*s: node %u create mles on other "
1746                              "nodes and requests a re-assert\n",
1747                              namelen, lockname, to);
1748                         reassert = 1;
1749                 }
1750                 if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
1751                         mlog(0, "%.*s: node %u has a reference to this "
1752                              "lockres, set the bit in the refmap\n",
1753                              namelen, lockname, to);
1754                         spin_lock(&res->spinlock);
1755                         dlm_lockres_set_refmap_bit(to, res);
1756                         spin_unlock(&res->spinlock);
1757                 }
1758         }
1759
1760         if (reassert)
1761                 goto again;
1762
1763         spin_lock(&res->spinlock);
1764         res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
1765         spin_unlock(&res->spinlock);
1766         wake_up(&res->wq);
1767
1768         return ret;
1769 }
1770
1771 /*
1772  * locks that can be taken here:
1773  * dlm->spinlock
1774  * res->spinlock
1775  * mle->spinlock
1776  * dlm->master_list
1777  *
1778  * if possible, TRIM THIS DOWN!!!
1779  */
1780 int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1781                               void **ret_data)
1782 {
1783         struct dlm_ctxt *dlm = data;
1784         struct dlm_master_list_entry *mle = NULL;
1785         struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
1786         struct dlm_lock_resource *res = NULL;
1787         char *name;
1788         unsigned int namelen, hash;
1789         u32 flags;
1790         int master_request = 0, have_lockres_ref = 0;
1791         int ret = 0;
1792
1793         if (!dlm_grab(dlm))
1794                 return 0;
1795
1796         name = assert->name;
1797         namelen = assert->namelen;
1798         hash = dlm_lockid_hash(name, namelen);
1799         flags = be32_to_cpu(assert->flags);
1800
1801         if (namelen > DLM_LOCKID_NAME_MAX) {
1802                 mlog(ML_ERROR, "Invalid name length!");
1803                 goto done;
1804         }
1805
1806         spin_lock(&dlm->spinlock);
1807
1808         if (flags)
1809                 mlog(0, "assert_master with flags: %u\n", flags);
1810
1811         /* find the MLE */
1812         spin_lock(&dlm->master_lock);
1813         if (!dlm_find_mle(dlm, &mle, name, namelen)) {
1814                 /* not an error, could be master just re-asserting */
1815                 mlog(0, "just got an assert_master from %u, but no "
1816                      "MLE for it! (%.*s)\n", assert->node_idx,
1817                      namelen, name);
1818         } else {
1819                 int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
1820                 if (bit >= O2NM_MAX_NODES) {
1821                         /* not necessarily an error, though less likely.
1822                          * could be master just re-asserting. */
1823                         mlog(0, "no bits set in the maybe_map, but %u "
1824                              "is asserting! (%.*s)\n", assert->node_idx,
1825                              namelen, name);
1826                 } else if (bit != assert->node_idx) {
1827                         if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1828                                 mlog(0, "master %u was found, %u should "
1829                                      "back off\n", assert->node_idx, bit);
1830                         } else {
1831                                 /* with the fix for bug 569, a higher node
1832                                  * number winning the mastery will respond
1833                                  * YES to mastery requests, but this node
1834                                  * had no way of knowing.  let it pass. */
1835                                 mlog(0, "%u is the lowest node, "
1836                                      "%u is asserting. (%.*s)  %u must "
1837                                      "have begun after %u won.\n", bit,
1838                                      assert->node_idx, namelen, name, bit,
1839                                      assert->node_idx);
1840                         }
1841                 }
1842                 if (mle->type == DLM_MLE_MIGRATION) {
1843                         if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1844                                 mlog(0, "%s:%.*s: got cleanup assert"
1845                                      " from %u for migration\n",
1846                                      dlm->name, namelen, name,
1847                                      assert->node_idx);
1848                         } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
1849                                 mlog(0, "%s:%.*s: got unrelated assert"
1850                                      " from %u for migration, ignoring\n",
1851                                      dlm->name, namelen, name,
1852                                      assert->node_idx);
1853                                 __dlm_put_mle(mle);
1854                                 spin_unlock(&dlm->master_lock);
1855                                 spin_unlock(&dlm->spinlock);
1856                                 goto done;
1857                         }
1858                 }
1859         }
1860         spin_unlock(&dlm->master_lock);
1861
1862         /* ok everything checks out with the MLE
1863          * now check to see if there is a lockres */
1864         res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1865         if (res) {
1866                 spin_lock(&res->spinlock);
1867                 if (res->state & DLM_LOCK_RES_RECOVERING)  {
1868                         mlog(ML_ERROR, "%u asserting but %.*s is "
1869                              "RECOVERING!\n", assert->node_idx, namelen, name);
1870                         goto kill;
1871                 }
1872                 if (!mle) {
1873                         if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
1874                             res->owner != assert->node_idx) {
1875                                 mlog(ML_ERROR, "DIE! Mastery assert from %u, "
1876                                      "but current owner is %u! (%.*s)\n",
1877                                      assert->node_idx, res->owner, namelen,
1878                                      name);
1879                                 __dlm_print_one_lock_resource(res);
1880                                 BUG();
1881                         }
1882                 } else if (mle->type != DLM_MLE_MIGRATION) {
1883                         if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1884                                 /* owner is just re-asserting */
1885                                 if (res->owner == assert->node_idx) {
1886                                         mlog(0, "owner %u re-asserting on "
1887                                              "lock %.*s\n", assert->node_idx,
1888                                              namelen, name);
1889                                         goto ok;
1890                                 }
1891                                 mlog(ML_ERROR, "got assert_master from "
1892                                      "node %u, but %u is the owner! "
1893                                      "(%.*s)\n", assert->node_idx,
1894                                      res->owner, namelen, name);
1895                                 goto kill;
1896                         }
1897                         if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1898                                 mlog(ML_ERROR, "got assert from %u, but lock "
1899                                      "with no owner should be "
1900                                      "in-progress! (%.*s)\n",
1901                                      assert->node_idx,
1902                                      namelen, name);
1903                                 goto kill;
1904                         }
1905                 } else /* mle->type == DLM_MLE_MIGRATION */ {
1906                         /* should only be getting an assert from new master */
1907                         if (assert->node_idx != mle->new_master) {
1908                                 mlog(ML_ERROR, "got assert from %u, but "
1909                                      "new master is %u, and old master "
1910                                      "was %u (%.*s)\n",
1911                                      assert->node_idx, mle->new_master,
1912                                      mle->master, namelen, name);
1913                                 goto kill;
1914                         }
1915
1916                 }
1917 ok:
1918                 spin_unlock(&res->spinlock);
1919         }
1920         spin_unlock(&dlm->spinlock);
1921
1922         // mlog(0, "woo!  got an assert_master from node %u!\n",
1923         //           assert->node_idx);
1924         if (mle) {
1925                 int extra_ref = 0;
1926                 int nn = -1;
1927                 int rr, err = 0;
1928
1929                 spin_lock(&mle->spinlock);
1930                 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
1931                         extra_ref = 1;
1932                 else {
1933                         /* MASTER mle: if any bits set in the response map
1934                          * then the calling node needs to re-assert to clear
1935                          * up nodes that this node contacted */
1936                         while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
1937                                                     nn+1)) < O2NM_MAX_NODES) {
1938                                 if (nn != dlm->node_num && nn != assert->node_idx)
1939                                         master_request = 1;
1940                         }
1941                 }
1942                 mle->master = assert->node_idx;
1943                 atomic_set(&mle->woken, 1);
1944                 wake_up(&mle->wq);
1945                 spin_unlock(&mle->spinlock);
1946
1947                 if (res) {
1948                         int wake = 0;
1949                         spin_lock(&res->spinlock);
1950                         if (mle->type == DLM_MLE_MIGRATION) {
1951                                 mlog(0, "finishing off migration of lockres %.*s, "
1952                                         "from %u to %u\n",
1953                                         res->lockname.len, res->lockname.name,
1954                                         dlm->node_num, mle->new_master);
1955                                 res->state &= ~DLM_LOCK_RES_MIGRATING;
1956                                 wake = 1;
1957                                 dlm_change_lockres_owner(dlm, res, mle->new_master);
1958                                 BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
1959                         } else {
1960                                 dlm_change_lockres_owner(dlm, res, mle->master);
1961                         }
1962                         spin_unlock(&res->spinlock);
1963                         have_lockres_ref = 1;
1964                         if (wake)
1965                                 wake_up(&res->wq);
1966                 }
1967
1968                 /* master is known, detach if not already detached.
1969                  * ensures that only one assert_master call will happen
1970                  * on this mle. */
1971                 spin_lock(&dlm->spinlock);
1972                 spin_lock(&dlm->master_lock);
1973
1974                 rr = atomic_read(&mle->mle_refs.refcount);
1975                 if (mle->inuse > 0) {
1976                         if (extra_ref && rr < 3)
1977                                 err = 1;
1978                         else if (!extra_ref && rr < 2)
1979                                 err = 1;
1980                 } else {
1981                         if (extra_ref && rr < 2)
1982                                 err = 1;
1983                         else if (!extra_ref && rr < 1)
1984                                 err = 1;
1985                 }
1986                 if (err) {
1987                         mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
1988                              "that will mess up this node, refs=%d, extra=%d, "
1989                              "inuse=%d\n", dlm->name, namelen, name,
1990                              assert->node_idx, rr, extra_ref, mle->inuse);
1991                         dlm_print_one_mle(mle);
1992                 }
1993                 __dlm_unlink_mle(dlm, mle);
1994                 __dlm_mle_detach_hb_events(dlm, mle);
1995                 __dlm_put_mle(mle);
1996                 if (extra_ref) {
1997                         /* the assert master message now balances the extra
1998                          * ref given by the master / migration request message.
1999                          * if this is the last put, it will be removed
2000                          * from the list. */
2001                         __dlm_put_mle(mle);
2002                 }
2003                 spin_unlock(&dlm->master_lock);
2004                 spin_unlock(&dlm->spinlock);
2005         } else if (res) {
2006                 if (res->owner != assert->node_idx) {
2007                         mlog(0, "assert_master from %u, but current "
2008                              "owner is %u (%.*s), no mle\n", assert->node_idx,
2009                              res->owner, namelen, name);
2010                 }
2011         }
2012
2013 done:
2014         ret = 0;
2015         if (res) {
2016                 spin_lock(&res->spinlock);
2017                 res->state |= DLM_LOCK_RES_SETREF_INPROG;
2018                 spin_unlock(&res->spinlock);
2019                 *ret_data = (void *)res;
2020         }
2021         dlm_put(dlm);
2022         if (master_request) {
2023                 mlog(0, "need to tell master to reassert\n");
2024                 /* positive. negative would shoot down the node. */
2025                 ret |= DLM_ASSERT_RESPONSE_REASSERT;
2026                 if (!have_lockres_ref) {
2027                         mlog(ML_ERROR, "strange, got assert from %u, MASTER "
2028                              "mle present here for %s:%.*s, but no lockres!\n",
2029                              assert->node_idx, dlm->name, namelen, name);
2030                 }
2031         }
2032         if (have_lockres_ref) {
2033                 /* let the master know we have a reference to the lockres */
2034                 ret |= DLM_ASSERT_RESPONSE_MASTERY_REF;
2035                 mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
2036                      dlm->name, namelen, name, assert->node_idx);
2037         }
2038         return ret;
2039
2040 kill:
2041         /* kill the caller! */
2042         mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
2043              "and killing the other node now!  This node is OK and can continue.\n");
2044         __dlm_print_one_lock_resource(res);
2045         spin_unlock(&res->spinlock);
2046         spin_unlock(&dlm->spinlock);
2047         *ret_data = (void *)res;
2048         dlm_put(dlm);
2049         return -EINVAL;
2050 }
2051
2052 void dlm_assert_master_post_handler(int status, void *data, void *ret_data)
2053 {
2054         struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data;
2055
2056         if (ret_data) {
2057                 spin_lock(&res->spinlock);
2058                 res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
2059                 spin_unlock(&res->spinlock);
2060                 wake_up(&res->wq);
2061                 dlm_lockres_put(res);
2062         }
2063         return;
2064 }
2065
2066 int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
2067                                struct dlm_lock_resource *res,
2068                                int ignore_higher, u8 request_from, u32 flags)
2069 {
2070         struct dlm_work_item *item;
2071         item = kzalloc(sizeof(*item), GFP_NOFS);
2072         if (!item)
2073                 return -ENOMEM;
2074
2075
2076         /* queue up work for dlm_assert_master_worker */
2077         dlm_grab(dlm);  /* get an extra ref for the work item */
2078         dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
2079         item->u.am.lockres = res; /* already have a ref */
2080         /* can optionally ignore node numbers higher than this node */
2081         item->u.am.ignore_higher = ignore_higher;
2082         item->u.am.request_from = request_from;
2083         item->u.am.flags = flags;
2084
2085         if (ignore_higher)
2086                 mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
2087                      res->lockname.name);
2088
2089         spin_lock(&dlm->work_lock);
2090         list_add_tail(&item->list, &dlm->work_list);
2091         spin_unlock(&dlm->work_lock);
2092
2093         queue_work(dlm->dlm_worker, &dlm->dispatched_work);
2094         return 0;
2095 }
2096
2097 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
2098 {
2099         struct dlm_ctxt *dlm = data;
2100         int ret = 0;
2101         struct dlm_lock_resource *res;
2102         unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
2103         int ignore_higher;
2104         int bit;
2105         u8 request_from;
2106         u32 flags;
2107
2108         dlm = item->dlm;
2109         res = item->u.am.lockres;
2110         ignore_higher = item->u.am.ignore_higher;
2111         request_from = item->u.am.request_from;
2112         flags = item->u.am.flags;
2113
2114         spin_lock(&dlm->spinlock);
2115         memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
2116         spin_unlock(&dlm->spinlock);
2117
2118         clear_bit(dlm->node_num, nodemap);
2119         if (ignore_higher) {
2120                 /* if is this just to clear up mles for nodes below
2121                  * this node, do not send the message to the original
2122                  * caller or any node number higher than this */
2123                 clear_bit(request_from, nodemap);
2124                 bit = dlm->node_num;
2125                 while (1) {
2126                         bit = find_next_bit(nodemap, O2NM_MAX_NODES,
2127                                             bit+1);
2128                         if (bit >= O2NM_MAX_NODES)
2129                                 break;
2130                         clear_bit(bit, nodemap);
2131                 }
2132         }
2133
2134         /*
2135          * If we're migrating this lock to someone else, we are no
2136          * longer allowed to assert out own mastery.  OTOH, we need to
2137          * prevent migration from starting while we're still asserting
2138          * our dominance.  The reserved ast delays migration.
2139          */
2140         spin_lock(&res->spinlock);
2141         if (res->state & DLM_LOCK_RES_MIGRATING) {
2142                 mlog(0, "Someone asked us to assert mastery, but we're "
2143                      "in the middle of migration.  Skipping assert, "
2144                      "the new master will handle that.\n");
2145                 spin_unlock(&res->spinlock);
2146                 goto put;
2147         } else
2148                 __dlm_lockres_reserve_ast(res);
2149         spin_unlock(&res->spinlock);
2150
2151         /* this call now finishes out the nodemap
2152          * even if one or more nodes die */
2153         mlog(0, "worker about to master %.*s here, this=%u\n",
2154                      res->lockname.len, res->lockname.name, dlm->node_num);
2155         ret = dlm_do_assert_master(dlm, res, nodemap, flags);
2156         if (ret < 0) {
2157                 /* no need to restart, we are done */
2158                 if (!dlm_is_host_down(ret))
2159                         mlog_errno(ret);
2160         }
2161
2162         /* Ok, we've asserted ourselves.  Let's let migration start. */
2163         dlm_lockres_release_ast(dlm, res);
2164
2165 put:
2166         dlm_lockres_put(res);
2167
2168         mlog(0, "finished with dlm_assert_master_worker\n");
2169 }
2170
2171 /* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
2172  * We cannot wait for node recovery to complete to begin mastering this
2173  * lockres because this lockres is used to kick off recovery! ;-)
2174  * So, do a pre-check on all living nodes to see if any of those nodes
2175  * think that $RECOVERY is currently mastered by a dead node.  If so,
2176  * we wait a short time to allow that node to get notified by its own
2177  * heartbeat stack, then check again.  All $RECOVERY lock resources
2178  * mastered by dead nodes are purged when the hearbeat callback is
2179  * fired, so we can know for sure that it is safe to continue once
2180  * the node returns a live node or no node.  */
2181 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
2182                                        struct dlm_lock_resource *res)
2183 {
2184         struct dlm_node_iter iter;
2185         int nodenum;
2186         int ret = 0;
2187         u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
2188
2189         spin_lock(&dlm->spinlock);
2190         dlm_node_iter_init(dlm->domain_map, &iter);
2191         spin_unlock(&dlm->spinlock);
2192
2193         while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
2194                 /* do not send to self */
2195                 if (nodenum == dlm->node_num)
2196                         continue;
2197                 ret = dlm_do_master_requery(dlm, res, nodenum, &master);
2198                 if (ret < 0) {
2199                         mlog_errno(ret);
2200                         if (!dlm_is_host_down(ret))
2201                                 BUG();
2202                         /* host is down, so answer for that node would be
2203                          * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
2204                         ret = 0;
2205                 }
2206
2207                 if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
2208                         /* check to see if this master is in the recovery map */
2209                         spin_lock(&dlm->spinlock);
2210                         if (test_bit(master, dlm->recovery_map)) {
2211                                 mlog(ML_NOTICE, "%s: node %u has not seen "
2212                                      "node %u go down yet, and thinks the "
2213                                      "dead node is mastering the recovery "
2214                                      "lock.  must wait.\n", dlm->name,
2215                                      nodenum, master);
2216                                 ret = -EAGAIN;
2217                         }
2218                         spin_unlock(&dlm->spinlock);
2219                         mlog(0, "%s: reco lock master is %u\n", dlm->name,
2220                              master);
2221                         break;
2222                 }
2223         }
2224         return ret;
2225 }
2226
2227 /*
2228  * DLM_DEREF_LOCKRES_MSG
2229  */
2230
2231 int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2232 {
2233         struct dlm_deref_lockres deref;
2234         int ret = 0, r;
2235         const char *lockname;
2236         unsigned int namelen;
2237
2238         lockname = res->lockname.name;
2239         namelen = res->lockname.len;
2240         BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2241
2242         mlog(0, "%s:%.*s: sending deref to %d\n",
2243              dlm->name, namelen, lockname, res->owner);
2244         memset(&deref, 0, sizeof(deref));
2245         deref.node_idx = dlm->node_num;
2246         deref.namelen = namelen;
2247         memcpy(deref.name, lockname, namelen);
2248
2249         ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2250                                  &deref, sizeof(deref), res->owner, &r);
2251         if (ret < 0)
2252                 mlog_errno(ret);
2253         else if (r < 0) {
2254                 /* BAD.  other node says I did not have a ref. */
2255                 mlog(ML_ERROR,"while dropping ref on %s:%.*s "
2256                     "(master=%u) got %d.\n", dlm->name, namelen,
2257                     lockname, res->owner, r);
2258                 dlm_print_one_lock_resource(res);
2259                 BUG();
2260         }
2261         return ret;
2262 }
2263
2264 int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2265                               void **ret_data)
2266 {
2267         struct dlm_ctxt *dlm = data;
2268         struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf;
2269         struct dlm_lock_resource *res = NULL;
2270         char *name;
2271         unsigned int namelen;
2272         int ret = -EINVAL;
2273         u8 node;
2274         unsigned int hash;
2275         struct dlm_work_item *item;
2276         int cleared = 0;
2277         int dispatch = 0;
2278
2279         if (!dlm_grab(dlm))
2280                 return 0;
2281
2282         name = deref->name;
2283         namelen = deref->namelen;
2284         node = deref->node_idx;
2285
2286         if (namelen > DLM_LOCKID_NAME_MAX) {
2287                 mlog(ML_ERROR, "Invalid name length!");
2288                 goto done;
2289         }
2290         if (deref->node_idx >= O2NM_MAX_NODES) {
2291                 mlog(ML_ERROR, "Invalid node number: %u\n", node);
2292                 goto done;
2293         }
2294
2295         hash = dlm_lockid_hash(name, namelen);
2296
2297         spin_lock(&dlm->spinlock);
2298         res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
2299         if (!res) {
2300                 spin_unlock(&dlm->spinlock);
2301                 mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
2302                      dlm->name, namelen, name);
2303                 goto done;
2304         }
2305         spin_unlock(&dlm->spinlock);
2306
2307         spin_lock(&res->spinlock);
2308         if (res->state & DLM_LOCK_RES_SETREF_INPROG)
2309                 dispatch = 1;
2310         else {
2311                 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2312                 if (test_bit(node, res->refmap)) {
2313                         dlm_lockres_clear_refmap_bit(node, res);
2314                         cleared = 1;
2315                 }
2316         }
2317         spin_unlock(&res->spinlock);
2318
2319         if (!dispatch) {
2320                 if (cleared)
2321                         dlm_lockres_calc_usage(dlm, res);
2322                 else {
2323                         mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2324                         "but it is already dropped!\n", dlm->name,
2325                         res->lockname.len, res->lockname.name, node);
2326                         dlm_print_one_lock_resource(res);
2327                 }
2328                 ret = 0;
2329                 goto done;
2330         }
2331
2332         item = kzalloc(sizeof(*item), GFP_NOFS);
2333         if (!item) {
2334                 ret = -ENOMEM;
2335                 mlog_errno(ret);
2336                 goto done;
2337         }
2338
2339         dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
2340         item->u.dl.deref_res = res;
2341         item->u.dl.deref_node = node;
2342
2343         spin_lock(&dlm->work_lock);
2344         list_add_tail(&item->list, &dlm->work_list);
2345         spin_unlock(&dlm->work_lock);
2346
2347         queue_work(dlm->dlm_worker, &dlm->dispatched_work);
2348         return 0;
2349
2350 done:
2351         if (res)
2352                 dlm_lockres_put(res);
2353         dlm_put(dlm);
2354
2355         return ret;
2356 }
2357
2358 static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2359 {
2360         struct dlm_ctxt *dlm;
2361         struct dlm_lock_resource *res;
2362         u8 node;
2363         u8 cleared = 0;
2364
2365         dlm = item->dlm;
2366         res = item->u.dl.deref_res;
2367         node = item->u.dl.deref_node;
2368
2369         spin_lock(&res->spinlock);
2370         BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2371         if (test_bit(node, res->refmap)) {
2372                 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
2373                 dlm_lockres_clear_refmap_bit(node, res);
2374                 cleared = 1;
2375         }
2376         spin_unlock(&res->spinlock);
2377
2378         if (cleared) {
2379                 mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
2380                      dlm->name, res->lockname.len, res->lockname.name, node);
2381                 dlm_lockres_calc_usage(dlm, res);
2382         } else {
2383                 mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2384                      "but it is already dropped!\n", dlm->name,
2385                      res->lockname.len, res->lockname.name, node);
2386                 dlm_print_one_lock_resource(res);
2387         }
2388
2389         dlm_lockres_put(res);
2390 }
2391
2392 /* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
2393  * if not. If 0, numlocks is set to the number of locks in the lockres.
2394  */
2395 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2396                                       struct dlm_lock_resource *res,
2397                                       int *numlocks)
2398 {
2399         int ret;
2400         int i;
2401         int count = 0;
2402         struct list_head *queue;
2403         struct dlm_lock *lock;
2404
2405         assert_spin_locked(&res->spinlock);
2406
2407         ret = -EINVAL;
2408         if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
2409                 mlog(0, "cannot migrate lockres with unknown owner!\n");
2410                 goto leave;
2411         }
2412
2413         if (res->owner != dlm->node_num) {
2414                 mlog(0, "cannot migrate lockres this node doesn't own!\n");
2415                 goto leave;
2416         }
2417
2418         ret = 0;
2419         queue = &res->granted;
2420         for (i = 0; i < 3; i++) {
2421                 list_for_each_entry(lock, queue, list) {
2422                         ++count;
2423                         if (lock->ml.node == dlm->node_num) {
2424                                 mlog(0, "found a lock owned by this node still "
2425                                      "on the %s queue!  will not migrate this "
2426                                      "lockres\n", (i == 0 ? "granted" :
2427                                                    (i == 1 ? "converting" :
2428                                                     "blocked")));
2429                                 ret = -ENOTEMPTY;
2430                                 goto leave;
2431                         }
2432                 }
2433                 queue++;
2434         }
2435
2436         *numlocks = count;
2437         mlog(0, "migrateable lockres having %d locks\n", *numlocks);
2438
2439 leave:
2440         return ret;
2441 }
2442
2443 /*
2444  * DLM_MIGRATE_LOCKRES
2445  */
2446
2447
2448 static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2449                                struct dlm_lock_resource *res,
2450                                u8 target)
2451 {
2452         struct dlm_master_list_entry *mle = NULL;
2453         struct dlm_master_list_entry *oldmle = NULL;
2454         struct dlm_migratable_lockres *mres = NULL;
2455         int ret = 0;
2456         const char *name;
2457         unsigned int namelen;
2458         int mle_added = 0;
2459         int numlocks;
2460         int wake = 0;
2461
2462         if (!dlm_grab(dlm))
2463                 return -EINVAL;
2464
2465         name = res->lockname.name;
2466         namelen = res->lockname.len;
2467
2468         mlog(0, "migrating %.*s to %u\n", namelen, name, target);
2469
2470         /*
2471          * ensure this lockres is a proper candidate for migration
2472          */
2473         spin_lock(&res->spinlock);
2474         ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
2475         if (ret < 0) {
2476                 spin_unlock(&res->spinlock);
2477                 goto leave;
2478         }
2479         spin_unlock(&res->spinlock);
2480
2481         /* no work to do */
2482         if (numlocks == 0) {
2483                 mlog(0, "no locks were found on this lockres! done!\n");
2484                 goto leave;
2485         }
2486
2487         /*
2488          * preallocate up front
2489          * if this fails, abort
2490          */
2491
2492         ret = -ENOMEM;
2493         mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
2494         if (!mres) {
2495                 mlog_errno(ret);
2496                 goto leave;
2497         }
2498
2499         mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
2500                                                                 GFP_NOFS);
2501         if (!mle) {
2502                 mlog_errno(ret);
2503                 goto leave;
2504         }
2505         ret = 0;
2506
2507         /*
2508          * find a node to migrate the lockres to
2509          */
2510
2511         mlog(0, "picking a migration node\n");
2512         spin_lock(&dlm->spinlock);
2513         /* pick a new node */
2514         if (!test_bit(target, dlm->domain_map) ||
2515             target >= O2NM_MAX_NODES) {
2516                 target = dlm_pick_migration_target(dlm, res);
2517         }
2518         mlog(0, "node %u chosen for migration\n", target);
2519
2520         if (target >= O2NM_MAX_NODES ||
2521             !test_bit(target, dlm->domain_map)) {
2522                 /* target chosen is not alive */
2523                 ret = -EINVAL;
2524         }
2525
2526         if (ret) {
2527                 spin_unlock(&dlm->spinlock);
2528                 goto fail;
2529         }
2530
2531         mlog(0, "continuing with target = %u\n", target);
2532
2533         /*
2534          * clear any existing master requests and
2535          * add the migration mle to the list
2536          */
2537         spin_lock(&dlm->master_lock);
2538         ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
2539                                     namelen, target, dlm->node_num);
2540         spin_unlock(&dlm->master_lock);
2541         spin_unlock(&dlm->spinlock);
2542
2543         if (ret == -EEXIST) {
2544                 mlog(0, "another process is already migrating it\n");
2545                 goto fail;
2546         }
2547         mle_added = 1;
2548
2549         /*
2550          * set the MIGRATING flag and flush asts
2551          * if we fail after this we need to re-dirty the lockres
2552          */
2553         if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
2554                 mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
2555                      "the target went down.\n", res->lockname.len,
2556                      res->lockname.name, target);
2557                 spin_lock(&res->spinlock);
2558                 res->state &= ~DLM_LOCK_RES_MIGRATING;
2559                 wake = 1;
2560                 spin_unlock(&res->spinlock);
2561                 ret = -EINVAL;
2562         }
2563
2564 fail:
2565         if (oldmle) {
2566                 /* master is known, detach if not already detached */
2567                 dlm_mle_detach_hb_events(dlm, oldmle);
2568                 dlm_put_mle(oldmle);
2569         }
2570
2571         if (ret < 0) {
2572                 if (mle_added) {
2573                         dlm_mle_detach_hb_events(dlm, mle);
2574                         dlm_put_mle(mle);
2575                 } else if (mle) {
2576                         kmem_cache_free(dlm_mle_cache, mle);
2577                 }
2578                 goto leave;
2579         }
2580
2581         /*
2582          * at this point, we have a migration target, an mle
2583          * in the master list, and the MIGRATING flag set on
2584          * the lockres
2585          */
2586
2587         /* now that remote nodes are spinning on the MIGRATING flag,
2588          * ensure that all assert_master work is flushed. */
2589         flush_workqueue(dlm->dlm_worker);
2590
2591         /* get an extra reference on the mle.
2592          * otherwise the assert_master from the new
2593          * master will destroy this.
2594          * also, make sure that all callers of dlm_get_mle
2595          * take both dlm->spinlock and dlm->master_lock */
2596         spin_lock(&dlm->spinlock);
2597         spin_lock(&dlm->master_lock);
2598         dlm_get_mle_inuse(mle);
2599         spin_unlock(&dlm->master_lock);
2600         spin_unlock(&dlm->spinlock);
2601
2602         /* notify new node and send all lock state */
2603         /* call send_one_lockres with migration flag.
2604          * this serves as notice to the target node that a
2605          * migration is starting. */
2606         ret = dlm_send_one_lockres(dlm, res, mres, target,
2607                                    DLM_MRES_MIGRATION);
2608
2609         if (ret < 0) {
2610                 mlog(0, "migration to node %u failed with %d\n",
2611                      target, ret);
2612                 /* migration failed, detach and clean up mle */
2613                 dlm_mle_detach_hb_events(dlm, mle);
2614                 dlm_put_mle(mle);
2615                 dlm_put_mle_inuse(mle);
2616                 spin_lock(&res->spinlock);
2617                 res->state &= ~DLM_LOCK_RES_MIGRATING;
2618                 wake = 1;
2619                 spin_unlock(&res->spinlock);
2620                 goto leave;
2621         }
2622
2623         /* at this point, the target sends a message to all nodes,
2624          * (using dlm_do_migrate_request).  this node is skipped since
2625          * we had to put an mle in the list to begin the process.  this
2626          * node now waits for target to do an assert master.  this node
2627          * will be the last one notified, ensuring that the migration
2628          * is complete everywhere.  if the target dies while this is
2629          * going on, some nodes could potentially see the target as the
2630          * master, so it is important that my recovery finds the migration
2631          * mle and sets the master to UNKNONWN. */
2632
2633
2634         /* wait for new node to assert master */
2635         while (1) {
2636                 ret = wait_event_interruptible_timeout(mle->wq,
2637                                         (atomic_read(&mle->woken) == 1),
2638                                         msecs_to_jiffies(5000));
2639
2640                 if (ret >= 0) {
2641                         if (atomic_read(&mle->woken) == 1 ||
2642                             res->owner == target)
2643                                 break;
2644
2645                         mlog(0, "%s:%.*s: timed out during migration\n",
2646                              dlm->name, res->lockname.len, res->lockname.name);
2647                         /* avoid hang during shutdown when migrating lockres
2648                          * to a node which also goes down */
2649                         if (dlm_is_node_dead(dlm, target)) {
2650                                 mlog(0, "%s:%.*s: expected migration "
2651                                      "target %u is no longer up, restarting\n",
2652                                      dlm->name, res->lockname.len,
2653                                      res->lockname.name, target);
2654                                 ret = -EINVAL;
2655                                 /* migration failed, detach and clean up mle */
2656                                 dlm_mle_detach_hb_events(dlm, mle);
2657                                 dlm_put_mle(mle);
2658                                 dlm_put_mle_inuse(mle);
2659                                 spin_lock(&res->spinlock);
2660                                 res->state &= ~DLM_LOCK_RES_MIGRATING;
2661                                 wake = 1;
2662                                 spin_unlock(&res->spinlock);
2663                                 goto leave;
2664                         }
2665                 } else
2666                         mlog(0, "%s:%.*s: caught signal during migration\n",
2667                              dlm->name, res->lockname.len, res->lockname.name);
2668         }
2669
2670         /* all done, set the owner, clear the flag */
2671         spin_lock(&res->spinlock);
2672         dlm_set_lockres_owner(dlm, res, target);
2673         res->state &= ~DLM_LOCK_RES_MIGRATING;
2674         dlm_remove_nonlocal_locks(dlm, res);
2675         spin_unlock(&res->spinlock);
2676         wake_up(&res->wq);
2677
2678         /* master is known, detach if not already detached */
2679         dlm_mle_detach_hb_events(dlm, mle);
2680         dlm_put_mle_inuse(mle);
2681         ret = 0;
2682
2683         dlm_lockres_calc_usage(dlm, res);
2684
2685 leave:
2686         /* re-dirty the lockres if we failed */
2687         if (ret < 0)
2688                 dlm_kick_thread(dlm, res);
2689
2690         /* wake up waiters if the MIGRATING flag got set
2691          * but migration failed */
2692         if (wake)
2693                 wake_up(&res->wq);
2694
2695         /* TODO: cleanup */
2696         if (mres)
2697                 free_page((unsigned long)mres);
2698
2699         dlm_put(dlm);
2700
2701         mlog(0, "returning %d\n", ret);
2702         return ret;
2703 }
2704
2705 #define DLM_MIGRATION_RETRY_MS  100
2706
2707 /* Should be called only after beginning the domain leave process.
2708  * There should not be any remaining locks on nonlocal lock resources,
2709  * and there should be no local locks left on locally mastered resources.
2710  *
2711  * Called with the dlm spinlock held, may drop it to do migration, but
2712  * will re-acquire before exit.
2713  *
2714  * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
2715 int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2716 {
2717         int ret;
2718         int lock_dropped = 0;
2719         int numlocks;
2720
2721         spin_lock(&res->spinlock);
2722         if (res->owner != dlm->node_num) {
2723                 if (!__dlm_lockres_unused(res)) {
2724                         mlog(ML_ERROR, "%s:%.*s: this node is not master, "
2725                              "trying to free this but locks remain\n",
2726                              dlm->name, res->lockname.len, res->lockname.name);
2727                 }
2728                 spin_unlock(&res->spinlock);
2729                 goto leave;
2730         }
2731
2732         /* No need to migrate a lockres having no locks */
2733         ret = dlm_is_lockres_migrateable(dlm, res, &numlocks);
2734         if (ret >= 0 && numlocks == 0) {
2735                 spin_unlock(&res->spinlock);
2736                 goto leave;
2737         }
2738         spin_unlock(&res->spinlock);
2739
2740         /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
2741         spin_unlock(&dlm->spinlock);
2742         lock_dropped = 1;
2743         while (1) {
2744                 ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
2745                 if (ret >= 0)
2746                         break;
2747                 if (ret == -ENOTEMPTY) {
2748                         mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
2749                                 res->lockname.len, res->lockname.name);
2750                         BUG();
2751                 }
2752
2753                 mlog(0, "lockres %.*s: migrate failed, "
2754                      "retrying\n", res->lockname.len,
2755                      res->lockname.name);
2756                 msleep(DLM_MIGRATION_RETRY_MS);
2757         }
2758         spin_lock(&dlm->spinlock);
2759 leave:
2760         return lock_dropped;
2761 }
2762
2763 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
2764 {
2765         int ret;
2766         spin_lock(&dlm->ast_lock);
2767         spin_lock(&lock->spinlock);
2768         ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
2769         spin_unlock(&lock->spinlock);
2770         spin_unlock(&dlm->ast_lock);
2771         return ret;
2772 }
2773
2774 static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2775                                      struct dlm_lock_resource *res,
2776                                      u8 mig_target)
2777 {
2778         int can_proceed;
2779         spin_lock(&res->spinlock);
2780         can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2781         spin_unlock(&res->spinlock);
2782
2783         /* target has died, so make the caller break out of the
2784          * wait_event, but caller must recheck the domain_map */
2785         spin_lock(&dlm->spinlock);
2786         if (!test_bit(mig_target, dlm->domain_map))
2787                 can_proceed = 1;
2788         spin_unlock(&dlm->spinlock);
2789         return can_proceed;
2790 }
2791
2792 static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
2793                                 struct dlm_lock_resource *res)
2794 {
2795         int ret;
2796         spin_lock(&res->spinlock);
2797         ret = !!(res->state & DLM_LOCK_RES_DIRTY);
2798         spin_unlock(&res->spinlock);
2799         return ret;
2800 }
2801
2802
2803 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
2804                                        struct dlm_lock_resource *res,
2805                                        u8 target)
2806 {
2807         int ret = 0;
2808
2809         mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
2810                res->lockname.len, res->lockname.name, dlm->node_num,
2811                target);
2812         /* need to set MIGRATING flag on lockres.  this is done by
2813          * ensuring that all asts have been flushed for this lockres. */
2814         spin_lock(&res->spinlock);
2815         BUG_ON(res->migration_pending);
2816         res->migration_pending = 1;
2817         /* strategy is to reserve an extra ast then release
2818          * it below, letting the release do all of the work */
2819         __dlm_lockres_reserve_ast(res);
2820         spin_unlock(&res->spinlock);
2821
2822         /* now flush all the pending asts */
2823         dlm_kick_thread(dlm, res);
2824         /* before waiting on DIRTY, block processes which may
2825          * try to dirty the lockres before MIGRATING is set */
2826         spin_lock(&res->spinlock);
2827         BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
2828         res->state |= DLM_LOCK_RES_BLOCK_DIRTY;
2829         spin_unlock(&res->spinlock);
2830         /* now wait on any pending asts and the DIRTY state */
2831         wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
2832         dlm_lockres_release_ast(dlm, res);
2833
2834         mlog(0, "about to wait on migration_wq, dirty=%s\n",
2835                res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
2836         /* if the extra ref we just put was the final one, this
2837          * will pass thru immediately.  otherwise, we need to wait
2838          * for the last ast to finish. */
2839 again:
2840         ret = wait_event_interruptible_timeout(dlm->migration_wq,
2841                    dlm_migration_can_proceed(dlm, res, target),
2842                    msecs_to_jiffies(1000));
2843         if (ret < 0) {
2844                 mlog(0, "woken again: migrating? %s, dead? %s\n",
2845                        res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2846                        test_bit(target, dlm->domain_map) ? "no":"yes");
2847         } else {
2848                 mlog(0, "all is well: migrating? %s, dead? %s\n",
2849                        res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2850                        test_bit(target, dlm->domain_map) ? "no":"yes");
2851         }
2852         if (!dlm_migration_can_proceed(dlm, res, target)) {
2853                 mlog(0, "trying again...\n");
2854                 goto again;
2855         }
2856         /* now that we are sure the MIGRATING state is there, drop
2857          * the unneded state which blocked threads trying to DIRTY */
2858         spin_lock(&res->spinlock);
2859         BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
2860         BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
2861         res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
2862         spin_unlock(&res->spinlock);
2863
2864         /* did the target go down or die? */
2865         spin_lock(&dlm->spinlock);
2866         if (!test_bit(target, dlm->domain_map)) {
2867                 mlog(ML_ERROR, "aha. migration target %u just went down\n",
2868                      target);
2869                 ret = -EHOSTDOWN;
2870         }
2871         spin_unlock(&dlm->spinlock);
2872
2873         /*
2874          * at this point:
2875          *
2876          *   o the DLM_LOCK_RES_MIGRATING flag is set
2877          *   o there are no pending asts on this lockres
2878          *   o all processes trying to reserve an ast on this
2879          *     lockres must wait for the MIGRATING flag to clear
2880          */
2881         return ret;
2882 }
2883
2884 /* last step in the migration process.
2885  * original master calls this to free all of the dlm_lock
2886  * structures that used to be for other nodes. */
2887 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2888                                       struct dlm_lock_resource *res)
2889 {
2890         struct list_head *queue = &res->granted;
2891         int i, bit;
2892         struct dlm_lock *lock, *next;
2893
2894         assert_spin_locked(&res->spinlock);
2895
2896         BUG_ON(res->owner == dlm->node_num);
2897
2898         for (i=0; i<3; i++) {
2899                 list_for_each_entry_safe(lock, next, queue, list) {
2900                         if (lock->ml.node != dlm->node_num) {
2901                                 mlog(0, "putting lock for node %u\n",
2902                                      lock->ml.node);
2903                                 /* be extra careful */
2904                                 BUG_ON(!list_empty(&lock->ast_list));
2905                                 BUG_ON(!list_empty(&lock->bast_list));
2906                                 BUG_ON(lock->ast_pending);
2907                                 BUG_ON(lock->bast_pending);
2908                                 dlm_lockres_clear_refmap_bit(lock->ml.node, res);
2909                                 list_del_init(&lock->list);
2910                                 dlm_lock_put(lock);
2911                                 /* In a normal unlock, we would have added a
2912                                  * DLM_UNLOCK_FREE_LOCK action. Force it. */
2913                                 dlm_lock_put(lock);
2914                         }
2915                 }
2916                 queue++;
2917         }
2918         bit = 0;
2919         while (1) {
2920                 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
2921                 if (bit >= O2NM_MAX_NODES)
2922                         break;
2923                 /* do not clear the local node reference, if there is a
2924                  * process holding this, let it drop the ref itself */
2925                 if (bit != dlm->node_num) {
2926                         mlog(0, "%s:%.*s: node %u had a ref to this "
2927                              "migrating lockres, clearing\n", dlm->name,
2928                              res->lockname.len, res->lockname.name, bit);
2929                         dlm_lockres_clear_refmap_bit(bit, res);
2930                 }
2931                 bit++;
2932         }
2933 }
2934
2935 /* for now this is not too intelligent.  we will
2936  * need stats to make this do the right thing.
2937  * this just finds the first lock on one of the
2938  * queues and uses that node as the target. */
2939 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2940                                     struct dlm_lock_resource *res)
2941 {
2942         int i;
2943         struct list_head *queue = &res->granted;
2944         struct dlm_lock *lock;
2945         int nodenum;
2946
2947         assert_spin_locked(&dlm->spinlock);
2948
2949         spin_lock(&res->spinlock);
2950         for (i=0; i<3; i++) {
2951                 list_for_each_entry(lock, queue, list) {
2952                         /* up to the caller to make sure this node
2953                          * is alive */
2954                         if (lock->ml.node != dlm->node_num) {
2955                                 spin_unlock(&res->spinlock);
2956                                 return lock->ml.node;
2957                         }
2958                 }
2959                 queue++;
2960         }
2961         spin_unlock(&res->spinlock);
2962         mlog(0, "have not found a suitable target yet! checking domain map\n");
2963
2964         /* ok now we're getting desperate.  pick anyone alive. */
2965         nodenum = -1;
2966         while (1) {
2967                 nodenum = find_next_bit(dlm->domain_map,
2968                                         O2NM_MAX_NODES, nodenum+1);
2969                 mlog(0, "found %d in domain map\n", nodenum);
2970                 if (nodenum >= O2NM_MAX_NODES)
2971                         break;
2972                 if (nodenum != dlm->node_num) {
2973                         mlog(0, "picking %d\n", nodenum);
2974                         return nodenum;
2975                 }
2976         }
2977
2978         mlog(0, "giving up.  no master to migrate to\n");
2979         return DLM_LOCK_RES_OWNER_UNKNOWN;
2980 }
2981
2982
2983
2984 /* this is called by the new master once all lockres
2985  * data has been received */
2986 static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2987                                   struct dlm_lock_resource *res,
2988                                   u8 master, u8 new_master,
2989                                   struct dlm_node_iter *iter)
2990 {
2991         struct dlm_migrate_request migrate;
2992         int ret, skip, status = 0;
2993         int nodenum;
2994
2995         memset(&migrate, 0, sizeof(migrate));
2996         migrate.namelen = res->lockname.len;
2997         memcpy(migrate.name, res->lockname.name, migrate.namelen);
2998         migrate.new_master = new_master;
2999         migrate.master = master;
3000
3001         ret = 0;
3002
3003         /* send message to all nodes, except the master and myself */
3004         while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
3005                 if (nodenum == master ||
3006                     nodenum == new_master)
3007                         continue;
3008
3009                 /* We could race exit domain. If exited, skip. */
3010                 spin_lock(&dlm->spinlock);
3011                 skip = (!test_bit(nodenum, dlm->domain_map));
3012                 spin_unlock(&dlm->spinlock);
3013                 if (skip) {
3014                         clear_bit(nodenum, iter->node_map);
3015                         continue;
3016                 }
3017
3018                 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
3019                                          &migrate, sizeof(migrate), nodenum,
3020                                          &status);
3021                 if (ret < 0) {
3022                         mlog(0, "migrate_request returned %d!\n", ret);
3023                         if (!dlm_is_host_down(ret)) {
3024                                 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
3025                                 BUG();
3026                         }
3027                         clear_bit(nodenum, iter->node_map);
3028                         ret = 0;
3029                 } else if (status < 0) {
3030                         mlog(0, "migrate request (node %u) returned %d!\n",
3031                              nodenum, status);
3032                         ret = status;
3033                 } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
3034                         /* during the migration request we short-circuited
3035                          * the mastery of the lockres.  make sure we have
3036                          * a mastery ref for nodenum */
3037                         mlog(0, "%s:%.*s: need ref for node %u\n",
3038                              dlm->name, res->lockname.len, res->lockname.name,
3039                              nodenum);
3040                         spin_lock(&res->spinlock);
3041                         dlm_lockres_set_refmap_bit(nodenum, res);
3042                         spin_unlock(&res->spinlock);
3043                 }
3044         }
3045
3046         if (ret < 0)
3047                 mlog_errno(ret);
3048
3049         mlog(0, "returning ret=%d\n", ret);
3050         return ret;
3051 }
3052
3053
3054 /* if there is an existing mle for this lockres, we now know who the master is.
3055  * (the one who sent us *this* message) we can clear it up right away.
3056  * since the process that put the mle on the list still has a reference to it,
3057  * we can unhash it now, set the master and wake the process.  as a result,
3058  * we will have no mle in the list to start with.  now we can add an mle for
3059  * the migration and this should be the only one found for those scanning the
3060  * list.  */
3061 int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3062                                 void **ret_data)
3063 {
3064         struct dlm_ctxt *dlm = data;
3065         struct dlm_lock_resource *res = NULL;
3066         struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
3067         struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
3068         const char *name;
3069         unsigned int namelen, hash;
3070         int ret = 0;
3071
3072         if (!dlm_grab(dlm))
3073                 return -EINVAL;
3074
3075         name = migrate->name;
3076         namelen = migrate->namelen;
3077         hash = dlm_lockid_hash(name, namelen);
3078
3079         /* preallocate.. if this fails, abort */
3080         mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
3081                                                          GFP_NOFS);
3082
3083         if (!mle) {
3084                 ret = -ENOMEM;
3085                 goto leave;
3086         }
3087
3088         /* check for pre-existing lock */
3089         spin_lock(&dlm->spinlock);
3090         res = __dlm_lookup_lockres(dlm, name, namelen, hash);
3091         spin_lock(&dlm->master_lock);
3092
3093         if (res) {
3094                 spin_lock(&res->spinlock);
3095                 if (res->state & DLM_LOCK_RES_RECOVERING) {
3096                         /* if all is working ok, this can only mean that we got
3097                         * a migrate request from a node that we now see as
3098                         * dead.  what can we do here?  drop it to the floor? */
3099                         spin_unlock(&res->spinlock);
3100                         mlog(ML_ERROR, "Got a migrate request, but the "
3101                              "lockres is marked as recovering!");
3102                         kmem_cache_free(dlm_mle_cache, mle);
3103                         ret = -EINVAL; /* need a better solution */
3104                         goto unlock;
3105                 }
3106                 res->state |= DLM_LOCK_RES_MIGRATING;
3107                 spin_unlock(&res->spinlock);
3108         }
3109
3110         /* ignore status.  only nonzero status would BUG. */
3111         ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
3112                                     name, namelen,
3113                                     migrate->new_master,
3114                                     migrate->master);
3115
3116 unlock:
3117         spin_unlock(&dlm->master_lock);
3118         spin_unlock(&dlm->spinlock);
3119
3120         if (oldmle) {
3121                 /* master is known, detach if not already detached */
3122                 dlm_mle_detach_hb_events(dlm, oldmle);
3123                 dlm_put_mle(oldmle);
3124         }
3125
3126         if (res)
3127                 dlm_lockres_put(res);
3128 leave:
3129         dlm_put(dlm);
3130         return ret;
3131 }
3132
3133 /* must be holding dlm->spinlock and dlm->master_lock
3134  * when adding a migration mle, we can clear any other mles
3135  * in the master list because we know with certainty that
3136  * the master is "master".  so we remove any old mle from
3137  * the list after setting it's master field, and then add
3138  * the new migration mle.  this way we can hold with the rule
3139  * of having only one mle for a given lock name at all times. */
3140 static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3141                                  struct dlm_lock_resource *res,
3142                                  struct dlm_master_list_entry *mle,
3143                                  struct dlm_master_list_entry **oldmle,
3144                                  const char *name, unsigned int namelen,
3145                                  u8 new_master, u8 master)
3146 {
3147         int found;
3148         int ret = 0;
3149
3150         *oldmle = NULL;
3151
3152         mlog_entry_void();
3153
3154         assert_spin_locked(&dlm->spinlock);
3155         assert_spin_locked(&dlm->master_lock);
3156
3157         /* caller is responsible for any ref taken here on oldmle */
3158         found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
3159         if (found) {
3160                 struct dlm_master_list_entry *tmp = *oldmle;
3161                 spin_lock(&tmp->spinlock);
3162                 if (tmp->type == DLM_MLE_MIGRATION) {
3163                         if (master == dlm->node_num) {
3164                                 /* ah another process raced me to it */
3165                                 mlog(0, "tried to migrate %.*s, but some "
3166                                      "process beat me to it\n",
3167                                      namelen, name);
3168                                 ret = -EEXIST;
3169                         } else {
3170                                 /* bad.  2 NODES are trying to migrate! */
3171                                 mlog(ML_ERROR, "migration error  mle: "
3172                                      "master=%u new_master=%u // request: "
3173                                      "master=%u new_master=%u // "
3174                                      "lockres=%.*s\n",
3175                                      tmp->master, tmp->new_master,
3176                                      master, new_master,
3177                                      namelen, name);
3178                                 BUG();
3179                         }
3180                 } else {
3181                         /* this is essentially what assert_master does */
3182                         tmp->master = master;
3183                         atomic_set(&tmp->woken, 1);
3184                         wake_up(&tmp->wq);
3185                         /* remove it so that only one mle will be found */
3186                         __dlm_unlink_mle(dlm, tmp);
3187                         __dlm_mle_detach_hb_events(dlm, tmp);
3188                         ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3189                         mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
3190                             "telling master to get ref for cleared out mle "
3191                             "during migration\n", dlm->name, namelen, name,
3192                             master, new_master);
3193                 }
3194                 spin_unlock(&tmp->spinlock);
3195         }
3196
3197         /* now add a migration mle to the tail of the list */
3198         dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
3199         mle->new_master = new_master;
3200         /* the new master will be sending an assert master for this.
3201          * at that point we will get the refmap reference */
3202         mle->master = master;
3203         /* do this for consistency with other mle types */
3204         set_bit(new_master, mle->maybe_map);
3205         __dlm_insert_mle(dlm, mle);
3206
3207         return ret;
3208 }
3209
3210
3211 void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3212 {
3213         struct dlm_master_list_entry *mle, *next;
3214         struct dlm_lock_resource *res;
3215         unsigned int hash;
3216
3217         mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
3218 top:
3219         assert_spin_locked(&dlm->spinlock);
3220
3221         /* clean the master list */
3222         spin_lock(&dlm->master_lock);
3223         list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
3224                 BUG_ON(mle->type != DLM_MLE_BLOCK &&
3225                        mle->type != DLM_MLE_MASTER &&
3226                        mle->type != DLM_MLE_MIGRATION);
3227
3228                 /* MASTER mles are initiated locally.  the waiting
3229                  * process will notice the node map change
3230                  * shortly.  let that happen as normal. */
3231                 if (mle->type == DLM_MLE_MASTER)
3232                         continue;
3233
3234
3235                 /* BLOCK mles are initiated by other nodes.
3236                  * need to clean up if the dead node would have
3237                  * been the master. */
3238                 if (mle->type == DLM_MLE_BLOCK) {
3239                         int bit;
3240
3241                         spin_lock(&mle->spinlock);
3242                         bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
3243                         if (bit != dead_node) {
3244                                 mlog(0, "mle found, but dead node %u would "
3245                                      "not have been master\n", dead_node);
3246                                 spin_unlock(&mle->spinlock);
3247                         } else {
3248                                 /* must drop the refcount by one since the
3249                                  * assert_master will never arrive.  this
3250                                  * may result in the mle being unlinked and
3251                                  * freed, but there may still be a process
3252                                  * waiting in the dlmlock path which is fine. */
3253                                 mlog(0, "node %u was expected master\n",
3254                                      dead_node);
3255                                 atomic_set(&mle->woken, 1);
3256                                 spin_unlock(&mle->spinlock);
3257                                 wake_up(&mle->wq);
3258                                 /* do not need events any longer, so detach
3259                                  * from heartbeat */
3260                                 __dlm_mle_detach_hb_events(dlm, mle);
3261                                 __dlm_put_mle(mle);
3262                         }
3263                         continue;
3264                 }
3265
3266                 /* everything else is a MIGRATION mle */
3267
3268                 /* the rule for MIGRATION mles is that the master
3269                  * becomes UNKNOWN if *either* the original or
3270                  * the new master dies.  all UNKNOWN lockreses
3271                  * are sent to whichever node becomes the recovery
3272                  * master.  the new master is responsible for
3273                  * determining if there is still a master for
3274                  * this lockres, or if he needs to take over
3275                  * mastery.  either way, this node should expect
3276                  * another message to resolve this. */
3277                 if (mle->master != dead_node &&
3278                     mle->new_master != dead_node)
3279                         continue;
3280
3281                 /* if we have reached this point, this mle needs to
3282                  * be removed from the list and freed. */
3283
3284                 /* remove from the list early.  NOTE: unlinking
3285                  * list_head while in list_for_each_safe */
3286                 __dlm_mle_detach_hb_events(dlm, mle);
3287                 spin_lock(&mle->spinlock);
3288                 __dlm_unlink_mle(dlm, mle);
3289                 atomic_set(&mle->woken, 1);
3290                 spin_unlock(&mle->spinlock);
3291                 wake_up(&mle->wq);
3292
3293                 mlog(0, "%s: node %u died during migration from "
3294                      "%u to %u!\n", dlm->name, dead_node,
3295                      mle->master, mle->new_master);
3296                 /* if there is a lockres associated with this
3297                  * mle, find it and set its owner to UNKNOWN */
3298                 hash = dlm_lockid_hash(mle->u.mlename.name, mle->u.mlename.len);
3299                 res = __dlm_lookup_lockres(dlm, mle->u.mlename.name,
3300                                            mle->u.mlename.len, hash);
3301                 if (res) {
3302                         /* unfortunately if we hit this rare case, our
3303                          * lock ordering is messed.  we need to drop
3304                          * the master lock so that we can take the
3305                          * lockres lock, meaning that we will have to
3306                          * restart from the head of list. */
3307                         spin_unlock(&dlm->master_lock);
3308
3309                         /* move lockres onto recovery list */
3310                         spin_lock(&res->spinlock);
3311                         dlm_set_lockres_owner(dlm, res,
3312                                         DLM_LOCK_RES_OWNER_UNKNOWN);
3313                         dlm_move_lockres_to_recovery_list(dlm, res);
3314                         spin_unlock(&res->spinlock);
3315                         dlm_lockres_put(res);
3316
3317                         /* about to get rid of mle, detach from heartbeat */
3318                         __dlm_mle_detach_hb_events(dlm, mle);
3319
3320                         /* dump the mle */
3321                         spin_lock(&dlm->master_lock);
3322                         __dlm_put_mle(mle);
3323                         spin_unlock(&dlm->master_lock);
3324
3325                         /* restart */
3326                         goto top;
3327                 }
3328
3329                 /* this may be the last reference */
3330                 __dlm_put_mle(mle);
3331         }
3332         spin_unlock(&dlm->master_lock);
3333 }
3334
3335
3336 int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3337                          u8 old_master)
3338 {
3339         struct dlm_node_iter iter;
3340         int ret = 0;
3341
3342         spin_lock(&dlm->spinlock);
3343         dlm_node_iter_init(dlm->domain_map, &iter);
3344         clear_bit(old_master, iter.node_map);
3345         clear_bit(dlm->node_num, iter.node_map);
3346         spin_unlock(&dlm->spinlock);
3347
3348         /* ownership of the lockres is changing.  account for the
3349          * mastery reference here since old_master will briefly have
3350          * a reference after the migration completes */
3351         spin_lock(&res->spinlock);
3352         dlm_lockres_set_refmap_bit(old_master, res);
3353         spin_unlock(&res->spinlock);
3354
3355         mlog(0, "now time to do a migrate request to other nodes\n");
3356         ret = dlm_do_migrate_request(dlm, res, old_master,
3357                                      dlm->node_num, &iter);
3358         if (ret < 0) {
3359                 mlog_errno(ret);
3360                 goto leave;
3361         }
3362
3363         mlog(0, "doing assert master of %.*s to all except the original node\n",
3364              res->lockname.len, res->lockname.name);
3365         /* this call now finishes out the nodemap
3366          * even if one or more nodes die */
3367         ret = dlm_do_assert_master(dlm, res, iter.node_map,
3368                                    DLM_ASSERT_MASTER_FINISH_MIGRATION);
3369         if (ret < 0) {
3370                 /* no longer need to retry.  all living nodes contacted. */
3371                 mlog_errno(ret);
3372                 ret = 0;
3373         }
3374
3375         memset(iter.node_map, 0, sizeof(iter.node_map));
3376         set_bit(old_master, iter.node_map);
3377         mlog(0, "doing assert master of %.*s back to %u\n",
3378              res->lockname.len, res->lockname.name, old_master);
3379         ret = dlm_do_assert_master(dlm, res, iter.node_map,
3380                                    DLM_ASSERT_MASTER_FINISH_MIGRATION);
3381         if (ret < 0) {
3382                 mlog(0, "assert master to original master failed "
3383                      "with %d.\n", ret);
3384                 /* the only nonzero status here would be because of
3385                  * a dead original node.  we're done. */
3386                 ret = 0;
3387         }
3388
3389         /* all done, set the owner, clear the flag */
3390         spin_lock(&res->spinlock);
3391         dlm_set_lockres_owner(dlm, res, dlm->node_num);
3392         res->state &= ~DLM_LOCK_RES_MIGRATING;
3393         spin_unlock(&res->spinlock);
3394         /* re-dirty it on the new master */
3395         dlm_kick_thread(dlm, res);
3396         wake_up(&res->wq);
3397 leave:
3398         return ret;
3399 }
3400
3401 /*
3402  * LOCKRES AST REFCOUNT
3403  * this is integral to migration
3404  */
3405
3406 /* for future intent to call an ast, reserve one ahead of time.
3407  * this should be called only after waiting on the lockres
3408  * with dlm_wait_on_lockres, and while still holding the
3409  * spinlock after the call. */
3410 void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
3411 {
3412         assert_spin_locked(&res->spinlock);
3413         if (res->state & DLM_LOCK_RES_MIGRATING) {
3414                 __dlm_print_one_lock_resource(res);
3415         }
3416         BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
3417
3418         atomic_inc(&res->asts_reserved);
3419 }
3420
3421 /*
3422  * used to drop the reserved ast, either because it went unused,
3423  * or because the ast/bast was actually called.
3424  *
3425  * also, if there is a pending migration on this lockres,
3426  * and this was the last pending ast on the lockres,
3427  * atomically set the MIGRATING flag before we drop the lock.
3428  * this is how we ensure that migration can proceed with no
3429  * asts in progress.  note that it is ok if the state of the
3430  * queues is such that a lock should be granted in the future
3431  * or that a bast should be fired, because the new master will
3432  * shuffle the lists on this lockres as soon as it is migrated.
3433  */
3434 void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
3435                              struct dlm_lock_resource *res)
3436 {
3437         if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
3438                 return;
3439
3440         if (!res->migration_pending) {
3441                 spin_unlock(&res->spinlock);
3442                 return;
3443         }
3444
3445         BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
3446         res->migration_pending = 0;
3447         res->state |= DLM_LOCK_RES_MIGRATING;
3448         spin_unlock(&res->spinlock);
3449         wake_up(&res->wq);
3450         wake_up(&dlm->migration_wq);
3451 }