dlm: fix unlock balance warnings
authorDavid Teigland <teigland@redhat.com>
Thu, 2 Aug 2012 16:08:21 +0000 (11:08 -0500)
committerDavid Teigland <teigland@redhat.com>
Wed, 8 Aug 2012 16:33:49 +0000 (11:33 -0500)
The in_recovery rw_semaphore has always been acquired and
released by different threads by design.  To work around
the "BUG: bad unlock balance detected!" messages, adjust
things so the dlm_recoverd thread always does both down_write
and up_write.

Signed-off-by: David Teigland <teigland@redhat.com>
fs/dlm/dlm_internal.h
fs/dlm/lockspace.c
fs/dlm/member.c
fs/dlm/rcom.c
fs/dlm/recoverd.c
fs/dlm/recoverd.h

index 9d3e485f88c8b3b2230a9ba49ad315113b077bfd..871c1abf6029869ba7157c2f0db032eaabe951f3 100644 (file)
@@ -604,6 +604,7 @@ struct dlm_ls {
        struct idr              ls_recover_idr;
        spinlock_t              ls_recover_idr_lock;
        wait_queue_head_t       ls_wait_general;
+       wait_queue_head_t       ls_recover_lock_wait;
        struct mutex            ls_clear_proc_locks;
 
        struct list_head        ls_root_list;   /* root resources */
@@ -616,15 +617,40 @@ struct dlm_ls {
        char                    ls_name[1];
 };
 
-#define LSFL_WORK              0
-#define LSFL_RUNNING           1
-#define LSFL_RECOVERY_STOP     2
-#define LSFL_RCOM_READY                3
-#define LSFL_RCOM_WAIT         4
-#define LSFL_UEVENT_WAIT       5
-#define LSFL_TIMEWARN          6
-#define LSFL_CB_DELAY          7
-#define LSFL_NODIR             8
+/*
+ * LSFL_RECOVER_STOP - dlm_ls_stop() sets this to tell dlm recovery routines
+ * that they should abort what they're doing so new recovery can be started.
+ *
+ * LSFL_RECOVER_DOWN - dlm_ls_stop() sets this to tell dlm_recoverd that it
+ * should do down_write() on the in_recovery rw_semaphore. (doing down_write
+ * within dlm_ls_stop causes complaints about the lock acquired/released
+ * in different contexts.)
+ *
+ * LSFL_RECOVER_LOCK - dlm_recoverd holds the in_recovery rw_semaphore.
+ * It sets this after it is done with down_write() on the in_recovery
+ * rw_semaphore and clears it after it has released the rw_semaphore.
+ *
+ * LSFL_RECOVER_WORK - dlm_ls_start() sets this to tell dlm_recoverd that it
+ * should begin recovery of the lockspace.
+ *
+ * LSFL_RUNNING - set when normal locking activity is enabled.
+ * dlm_ls_stop() clears this to tell dlm locking routines that they should
+ * quit what they are doing so recovery can run.  dlm_recoverd sets
+ * this after recovery is finished.
+ */
+
+#define LSFL_RECOVER_STOP      0
+#define LSFL_RECOVER_DOWN      1
+#define LSFL_RECOVER_LOCK      2
+#define LSFL_RECOVER_WORK      3
+#define LSFL_RUNNING           4
+
+#define LSFL_RCOM_READY                5
+#define LSFL_RCOM_WAIT         6
+#define LSFL_UEVENT_WAIT       7
+#define LSFL_TIMEWARN          8
+#define LSFL_CB_DELAY          9
+#define LSFL_NODIR             10
 
 /* much of this is just saving user space pointers associated with the
    lock that we pass back to the user lib with an ast */
@@ -667,7 +693,7 @@ static inline int dlm_locking_stopped(struct dlm_ls *ls)
 
 static inline int dlm_recovery_stopped(struct dlm_ls *ls)
 {
-       return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+       return test_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
 }
 
 static inline int dlm_no_directory(struct dlm_ls *ls)
index 952557d00ccdf0744ca2527300ef1d2798e14cdb..2e99fb0c973776fe9f103f5f0d220d2e7aceb256 100644 (file)
@@ -582,8 +582,6 @@ static int new_lockspace(const char *name, const char *cluster,
        INIT_LIST_HEAD(&ls->ls_root_list);
        init_rwsem(&ls->ls_root_sem);
 
-       down_write(&ls->ls_in_recovery);
-
        spin_lock(&lslist_lock);
        ls->ls_create_count = 1;
        list_add(&ls->ls_list, &lslist);
@@ -597,13 +595,24 @@ static int new_lockspace(const char *name, const char *cluster,
                }
        }
 
-       /* needs to find ls in lslist */
+       init_waitqueue_head(&ls->ls_recover_lock_wait);
+
+       /*
+        * Once started, dlm_recoverd first looks for ls in lslist, then
+        * initializes ls_in_recovery as locked in "down" mode.  We need
+        * to wait for the wakeup from dlm_recoverd because in_recovery
+        * has to start out in down mode.
+        */
+
        error = dlm_recoverd_start(ls);
        if (error) {
                log_error(ls, "can't start dlm_recoverd %d", error);
                goto out_callback;
        }
 
+       wait_event(ls->ls_recover_lock_wait,
+                  test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags));
+
        ls->ls_kobj.kset = dlm_kset;
        error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
                                     "%s", ls->ls_name);
index 862640a36d5cbba1762ad47317dffbd72eab0531..476557b54921793a246cb11b1fc8f19c348aa17d 100644 (file)
@@ -616,13 +616,13 @@ int dlm_ls_stop(struct dlm_ls *ls)
        down_write(&ls->ls_recv_active);
 
        /*
-        * Abort any recovery that's in progress (see RECOVERY_STOP,
+        * Abort any recovery that's in progress (see RECOVER_STOP,
         * dlm_recovery_stopped()) and tell any other threads running in the
         * dlm to quit any processing (see RUNNING, dlm_locking_stopped()).
         */
 
        spin_lock(&ls->ls_recover_lock);
-       set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+       set_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
        new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
        ls->ls_recover_seq++;
        spin_unlock(&ls->ls_recover_lock);
@@ -642,12 +642,16 @@ int dlm_ls_stop(struct dlm_ls *ls)
         *    when recovery is complete.
         */
 
-       if (new)
-               down_write(&ls->ls_in_recovery);
+       if (new) {
+               set_bit(LSFL_RECOVER_DOWN, &ls->ls_flags);
+               wake_up_process(ls->ls_recoverd_task);
+               wait_event(ls->ls_recover_lock_wait,
+                          test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags));
+       }
 
        /*
         * The recoverd suspend/resume makes sure that dlm_recoverd (if
-        * running) has noticed RECOVERY_STOP above and quit processing the
+        * running) has noticed RECOVER_STOP above and quit processing the
         * previous recovery.
         */
 
@@ -709,7 +713,8 @@ int dlm_ls_start(struct dlm_ls *ls)
                kfree(rv_old);
        }
 
-       dlm_recoverd_kick(ls);
+       set_bit(LSFL_RECOVER_WORK, &ls->ls_flags);
+       wake_up_process(ls->ls_recoverd_task);
        return 0;
 
  fail:
index 87f1a56eab32c8850ce1eec9eefa404c15919f49..9d61947d473a1191f4a0391bb837746dfb8206c3 100644 (file)
@@ -581,7 +581,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 
        spin_lock(&ls->ls_recover_lock);
        status = ls->ls_recover_status;
-       stop = test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+       stop = test_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
        seq = ls->ls_recover_seq;
        spin_unlock(&ls->ls_recover_lock);
 
index 88ce65ff021ed35848b30ae129e9334930413cc3..32f9f8926ec3f17a3a69433402e8a0fecb07a1f6 100644 (file)
@@ -41,6 +41,7 @@ static int enable_locking(struct dlm_ls *ls, uint64_t seq)
                set_bit(LSFL_RUNNING, &ls->ls_flags);
                /* unblocks processes waiting to enter the dlm */
                up_write(&ls->ls_in_recovery);
+               clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
                error = 0;
        }
        spin_unlock(&ls->ls_recover_lock);
@@ -262,7 +263,7 @@ static void do_ls_recovery(struct dlm_ls *ls)
        rv = ls->ls_recover_args;
        ls->ls_recover_args = NULL;
        if (rv && ls->ls_recover_seq == rv->seq)
-               clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+               clear_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
        spin_unlock(&ls->ls_recover_lock);
 
        if (rv) {
@@ -282,26 +283,34 @@ static int dlm_recoverd(void *arg)
                return -1;
        }
 
+       down_write(&ls->ls_in_recovery);
+       set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
+       wake_up(&ls->ls_recover_lock_wait);
+
        while (!kthread_should_stop()) {
                set_current_state(TASK_INTERRUPTIBLE);
-               if (!test_bit(LSFL_WORK, &ls->ls_flags))
+               if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) &&
+                   !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags))
                        schedule();
                set_current_state(TASK_RUNNING);
 
-               if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
+               if (test_and_clear_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) {
+                       down_write(&ls->ls_in_recovery);
+                       set_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
+                       wake_up(&ls->ls_recover_lock_wait);
+               }
+
+               if (test_and_clear_bit(LSFL_RECOVER_WORK, &ls->ls_flags))
                        do_ls_recovery(ls);
        }
 
+       if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags))
+               up_write(&ls->ls_in_recovery);
+
        dlm_put_lockspace(ls);
        return 0;
 }
 
-void dlm_recoverd_kick(struct dlm_ls *ls)
-{
-       set_bit(LSFL_WORK, &ls->ls_flags);
-       wake_up_process(ls->ls_recoverd_task);
-}
-
 int dlm_recoverd_start(struct dlm_ls *ls)
 {
        struct task_struct *p;
index 866657c5d69deb0eaf415f95f6f2690ab70564c8..8856079733fa303d4b411938b13a5f1caac55598 100644 (file)
@@ -14,7 +14,6 @@
 #ifndef __RECOVERD_DOT_H__
 #define __RECOVERD_DOT_H__
 
-void dlm_recoverd_kick(struct dlm_ls *ls);
 void dlm_recoverd_stop(struct dlm_ls *ls);
 int dlm_recoverd_start(struct dlm_ls *ls);
 void dlm_recoverd_suspend(struct dlm_ls *ls);