drbd: close race when detaching from disk
authorLars Ellenberg <lars.ellenberg@linbit.com>
Tue, 11 Feb 2014 07:57:18 +0000 (08:57 +0100)
committerPhilipp Reisner <philipp.reisner@linbit.com>
Thu, 10 Jul 2014 16:34:54 +0000 (18:34 +0200)
BUG: unable to handle kernel NULL pointer dereference at 0000000000000058
IP: bd_release+0x21/0x70
Process drbd_w_t7146
Call Trace:
 close_bdev_exclusive
 drbd_free_ldev [drbd]
 drbd_ldev_destroy [drbd]
 w_after_state_ch [drbd]

Race probably went like this:
  state.disk = D_FAILED

... first one to hit zero during D_FAILED:
   put_ldev() /* ----------------> 0 */
     i = atomic_dec_return()
     if (i == 0)
       if (state.disk == D_FAILED)
         schedule_work(go_diskless)
                                /* 1 <------ */ get_ldev_if_state()
   go_diskless()
      do_some_pre_cleanup()                     corresponding put_ldev():
      force_state(D_DISKLESS)   /* 0 <------ */ i = atomic_dec_return()
                                                if (i == 0)
        atomic_inc() /* ---------> 1 */
        state.disk = D_DISKLESS
        schedule_work(after_state_ch)           /* execution pre-empted by IRQ ? */

   after_state_ch()
     put_ldev()
       i = atomic_dec_return()  /* 0 */
       if (i == 0)
         if (state.disk == D_DISKLESS)            if (state.disk == D_DISKLESS)
           drbd_ldev_destroy()                      drbd_ldev_destroy();

Trying to fix this by checking the disk state *before* the
atomic_dec_return(), which implies memory barriers, and by inserting
extra memory barriers around the state assignment in __drbd_set_state().

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_state.c

index a16f9ae3c98a4ed81c71e45a10a747058e9a1612..a0ffc19ccf0e78324a5551ea7692ac4f1f928436 100644 (file)
@@ -1946,6 +1946,11 @@ static inline bool is_sync_state(enum drbd_conns connection_state)
 
 static inline void put_ldev(struct drbd_device *device)
 {
+       enum drbd_disk_state ds = device->state.disk;
+       /* We must check the state *before* the atomic_dec becomes visible,
+        * or we have a theoretical race where someone hitting zero,
+        * while state still D_FAILED, will then see D_DISKLESS in the
+        * condition below and calling into destroy, where he must not, yet. */
        int i = atomic_dec_return(&device->local_cnt);
 
        /* This may be called from some endio handler,
@@ -1954,10 +1959,10 @@ static inline void put_ldev(struct drbd_device *device)
        __release(local);
        D_ASSERT(device, i >= 0);
        if (i == 0) {
-               if (device->state.disk == D_DISKLESS)
+               if (ds == D_DISKLESS)
                        /* even internal references gone, safe to destroy */
                        drbd_ldev_destroy(device);
-               if (device->state.disk == D_FAILED) {
+               if (ds == D_FAILED) {
                        /* all application IO references gone. */
                        if (!test_and_set_bit(GO_DISKLESS, &device->flags))
                                drbd_queue_work(&first_peer_device(device)->connection->sender_work,
index 1bddd6cf8ac7c3dec991a53586e9eb597d8674e2..6629f46681025f6ad4021e8f90e0fa02e219fa02 100644 (file)
@@ -958,7 +958,6 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
        enum drbd_state_rv rv = SS_SUCCESS;
        enum sanitize_state_warnings ssw;
        struct after_state_chg_work *ascw;
-       bool did_remote, should_do_remote;
 
        os = drbd_read_state(device);
 
@@ -1010,18 +1009,22 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
            (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
                atomic_inc(&device->local_cnt);
 
-       did_remote = drbd_should_do_remote(device->state);
        if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
                clear_bit(RS_DONE, &device->flags);
 
+       /* changes to local_cnt and device flags should be visible before
+        * changes to state, which again should be visible before anything else
+        * depending on that change happens. */
+       smp_wmb();
        device->state.i = ns.i;
-       should_do_remote = drbd_should_do_remote(device->state);
        device->resource->susp = ns.susp;
        device->resource->susp_nod = ns.susp_nod;
        device->resource->susp_fen = ns.susp_fen;
+       smp_wmb();
 
        /* put replicated vs not-replicated requests in seperate epochs */
-       if (did_remote != should_do_remote)
+       if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
+           drbd_should_do_remote((union drbd_dev_state)ns.i))
                start_new_tl_epoch(connection);
 
        if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)