4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
34 #include <linux/ctype.h>
35 #include <linux/mutex.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
53 #include <linux/drbd_limits.h>
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
59 struct after_state_chg_work {
63 enum chg_state_flags flags;
64 struct completion *done;
67 static DEFINE_MUTEX(drbd_main_mutex);
68 int drbdd_init(struct drbd_thread *);
69 int drbd_worker(struct drbd_thread *);
70 int drbd_asender(struct drbd_thread *);
73 static int drbd_open(struct block_device *bdev, fmode_t mode);
74 static int drbd_release(struct gendisk *gd, fmode_t mode);
75 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79 static void md_sync_timer_fn(unsigned long data);
80 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82 static void _tl_clear(struct drbd_conf *mdev);
84 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
85 "Lars Ellenberg <lars@linbit.com>");
86 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
87 MODULE_VERSION(REL_VERSION);
88 MODULE_LICENSE("GPL");
89 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
90 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
91 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
93 #include <linux/moduleparam.h>
94 /* allow_open_on_secondary */
95 MODULE_PARM_DESC(allow_oos, "DONT USE!");
96 /* thanks to these macros, if compiled into the kernel (not-module),
97 * this becomes the boot parameter drbd.minor_count */
98 module_param(minor_count, uint, 0444);
99 module_param(disable_sendpage, bool, 0644);
100 module_param(allow_oos, bool, 0);
101 module_param(cn_idx, uint, 0444);
102 module_param(proc_details, int, 0644);
104 #ifdef CONFIG_DRBD_FAULT_INJECTION
107 static int fault_count;
109 /* bitmap of enabled faults */
110 module_param(enable_faults, int, 0664);
111 /* fault rate % value - applies to all enabled faults */
112 module_param(fault_rate, int, 0664);
113 /* count of faults inserted */
114 module_param(fault_count, int, 0664);
115 /* bitmap of devices to insert faults on */
116 module_param(fault_devs, int, 0644);
119 /* module parameter, defined */
120 unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
121 bool disable_sendpage;
123 unsigned int cn_idx = CN_IDX_DRBD;
124 int proc_details; /* Detail level in proc drbd*/
126 /* Module parameter for setting the user mode helper program
127 * to run. Default is /sbin/drbdadm */
128 char usermode_helper[80] = "/sbin/drbdadm";
130 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
132 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
133 * as member "struct gendisk *vdisk;"
135 struct drbd_conf **minor_table;
137 struct kmem_cache *drbd_request_cache;
138 struct kmem_cache *drbd_ee_cache; /* epoch entries */
139 struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
140 struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
141 mempool_t *drbd_request_mempool;
142 mempool_t *drbd_ee_mempool;
143 mempool_t *drbd_md_io_page_pool;
144 struct bio_set *drbd_md_io_bio_set;
146 /* I do not use a standard mempool, because:
147 1) I want to hand out the pre-allocated objects first.
148 2) I want to be able to interrupt sleeping allocation with a signal.
149 Note: This is a single linked list, the next pointer is the private
150 member of struct page.
152 struct page *drbd_pp_pool;
153 spinlock_t drbd_pp_lock;
155 wait_queue_head_t drbd_pp_wait;
157 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
159 static const struct block_device_operations drbd_ops = {
160 .owner = THIS_MODULE,
162 .release = drbd_release,
165 struct bio *bio_alloc_drbd(gfp_t gfp_mask)
167 if (!drbd_md_io_bio_set)
168 return bio_alloc(gfp_mask, 1);
170 return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
174 /* When checking with sparse, and this is an inline function, sparse will
175 give tons of false positives. When this is a real functions sparse works.
177 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
181 atomic_inc(&mdev->local_cnt);
182 io_allowed = (mdev->state.disk >= mins);
184 if (atomic_dec_and_test(&mdev->local_cnt))
185 wake_up(&mdev->misc_wait);
193 * DOC: The transfer log
195 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
196 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
197 * of the list. There is always at least one &struct drbd_tl_epoch object.
199 * Each &struct drbd_tl_epoch has a circular double linked list of requests
202 static int tl_init(struct drbd_conf *mdev)
204 struct drbd_tl_epoch *b;
206 /* during device minor initialization, we may well use GFP_KERNEL */
207 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
210 INIT_LIST_HEAD(&b->requests);
211 INIT_LIST_HEAD(&b->w.list);
215 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
217 mdev->oldest_tle = b;
218 mdev->newest_tle = b;
219 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
220 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
222 mdev->tl_hash = NULL;
228 static void tl_cleanup(struct drbd_conf *mdev)
230 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
231 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
232 kfree(mdev->oldest_tle);
233 mdev->oldest_tle = NULL;
234 kfree(mdev->unused_spare_tle);
235 mdev->unused_spare_tle = NULL;
236 kfree(mdev->tl_hash);
237 mdev->tl_hash = NULL;
242 * _tl_add_barrier() - Adds a barrier to the transfer log
243 * @mdev: DRBD device.
244 * @new: Barrier to be added before the current head of the TL.
246 * The caller must hold the req_lock.
248 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
250 struct drbd_tl_epoch *newest_before;
252 INIT_LIST_HEAD(&new->requests);
253 INIT_LIST_HEAD(&new->w.list);
254 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
258 newest_before = mdev->newest_tle;
259 new->br_number = newest_before->br_number+1;
260 if (mdev->newest_tle != new) {
261 mdev->newest_tle->next = new;
262 mdev->newest_tle = new;
267 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
268 * @mdev: DRBD device.
269 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
270 * @set_size: Expected number of requests before that barrier.
272 * In case the passed barrier_nr or set_size does not match the oldest
273 * &struct drbd_tl_epoch objects this function will cause a termination
276 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
277 unsigned int set_size)
279 struct drbd_tl_epoch *b, *nob; /* next old barrier */
280 struct list_head *le, *tle;
281 struct drbd_request *r;
283 spin_lock_irq(&mdev->req_lock);
285 b = mdev->oldest_tle;
287 /* first some paranoia code */
289 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
293 if (b->br_number != barrier_nr) {
294 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
295 barrier_nr, b->br_number);
298 if (b->n_writes != set_size) {
299 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
300 barrier_nr, set_size, b->n_writes);
304 /* Clean up list of requests processed during current epoch */
305 list_for_each_safe(le, tle, &b->requests) {
306 r = list_entry(le, struct drbd_request, tl_requests);
307 _req_mod(r, barrier_acked);
309 /* There could be requests on the list waiting for completion
310 of the write to the local disk. To avoid corruptions of
311 slab's data structures we have to remove the lists head.
313 Also there could have been a barrier ack out of sequence, overtaking
314 the write acks - which would be a bug and violating write ordering.
315 To not deadlock in case we lose connection while such requests are
316 still pending, we need some way to find them for the
317 _req_mode(connection_lost_while_pending).
319 These have been list_move'd to the out_of_sequence_requests list in
320 _req_mod(, barrier_acked) above.
322 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
325 if (drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) {
326 _tl_add_barrier(mdev, b);
328 mdev->oldest_tle = nob;
329 /* if nob == NULL b was the only barrier, and becomes the new
330 barrier. Therefore mdev->oldest_tle points already to b */
332 D_ASSERT(nob != NULL);
333 mdev->oldest_tle = nob;
337 spin_unlock_irq(&mdev->req_lock);
338 dec_ap_pending(mdev);
343 spin_unlock_irq(&mdev->req_lock);
344 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
349 * _tl_restart() - Walks the transfer log, and applies an action to all requests
350 * @mdev: DRBD device.
351 * @what: The action/event to perform with all request objects
353 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
354 * restart_frozen_disk_io.
356 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
358 struct drbd_tl_epoch *b, *tmp, **pn;
359 struct list_head *le, *tle, carry_reads;
360 struct drbd_request *req;
361 int rv, n_writes, n_reads;
363 b = mdev->oldest_tle;
364 pn = &mdev->oldest_tle;
368 INIT_LIST_HEAD(&carry_reads);
369 list_for_each_safe(le, tle, &b->requests) {
370 req = list_entry(le, struct drbd_request, tl_requests);
371 rv = _req_mod(req, what);
373 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
374 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
379 if (what == resend) {
380 b->n_writes = n_writes;
381 if (b->w.cb == NULL) {
382 b->w.cb = w_send_barrier;
383 inc_ap_pending(mdev);
384 drbd_set_flag(mdev, CREATE_BARRIER);
387 drbd_queue_work(&mdev->data.work, &b->w);
392 list_add(&carry_reads, &b->requests);
393 /* there could still be requests on that ring list,
394 * in case local io is still pending */
395 list_del(&b->requests);
397 /* dec_ap_pending corresponding to queue_barrier.
398 * the newest barrier may not have been queued yet,
399 * in which case w.cb is still NULL. */
401 dec_ap_pending(mdev);
403 if (b == mdev->newest_tle) {
404 /* recycle, but reinit! */
405 D_ASSERT(tmp == NULL);
406 INIT_LIST_HEAD(&b->requests);
407 list_splice(&carry_reads, &b->requests);
408 INIT_LIST_HEAD(&b->w.list);
410 b->br_number = net_random();
420 list_splice(&carry_reads, &b->requests);
423 /* Actions operating on the disk state, also want to work on
424 requests that got barrier acked. */
426 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
427 req = list_entry(le, struct drbd_request, tl_requests);
434 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
435 * @mdev: DRBD device.
437 * This is called after the connection to the peer was lost. The storage covered
438 * by the requests on the transfer gets marked as our of sync. Called from the
439 * receiver thread and the worker thread.
441 void tl_clear(struct drbd_conf *mdev)
443 spin_lock_irq(&mdev->req_lock);
445 spin_unlock_irq(&mdev->req_lock);
448 static void _tl_clear(struct drbd_conf *mdev)
450 struct list_head *le, *tle;
451 struct drbd_request *r;
453 _tl_restart(mdev, connection_lost_while_pending);
455 /* we expect this list to be empty. */
456 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
458 /* but just in case, clean it up anyways! */
459 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
460 r = list_entry(le, struct drbd_request, tl_requests);
461 /* It would be nice to complete outside of spinlock.
462 * But this is easier for now. */
463 _req_mod(r, connection_lost_while_pending);
466 /* ensure bit indicating barrier is required is clear */
467 drbd_clear_flag(mdev, CREATE_BARRIER);
469 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
473 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
475 spin_lock_irq(&mdev->req_lock);
476 _tl_restart(mdev, what);
477 spin_unlock_irq(&mdev->req_lock);
481 * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
482 * @mdev: DRBD device.
484 void tl_abort_disk_io(struct drbd_conf *mdev)
486 struct drbd_tl_epoch *b;
487 struct list_head *le, *tle;
488 struct drbd_request *req;
490 spin_lock_irq(&mdev->req_lock);
491 b = mdev->oldest_tle;
493 list_for_each_safe(le, tle, &b->requests) {
494 req = list_entry(le, struct drbd_request, tl_requests);
495 if (!(req->rq_state & RQ_LOCAL_PENDING))
497 _req_mod(req, abort_disk_io);
502 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
503 req = list_entry(le, struct drbd_request, tl_requests);
504 if (!(req->rq_state & RQ_LOCAL_PENDING))
506 _req_mod(req, abort_disk_io);
509 spin_unlock_irq(&mdev->req_lock);
513 * cl_wide_st_chg() - true if the state change is a cluster wide one
514 * @mdev: DRBD device.
515 * @os: old (current) state.
516 * @ns: new (wanted) state.
518 static int cl_wide_st_chg(struct drbd_conf *mdev,
519 union drbd_state os, union drbd_state ns)
521 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
522 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
523 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
524 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
525 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
526 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
527 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
531 drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
532 union drbd_state mask, union drbd_state val)
535 union drbd_state os, ns;
536 enum drbd_state_rv rv;
538 spin_lock_irqsave(&mdev->req_lock, flags);
540 ns.i = (os.i & ~mask.i) | val.i;
541 rv = _drbd_set_state(mdev, ns, f, NULL);
543 spin_unlock_irqrestore(&mdev->req_lock, flags);
549 * drbd_force_state() - Impose a change which happens outside our control on our state
550 * @mdev: DRBD device.
551 * @mask: mask of state bits to change.
552 * @val: value of new state bits.
554 void drbd_force_state(struct drbd_conf *mdev,
555 union drbd_state mask, union drbd_state val)
557 drbd_change_state(mdev, CS_HARD, mask, val);
560 static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
561 static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
564 enum sanitize_state_warnings {
566 ABORTED_ONLINE_VERIFY,
568 CONNECTION_LOST_NEGOTIATING,
569 IMPLICITLY_UPGRADED_DISK,
570 IMPLICITLY_UPGRADED_PDSK,
572 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
573 union drbd_state ns, enum sanitize_state_warnings *warn);
574 int drbd_send_state_req(struct drbd_conf *,
575 union drbd_state, union drbd_state);
577 static enum drbd_state_rv
578 _req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
579 union drbd_state val)
581 union drbd_state os, ns;
583 enum drbd_state_rv rv;
585 if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_SUCCESS))
586 return SS_CW_SUCCESS;
588 if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_FAIL))
589 return SS_CW_FAILED_BY_PEER;
592 spin_lock_irqsave(&mdev->req_lock, flags);
594 ns.i = (os.i & ~mask.i) | val.i;
595 ns = sanitize_state(mdev, os, ns, NULL);
597 if (!cl_wide_st_chg(mdev, os, ns))
600 rv = is_valid_state(mdev, ns);
601 if (rv == SS_SUCCESS) {
602 rv = is_valid_state_transition(mdev, ns, os);
603 if (rv == SS_SUCCESS)
604 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
607 spin_unlock_irqrestore(&mdev->req_lock, flags);
613 * drbd_req_state() - Perform an eventually cluster wide state change
614 * @mdev: DRBD device.
615 * @mask: mask of state bits to change.
616 * @val: value of new state bits.
619 * Should not be called directly, use drbd_request_state() or
620 * _drbd_request_state().
622 static enum drbd_state_rv
623 drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
624 union drbd_state val, enum chg_state_flags f)
626 struct completion done;
628 union drbd_state os, ns;
629 enum drbd_state_rv rv;
631 init_completion(&done);
633 if (f & CS_SERIALIZE)
634 mutex_lock(&mdev->state_mutex);
636 spin_lock_irqsave(&mdev->req_lock, flags);
638 ns.i = (os.i & ~mask.i) | val.i;
639 ns = sanitize_state(mdev, os, ns, NULL);
641 if (cl_wide_st_chg(mdev, os, ns)) {
642 rv = is_valid_state(mdev, ns);
643 if (rv == SS_SUCCESS)
644 rv = is_valid_state_transition(mdev, ns, os);
645 spin_unlock_irqrestore(&mdev->req_lock, flags);
647 if (rv < SS_SUCCESS) {
649 print_st_err(mdev, os, ns, rv);
653 drbd_state_lock(mdev);
654 if (!drbd_send_state_req(mdev, mask, val)) {
655 drbd_state_unlock(mdev);
656 rv = SS_CW_FAILED_BY_PEER;
658 print_st_err(mdev, os, ns, rv);
662 if (mask.conn == C_MASK && val.conn == C_DISCONNECTING)
663 drbd_set_flag(mdev, DISCONNECT_SENT);
665 wait_event(mdev->state_wait,
666 (rv = _req_st_cond(mdev, mask, val)));
668 if (rv < SS_SUCCESS) {
669 drbd_state_unlock(mdev);
671 print_st_err(mdev, os, ns, rv);
674 spin_lock_irqsave(&mdev->req_lock, flags);
676 ns.i = (os.i & ~mask.i) | val.i;
677 rv = _drbd_set_state(mdev, ns, f, &done);
678 drbd_state_unlock(mdev);
680 rv = _drbd_set_state(mdev, ns, f, &done);
683 spin_unlock_irqrestore(&mdev->req_lock, flags);
685 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
686 D_ASSERT(current != mdev->worker.task);
687 wait_for_completion(&done);
691 if (f & CS_SERIALIZE)
692 mutex_unlock(&mdev->state_mutex);
698 * _drbd_request_state() - Request a state change (with flags)
699 * @mdev: DRBD device.
700 * @mask: mask of state bits to change.
701 * @val: value of new state bits.
704 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
705 * flag, or when logging of failed state change requests is not desired.
708 _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
709 union drbd_state val, enum chg_state_flags f)
711 enum drbd_state_rv rv;
713 wait_event(mdev->state_wait,
714 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
719 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
721 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
723 drbd_conn_str(ns.conn),
724 drbd_role_str(ns.role),
725 drbd_role_str(ns.peer),
726 drbd_disk_str(ns.disk),
727 drbd_disk_str(ns.pdsk),
728 is_susp(ns) ? 's' : 'r',
729 ns.aftr_isp ? 'a' : '-',
730 ns.peer_isp ? 'p' : '-',
731 ns.user_isp ? 'u' : '-'
735 void print_st_err(struct drbd_conf *mdev, union drbd_state os,
736 union drbd_state ns, enum drbd_state_rv err)
738 if (err == SS_IN_TRANSIENT_STATE)
740 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
741 print_st(mdev, " state", os);
742 print_st(mdev, "wanted", ns);
747 * is_valid_state() - Returns an SS_ error code if ns is not valid
748 * @mdev: DRBD device.
749 * @ns: State to consider.
751 static enum drbd_state_rv
752 is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
754 /* See drbd_state_sw_errors in drbd_strings.c */
756 enum drbd_fencing_p fp;
757 enum drbd_state_rv rv = SS_SUCCESS;
760 if (get_ldev(mdev)) {
761 fp = mdev->ldev->dc.fencing;
765 if (get_net_conf(mdev)) {
766 if (!mdev->net_conf->two_primaries &&
767 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
768 rv = SS_TWO_PRIMARIES;
773 /* already found a reason to abort */;
774 else if (ns.role == R_SECONDARY && mdev->open_cnt)
775 rv = SS_DEVICE_IN_USE;
777 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
778 rv = SS_NO_UP_TO_DATE_DISK;
780 else if (fp >= FP_RESOURCE &&
781 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
784 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
785 rv = SS_NO_UP_TO_DATE_DISK;
787 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
788 rv = SS_NO_LOCAL_DISK;
790 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
791 rv = SS_NO_REMOTE_DISK;
793 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
794 rv = SS_NO_UP_TO_DATE_DISK;
796 else if ((ns.conn == C_CONNECTED ||
797 ns.conn == C_WF_BITMAP_S ||
798 ns.conn == C_SYNC_SOURCE ||
799 ns.conn == C_PAUSED_SYNC_S) &&
800 ns.disk == D_OUTDATED)
801 rv = SS_CONNECTED_OUTDATES;
803 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
804 (mdev->sync_conf.verify_alg[0] == 0))
805 rv = SS_NO_VERIFY_ALG;
807 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
808 mdev->agreed_pro_version < 88)
809 rv = SS_NOT_SUPPORTED;
811 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
812 rv = SS_CONNECTED_OUTDATES;
818 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
819 * @mdev: DRBD device.
823 static enum drbd_state_rv
824 is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
827 enum drbd_state_rv rv = SS_SUCCESS;
829 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
830 os.conn > C_CONNECTED)
831 rv = SS_RESYNC_RUNNING;
833 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
834 rv = SS_ALREADY_STANDALONE;
836 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
839 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
840 rv = SS_NO_NET_CONFIG;
842 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
843 rv = SS_LOWER_THAN_OUTDATED;
845 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
846 rv = SS_IN_TRANSIENT_STATE;
848 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
849 rv = SS_IN_TRANSIENT_STATE;
851 /* While establishing a connection only allow cstate to change.
852 Delay/refuse role changes, detach attach etc... */
853 if (drbd_test_flag(mdev, STATE_SENT) &&
854 !(os.conn == C_WF_REPORT_PARAMS ||
855 (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
856 rv = SS_IN_TRANSIENT_STATE;
858 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
859 rv = SS_NEED_CONNECTION;
861 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
862 ns.conn != os.conn && os.conn > C_CONNECTED)
863 rv = SS_RESYNC_RUNNING;
865 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
866 os.conn < C_CONNECTED)
867 rv = SS_NEED_CONNECTION;
869 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
870 && os.conn < C_WF_REPORT_PARAMS)
871 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
876 static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
878 static const char *msg_table[] = {
880 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
881 [ABORTED_RESYNC] = "Resync aborted.",
882 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
883 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
884 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
887 if (warn != NO_WARNING)
888 dev_warn(DEV, "%s\n", msg_table[warn]);
892 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
893 * @mdev: DRBD device.
898 * When we loose connection, we have to set the state of the peers disk (pdsk)
899 * to D_UNKNOWN. This rule and many more along those lines are in this function.
901 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
902 union drbd_state ns, enum sanitize_state_warnings *warn)
904 enum drbd_fencing_p fp;
905 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
911 if (get_ldev(mdev)) {
912 fp = mdev->ldev->dc.fencing;
916 /* Disallow Network errors to configure a device's network part */
917 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
918 os.conn <= C_DISCONNECTING)
921 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
922 * If you try to go into some Sync* state, that shall fail (elsewhere). */
923 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
924 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
927 /* we cannot fail (again) if we already detached */
928 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
929 ns.disk = D_DISKLESS;
931 /* After C_DISCONNECTING only C_STANDALONE may follow */
932 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
935 if (ns.conn < C_CONNECTED) {
938 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
942 /* Clear the aftr_isp when becoming unconfigured */
943 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
946 /* Abort resync if a disk fails/detaches */
947 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
948 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
950 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
951 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
952 ns.conn = C_CONNECTED;
955 /* Connection breaks down before we finished "Negotiating" */
956 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
957 get_ldev_if_state(mdev, D_NEGOTIATING)) {
958 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
959 ns.disk = mdev->new_state_tmp.disk;
960 ns.pdsk = mdev->new_state_tmp.pdsk;
963 *warn = CONNECTION_LOST_NEGOTIATING;
964 ns.disk = D_DISKLESS;
970 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
971 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
972 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
973 ns.disk = D_UP_TO_DATE;
974 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
975 ns.pdsk = D_UP_TO_DATE;
978 /* Implications of the connection stat on the disk states */
979 disk_min = D_DISKLESS;
980 disk_max = D_UP_TO_DATE;
981 pdsk_min = D_INCONSISTENT;
982 pdsk_max = D_UNKNOWN;
983 switch ((enum drbd_conns)ns.conn) {
985 case C_PAUSED_SYNC_T:
986 case C_STARTING_SYNC_T:
989 disk_min = D_INCONSISTENT;
990 disk_max = D_OUTDATED;
991 pdsk_min = D_UP_TO_DATE;
992 pdsk_max = D_UP_TO_DATE;
996 disk_min = D_UP_TO_DATE;
997 disk_max = D_UP_TO_DATE;
998 pdsk_min = D_UP_TO_DATE;
999 pdsk_max = D_UP_TO_DATE;
1002 disk_min = D_DISKLESS;
1003 disk_max = D_UP_TO_DATE;
1004 pdsk_min = D_DISKLESS;
1005 pdsk_max = D_UP_TO_DATE;
1008 case C_PAUSED_SYNC_S:
1009 case C_STARTING_SYNC_S:
1011 disk_min = D_UP_TO_DATE;
1012 disk_max = D_UP_TO_DATE;
1013 pdsk_min = D_INCONSISTENT;
1014 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
1017 disk_min = D_INCONSISTENT;
1018 disk_max = D_INCONSISTENT;
1019 pdsk_min = D_UP_TO_DATE;
1020 pdsk_max = D_UP_TO_DATE;
1023 disk_min = D_UP_TO_DATE;
1024 disk_max = D_UP_TO_DATE;
1025 pdsk_min = D_INCONSISTENT;
1026 pdsk_max = D_INCONSISTENT;
1029 case C_DISCONNECTING:
1033 case C_NETWORK_FAILURE:
1034 case C_PROTOCOL_ERROR:
1036 case C_WF_CONNECTION:
1037 case C_WF_REPORT_PARAMS:
1041 if (ns.disk > disk_max)
1044 if (ns.disk < disk_min) {
1046 *warn = IMPLICITLY_UPGRADED_DISK;
1049 if (ns.pdsk > pdsk_max)
1052 if (ns.pdsk < pdsk_min) {
1054 *warn = IMPLICITLY_UPGRADED_PDSK;
1058 if (fp == FP_STONITH &&
1059 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1060 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
1061 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
1063 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1064 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1065 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
1066 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
1068 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1069 if (ns.conn == C_SYNC_SOURCE)
1070 ns.conn = C_PAUSED_SYNC_S;
1071 if (ns.conn == C_SYNC_TARGET)
1072 ns.conn = C_PAUSED_SYNC_T;
1074 if (ns.conn == C_PAUSED_SYNC_S)
1075 ns.conn = C_SYNC_SOURCE;
1076 if (ns.conn == C_PAUSED_SYNC_T)
1077 ns.conn = C_SYNC_TARGET;
1083 /* helper for __drbd_set_state */
1084 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1086 if (mdev->agreed_pro_version < 90)
1087 mdev->ov_start_sector = 0;
1088 mdev->rs_total = drbd_bm_bits(mdev);
1089 mdev->ov_position = 0;
1090 if (cs == C_VERIFY_T) {
1091 /* starting online verify from an arbitrary position
1092 * does not fit well into the existing protocol.
1093 * on C_VERIFY_T, we initialize ov_left and friends
1094 * implicitly in receive_DataRequest once the
1095 * first P_OV_REQUEST is received */
1096 mdev->ov_start_sector = ~(sector_t)0;
1098 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1099 if (bit >= mdev->rs_total) {
1100 mdev->ov_start_sector =
1101 BM_BIT_TO_SECT(mdev->rs_total - 1);
1104 mdev->rs_total -= bit;
1105 mdev->ov_position = mdev->ov_start_sector;
1107 mdev->ov_left = mdev->rs_total;
1110 static void drbd_resume_al(struct drbd_conf *mdev)
1112 if (drbd_test_and_clear_flag(mdev, AL_SUSPENDED))
1113 dev_info(DEV, "Resumed AL updates\n");
1117 * __drbd_set_state() - Set a new DRBD state
1118 * @mdev: DRBD device.
1121 * @done: Optional completion, that will get completed after the after_state_ch() finished
1123 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1126 __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1127 enum chg_state_flags flags, struct completion *done)
1129 union drbd_state os;
1130 enum drbd_state_rv rv = SS_SUCCESS;
1131 enum sanitize_state_warnings ssw;
1132 struct after_state_chg_work *ascw;
1136 ns = sanitize_state(mdev, os, ns, &ssw);
1139 return SS_NOTHING_TO_DO;
1141 if (!(flags & CS_HARD)) {
1142 /* pre-state-change checks ; only look at ns */
1143 /* See drbd_state_sw_errors in drbd_strings.c */
1145 rv = is_valid_state(mdev, ns);
1146 if (rv < SS_SUCCESS) {
1147 /* If the old state was illegal as well, then let
1150 if (is_valid_state(mdev, os) == rv)
1151 rv = is_valid_state_transition(mdev, ns, os);
1153 rv = is_valid_state_transition(mdev, ns, os);
1156 if (rv < SS_SUCCESS) {
1157 if (flags & CS_VERBOSE)
1158 print_st_err(mdev, os, ns, rv);
1162 print_sanitize_warnings(mdev, ssw);
1168 if (ns.role != os.role)
1169 pbp += sprintf(pbp, "role( %s -> %s ) ",
1170 drbd_role_str(os.role),
1171 drbd_role_str(ns.role));
1172 if (ns.peer != os.peer)
1173 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1174 drbd_role_str(os.peer),
1175 drbd_role_str(ns.peer));
1176 if (ns.conn != os.conn)
1177 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1178 drbd_conn_str(os.conn),
1179 drbd_conn_str(ns.conn));
1180 if (ns.disk != os.disk)
1181 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1182 drbd_disk_str(os.disk),
1183 drbd_disk_str(ns.disk));
1184 if (ns.pdsk != os.pdsk)
1185 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1186 drbd_disk_str(os.pdsk),
1187 drbd_disk_str(ns.pdsk));
1188 if (is_susp(ns) != is_susp(os))
1189 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1192 if (ns.aftr_isp != os.aftr_isp)
1193 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1196 if (ns.peer_isp != os.peer_isp)
1197 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1200 if (ns.user_isp != os.user_isp)
1201 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1204 dev_info(DEV, "%s\n", pb);
1207 /* solve the race between becoming unconfigured,
1208 * worker doing the cleanup, and
1209 * admin reconfiguring us:
1210 * on (re)configure, first set CONFIG_PENDING,
1211 * then wait for a potentially exiting worker,
1212 * start the worker, and schedule one no_op.
1213 * then proceed with configuration.
1215 if (ns.disk == D_DISKLESS &&
1216 ns.conn == C_STANDALONE &&
1217 ns.role == R_SECONDARY &&
1218 !drbd_test_and_set_flag(mdev, CONFIG_PENDING))
1219 drbd_set_flag(mdev, DEVICE_DYING);
1221 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1222 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1223 * drbd_ldev_destroy() won't happen before our corresponding
1224 * after_state_ch works run, where we put_ldev again. */
1225 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1226 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1227 atomic_inc(&mdev->local_cnt);
1231 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1232 drbd_print_uuids(mdev, "attached to UUIDs");
1234 wake_up(&mdev->misc_wait);
1235 wake_up(&mdev->state_wait);
1237 /* Aborted verify run, or we reached the stop sector.
1238 * Log the last position, unless end-of-device. */
1239 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1240 ns.conn <= C_CONNECTED) {
1241 mdev->ov_start_sector =
1242 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1244 dev_info(DEV, "Online Verify reached sector %llu\n",
1245 (unsigned long long)mdev->ov_start_sector);
1248 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1249 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1250 dev_info(DEV, "Syncer continues.\n");
1251 mdev->rs_paused += (long)jiffies
1252 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1253 if (ns.conn == C_SYNC_TARGET)
1254 mod_timer(&mdev->resync_timer, jiffies);
1257 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1258 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1259 dev_info(DEV, "Resync suspended\n");
1260 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1263 if (os.conn == C_CONNECTED &&
1264 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1265 unsigned long now = jiffies;
1268 set_ov_position(mdev, ns.conn);
1269 mdev->rs_start = now;
1270 mdev->rs_last_events = 0;
1271 mdev->rs_last_sect_ev = 0;
1272 mdev->ov_last_oos_size = 0;
1273 mdev->ov_last_oos_start = 0;
1275 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1276 mdev->rs_mark_left[i] = mdev->ov_left;
1277 mdev->rs_mark_time[i] = now;
1280 drbd_rs_controller_reset(mdev);
1282 if (ns.conn == C_VERIFY_S) {
1283 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1284 (unsigned long long)mdev->ov_position);
1285 mod_timer(&mdev->resync_timer, jiffies);
1289 if (get_ldev(mdev)) {
1290 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1291 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1292 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1294 if (drbd_test_flag(mdev, CRASHED_PRIMARY))
1295 mdf |= MDF_CRASHED_PRIMARY;
1296 if (mdev->state.role == R_PRIMARY ||
1297 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1298 mdf |= MDF_PRIMARY_IND;
1299 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1300 mdf |= MDF_CONNECTED_IND;
1301 if (mdev->state.disk > D_INCONSISTENT)
1302 mdf |= MDF_CONSISTENT;
1303 if (mdev->state.disk > D_OUTDATED)
1304 mdf |= MDF_WAS_UP_TO_DATE;
1305 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1306 mdf |= MDF_PEER_OUT_DATED;
1307 if (mdf != mdev->ldev->md.flags) {
1308 mdev->ldev->md.flags = mdf;
1309 drbd_md_mark_dirty(mdev);
1311 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1312 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1316 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1317 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1318 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1319 drbd_set_flag(mdev, CONSIDER_RESYNC);
1321 /* Receiver should clean up itself */
1322 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1323 drbd_thread_stop_nowait(&mdev->receiver);
1325 /* Now the receiver finished cleaning up itself, it should die */
1326 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1327 drbd_thread_stop_nowait(&mdev->receiver);
1329 /* Upon network failure, we need to restart the receiver. */
1330 if (os.conn > C_WF_CONNECTION &&
1331 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1332 drbd_thread_restart_nowait(&mdev->receiver);
1334 /* Resume AL writing if we get a connection */
1335 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1336 drbd_resume_al(mdev);
1338 /* remember last connect and attach times so request_timer_fn() won't
1339 * kill newly established sessions while we are still trying to thaw
1340 * previously frozen IO */
1341 if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
1342 mdev->last_reconnect_jif = jiffies;
1343 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1344 ns.disk > D_NEGOTIATING)
1345 mdev->last_reattach_jif = jiffies;
1347 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1351 ascw->flags = flags;
1352 ascw->w.cb = w_after_state_ch;
1354 drbd_queue_work(&mdev->data.work, &ascw->w);
1356 dev_warn(DEV, "Could not kmalloc an ascw\n");
1362 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1364 struct after_state_chg_work *ascw =
1365 container_of(w, struct after_state_chg_work, w);
1366 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1367 if (ascw->flags & CS_WAIT_COMPLETE) {
1368 D_ASSERT(ascw->done != NULL);
1369 complete(ascw->done);
1376 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1379 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1380 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1384 switch (mdev->state.conn) {
1385 case C_STARTING_SYNC_T:
1386 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1388 case C_STARTING_SYNC_S:
1389 drbd_start_resync(mdev, C_SYNC_SOURCE);
1394 int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1395 int (*io_fn)(struct drbd_conf *),
1396 char *why, enum bm_flag flags)
1400 D_ASSERT(current == mdev->worker.task);
1402 /* open coded non-blocking drbd_suspend_io(mdev); */
1403 drbd_set_flag(mdev, SUSPEND_IO);
1405 drbd_bm_lock(mdev, why, flags);
1407 drbd_bm_unlock(mdev);
1409 drbd_resume_io(mdev);
1415 * after_state_ch() - Perform after state change actions that may sleep
1416 * @mdev: DRBD device.
1421 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1422 union drbd_state ns, enum chg_state_flags flags)
1424 enum drbd_fencing_p fp;
1425 enum drbd_req_event what = nothing;
1426 union drbd_state nsm = (union drbd_state){ .i = -1 };
1428 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1429 drbd_clear_flag(mdev, CRASHED_PRIMARY);
1431 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1435 if (get_ldev(mdev)) {
1436 fp = mdev->ldev->dc.fencing;
1440 /* Inform userspace about the change... */
1441 drbd_bcast_state(mdev, ns);
1443 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1444 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1445 drbd_khelper(mdev, "pri-on-incon-degr");
1447 /* Here we have the actions that are performed after a
1448 state change. This function might sleep */
1450 if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
1451 mod_timer(&mdev->request_timer, jiffies + HZ);
1455 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1458 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1459 ns.disk > D_NEGOTIATING)
1460 what = restart_frozen_disk_io;
1462 if (what != nothing)
1467 /* case1: The outdate peer handler is successful: */
1468 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
1469 if (drbd_test_flag(mdev, NEW_CUR_UUID)) {
1470 drbd_uuid_new_current(mdev);
1471 drbd_clear_flag(mdev, NEW_CUR_UUID);
1473 spin_lock_irq(&mdev->req_lock);
1475 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1476 spin_unlock_irq(&mdev->req_lock);
1478 /* case2: The connection was established again: */
1479 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1480 drbd_clear_flag(mdev, NEW_CUR_UUID);
1486 if (what != nothing) {
1487 spin_lock_irq(&mdev->req_lock);
1488 _tl_restart(mdev, what);
1489 nsm.i &= mdev->state.i;
1490 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1491 spin_unlock_irq(&mdev->req_lock);
1494 /* Became sync source. With protocol >= 96, we still need to send out
1495 * the sync uuid now. Need to do that before any drbd_send_state, or
1496 * the other side may go "paused sync" before receiving the sync uuids,
1497 * which is unexpected. */
1498 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1499 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1500 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1501 drbd_gen_and_send_sync_uuid(mdev);
1505 /* Do not change the order of the if above and the two below... */
1506 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1507 /* we probably will start a resync soon.
1508 * make sure those things are properly reset. */
1510 mdev->rs_failed = 0;
1511 atomic_set(&mdev->rs_pending_cnt, 0);
1512 drbd_rs_cancel_all(mdev);
1514 drbd_send_uuids(mdev);
1515 drbd_send_state(mdev, ns);
1517 /* No point in queuing send_bitmap if we don't have a connection
1518 * anymore, so check also the _current_ state, not only the new state
1519 * at the time this work was queued. */
1520 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1521 mdev->state.conn == C_WF_BITMAP_S)
1522 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
1523 "send_bitmap (WFBitMapS)",
1524 BM_LOCKED_TEST_ALLOWED);
1526 /* Lost contact to peer's copy of the data */
1527 if ((os.pdsk >= D_INCONSISTENT &&
1528 os.pdsk != D_UNKNOWN &&
1529 os.pdsk != D_OUTDATED)
1530 && (ns.pdsk < D_INCONSISTENT ||
1531 ns.pdsk == D_UNKNOWN ||
1532 ns.pdsk == D_OUTDATED)) {
1533 if (get_ldev(mdev)) {
1534 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1535 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1536 if (is_susp(mdev->state)) {
1537 drbd_set_flag(mdev, NEW_CUR_UUID);
1539 drbd_uuid_new_current(mdev);
1540 drbd_send_uuids(mdev);
1547 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1548 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
1549 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1550 drbd_uuid_new_current(mdev);
1551 drbd_send_uuids(mdev);
1553 /* D_DISKLESS Peer becomes secondary */
1554 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1555 /* We may still be Primary ourselves.
1556 * No harm done if the bitmap still changes,
1557 * redirtied pages will follow later. */
1558 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1559 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
1563 /* Write out all changed bits on demote.
1564 * Though, no need to da that just yet
1565 * if there is a resync going on still */
1566 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1567 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
1568 /* No changes to the bitmap expected this time, so assert that,
1569 * even though no harm was done if it did change. */
1570 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1571 "demote", BM_LOCKED_TEST_ALLOWED);
1575 /* Last part of the attaching process ... */
1576 if (ns.conn >= C_CONNECTED &&
1577 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1578 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
1579 drbd_send_uuids(mdev);
1580 drbd_send_state(mdev, ns);
1583 /* We want to pause/continue resync, tell peer. */
1584 if (ns.conn >= C_CONNECTED &&
1585 ((os.aftr_isp != ns.aftr_isp) ||
1586 (os.user_isp != ns.user_isp)))
1587 drbd_send_state(mdev, ns);
1589 /* In case one of the isp bits got set, suspend other devices. */
1590 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1591 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1592 suspend_other_sg(mdev);
1594 /* Make sure the peer gets informed about eventual state
1595 changes (ISP bits) while we were in WFReportParams. */
1596 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1597 drbd_send_state(mdev, ns);
1599 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1600 drbd_send_state(mdev, ns);
1602 /* We are in the progress to start a full sync... */
1603 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1604 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1605 /* no other bitmap changes expected during this phase */
1606 drbd_queue_bitmap_io(mdev,
1607 &drbd_bmio_set_n_write, &abw_start_sync,
1608 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1610 /* We are invalidating our self... */
1611 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1612 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1613 /* other bitmap operation expected during this phase */
1614 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1615 "set_n_write from invalidate", BM_LOCKED_MASK);
1617 /* first half of local IO error, failure to attach,
1618 * or administrative detach */
1619 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1620 /* corresponding get_ldev was in __drbd_set_state, to serialize
1621 * our cleanup here with the transition to D_DISKLESS.
1622 * But it is still not safe to dreference ldev here, we may end
1623 * up here from a failed attach, before ldev was even set. */
1625 enum drbd_io_error_p eh = mdev->ldev->dc.on_io_error;
1627 /* In some setups, this handler triggers a suicide,
1628 * basically mapping IO error to node failure, to
1629 * reduce the number of different failure scenarios.
1631 * This handler intentionally runs before we abort IO,
1632 * notify the peer, or try to update our meta data. */
1633 if (eh == EP_CALL_HELPER && drbd_test_flag(mdev, WAS_IO_ERROR))
1634 drbd_khelper(mdev, "local-io-error");
1636 /* Immediately allow completion of all application IO,
1637 * that waits for completion from the local disk,
1638 * if this was a force-detach due to disk_timeout
1639 * or administrator request (drbdsetup detach --force).
1640 * Do NOT abort otherwise.
1641 * Aborting local requests may cause serious problems,
1642 * if requests are completed to upper layers already,
1643 * and then later the already submitted local bio completes.
1644 * This can cause DMA into former bio pages that meanwhile
1645 * have been re-used for other things.
1646 * So aborting local requests may cause crashes,
1647 * or even worse, silent data corruption.
1649 if (drbd_test_flag(mdev, FORCE_DETACH))
1650 tl_abort_disk_io(mdev);
1652 /* current state still has to be D_FAILED,
1653 * there is only one way out: to D_DISKLESS,
1654 * and that may only happen after our put_ldev below. */
1655 if (mdev->state.disk != D_FAILED)
1657 "ASSERT FAILED: disk is %s during detach\n",
1658 drbd_disk_str(mdev->state.disk));
1660 if (ns.conn >= C_CONNECTED)
1661 drbd_send_state(mdev, ns);
1663 drbd_rs_cancel_all(mdev);
1665 /* In case we want to get something to stable storage still,
1666 * this may be the last chance.
1667 * Following put_ldev may transition to D_DISKLESS. */
1673 /* second half of local IO error, failure to attach,
1674 * or administrative detach,
1675 * after local_cnt references have reached zero again */
1676 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1677 /* We must still be diskless,
1678 * re-attach has to be serialized with this! */
1679 if (mdev->state.disk != D_DISKLESS)
1681 "ASSERT FAILED: disk is %s while going diskless\n",
1682 drbd_disk_str(mdev->state.disk));
1684 if (ns.conn >= C_CONNECTED)
1685 drbd_send_state(mdev, ns);
1687 /* corresponding get_ldev in __drbd_set_state
1688 * this may finally trigger drbd_ldev_destroy. */
1692 /* Notify peer that I had a local IO error, and did not detached.. */
1693 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
1694 drbd_send_state(mdev, ns);
1696 /* Disks got bigger while they were detached */
1697 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1698 drbd_test_and_clear_flag(mdev, RESYNC_AFTER_NEG)) {
1699 if (ns.conn == C_CONNECTED)
1700 resync_after_online_grow(mdev);
1703 /* A resync finished or aborted, wake paused devices... */
1704 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1705 (os.peer_isp && !ns.peer_isp) ||
1706 (os.user_isp && !ns.user_isp))
1707 resume_next_sg(mdev);
1709 /* sync target done with resync. Explicitly notify peer, even though
1710 * it should (at least for non-empty resyncs) already know itself. */
1711 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1712 drbd_send_state(mdev, ns);
1714 /* Verify finished, or reached stop sector. Peer did not know about
1715 * the stop sector, and we may even have changed the stop sector during
1716 * verify to interrupt/stop early. Send the new state. */
1717 if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
1718 && mdev->agreed_pro_version >= 97)
1719 drbd_send_state(mdev, ns);
1721 /* Wake up role changes, that were delayed because of connection establishing */
1722 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
1723 drbd_clear_flag(mdev, STATE_SENT);
1724 wake_up(&mdev->state_wait);
1727 /* This triggers bitmap writeout of potentially still unwritten pages
1728 * if the resync finished cleanly, or aborted because of peer disk
1729 * failure, or because of connection loss.
1730 * For resync aborted because of local disk failure, we cannot do
1731 * any bitmap writeout anymore.
1732 * No harm done if some bits change during this phase.
1734 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1735 drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
1736 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
1740 /* free tl_hash if we Got thawed and are C_STANDALONE */
1741 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1742 drbd_free_tl_hash(mdev);
1744 /* Upon network connection, we need to start the receiver */
1745 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1746 drbd_thread_start(&mdev->receiver);
1748 /* Terminate worker thread if we are unconfigured - it will be
1749 restarted as needed... */
1750 if (ns.disk == D_DISKLESS &&
1751 ns.conn == C_STANDALONE &&
1752 ns.role == R_SECONDARY) {
1753 if (os.aftr_isp != ns.aftr_isp)
1754 resume_next_sg(mdev);
1755 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1756 if (drbd_test_flag(mdev, DEVICE_DYING))
1757 drbd_thread_stop_nowait(&mdev->worker);
1764 static int drbd_thread_setup(void *arg)
1766 struct drbd_thread *thi = (struct drbd_thread *) arg;
1767 struct drbd_conf *mdev = thi->mdev;
1768 unsigned long flags;
1772 retval = thi->function(thi);
1774 spin_lock_irqsave(&thi->t_lock, flags);
1776 /* if the receiver has been "Exiting", the last thing it did
1777 * was set the conn state to "StandAlone",
1778 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1779 * and receiver thread will be "started".
1780 * drbd_thread_start needs to set "Restarting" in that case.
1781 * t_state check and assignment needs to be within the same spinlock,
1782 * so either thread_start sees Exiting, and can remap to Restarting,
1783 * or thread_start see None, and can proceed as normal.
1786 if (thi->t_state == Restarting) {
1787 dev_info(DEV, "Restarting %s\n", current->comm);
1788 thi->t_state = Running;
1789 spin_unlock_irqrestore(&thi->t_lock, flags);
1794 thi->t_state = None;
1796 complete(&thi->stop);
1797 spin_unlock_irqrestore(&thi->t_lock, flags);
1799 dev_info(DEV, "Terminating %s\n", current->comm);
1801 /* Release mod reference taken when thread was started */
1802 module_put(THIS_MODULE);
1806 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1807 int (*func) (struct drbd_thread *))
1809 spin_lock_init(&thi->t_lock);
1811 thi->t_state = None;
1812 thi->function = func;
1816 int drbd_thread_start(struct drbd_thread *thi)
1818 struct drbd_conf *mdev = thi->mdev;
1819 struct task_struct *nt;
1820 unsigned long flags;
1823 thi == &mdev->receiver ? "receiver" :
1824 thi == &mdev->asender ? "asender" :
1825 thi == &mdev->worker ? "worker" : "NONSENSE";
1827 /* is used from state engine doing drbd_thread_stop_nowait,
1828 * while holding the req lock irqsave */
1829 spin_lock_irqsave(&thi->t_lock, flags);
1831 switch (thi->t_state) {
1833 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1834 me, current->comm, current->pid);
1836 /* Get ref on module for thread - this is released when thread exits */
1837 if (!try_module_get(THIS_MODULE)) {
1838 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1839 spin_unlock_irqrestore(&thi->t_lock, flags);
1843 init_completion(&thi->stop);
1844 D_ASSERT(thi->task == NULL);
1845 thi->reset_cpu_mask = 1;
1846 thi->t_state = Running;
1847 spin_unlock_irqrestore(&thi->t_lock, flags);
1848 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1850 nt = kthread_create(drbd_thread_setup, (void *) thi,
1851 "drbd%d_%s", mdev_to_minor(mdev), me);
1854 dev_err(DEV, "Couldn't start thread\n");
1856 module_put(THIS_MODULE);
1859 spin_lock_irqsave(&thi->t_lock, flags);
1861 thi->t_state = Running;
1862 spin_unlock_irqrestore(&thi->t_lock, flags);
1863 wake_up_process(nt);
1866 thi->t_state = Restarting;
1867 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1868 me, current->comm, current->pid);
1873 spin_unlock_irqrestore(&thi->t_lock, flags);
1881 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1883 unsigned long flags;
1885 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1887 /* may be called from state engine, holding the req lock irqsave */
1888 spin_lock_irqsave(&thi->t_lock, flags);
1890 if (thi->t_state == None) {
1891 spin_unlock_irqrestore(&thi->t_lock, flags);
1893 drbd_thread_start(thi);
1897 if (thi->t_state != ns) {
1898 if (thi->task == NULL) {
1899 spin_unlock_irqrestore(&thi->t_lock, flags);
1905 init_completion(&thi->stop);
1906 if (thi->task != current)
1907 force_sig(DRBD_SIGKILL, thi->task);
1911 spin_unlock_irqrestore(&thi->t_lock, flags);
1914 wait_for_completion(&thi->stop);
1919 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1920 * @mdev: DRBD device.
1922 * Forces all threads of a device onto the same CPU. This is beneficial for
1923 * DRBD's performance. May be overwritten by user's configuration.
1925 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1929 /* user override. */
1930 if (cpumask_weight(mdev->cpu_mask))
1933 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1934 for_each_online_cpu(cpu) {
1936 cpumask_set_cpu(cpu, mdev->cpu_mask);
1940 /* should not be reached */
1941 cpumask_setall(mdev->cpu_mask);
1945 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1946 * @mdev: DRBD device.
1948 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1951 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1953 struct task_struct *p = current;
1954 struct drbd_thread *thi =
1955 p == mdev->asender.task ? &mdev->asender :
1956 p == mdev->receiver.task ? &mdev->receiver :
1957 p == mdev->worker.task ? &mdev->worker :
1961 if (!thi->reset_cpu_mask)
1963 thi->reset_cpu_mask = 0;
1964 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1968 /* the appropriate socket mutex must be held already */
1969 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1970 enum drbd_packets cmd, struct p_header80 *h,
1971 size_t size, unsigned msg_flags)
1975 ERR_IF(!h) return false;
1976 ERR_IF(!size) return false;
1978 h->magic = BE_DRBD_MAGIC;
1979 h->command = cpu_to_be16(cmd);
1980 h->length = cpu_to_be16(size-sizeof(struct p_header80));
1982 sent = drbd_send(mdev, sock, h, size, msg_flags);
1984 ok = (sent == size);
1985 if (!ok && !signal_pending(current))
1986 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
1987 cmdname(cmd), (int)size, sent);
1991 /* don't pass the socket. we may only look at it
1992 * when we hold the appropriate socket mutex.
1994 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1995 enum drbd_packets cmd, struct p_header80 *h, size_t size)
1998 struct socket *sock;
2000 if (use_data_socket) {
2001 mutex_lock(&mdev->data.mutex);
2002 sock = mdev->data.socket;
2004 mutex_lock(&mdev->meta.mutex);
2005 sock = mdev->meta.socket;
2008 /* drbd_disconnect() could have called drbd_free_sock()
2009 * while we were waiting in down()... */
2010 if (likely(sock != NULL))
2011 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
2013 if (use_data_socket)
2014 mutex_unlock(&mdev->data.mutex);
2016 mutex_unlock(&mdev->meta.mutex);
2020 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
2023 struct p_header80 h;
2026 h.magic = BE_DRBD_MAGIC;
2027 h.command = cpu_to_be16(cmd);
2028 h.length = cpu_to_be16(size);
2030 if (!drbd_get_data_sock(mdev))
2034 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
2036 drbd_send(mdev, mdev->data.socket, data, size, 0));
2038 drbd_put_data_sock(mdev);
2043 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
2045 struct p_rs_param_95 *p;
2046 struct socket *sock;
2048 const int apv = mdev->agreed_pro_version;
2050 size = apv <= 87 ? sizeof(struct p_rs_param)
2051 : apv == 88 ? sizeof(struct p_rs_param)
2052 + strlen(mdev->sync_conf.verify_alg) + 1
2053 : apv <= 94 ? sizeof(struct p_rs_param_89)
2054 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
2056 /* used from admin command context and receiver/worker context.
2057 * to avoid kmalloc, grab the socket right here,
2058 * then use the pre-allocated sbuf there */
2059 mutex_lock(&mdev->data.mutex);
2060 sock = mdev->data.socket;
2062 if (likely(sock != NULL)) {
2063 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
2065 p = &mdev->data.sbuf.rs_param_95;
2067 /* initialize verify_alg and csums_alg */
2068 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2070 p->rate = cpu_to_be32(sc->rate);
2071 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
2072 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
2073 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
2074 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
2077 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
2079 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
2081 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
2083 rv = 0; /* not ok */
2085 mutex_unlock(&mdev->data.mutex);
2090 int drbd_send_protocol(struct drbd_conf *mdev)
2092 struct p_protocol *p;
2095 size = sizeof(struct p_protocol);
2097 if (mdev->agreed_pro_version >= 87)
2098 size += strlen(mdev->net_conf->integrity_alg) + 1;
2100 /* we must not recurse into our own queue,
2101 * as that is blocked during handshake */
2102 p = kmalloc(size, GFP_NOIO);
2106 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2107 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2108 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2109 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
2110 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2113 if (mdev->net_conf->want_lose)
2115 if (mdev->net_conf->dry_run) {
2116 if (mdev->agreed_pro_version >= 92)
2119 dev_err(DEV, "--dry-run is not supported by peer");
2124 p->conn_flags = cpu_to_be32(cf);
2126 if (mdev->agreed_pro_version >= 87)
2127 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2129 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
2130 (struct p_header80 *)p, size);
2135 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2140 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2143 spin_lock_irq(&mdev->ldev->md.uuid_lock);
2144 for (i = UI_CURRENT; i < UI_SIZE; i++)
2145 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2146 spin_unlock_irq(&mdev->ldev->md.uuid_lock);
2148 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2149 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2150 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2151 uuid_flags |= drbd_test_flag(mdev, CRASHED_PRIMARY) ? 2 : 0;
2152 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2153 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2157 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
2158 (struct p_header80 *)&p, sizeof(p));
2161 int drbd_send_uuids(struct drbd_conf *mdev)
2163 return _drbd_send_uuids(mdev, 0);
2166 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2168 return _drbd_send_uuids(mdev, 8);
2171 void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2173 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2174 u64 *uuid = mdev->ldev->md.uuid;
2175 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2177 (unsigned long long)uuid[UI_CURRENT],
2178 (unsigned long long)uuid[UI_BITMAP],
2179 (unsigned long long)uuid[UI_HISTORY_START],
2180 (unsigned long long)uuid[UI_HISTORY_END]);
2183 dev_info(DEV, "%s effective data uuid: %016llX\n",
2185 (unsigned long long)mdev->ed_uuid);
2189 int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
2194 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2196 uuid = mdev->ldev->md.uuid[UI_BITMAP];
2197 if (uuid && uuid != UUID_JUST_CREATED)
2198 uuid = uuid + UUID_NEW_BM_OFFSET;
2200 get_random_bytes(&uuid, sizeof(u64));
2201 drbd_uuid_set(mdev, UI_BITMAP, uuid);
2202 drbd_print_uuids(mdev, "updated sync UUID");
2204 p.uuid = cpu_to_be64(uuid);
2206 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
2207 (struct p_header80 *)&p, sizeof(p));
2210 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
2213 sector_t d_size, u_size;
2215 unsigned int max_bio_size;
2218 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2219 D_ASSERT(mdev->ldev->backing_bdev);
2220 d_size = drbd_get_max_capacity(mdev->ldev);
2221 u_size = mdev->ldev->dc.disk_size;
2222 q_order_type = drbd_queue_order_type(mdev);
2223 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2224 max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
2229 q_order_type = QUEUE_ORDERED_NONE;
2230 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
2233 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2234 if (mdev->agreed_pro_version <= 94)
2235 max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
2237 p.d_size = cpu_to_be64(d_size);
2238 p.u_size = cpu_to_be64(u_size);
2239 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
2240 p.max_bio_size = cpu_to_be32(max_bio_size);
2241 p.queue_order_type = cpu_to_be16(q_order_type);
2242 p.dds_flags = cpu_to_be16(flags);
2244 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
2245 (struct p_header80 *)&p, sizeof(p));
2250 * drbd_send_current_state() - Sends the drbd state to the peer
2251 * @mdev: DRBD device.
2253 int drbd_send_current_state(struct drbd_conf *mdev)
2255 struct socket *sock;
2259 /* Grab state lock so we wont send state if we're in the middle
2260 * of a cluster wide state change on another thread */
2261 drbd_state_lock(mdev);
2263 mutex_lock(&mdev->data.mutex);
2265 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2266 sock = mdev->data.socket;
2268 if (likely(sock != NULL)) {
2269 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2270 (struct p_header80 *)&p, sizeof(p), 0);
2273 mutex_unlock(&mdev->data.mutex);
2275 drbd_state_unlock(mdev);
2280 * drbd_send_state() - After a state change, sends the new state to the peer
2281 * @mdev: DRBD device.
2282 * @state: the state to send, not necessarily the current state.
2284 * Each state change queues an "after_state_ch" work, which will eventually
2285 * send the resulting new state to the peer. If more state changes happen
2286 * between queuing and processing of the after_state_ch work, we still
2287 * want to send each intermediary state in the order it occurred.
2289 int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
2291 struct socket *sock;
2295 mutex_lock(&mdev->data.mutex);
2297 p.state = cpu_to_be32(state.i);
2298 sock = mdev->data.socket;
2300 if (likely(sock != NULL)) {
2301 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2302 (struct p_header80 *)&p, sizeof(p), 0);
2305 mutex_unlock(&mdev->data.mutex);
2310 int drbd_send_state_req(struct drbd_conf *mdev,
2311 union drbd_state mask, union drbd_state val)
2313 struct p_req_state p;
2315 p.mask = cpu_to_be32(mask.i);
2316 p.val = cpu_to_be32(val.i);
2318 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2319 (struct p_header80 *)&p, sizeof(p));
2322 int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
2324 struct p_req_state_reply p;
2326 p.retcode = cpu_to_be32(retcode);
2328 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2329 (struct p_header80 *)&p, sizeof(p));
2332 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2333 struct p_compressed_bm *p,
2334 struct bm_xfer_ctx *c)
2336 struct bitstream bs;
2337 unsigned long plain_bits;
2344 /* may we use this feature? */
2345 if ((mdev->sync_conf.use_rle == 0) ||
2346 (mdev->agreed_pro_version < 90))
2349 if (c->bit_offset >= c->bm_bits)
2350 return 0; /* nothing to do. */
2352 /* use at most thus many bytes */
2353 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2354 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2355 /* plain bits covered in this code string */
2358 /* p->encoding & 0x80 stores whether the first run length is set.
2359 * bit offset is implicit.
2360 * start with toggle == 2 to be able to tell the first iteration */
2363 /* see how much plain bits we can stuff into one packet
2364 * using RLE and VLI. */
2366 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2367 : _drbd_bm_find_next(mdev, c->bit_offset);
2370 rl = tmp - c->bit_offset;
2372 if (toggle == 2) { /* first iteration */
2374 /* the first checked bit was set,
2375 * store start value, */
2376 DCBP_set_start(p, 1);
2377 /* but skip encoding of zero run length */
2381 DCBP_set_start(p, 0);
2384 /* paranoia: catch zero runlength.
2385 * can only happen if bitmap is modified while we scan it. */
2387 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2388 "t:%u bo:%lu\n", toggle, c->bit_offset);
2392 bits = vli_encode_bits(&bs, rl);
2393 if (bits == -ENOBUFS) /* buffer full */
2396 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2402 c->bit_offset = tmp;
2403 } while (c->bit_offset < c->bm_bits);
2405 len = bs.cur.b - p->code + !!bs.cur.bit;
2407 if (plain_bits < (len << 3)) {
2408 /* incompressible with this method.
2409 * we need to rewind both word and bit position. */
2410 c->bit_offset -= plain_bits;
2411 bm_xfer_ctx_bit_to_word_offset(c);
2412 c->bit_offset = c->word_offset * BITS_PER_LONG;
2416 /* RLE + VLI was able to compress it just fine.
2417 * update c->word_offset. */
2418 bm_xfer_ctx_bit_to_word_offset(c);
2420 /* store pad_bits */
2421 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2427 * send_bitmap_rle_or_plain
2429 * Return 0 when done, 1 when another iteration is needed, and a negative error
2430 * code upon failure.
2433 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2434 struct p_header80 *h, struct bm_xfer_ctx *c)
2436 struct p_compressed_bm *p = (void*)h;
2437 unsigned long num_words;
2441 len = fill_bitmap_rle_bits(mdev, p, c);
2447 DCBP_set_code(p, RLE_VLI_Bits);
2448 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2449 sizeof(*p) + len, 0);
2452 c->bytes[0] += sizeof(*p) + len;
2454 if (c->bit_offset >= c->bm_bits)
2457 /* was not compressible.
2458 * send a buffer full of plain text bits instead. */
2459 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2460 len = num_words * sizeof(long);
2462 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2463 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2464 h, sizeof(struct p_header80) + len, 0);
2465 c->word_offset += num_words;
2466 c->bit_offset = c->word_offset * BITS_PER_LONG;
2469 c->bytes[1] += sizeof(struct p_header80) + len;
2471 if (c->bit_offset > c->bm_bits)
2472 c->bit_offset = c->bm_bits;
2476 INFO_bm_xfer_stats(mdev, "send", c);
2484 /* See the comment at receive_bitmap() */
2485 int _drbd_send_bitmap(struct drbd_conf *mdev)
2487 struct bm_xfer_ctx c;
2488 struct p_header80 *p;
2491 ERR_IF(!mdev->bitmap) return false;
2493 /* maybe we should use some per thread scratch page,
2494 * and allocate that during initial device creation? */
2495 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2497 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2501 if (get_ldev(mdev)) {
2502 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2503 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2504 drbd_bm_set_all(mdev);
2505 if (drbd_bm_write(mdev)) {
2506 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2507 * but otherwise process as per normal - need to tell other
2508 * side that a full resync is required! */
2509 dev_err(DEV, "Failed to write bitmap to disk!\n");
2511 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2518 c = (struct bm_xfer_ctx) {
2519 .bm_bits = drbd_bm_bits(mdev),
2520 .bm_words = drbd_bm_words(mdev),
2524 err = send_bitmap_rle_or_plain(mdev, p, &c);
2527 free_page((unsigned long) p);
2531 int drbd_send_bitmap(struct drbd_conf *mdev)
2535 if (!drbd_get_data_sock(mdev))
2537 err = !_drbd_send_bitmap(mdev);
2538 drbd_put_data_sock(mdev);
2542 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2545 struct p_barrier_ack p;
2547 p.barrier = barrier_nr;
2548 p.set_size = cpu_to_be32(set_size);
2550 if (mdev->state.conn < C_CONNECTED)
2552 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2553 (struct p_header80 *)&p, sizeof(p));
2558 * _drbd_send_ack() - Sends an ack packet
2559 * @mdev: DRBD device.
2560 * @cmd: Packet command code.
2561 * @sector: sector, needs to be in big endian byte order
2562 * @blksize: size in byte, needs to be in big endian byte order
2563 * @block_id: Id, big endian byte order
2565 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2571 struct p_block_ack p;
2574 p.block_id = block_id;
2575 p.blksize = blksize;
2576 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2578 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2580 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2581 (struct p_header80 *)&p, sizeof(p));
2585 /* dp->sector and dp->block_id already/still in network byte order,
2586 * data_size is payload size according to dp->head,
2587 * and may need to be corrected for digest size. */
2588 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2589 struct p_data *dp, int data_size)
2591 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2592 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2593 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2597 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2598 struct p_block_req *rp)
2600 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2604 * drbd_send_ack() - Sends an ack packet
2605 * @mdev: DRBD device.
2606 * @cmd: Packet command code.
2609 int drbd_send_ack(struct drbd_conf *mdev,
2610 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2612 return _drbd_send_ack(mdev, cmd,
2613 cpu_to_be64(e->sector),
2614 cpu_to_be32(e->size),
2618 /* This function misuses the block_id field to signal if the blocks
2619 * are is sync or not. */
2620 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2621 sector_t sector, int blksize, u64 block_id)
2623 return _drbd_send_ack(mdev, cmd,
2624 cpu_to_be64(sector),
2625 cpu_to_be32(blksize),
2626 cpu_to_be64(block_id));
2629 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2630 sector_t sector, int size, u64 block_id)
2633 struct p_block_req p;
2635 p.sector = cpu_to_be64(sector);
2636 p.block_id = block_id;
2637 p.blksize = cpu_to_be32(size);
2639 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2640 (struct p_header80 *)&p, sizeof(p));
2644 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2645 sector_t sector, int size,
2646 void *digest, int digest_size,
2647 enum drbd_packets cmd)
2650 struct p_block_req p;
2652 p.sector = cpu_to_be64(sector);
2653 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2654 p.blksize = cpu_to_be32(size);
2656 p.head.magic = BE_DRBD_MAGIC;
2657 p.head.command = cpu_to_be16(cmd);
2658 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2660 mutex_lock(&mdev->data.mutex);
2662 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2663 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2665 mutex_unlock(&mdev->data.mutex);
2670 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2673 struct p_block_req p;
2675 p.sector = cpu_to_be64(sector);
2676 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2677 p.blksize = cpu_to_be32(size);
2679 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2680 (struct p_header80 *)&p, sizeof(p));
2684 /* called on sndtimeo
2685 * returns false if we should retry,
2686 * true if we think connection is dead
2688 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2691 /* long elapsed = (long)(jiffies - mdev->last_received); */
2693 drop_it = mdev->meta.socket == sock
2694 || !mdev->asender.task
2695 || get_t_state(&mdev->asender) != Running
2696 || mdev->state.conn < C_CONNECTED;
2701 drop_it = !--mdev->ko_count;
2703 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2704 current->comm, current->pid, mdev->ko_count);
2708 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2711 /* The idea of sendpage seems to be to put some kind of reference
2712 * to the page into the skb, and to hand it over to the NIC. In
2713 * this process get_page() gets called.
2715 * As soon as the page was really sent over the network put_page()
2716 * gets called by some part of the network layer. [ NIC driver? ]
2718 * [ get_page() / put_page() increment/decrement the count. If count
2719 * reaches 0 the page will be freed. ]
2721 * This works nicely with pages from FSs.
2722 * But this means that in protocol A we might signal IO completion too early!
2724 * In order not to corrupt data during a resync we must make sure
2725 * that we do not reuse our own buffer pages (EEs) to early, therefore
2726 * we have the net_ee list.
2728 * XFS seems to have problems, still, it submits pages with page_count == 0!
2729 * As a workaround, we disable sendpage on pages
2730 * with page_count == 0 or PageSlab.
2732 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2733 int offset, size_t size, unsigned msg_flags)
2735 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2738 mdev->send_cnt += size>>9;
2739 return sent == size;
2742 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2743 int offset, size_t size, unsigned msg_flags)
2745 mm_segment_t oldfs = get_fs();
2749 /* e.g. XFS meta- & log-data is in slab pages, which have a
2750 * page_count of 0 and/or have PageSlab() set.
2751 * we cannot use send_page for those, as that does get_page();
2752 * put_page(); and would cause either a VM_BUG directly, or
2753 * __page_cache_release a page that would actually still be referenced
2754 * by someone, leading to some obscure delayed Oops somewhere else. */
2755 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2756 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2758 msg_flags |= MSG_NOSIGNAL;
2759 drbd_update_congested(mdev);
2762 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2765 if (sent == -EAGAIN) {
2766 if (we_should_drop_the_connection(mdev,
2773 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2774 __func__, (int)size, len, sent);
2779 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2781 drbd_clear_flag(mdev, NET_CONGESTED);
2785 mdev->send_cnt += size>>9;
2789 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2791 struct bio_vec *bvec;
2793 /* hint all but last page with MSG_MORE */
2794 bio_for_each_segment(bvec, bio, i) {
2795 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2796 bvec->bv_offset, bvec->bv_len,
2797 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2803 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2805 struct bio_vec *bvec;
2807 /* hint all but last page with MSG_MORE */
2808 bio_for_each_segment(bvec, bio, i) {
2809 if (!_drbd_send_page(mdev, bvec->bv_page,
2810 bvec->bv_offset, bvec->bv_len,
2811 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2817 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2819 struct page *page = e->pages;
2820 unsigned len = e->size;
2821 /* hint all but last page with MSG_MORE */
2822 page_chain_for_each(page) {
2823 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2824 if (!_drbd_send_page(mdev, page, 0, l,
2825 page_chain_next(page) ? MSG_MORE : 0))
2832 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2834 if (mdev->agreed_pro_version >= 95)
2835 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2836 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2837 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2838 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2840 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2843 /* Used to send write requests
2844 * R_PRIMARY -> Peer (P_DATA)
2846 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2850 unsigned int dp_flags = 0;
2854 if (!drbd_get_data_sock(mdev))
2857 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2858 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2860 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2861 p.head.h80.magic = BE_DRBD_MAGIC;
2862 p.head.h80.command = cpu_to_be16(P_DATA);
2864 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2866 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2867 p.head.h95.command = cpu_to_be16(P_DATA);
2869 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2872 p.sector = cpu_to_be64(req->sector);
2873 p.block_id = (unsigned long)req;
2874 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2876 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2878 if (mdev->state.conn >= C_SYNC_SOURCE &&
2879 mdev->state.conn <= C_PAUSED_SYNC_T)
2880 dp_flags |= DP_MAY_SET_IN_SYNC;
2882 p.dp_flags = cpu_to_be32(dp_flags);
2883 drbd_set_flag(mdev, UNPLUG_REMOTE);
2885 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2887 dgb = mdev->int_dig_out;
2888 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2889 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2892 /* For protocol A, we have to memcpy the payload into
2893 * socket buffers, as we may complete right away
2894 * as soon as we handed it over to tcp, at which point the data
2895 * pages may become invalid.
2897 * For data-integrity enabled, we copy it as well, so we can be
2898 * sure that even if the bio pages may still be modified, it
2899 * won't change the data on the wire, thus if the digest checks
2900 * out ok after sending on this side, but does not fit on the
2901 * receiving side, we sure have detected corruption elsewhere.
2903 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2904 ok = _drbd_send_bio(mdev, req->master_bio);
2906 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2908 /* double check digest, sometimes buffers have been modified in flight. */
2909 if (dgs > 0 && dgs <= 64) {
2910 /* 64 byte, 512 bit, is the largest digest size
2911 * currently supported in kernel crypto. */
2912 unsigned char digest[64];
2913 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2914 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2916 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2917 (unsigned long long)req->sector, req->size);
2919 } /* else if (dgs > 64) {
2920 ... Be noisy about digest too large ...
2924 drbd_put_data_sock(mdev);
2929 /* answer packet, used to send data back for read requests:
2930 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2931 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2933 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2934 struct drbd_epoch_entry *e)
2941 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2942 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2944 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2945 p.head.h80.magic = BE_DRBD_MAGIC;
2946 p.head.h80.command = cpu_to_be16(cmd);
2948 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2950 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2951 p.head.h95.command = cpu_to_be16(cmd);
2953 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2956 p.sector = cpu_to_be64(e->sector);
2957 p.block_id = e->block_id;
2958 /* p.seq_num = 0; No sequence numbers here.. */
2960 /* Only called by our kernel thread.
2961 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2962 * in response to admin command or module unload.
2964 if (!drbd_get_data_sock(mdev))
2967 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2969 dgb = mdev->int_dig_out;
2970 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2971 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2974 ok = _drbd_send_zc_ee(mdev, e);
2976 drbd_put_data_sock(mdev);
2981 int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2983 struct p_block_desc p;
2985 p.sector = cpu_to_be64(req->sector);
2986 p.blksize = cpu_to_be32(req->size);
2988 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2992 drbd_send distinguishes two cases:
2994 Packets sent via the data socket "sock"
2995 and packets sent via the meta data socket "msock"
2998 -----------------+-------------------------+------------------------------
2999 timeout conf.timeout / 2 conf.timeout / 2
3000 timeout action send a ping via msock Abort communication
3001 and close all sockets
3005 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
3007 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
3008 void *buf, size_t size, unsigned msg_flags)
3017 /* THINK if (signal_pending) return ... ? */
3022 msg.msg_name = NULL;
3023 msg.msg_namelen = 0;
3024 msg.msg_control = NULL;
3025 msg.msg_controllen = 0;
3026 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
3028 if (sock == mdev->data.socket) {
3029 mdev->ko_count = mdev->net_conf->ko_count;
3030 drbd_update_congested(mdev);
3034 * tcp_sendmsg does _not_ use its size parameter at all ?
3036 * -EAGAIN on timeout, -EINTR on signal.
3039 * do we need to block DRBD_SIG if sock == &meta.socket ??
3040 * otherwise wake_asender() might interrupt some send_*Ack !
3042 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
3043 if (rv == -EAGAIN) {
3044 if (we_should_drop_the_connection(mdev, sock))
3051 flush_signals(current);
3059 } while (sent < size);
3061 if (sock == mdev->data.socket)
3062 drbd_clear_flag(mdev, NET_CONGESTED);
3065 if (rv != -EAGAIN) {
3066 dev_err(DEV, "%s_sendmsg returned %d\n",
3067 sock == mdev->meta.socket ? "msock" : "sock",
3069 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
3071 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
3077 static int drbd_open(struct block_device *bdev, fmode_t mode)
3079 struct drbd_conf *mdev = bdev->bd_disk->private_data;
3080 unsigned long flags;
3083 mutex_lock(&drbd_main_mutex);
3084 spin_lock_irqsave(&mdev->req_lock, flags);
3085 /* to have a stable mdev->state.role
3086 * and no race with updating open_cnt */
3088 if (mdev->state.role != R_PRIMARY) {
3089 if (mode & FMODE_WRITE)
3091 else if (!allow_oos)
3097 spin_unlock_irqrestore(&mdev->req_lock, flags);
3098 mutex_unlock(&drbd_main_mutex);
3103 static int drbd_release(struct gendisk *gd, fmode_t mode)
3105 struct drbd_conf *mdev = gd->private_data;
3106 mutex_lock(&drbd_main_mutex);
3108 mutex_unlock(&drbd_main_mutex);
3112 static void drbd_set_defaults(struct drbd_conf *mdev)
3114 /* This way we get a compile error when sync_conf grows,
3115 and we forgot to initialize it here */
3116 mdev->sync_conf = (struct syncer_conf) {
3117 /* .rate = */ DRBD_RATE_DEF,
3118 /* .after = */ DRBD_AFTER_DEF,
3119 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
3120 /* .verify_alg = */ {}, 0,
3121 /* .cpu_mask = */ {}, 0,
3122 /* .csums_alg = */ {}, 0,
3124 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
3125 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
3126 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
3127 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
3128 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
3129 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
3132 /* Have to use that way, because the layout differs between
3133 big endian and little endian */
3134 mdev->state = (union drbd_state) {
3135 { .role = R_SECONDARY,
3137 .conn = C_STANDALONE,
3146 void drbd_init_set_defaults(struct drbd_conf *mdev)
3148 /* the memset(,0,) did most of this.
3149 * note: only assignments, no allocation in here */
3151 drbd_set_defaults(mdev);
3153 atomic_set(&mdev->ap_bio_cnt, 0);
3154 atomic_set(&mdev->ap_pending_cnt, 0);
3155 atomic_set(&mdev->rs_pending_cnt, 0);
3156 atomic_set(&mdev->unacked_cnt, 0);
3157 atomic_set(&mdev->local_cnt, 0);
3158 atomic_set(&mdev->net_cnt, 0);
3159 atomic_set(&mdev->packet_seq, 0);
3160 atomic_set(&mdev->pp_in_use, 0);
3161 atomic_set(&mdev->pp_in_use_by_net, 0);
3162 atomic_set(&mdev->rs_sect_in, 0);
3163 atomic_set(&mdev->rs_sect_ev, 0);
3164 atomic_set(&mdev->ap_in_flight, 0);
3165 atomic_set(&mdev->md_io_in_use, 0);
3167 mutex_init(&mdev->data.mutex);
3168 mutex_init(&mdev->meta.mutex);
3169 sema_init(&mdev->data.work.s, 0);
3170 sema_init(&mdev->meta.work.s, 0);
3171 mutex_init(&mdev->state_mutex);
3173 spin_lock_init(&mdev->data.work.q_lock);
3174 spin_lock_init(&mdev->meta.work.q_lock);
3176 spin_lock_init(&mdev->al_lock);
3177 spin_lock_init(&mdev->req_lock);
3178 spin_lock_init(&mdev->peer_seq_lock);
3179 spin_lock_init(&mdev->epoch_lock);
3181 INIT_LIST_HEAD(&mdev->active_ee);
3182 INIT_LIST_HEAD(&mdev->sync_ee);
3183 INIT_LIST_HEAD(&mdev->done_ee);
3184 INIT_LIST_HEAD(&mdev->read_ee);
3185 INIT_LIST_HEAD(&mdev->net_ee);
3186 INIT_LIST_HEAD(&mdev->resync_reads);
3187 INIT_LIST_HEAD(&mdev->data.work.q);
3188 INIT_LIST_HEAD(&mdev->meta.work.q);
3189 INIT_LIST_HEAD(&mdev->resync_work.list);
3190 INIT_LIST_HEAD(&mdev->unplug_work.list);
3191 INIT_LIST_HEAD(&mdev->go_diskless.list);
3192 INIT_LIST_HEAD(&mdev->md_sync_work.list);
3193 INIT_LIST_HEAD(&mdev->start_resync_work.list);
3194 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
3196 mdev->resync_work.cb = w_resync_timer;
3197 mdev->unplug_work.cb = w_send_write_hint;
3198 mdev->go_diskless.cb = w_go_diskless;
3199 mdev->md_sync_work.cb = w_md_sync;
3200 mdev->bm_io_work.w.cb = w_bitmap_io;
3201 mdev->start_resync_work.cb = w_start_resync;
3202 init_timer(&mdev->resync_timer);
3203 init_timer(&mdev->md_sync_timer);
3204 init_timer(&mdev->start_resync_timer);
3205 init_timer(&mdev->request_timer);
3206 mdev->resync_timer.function = resync_timer_fn;
3207 mdev->resync_timer.data = (unsigned long) mdev;
3208 mdev->md_sync_timer.function = md_sync_timer_fn;
3209 mdev->md_sync_timer.data = (unsigned long) mdev;
3210 mdev->start_resync_timer.function = start_resync_timer_fn;
3211 mdev->start_resync_timer.data = (unsigned long) mdev;
3212 mdev->request_timer.function = request_timer_fn;
3213 mdev->request_timer.data = (unsigned long) mdev;
3215 init_waitqueue_head(&mdev->misc_wait);
3216 init_waitqueue_head(&mdev->state_wait);
3217 init_waitqueue_head(&mdev->net_cnt_wait);
3218 init_waitqueue_head(&mdev->ee_wait);
3219 init_waitqueue_head(&mdev->al_wait);
3220 init_waitqueue_head(&mdev->seq_wait);
3222 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3223 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3224 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3226 mdev->agreed_pro_version = PRO_VERSION_MAX;
3227 mdev->write_ordering = WO_bdev_flush;
3228 mdev->resync_wenr = LC_FREE;
3229 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3230 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3233 void drbd_mdev_cleanup(struct drbd_conf *mdev)
3236 if (mdev->receiver.t_state != None)
3237 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3238 mdev->receiver.t_state);
3240 /* no need to lock it, I'm the only thread alive */
3241 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3242 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3252 mdev->rs_failed = 0;
3253 mdev->rs_last_events = 0;
3254 mdev->rs_last_sect_ev = 0;
3255 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3256 mdev->rs_mark_left[i] = 0;
3257 mdev->rs_mark_time[i] = 0;
3259 D_ASSERT(mdev->net_conf == NULL);
3261 drbd_set_my_capacity(mdev, 0);
3263 /* maybe never allocated. */
3264 drbd_bm_resize(mdev, 0, 1);
3265 drbd_bm_cleanup(mdev);
3268 drbd_free_resources(mdev);
3269 drbd_clear_flag(mdev, AL_SUSPENDED);
3272 * currently we drbd_init_ee only on module load, so
3273 * we may do drbd_release_ee only on module unload!
3275 D_ASSERT(list_empty(&mdev->active_ee));
3276 D_ASSERT(list_empty(&mdev->sync_ee));
3277 D_ASSERT(list_empty(&mdev->done_ee));
3278 D_ASSERT(list_empty(&mdev->read_ee));
3279 D_ASSERT(list_empty(&mdev->net_ee));
3280 D_ASSERT(list_empty(&mdev->resync_reads));
3281 D_ASSERT(list_empty(&mdev->data.work.q));
3282 D_ASSERT(list_empty(&mdev->meta.work.q));
3283 D_ASSERT(list_empty(&mdev->resync_work.list));
3284 D_ASSERT(list_empty(&mdev->unplug_work.list));
3285 D_ASSERT(list_empty(&mdev->go_diskless.list));
3287 drbd_set_defaults(mdev);
3291 static void drbd_destroy_mempools(void)
3295 while (drbd_pp_pool) {
3296 page = drbd_pp_pool;
3297 drbd_pp_pool = (struct page *)page_private(page);
3302 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3304 if (drbd_md_io_bio_set)
3305 bioset_free(drbd_md_io_bio_set);
3306 if (drbd_md_io_page_pool)
3307 mempool_destroy(drbd_md_io_page_pool);
3308 if (drbd_ee_mempool)
3309 mempool_destroy(drbd_ee_mempool);
3310 if (drbd_request_mempool)
3311 mempool_destroy(drbd_request_mempool);
3313 kmem_cache_destroy(drbd_ee_cache);
3314 if (drbd_request_cache)
3315 kmem_cache_destroy(drbd_request_cache);
3316 if (drbd_bm_ext_cache)
3317 kmem_cache_destroy(drbd_bm_ext_cache);
3318 if (drbd_al_ext_cache)
3319 kmem_cache_destroy(drbd_al_ext_cache);
3321 drbd_md_io_bio_set = NULL;
3322 drbd_md_io_page_pool = NULL;
3323 drbd_ee_mempool = NULL;
3324 drbd_request_mempool = NULL;
3325 drbd_ee_cache = NULL;
3326 drbd_request_cache = NULL;
3327 drbd_bm_ext_cache = NULL;
3328 drbd_al_ext_cache = NULL;
3333 static int drbd_create_mempools(void)
3336 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3339 /* prepare our caches and mempools */
3340 drbd_request_mempool = NULL;
3341 drbd_ee_cache = NULL;
3342 drbd_request_cache = NULL;
3343 drbd_bm_ext_cache = NULL;
3344 drbd_al_ext_cache = NULL;
3345 drbd_pp_pool = NULL;
3346 drbd_md_io_page_pool = NULL;
3347 drbd_md_io_bio_set = NULL;
3350 drbd_request_cache = kmem_cache_create(
3351 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3352 if (drbd_request_cache == NULL)
3355 drbd_ee_cache = kmem_cache_create(
3356 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3357 if (drbd_ee_cache == NULL)
3360 drbd_bm_ext_cache = kmem_cache_create(
3361 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3362 if (drbd_bm_ext_cache == NULL)
3365 drbd_al_ext_cache = kmem_cache_create(
3366 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3367 if (drbd_al_ext_cache == NULL)
3371 #ifdef COMPAT_HAVE_BIOSET_CREATE
3372 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
3373 if (drbd_md_io_bio_set == NULL)
3377 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
3378 if (drbd_md_io_page_pool == NULL)
3381 drbd_request_mempool = mempool_create(number,
3382 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3383 if (drbd_request_mempool == NULL)
3386 drbd_ee_mempool = mempool_create(number,
3387 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3388 if (drbd_ee_mempool == NULL)
3391 /* drbd's page pool */
3392 spin_lock_init(&drbd_pp_lock);
3394 for (i = 0; i < number; i++) {
3395 page = alloc_page(GFP_HIGHUSER);
3398 set_page_private(page, (unsigned long)drbd_pp_pool);
3399 drbd_pp_pool = page;
3401 drbd_pp_vacant = number;
3406 drbd_destroy_mempools(); /* in case we allocated some */
3410 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3413 /* just so we have it. you never know what interesting things we
3414 * might want to do here some day...
3420 static struct notifier_block drbd_notifier = {
3421 .notifier_call = drbd_notify_sys,
3424 static void drbd_release_ee_lists(struct drbd_conf *mdev)
3428 rr = drbd_release_ee(mdev, &mdev->active_ee);
3430 dev_err(DEV, "%d EEs in active list found!\n", rr);
3432 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3434 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3436 rr = drbd_release_ee(mdev, &mdev->read_ee);
3438 dev_err(DEV, "%d EEs in read list found!\n", rr);
3440 rr = drbd_release_ee(mdev, &mdev->done_ee);
3442 dev_err(DEV, "%d EEs in done list found!\n", rr);
3444 rr = drbd_release_ee(mdev, &mdev->net_ee);
3446 dev_err(DEV, "%d EEs in net list found!\n", rr);
3449 /* caution. no locking.
3450 * currently only used from module cleanup code. */
3451 static void drbd_delete_device(unsigned int minor)
3453 struct drbd_conf *mdev = minor_to_mdev(minor);
3458 del_timer_sync(&mdev->request_timer);
3460 /* paranoia asserts */
3461 if (mdev->open_cnt != 0)
3462 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3463 __FILE__ , __LINE__);
3465 ERR_IF (!list_empty(&mdev->data.work.q)) {
3466 struct list_head *lp;
3467 list_for_each(lp, &mdev->data.work.q) {
3468 dev_err(DEV, "lp = %p\n", lp);
3471 /* end paranoia asserts */
3473 del_gendisk(mdev->vdisk);
3475 /* cleanup stuff that may have been allocated during
3476 * device (re-)configuration or state changes */
3478 if (mdev->this_bdev)
3479 bdput(mdev->this_bdev);
3481 drbd_free_resources(mdev);
3483 drbd_release_ee_lists(mdev);
3485 /* should be freed on disconnect? */
3486 kfree(mdev->ee_hash);
3488 mdev->ee_hash_s = 0;
3489 mdev->ee_hash = NULL;
3492 lc_destroy(mdev->act_log);
3493 lc_destroy(mdev->resync);
3495 kfree(mdev->p_uuid);
3496 /* mdev->p_uuid = NULL; */
3498 kfree(mdev->int_dig_out);
3499 kfree(mdev->int_dig_in);
3500 kfree(mdev->int_dig_vv);
3502 /* cleanup the rest that has been
3503 * allocated from drbd_new_device
3504 * and actually free the mdev itself */
3505 drbd_free_mdev(mdev);
3508 static void drbd_cleanup(void)
3512 unregister_reboot_notifier(&drbd_notifier);
3514 /* first remove proc,
3515 * drbdsetup uses it's presence to detect
3516 * whether DRBD is loaded.
3517 * If we would get stuck in proc removal,
3518 * but have netlink already deregistered,
3519 * some drbdsetup commands may wait forever
3523 remove_proc_entry("drbd", NULL);
3530 drbd_delete_device(i);
3531 drbd_destroy_mempools();
3536 unregister_blkdev(DRBD_MAJOR, "drbd");
3538 printk(KERN_INFO "drbd: module cleanup done.\n");
3542 * drbd_congested() - Callback for the flusher thread
3543 * @congested_data: User data
3544 * @bdi_bits: Bits the BDI flusher thread is currently interested in
3546 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3548 static int drbd_congested(void *congested_data, int bdi_bits)
3550 struct drbd_conf *mdev = congested_data;
3551 struct request_queue *q;
3555 if (!may_inc_ap_bio(mdev)) {
3556 /* DRBD has frozen IO */
3562 if (drbd_test_flag(mdev, CALLBACK_PENDING)) {
3563 r |= (1 << BDI_async_congested);
3564 /* Without good local data, we would need to read from remote,
3565 * and that would need the worker thread as well, which is
3566 * currently blocked waiting for that usermode helper to
3569 if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
3570 r |= (1 << BDI_sync_congested);
3578 if (get_ldev(mdev)) {
3579 q = bdev_get_queue(mdev->ldev->backing_bdev);
3580 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3586 if (bdi_bits & (1 << BDI_async_congested) && drbd_test_flag(mdev, NET_CONGESTED)) {
3587 r |= (1 << BDI_async_congested);
3588 reason = reason == 'b' ? 'a' : 'n';
3592 mdev->congestion_reason = reason;
3596 struct drbd_conf *drbd_new_device(unsigned int minor)
3598 struct drbd_conf *mdev;
3599 struct gendisk *disk;
3600 struct request_queue *q;
3602 /* GFP_KERNEL, we are outside of all write-out paths */
3603 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3606 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3607 goto out_no_cpumask;
3609 mdev->minor = minor;
3611 drbd_init_set_defaults(mdev);
3613 q = blk_alloc_queue(GFP_KERNEL);
3617 q->queuedata = mdev;
3619 disk = alloc_disk(1);
3624 set_disk_ro(disk, true);
3627 disk->major = DRBD_MAJOR;
3628 disk->first_minor = minor;
3629 disk->fops = &drbd_ops;
3630 sprintf(disk->disk_name, "drbd%d", minor);
3631 disk->private_data = mdev;
3633 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3634 /* we have no partitions. we contain only ourselves. */
3635 mdev->this_bdev->bd_contains = mdev->this_bdev;
3637 q->backing_dev_info.congested_fn = drbd_congested;
3638 q->backing_dev_info.congested_data = mdev;
3640 blk_queue_make_request(q, drbd_make_request);
3641 blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
3642 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3643 This triggers a max_bio_size message upon first attach or connect */
3644 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
3645 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3646 blk_queue_merge_bvec(q, drbd_merge_bvec);
3647 q->queue_lock = &mdev->req_lock;
3649 mdev->md_io_page = alloc_page(GFP_KERNEL);
3650 if (!mdev->md_io_page)
3651 goto out_no_io_page;
3653 if (drbd_bm_init(mdev))
3655 /* no need to lock access, we are still initializing this minor device. */
3659 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3660 if (!mdev->app_reads_hash)
3661 goto out_no_app_reads;
3663 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3664 if (!mdev->current_epoch)
3667 INIT_LIST_HEAD(&mdev->current_epoch->list);
3672 /* out_whatever_else:
3673 kfree(mdev->current_epoch); */
3675 kfree(mdev->app_reads_hash);
3679 drbd_bm_cleanup(mdev);
3681 __free_page(mdev->md_io_page);
3685 blk_cleanup_queue(q);
3687 free_cpumask_var(mdev->cpu_mask);
3693 /* counterpart of drbd_new_device.
3694 * last part of drbd_delete_device. */
3695 void drbd_free_mdev(struct drbd_conf *mdev)
3697 kfree(mdev->current_epoch);
3698 kfree(mdev->app_reads_hash);
3700 if (mdev->bitmap) /* should no longer be there. */
3701 drbd_bm_cleanup(mdev);
3702 __free_page(mdev->md_io_page);
3703 put_disk(mdev->vdisk);
3704 blk_cleanup_queue(mdev->rq_queue);
3705 free_cpumask_var(mdev->cpu_mask);
3706 drbd_free_tl_hash(mdev);
3711 int __init drbd_init(void)
3715 if (sizeof(struct p_handshake) != 80) {
3717 "drbd: never change the size or layout "
3718 "of the HandShake packet.\n");
3722 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
3724 "drbd: invalid minor_count (%d)\n", minor_count);
3732 err = drbd_nl_init();
3736 err = register_blkdev(DRBD_MAJOR, "drbd");
3739 "drbd: unable to register block device major %d\n",
3744 register_reboot_notifier(&drbd_notifier);
3747 * allocate all necessary structs
3751 init_waitqueue_head(&drbd_pp_wait);
3753 drbd_proc = NULL; /* play safe for drbd_cleanup */
3754 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3759 err = drbd_create_mempools();
3763 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3765 printk(KERN_ERR "drbd: unable to register proc file\n");
3769 rwlock_init(&global_state_lock);
3771 printk(KERN_INFO "drbd: initialized. "
3772 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3773 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3774 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3775 printk(KERN_INFO "drbd: registered as block device major %d\n",
3777 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3779 return 0; /* Success! */
3784 /* currently always the case */
3785 printk(KERN_ERR "drbd: ran out of memory\n");
3787 printk(KERN_ERR "drbd: initialization failure\n");
3791 void drbd_free_bc(struct drbd_backing_dev *ldev)
3796 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3797 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3802 void drbd_free_sock(struct drbd_conf *mdev)
3804 if (mdev->data.socket) {
3805 mutex_lock(&mdev->data.mutex);
3806 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3807 sock_release(mdev->data.socket);
3808 mdev->data.socket = NULL;
3809 mutex_unlock(&mdev->data.mutex);
3811 if (mdev->meta.socket) {
3812 mutex_lock(&mdev->meta.mutex);
3813 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3814 sock_release(mdev->meta.socket);
3815 mdev->meta.socket = NULL;
3816 mutex_unlock(&mdev->meta.mutex);
3821 void drbd_free_resources(struct drbd_conf *mdev)
3823 crypto_free_hash(mdev->csums_tfm);
3824 mdev->csums_tfm = NULL;
3825 crypto_free_hash(mdev->verify_tfm);
3826 mdev->verify_tfm = NULL;
3827 crypto_free_hash(mdev->cram_hmac_tfm);
3828 mdev->cram_hmac_tfm = NULL;
3829 crypto_free_hash(mdev->integrity_w_tfm);
3830 mdev->integrity_w_tfm = NULL;
3831 crypto_free_hash(mdev->integrity_r_tfm);
3832 mdev->integrity_r_tfm = NULL;
3834 drbd_free_sock(mdev);
3837 drbd_free_bc(mdev->ldev);
3838 mdev->ldev = NULL;);
3841 /* meta data management */
3843 struct meta_data_on_disk {
3844 u64 la_size; /* last agreed size. */
3845 u64 uuid[UI_SIZE]; /* UUIDs. */
3848 u32 flags; /* MDF */
3851 u32 al_offset; /* offset to this block */
3852 u32 al_nr_extents; /* important for restoring the AL */
3853 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3854 u32 bm_offset; /* offset to the bitmap, from here */
3855 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3856 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3857 u32 reserved_u32[3];
3862 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3863 * @mdev: DRBD device.
3865 void drbd_md_sync(struct drbd_conf *mdev)
3867 struct meta_data_on_disk *buffer;
3871 del_timer(&mdev->md_sync_timer);
3872 /* timer may be rearmed by drbd_md_mark_dirty() now. */
3873 if (!drbd_test_and_clear_flag(mdev, MD_DIRTY))
3876 /* We use here D_FAILED and not D_ATTACHING because we try to write
3877 * metadata even if we detach due to a disk failure! */
3878 if (!get_ldev_if_state(mdev, D_FAILED))
3881 buffer = drbd_md_get_buffer(mdev);
3885 memset(buffer, 0, 512);
3887 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3888 for (i = UI_CURRENT; i < UI_SIZE; i++)
3889 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3890 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3891 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3893 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3894 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3895 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3896 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3897 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3899 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3900 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
3902 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3903 sector = mdev->ldev->md.md_offset;
3905 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3906 /* this was a try anyways ... */
3907 dev_err(DEV, "meta data update failed!\n");
3908 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
3911 /* Update mdev->ldev->md.la_size_sect,
3912 * since we updated it on metadata. */
3913 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3915 drbd_md_put_buffer(mdev);
3921 * drbd_md_read() - Reads in the meta data super block
3922 * @mdev: DRBD device.
3923 * @bdev: Device from which the meta data should be read in.
3925 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
3926 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3928 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3930 struct meta_data_on_disk *buffer;
3931 int i, rv = NO_ERROR;
3933 if (!get_ldev_if_state(mdev, D_ATTACHING))
3934 return ERR_IO_MD_DISK;
3936 buffer = drbd_md_get_buffer(mdev);
3940 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3941 /* NOTE: can't do normal error processing here as this is
3942 called BEFORE disk is attached */
3943 dev_err(DEV, "Error while reading metadata.\n");
3944 rv = ERR_IO_MD_DISK;
3948 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3949 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3950 rv = ERR_MD_INVALID;
3953 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3954 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3955 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3956 rv = ERR_MD_INVALID;
3959 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3960 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3961 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3962 rv = ERR_MD_INVALID;
3965 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3966 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3967 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3968 rv = ERR_MD_INVALID;
3972 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3973 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3974 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3975 rv = ERR_MD_INVALID;
3979 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3980 for (i = UI_CURRENT; i < UI_SIZE; i++)
3981 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3982 bdev->md.flags = be32_to_cpu(buffer->flags);
3983 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3984 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3986 spin_lock_irq(&mdev->req_lock);
3987 if (mdev->state.conn < C_CONNECTED) {
3989 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3990 peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
3991 mdev->peer_max_bio_size = peer;
3993 spin_unlock_irq(&mdev->req_lock);
3995 if (mdev->sync_conf.al_extents < 7)
3996 mdev->sync_conf.al_extents = 127;
3999 drbd_md_put_buffer(mdev);
4007 * drbd_md_mark_dirty() - Mark meta data super block as dirty
4008 * @mdev: DRBD device.
4010 * Call this function if you change anything that should be written to
4011 * the meta-data super block. This function sets MD_DIRTY, and starts a
4012 * timer that ensures that within five seconds you have to call drbd_md_sync().
4015 void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
4017 if (!drbd_test_and_set_flag(mdev, MD_DIRTY)) {
4018 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
4019 mdev->last_md_mark_dirty.line = line;
4020 mdev->last_md_mark_dirty.func = func;
4024 void drbd_md_mark_dirty(struct drbd_conf *mdev)
4026 if (!drbd_test_and_set_flag(mdev, MD_DIRTY))
4027 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
4031 void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
4035 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
4036 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
4039 void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4041 if (idx == UI_CURRENT) {
4042 if (mdev->state.role == R_PRIMARY)
4047 drbd_set_ed_uuid(mdev, val);
4050 mdev->ldev->md.uuid[idx] = val;
4051 drbd_md_mark_dirty(mdev);
4054 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4056 unsigned long flags;
4057 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
4058 __drbd_uuid_set(mdev, idx, val);
4059 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
4062 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4064 unsigned long flags;
4065 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
4066 if (mdev->ldev->md.uuid[idx]) {
4067 drbd_uuid_move_history(mdev);
4068 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
4070 __drbd_uuid_set(mdev, idx, val);
4071 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
4075 * drbd_uuid_new_current() - Creates a new current UUID
4076 * @mdev: DRBD device.
4078 * Creates a new current UUID, and rotates the old current UUID into
4079 * the bitmap slot. Causes an incremental resync upon next connect.
4081 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
4084 unsigned long long bm_uuid;
4086 get_random_bytes(&val, sizeof(u64));
4088 spin_lock_irq(&mdev->ldev->md.uuid_lock);
4089 bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4092 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
4094 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
4095 __drbd_uuid_set(mdev, UI_CURRENT, val);
4096 spin_unlock_irq(&mdev->ldev->md.uuid_lock);
4098 drbd_print_uuids(mdev, "new current UUID");
4099 /* get it to stable storage _now_ */
4103 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
4105 unsigned long flags;
4106 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
4109 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
4111 drbd_uuid_move_history(mdev);
4112 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
4113 mdev->ldev->md.uuid[UI_BITMAP] = 0;
4115 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4117 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
4119 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
4121 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
4123 drbd_md_mark_dirty(mdev);
4127 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4128 * @mdev: DRBD device.
4130 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
4132 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
4136 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4137 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
4139 drbd_bm_set_all(mdev);
4141 rv = drbd_bm_write(mdev);
4144 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
4155 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4156 * @mdev: DRBD device.
4158 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
4160 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
4164 drbd_resume_al(mdev);
4165 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4166 drbd_bm_clear_all(mdev);
4167 rv = drbd_bm_write(mdev);
4174 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4176 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
4179 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
4181 if (get_ldev(mdev)) {
4182 drbd_bm_lock(mdev, work->why, work->flags);
4183 rv = work->io_fn(mdev);
4184 drbd_bm_unlock(mdev);
4188 drbd_clear_flag(mdev, BITMAP_IO);
4189 smp_mb__after_clear_bit();
4190 wake_up(&mdev->misc_wait);
4193 work->done(mdev, rv);
4195 drbd_clear_flag(mdev, BITMAP_IO_QUEUED);
4202 void drbd_ldev_destroy(struct drbd_conf *mdev)
4204 lc_destroy(mdev->resync);
4205 mdev->resync = NULL;
4206 lc_destroy(mdev->act_log);
4207 mdev->act_log = NULL;
4209 drbd_free_bc(mdev->ldev);
4210 mdev->ldev = NULL;);
4212 if (mdev->md_io_tmpp) {
4213 __free_page(mdev->md_io_tmpp);
4214 mdev->md_io_tmpp = NULL;
4216 drbd_clear_flag(mdev, GO_DISKLESS);
4219 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4221 D_ASSERT(mdev->state.disk == D_FAILED);
4222 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4223 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
4224 * the protected members anymore, though, so once put_ldev reaches zero
4225 * again, it will be safe to free them. */
4227 /* Try to write changed bitmap pages, read errors may have just
4228 * set some bits outside the area covered by the activity log.
4230 * If we have an IO error during the bitmap writeout,
4231 * we will want a full sync next time, just in case.
4232 * (Do we want a specific meta data flag for this?)
4234 * If that does not make it to stable storage either,
4235 * we cannot do anything about that anymore. */
4237 if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
4238 "detach", BM_LOCKED_MASK)) {
4239 if (drbd_test_flag(mdev, WAS_READ_ERROR)) {
4240 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
4246 drbd_force_state(mdev, NS(disk, D_DISKLESS));
4250 void drbd_go_diskless(struct drbd_conf *mdev)
4252 D_ASSERT(mdev->state.disk == D_FAILED);
4253 if (!drbd_test_and_set_flag(mdev, GO_DISKLESS))
4254 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
4258 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4259 * @mdev: DRBD device.
4260 * @io_fn: IO callback to be called when bitmap IO is possible
4261 * @done: callback to be called after the bitmap IO was performed
4262 * @why: Descriptive text of the reason for doing the IO
4264 * While IO on the bitmap happens we freeze application IO thus we ensure
4265 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4266 * called from worker context. It MUST NOT be used while a previous such
4267 * work is still pending!
4269 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4270 int (*io_fn)(struct drbd_conf *),
4271 void (*done)(struct drbd_conf *, int),
4272 char *why, enum bm_flag flags)
4274 D_ASSERT(current == mdev->worker.task);
4276 D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO_QUEUED));
4277 D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO));
4278 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4279 if (mdev->bm_io_work.why)
4280 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4281 why, mdev->bm_io_work.why);
4283 mdev->bm_io_work.io_fn = io_fn;
4284 mdev->bm_io_work.done = done;
4285 mdev->bm_io_work.why = why;
4286 mdev->bm_io_work.flags = flags;
4288 spin_lock_irq(&mdev->req_lock);
4289 drbd_set_flag(mdev, BITMAP_IO);
4290 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
4291 if (!drbd_test_and_set_flag(mdev, BITMAP_IO_QUEUED))
4292 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
4294 spin_unlock_irq(&mdev->req_lock);
4298 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4299 * @mdev: DRBD device.
4300 * @io_fn: IO callback to be called when bitmap IO is possible
4301 * @why: Descriptive text of the reason for doing the IO
4303 * freezes application IO while that the actual IO operations runs. This
4304 * functions MAY NOT be called from worker context.
4306 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4307 char *why, enum bm_flag flags)
4311 D_ASSERT(current != mdev->worker.task);
4313 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4314 drbd_suspend_io(mdev);
4316 drbd_bm_lock(mdev, why, flags);
4318 drbd_bm_unlock(mdev);
4320 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4321 drbd_resume_io(mdev);
4326 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4328 if ((mdev->ldev->md.flags & flag) != flag) {
4329 drbd_md_mark_dirty(mdev);
4330 mdev->ldev->md.flags |= flag;
4334 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4336 if ((mdev->ldev->md.flags & flag) != 0) {
4337 drbd_md_mark_dirty(mdev);
4338 mdev->ldev->md.flags &= ~flag;
4341 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4343 return (bdev->md.flags & flag) != 0;
4346 static void md_sync_timer_fn(unsigned long data)
4348 struct drbd_conf *mdev = (struct drbd_conf *) data;
4350 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4353 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4355 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
4357 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4358 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4364 #ifdef CONFIG_DRBD_FAULT_INJECTION
4365 /* Fault insertion support including random number generator shamelessly
4366 * stolen from kernel/rcutorture.c */
4367 struct fault_random_state {
4368 unsigned long state;
4369 unsigned long count;
4372 #define FAULT_RANDOM_MULT 39916801 /* prime */
4373 #define FAULT_RANDOM_ADD 479001701 /* prime */
4374 #define FAULT_RANDOM_REFRESH 10000
4377 * Crude but fast random-number generator. Uses a linear congruential
4378 * generator, with occasional help from get_random_bytes().
4380 static unsigned long
4381 _drbd_fault_random(struct fault_random_state *rsp)
4385 if (!rsp->count--) {
4386 get_random_bytes(&refresh, sizeof(refresh));
4387 rsp->state += refresh;
4388 rsp->count = FAULT_RANDOM_REFRESH;
4390 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4391 return swahw32(rsp->state);
4395 _drbd_fault_str(unsigned int type) {
4396 static char *_faults[] = {
4397 [DRBD_FAULT_MD_WR] = "Meta-data write",
4398 [DRBD_FAULT_MD_RD] = "Meta-data read",
4399 [DRBD_FAULT_RS_WR] = "Resync write",
4400 [DRBD_FAULT_RS_RD] = "Resync read",
4401 [DRBD_FAULT_DT_WR] = "Data write",
4402 [DRBD_FAULT_DT_RD] = "Data read",
4403 [DRBD_FAULT_DT_RA] = "Data read ahead",
4404 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
4405 [DRBD_FAULT_AL_EE] = "EE allocation",
4406 [DRBD_FAULT_RECEIVE] = "receive data corruption",
4409 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4413 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4415 static struct fault_random_state rrs = {0, 0};
4417 unsigned int ret = (
4419 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4420 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4425 if (__ratelimit(&drbd_ratelimit_state))
4426 dev_warn(DEV, "***Simulating %s failure\n",
4427 _drbd_fault_str(type));
4434 const char *drbd_buildtag(void)
4436 /* DRBD built from external sources has here a reference to the
4437 git hash of the source code. */
4439 static char buildtag[38] = "\0uilt-in";
4441 if (buildtag[0] == 0) {
4443 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4452 module_init(drbd_init)
4453 module_exit(drbd_cleanup)
4455 EXPORT_SYMBOL(drbd_conn_str);
4456 EXPORT_SYMBOL(drbd_role_str);
4457 EXPORT_SYMBOL(drbd_disk_str);
4458 EXPORT_SYMBOL(drbd_set_st_err_str);