#include <linux/compiler.h>
#include <linux/types.h>
-#include <linux/version.h>
#include <linux/list.h>
#include <linux/sched.h>
#include <linux/bitops.h>
/* module parameter, defined in drbd_main.c */
extern unsigned int minor_count;
-extern int disable_sendpage;
-extern int allow_oos;
+extern bool disable_sendpage;
+extern bool allow_oos;
#ifdef CONFIG_DRBD_FAULT_INJECTION
extern int enable_faults;
P_RECV_ACK = 0x15, /* Used in protocol B */
P_WRITE_ACK = 0x16, /* Used in protocol C */
P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
- P_DISCARD_WRITE = 0x18, /* Used in proto C, two-primaries conflict detection */
+ P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */
P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
* commands which share a struct:
* p_block_ack:
* P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
- * P_DISCARD_WRITE (proto C, two-primaries conflict detection)
+ * P_SUPERSEDED (proto C, two-primaries conflict detection)
* p_block_req:
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
u32 bit_map_gen[5];
} __packed;
-struct p_discard {
- u64 block_id;
- u32 seq_num;
- u32 pad;
-} __packed;
-
struct p_block_desc {
u64 sector;
u32 blksize;
struct bio *private_bio;
struct drbd_interval i;
- unsigned int epoch; /* barrier_nr */
- /* barrier_nr: used to check on "completion" whether this req was in
+ /* epoch: used to check on "completion" whether this req was in
* the current epoch, and we therefore have to close it,
- * starting a new epoch...
+ * causing a p_barrier packet to be send, starting a new epoch.
+ *
+ * This corresponds to "barrier" in struct p_barrier[_ack],
+ * and to "barrier_nr" in struct drbd_epoch (and various
+ * comments/function parameters/local variable names).
*/
+ unsigned int epoch;
struct list_head tl_requests; /* ring list in the transfer log */
struct bio *master_bio; /* master bio pointer */
- unsigned long rq_state; /* see comments above _req_mod() */
unsigned long start_time;
-};
-struct drbd_tl_epoch {
- struct drbd_work w;
- struct list_head requests; /* requests before */
- struct drbd_tl_epoch *next; /* pointer to the next barrier */
- unsigned int br_number; /* the barriers identifier. */
- int n_writes; /* number of requests attached before this barrier */
+ /* once it hits 0, we may complete the master_bio */
+ atomic_t completion_ref;
+ /* once it hits 0, we may destroy this drbd_request object */
+ struct kref kref;
+
+ unsigned rq_state; /* see comments above _req_mod() */
};
struct drbd_epoch {
/* flag bits per mdev */
enum {
- UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
CRASHED_PRIMARY, /* This node was a crashed primary.
* Gets cleared when the state.conn
* goes into C_CONNECTED state. */
- NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
CONSIDER_RESYNC,
MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
once no more io in flight, start bitmap io */
BITMAP_IO_QUEUED, /* Started bitmap IO */
GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */
- WAS_IO_ERROR, /* Local disk failed returned IO error */
+ WAS_IO_ERROR, /* Local disk failed, returned IO error */
+ WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
+ FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
RESIZE_PENDING, /* Size change detected locally, waiting for the response from
* the peer, if it changed there as well. */
struct drbd_work_queue {
struct list_head q;
- struct semaphore s; /* producers up it, worker down()s it */
spinlock_t q_lock; /* to protect the list. */
+ wait_queue_head_t q_wait;
};
struct drbd_socket {
- struct drbd_work_queue work;
struct mutex mutex;
struct socket *socket;
/* this way we get our
u64 md_offset; /* sector offset to 'super' block */
u64 la_size_sect; /* last agreed size, unit sectors */
+ spinlock_t uuid_lock;
u64 uuid[UI_SIZE];
u64 device_uuid;
u32 flags;
/* flag bits per tconn */
enum {
NET_CONGESTED, /* The data socket is congested */
- DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
+ RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */
SEND_PING, /* whether asender should send a ping asap */
SIGNAL_ASENDER, /* whether asender wants to be interrupted */
GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */
CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
STATE_SENT, /* Do not change state/UUIDs while this is set */
+ CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+ * pending, from drbd worker context.
+ * If set, bdi_write_congested() returns true,
+ * so shrink_page_list() would not recurse into,
+ * and potentially deadlock on, this drbd worker.
+ */
+ DISCONNECT_SENT,
};
struct drbd_tconn { /* is a resource from the config file */
unsigned int ko_count;
spinlock_t req_lock;
- struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
- struct drbd_tl_epoch *newest_tle;
- struct drbd_tl_epoch *oldest_tle;
- struct list_head out_of_sequence_requests;
- struct list_head barrier_acked_requests;
+
+ struct list_head transfer_log; /* all requests not yet fully processed */
struct crypto_hash *cram_hmac_tfm;
struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by tconn->data->mutex */
void *int_dig_in;
void *int_dig_vv;
+ /* receiver side */
struct drbd_epoch *current_epoch;
spinlock_t epoch_lock;
unsigned int epochs;
enum write_ordering_e write_ordering;
+ atomic_t current_tle_nr; /* transfer log epoch number */
+ unsigned current_tle_writes; /* writes seen within this tl epoch */
unsigned long last_reconnect_jif;
struct drbd_thread receiver;
struct drbd_thread worker;
struct drbd_thread asender;
cpumask_var_t cpu_mask;
+
+ /* sender side */
+ struct drbd_work_queue sender_work;
+
+ struct {
+ /* whether this sender thread
+ * has processed a single write yet. */
+ bool seen_any_write_yet;
+
+ /* Which barrier number to send with the next P_BARRIER */
+ int current_epoch_nr;
+
+ /* how many write requests have been sent
+ * with req->epoch == current_epoch_nr.
+ * If none, no P_BARRIER will be sent. */
+ unsigned current_epoch_writes;
+ } send;
};
struct drbd_conf {
unsigned long rs_mark_time[DRBD_SYNC_MARKS];
/* current index into rs_mark_{left,time} */
int rs_last_mark;
+ unsigned long rs_last_bcast; /* [unit jiffies] */
/* where does the admin want us to start? (sector) */
sector_t ov_start_sector;
+ sector_t ov_stop_sector;
/* where are we now? (sector) */
sector_t ov_position;
/* Start sector of out of sync range (to merge printk reporting). */
struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */
int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
- int peer_max_bio_size;
- int local_max_bio_size;
+ unsigned int peer_max_bio_size;
+ unsigned int local_max_bio_size;
};
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
extern void tl_release(struct drbd_tconn *, unsigned int barrier_nr,
unsigned int set_size);
extern void tl_clear(struct drbd_tconn *);
-extern void _tl_add_barrier(struct drbd_tconn *, struct drbd_tl_epoch *);
extern void drbd_free_sock(struct drbd_tconn *tconn);
extern int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
void *buf, size_t size, unsigned msg_flags);
extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
+extern void conn_md_sync(struct drbd_tconn *tconn);
extern void drbd_md_sync(struct drbd_conf *mdev);
extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
-extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
+extern void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local);
+extern void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
extern int drbd_bitmap_io(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
char *why, enum bm_flag flags);
+extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
+ int (*io_fn)(struct drbd_conf *),
+ char *why, enum bm_flag flags);
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
extern void drbd_go_diskless(struct drbd_conf *mdev);
* we limit us to a platform agnostic constant here for now.
* A followup commit may allow even bigger BIO sizes,
* once we thought that through. */
-#define DRBD_MAX_BIO_SIZE (1 << 20)
+#define DRBD_MAX_BIO_SIZE (1U << 20)
#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
#endif
-#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */
+#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
-#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* Header 80 only allows packets up to 32KiB data */
-#define DRBD_MAX_BIO_SIZE_P95 (1 << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
+#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
extern int drbd_bm_init(struct drbd_conf *mdev);
extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
extern void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr);
extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
extern int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
extern size_t drbd_bm_words(struct drbd_conf *mdev);
extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
extern int proc_details;
/* drbd_req */
-extern int __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
-extern int drbd_make_request(struct request_queue *q, struct bio *bio);
+extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
+extern void drbd_make_request(struct request_queue *q, struct bio *bio);
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
extern int is_valid_ar_handle(struct drbd_request *, sector_t);
extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev, sector_t sector, int rw);
extern void drbd_ov_out_of_sync_found(struct drbd_conf *, sector_t, int);
-extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
- unsigned int *done);
+extern void wait_until_done_or_force_detached(struct drbd_conf *mdev,
+ struct drbd_backing_dev *bdev, unsigned int *done);
extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
static inline void ov_out_of_sync_print(struct drbd_conf *mdev)
extern int w_send_write_hint(struct drbd_work *, int);
extern int w_make_resync_request(struct drbd_work *, int);
extern int w_send_dblock(struct drbd_work *, int);
-extern int w_send_barrier(struct drbd_work *, int);
extern int w_send_read_req(struct drbd_work *, int);
extern int w_prev_work_done(struct drbd_work *, int);
extern int w_e_reissue(struct drbd_work *, int);
return rv;
}
+enum drbd_force_detach_flags {
+ DRBD_READ_ERROR,
+ DRBD_WRITE_ERROR,
+ DRBD_META_IO_ERROR,
+ DRBD_FORCE_DETACH,
+};
+
#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
-static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
+static inline void __drbd_chk_io_error_(struct drbd_conf *mdev,
+ enum drbd_force_detach_flags df,
+ const char *where)
{
enum drbd_io_error_p ep;
rcu_read_unlock();
switch (ep) {
case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
- if (!forcedetach) {
+ if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Local IO failed in %s.\n", where);
if (mdev->state.disk > D_INCONSISTENT)
_drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL);
break;
}
- /* NOTE fall through to detach case if forcedetach set */
+ /* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */
case EP_DETACH:
case EP_CALL_HELPER:
+ /* Remember whether we saw a READ or WRITE error.
+ *
+ * Recovery of the affected area for WRITE failure is covered
+ * by the activity log.
+ * READ errors may fall outside that area though. Certain READ
+ * errors can be "healed" by writing good data to the affected
+ * blocks, which triggers block re-allocation in lower layers.
+ *
+ * If we can not write the bitmap after a READ error,
+ * we may need to trigger a full sync (see w_go_diskless()).
+ *
+ * Force-detach is not really an IO error, but rather a
+ * desperate measure to try to deal with a completely
+ * unresponsive lower level IO stack.
+ * Still it should be treated as a WRITE error.
+ *
+ * Meta IO error is always WRITE error:
+ * we read meta data only once during attach,
+ * which will fail in case of errors.
+ */
set_bit(WAS_IO_ERROR, &mdev->flags);
+ if (df == DRBD_READ_ERROR)
+ set_bit(WAS_READ_ERROR, &mdev->flags);
+ if (df == DRBD_FORCE_DETACH)
+ set_bit(FORCE_DETACH, &mdev->flags);
if (mdev->state.disk > D_FAILED) {
_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
dev_err(DEV,
*/
#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
- int error, int forcedetach, const char *where)
+ int error, enum drbd_force_detach_flags forcedetach, const char *where)
{
if (error) {
unsigned long flags;
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add(&w->list, &q->q);
- up(&q->s); /* within the spinlock,
- see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
}
static inline void
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add_tail(&w->list, &q->q);
- up(&q->s); /* within the spinlock,
- see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
}
static inline void wake_asender(struct drbd_tconn *tconn)
int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
D_ASSERT(ap_bio >= 0);
+
+ if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w);
+ }
+
/* this currently does wake_up for every dec_ap_bio!
* maybe rather introduce some type of hysteresis?
* e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
if (ap_bio < mxb)
wake_up(&mdev->misc_wait);
- if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
- if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
- drbd_queue_work(&mdev->tconn->data.work, &mdev->bm_io_work.w);
- }
+}
+
+static inline bool verify_can_do_stop_sector(struct drbd_conf *mdev)
+{
+ return mdev->tconn->agreed_pro_version >= 97 &&
+ mdev->tconn->agreed_pro_version != 100;
}
static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
{
int r;
+ if (mdev->ldev == NULL) {
+ dev_warn(DEV, "mdev->ldev == NULL in drbd_md_flush\n");
+ return;
+ }
+
if (test_bit(MD_NO_FUA, &mdev->flags))
return;
- r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
+ r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_NOIO, NULL);
if (r) {
set_bit(MD_NO_FUA, &mdev->flags);
dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
}
#endif
+
+/* This is defined in drivers/md/md.h as well. Should go into wait.h */
+#define __wait_event_lock_irq(wq, condition, lock, cmd) \
+do { \
+ wait_queue_t __wait; \
+ init_waitqueue_entry(&__wait, current); \
+ \
+ add_wait_queue(&wq, &__wait); \
+ for (;;) { \
+ set_current_state(TASK_UNINTERRUPTIBLE); \
+ if (condition) \
+ break; \
+ spin_unlock_irq(&lock); \
+ cmd; \
+ schedule(); \
+ spin_lock_irq(&lock); \
+ } \
+ current->state = TASK_RUNNING; \
+ remove_wait_queue(&wq, &__wait); \
+} while (0)
+
+#define wait_event_lock_irq(wq, condition, lock, cmd) \
+do { \
+ if (condition) \
+ break; \
+ __wait_event_lock_irq(wq, condition, lock, cmd); \
+} while (0)