From 584acdd49cd2472ca0f5a06adbe979db82d0b4af Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Mon, 15 Dec 2014 12:57:05 +1100 Subject: [PATCH] md/raid5: activate raid6 rmw feature Glue it altogehter. The raid6 rmw path should work the same as the already existing raid5 logic. So emulate the prexor handling/flags and split functions as needed. 1) Enable xor_syndrome() in the async layer. 2) Split ops_run_prexor() into RAID4/5 and RAID6 logic. Xor the syndrome at the start of a rmw run as we did it before for the single parity. 3) Take care of rmw run in ops_run_reconstruct6(). Again process only the changed pages to get syndrome back into sync. 4) Enhance set_syndrome_sources() to fill NULL pages if we are in a rmw run. The lower layers will calculate start & end pages from that and call the xor_syndrome() correspondingly. 5) Adapt the several places where we ignored Q handling up to now. Performance numbers for a single E5630 system with a mix of 10 7200k desktop/server disks. 300 seconds random write with 8 threads onto a 3,2TB (10*400GB) RAID6 64K chunk without spare (group_thread_cnt=4) bsize rmw_level=1 rmw_level=0 rmw_level=1 rmw_level=0 skip_copy=1 skip_copy=1 skip_copy=0 skip_copy=0 4K 115 KB/s 141 KB/s 165 KB/s 140 KB/s 8K 225 KB/s 275 KB/s 324 KB/s 274 KB/s 16K 434 KB/s 536 KB/s 640 KB/s 534 KB/s 32K 751 KB/s 1,051 KB/s 1,234 KB/s 1,045 KB/s 64K 1,339 KB/s 1,958 KB/s 2,282 KB/s 1,962 KB/s 128K 2,673 KB/s 3,862 KB/s 4,113 KB/s 3,898 KB/s 256K 7,685 KB/s 7,539 KB/s 7,557 KB/s 7,638 KB/s 512K 19,556 KB/s 19,558 KB/s 19,652 KB/s 19,688 Kb/s Signed-off-by: Markus Stockhausen Signed-off-by: NeilBrown --- crypto/async_tx/async_pq.c | 19 +++++-- drivers/md/raid5.c | 104 +++++++++++++++++++++++++++---------- drivers/md/raid5.h | 19 ++++++- include/linux/async_tx.h | 3 ++ 4 files changed, 115 insertions(+), 30 deletions(-) diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index d05327caf69d..5d355e0c2633 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -124,6 +124,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, { void **srcs; int i; + int start = -1, stop = disks - 3; if (submit->scribble) srcs = submit->scribble; @@ -134,10 +135,21 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, if (blocks[i] == NULL) { BUG_ON(i > disks - 3); /* P or Q can't be zero */ srcs[i] = (void*)raid6_empty_zero_page; - } else + } else { srcs[i] = page_address(blocks[i]) + offset; + if (i < disks - 2) { + stop = i; + if (start == -1) + start = i; + } + } } - raid6_call.gen_syndrome(disks, len, srcs); + if (submit->flags & ASYNC_TX_PQ_XOR_DST) { + BUG_ON(!raid6_call.xor_syndrome); + if (start >= 0) + raid6_call.xor_syndrome(disks, start, stop, len, srcs); + } else + raid6_call.gen_syndrome(disks, len, srcs); async_tx_sync_epilog(submit); } @@ -178,7 +190,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks, if (device) unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO); - if (unmap && + /* XORing P/Q is only implemented in software */ + if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) && (src_cnt <= dma_maxpq(device, 0) || dma_maxpq(device, DMA_PREP_CONTINUE) > 0) && is_dma_pq_aligned(device, offset, 0, len)) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3ae097d50b51..c82ce1fd8723 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1317,7 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) * destination buffer is recorded in srcs[count] and the Q destination * is recorded in srcs[count+1]]. */ -static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) +static int set_syndrome_sources(struct page **srcs, + struct stripe_head *sh, + int srctype) { int disks = sh->disks; int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); @@ -1332,8 +1334,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) i = d0_idx; do { int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + struct r5dev *dev = &sh->dev[i]; - srcs[slot] = sh->dev[i].page; + if (i == sh->qd_idx || i == sh->pd_idx || + (srctype == SYNDROME_SRC_ALL) || + (srctype == SYNDROME_SRC_WANT_DRAIN && + test_bit(R5_Wantdrain, &dev->flags)) || + (srctype == SYNDROME_SRC_WRITTEN && + dev->written)) + srcs[slot] = sh->dev[i].page; i = raid6_next_disk(i, disks); } while (i != d0_idx); @@ -1373,7 +1382,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) atomic_inc(&sh->count); if (target == qd_idx) { - count = set_syndrome_sources(blocks, sh); + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); blocks[count] = NULL; /* regenerating p is not necessary */ BUG_ON(blocks[count+1] != dest); /* q should already be set */ init_async_submit(&submit, ASYNC_TX_FENCE, NULL, @@ -1481,7 +1490,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); - count = set_syndrome_sources(blocks, sh); + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); init_async_submit(&submit, ASYNC_TX_FENCE, tx, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); @@ -1515,8 +1524,8 @@ static void ops_complete_prexor(void *stripe_head_ref) } static struct dma_async_tx_descriptor * -ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, - struct dma_async_tx_descriptor *tx) +ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) { int disks = sh->disks; struct page **xor_srcs = to_addr_page(percpu, 0); @@ -1544,6 +1553,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, return tx; } +static struct dma_async_tx_descriptor * +ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + struct page **blocks = to_addr_page(percpu, 0); + int count; + struct async_submit_ctl submit; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, + ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); + tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + + return tx; +} + static struct dma_async_tx_descriptor * ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { @@ -1746,6 +1775,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, int count, i, j = 0; struct stripe_head *head_sh = sh; int last_stripe; + int synflags; + unsigned long txflags; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1765,14 +1796,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, again: blocks = to_addr_page(percpu, j); - count = set_syndrome_sources(blocks, sh); + + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + synflags = SYNDROME_SRC_WRITTEN; + txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; + } else { + synflags = SYNDROME_SRC_ALL; + txflags = ASYNC_TX_ACK; + } + + count = set_syndrome_sources(blocks, sh, synflags); last_stripe = !head_sh->batch_head || list_first_entry(&sh->batch_list, struct stripe_head, batch_list) == head_sh; if (last_stripe) { atomic_inc(&head_sh->count); - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, + init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, head_sh, to_addr_conv(sh, percpu, j)); } else init_async_submit(&submit, 0, tx, NULL, NULL, @@ -1843,7 +1883,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu (unsigned long long)sh->sector, checkp); BUG_ON(sh->batch_head); - count = set_syndrome_sources(srcs, sh); + count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); if (!checkp) srcs[count] = NULL; @@ -1884,8 +1924,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) async_tx_ack(tx); } - if (test_bit(STRIPE_OP_PREXOR, &ops_request)) - tx = ops_run_prexor(sh, percpu, tx); + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { + if (level < 6) + tx = ops_run_prexor5(sh, percpu, tx); + else + tx = ops_run_prexor6(sh, percpu, tx); + } if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { tx = ops_run_biodrain(sh, tx); @@ -2770,7 +2814,7 @@ static void schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) { - int i, pd_idx = sh->pd_idx, disks = sh->disks; + int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; struct r5conf *conf = sh->raid_conf; int level = conf->level; @@ -2806,13 +2850,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&conf->pending_full_writes); } else { - BUG_ON(level == 6); BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); + BUG_ON(level == 6 && + (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (i == pd_idx) + if (i == pd_idx || i == qd_idx) continue; if (dev->towrite && @@ -3476,28 +3522,27 @@ static void handle_stripe_dirtying(struct r5conf *conf, int rmw = 0, rcw = 0, i; sector_t recovery_cp = conf->mddev->recovery_cp; - /* RAID6 requires 'rcw' in current implementation. - * Otherwise, check whether resync is now happening or should start. + /* Check whether resync is now happening or should start. * If yes, then the array is dirty (after unclean shutdown or * initial creation), so parity in some stripes might be inconsistent. * In this case, we need to always do reconstruct-write, to ensure * that in case of drive failure or read-error correction, we * generate correct data from the parity. */ - if (conf->max_degraded == 2 || + if (conf->rmw_level == PARITY_DISABLE_RMW || (recovery_cp < MaxSector && sh->sector >= recovery_cp && s->failed == 0)) { /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; - pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", - conf->max_degraded, (unsigned long long)recovery_cp, + pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", + conf->rmw_level, (unsigned long long)recovery_cp, (unsigned long long)sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && + if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { @@ -3507,7 +3552,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, rmw += 2*disks; /* cannot read it */ } /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { @@ -3520,7 +3566,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); - if (rmw < rcw && rmw > 0) { + if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ if (conf->mddev->queue) blk_add_trace_msg(conf->mddev->queue, @@ -3528,7 +3574,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && + if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) && @@ -3547,7 +3593,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } - if (rcw <= rmw && rcw > 0) { + if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { /* want reconstruct write, but need to get some data */ int qread =0; rcw = 0; @@ -6344,10 +6390,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) } conf->level = mddev->new_level; - if (conf->level == 6) + if (conf->level == 6) { conf->max_degraded = 2; - else + if (raid6_call.xor_syndrome) + conf->rmw_level = PARITY_ENABLE_RMW; + else + conf->rmw_level = PARITY_DISABLE_RMW; + } else { conf->max_degraded = 1; + conf->rmw_level = PARITY_ENABLE_RMW; + } conf->algorithm = mddev->new_layout; conf->reshape_progress = mddev->reshape_position; if (conf->reshape_progress != MaxSector) { diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index ee65ed844d3f..57fef9ba36fa 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -355,6 +355,23 @@ enum { STRIPE_OP_RECONSTRUCT, STRIPE_OP_CHECK, }; + +/* + * RAID parity calculation preferences + */ +enum { + PARITY_DISABLE_RMW = 0, + PARITY_ENABLE_RMW, +}; + +/* + * Pages requested from set_syndrome_sources() + */ +enum { + SYNDROME_SRC_ALL, + SYNDROME_SRC_WANT_DRAIN, + SYNDROME_SRC_WRITTEN, +}; /* * Plugging: * @@ -411,7 +428,7 @@ struct r5conf { spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; struct mddev *mddev; int chunk_sectors; - int level, algorithm; + int level, algorithm, rmw_level; int max_degraded; int raid_disks; int max_nr_stripes; diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 179b38ffd351..388574ea38ed 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -60,12 +60,15 @@ struct dma_chan_ref { * dependency chain * @ASYNC_TX_FENCE: specify that the next operation in the dependency * chain uses this operation's result as an input + * @ASYNC_TX_PQ_XOR_DST: do not overwrite the syndrome but XOR it with the + * input data. Required for rmw case. */ enum async_tx_flags { ASYNC_TX_XOR_ZERO_DST = (1 << 0), ASYNC_TX_XOR_DROP_DST = (1 << 1), ASYNC_TX_ACK = (1 << 2), ASYNC_TX_FENCE = (1 << 3), + ASYNC_TX_PQ_XOR_DST = (1 << 4), }; /** -- 2.34.1