block: strict rq_affinity
authorDan Williams <dan.j.williams@intel.com>
Sat, 23 Jul 2011 18:44:25 +0000 (20:44 +0200)
committerJens Axboe <jaxboe@fusionio.com>
Sat, 23 Jul 2011 18:44:25 +0000 (20:44 +0200)
Some systems benefit from completions always being steered to the strict
requester cpu rather than the looser "per-socket" steering that
blk_cpu_to_group() attempts by default. This is because the first
CPU in the group mask ends up being completely overloaded with work,
while the others (including the original submitter) has power left
to spare.

Allow the strict mode to be set by writing '2' to the sysfs control
file. This is identical to the scheme used for the nomerges file,
where '2' is a more aggressive setting than just being turned on.

echo 2 > /sys/block/<bdev>/queue/rq_affinity

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Roland Dreier <roland@purestorage.com>
Tested-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Documentation/block/queue-sysfs.txt
block/blk-core.c
block/blk-softirq.c
block/blk-sysfs.c
include/linux/blkdev.h

index f65274081c8d19a1c2f6f8b53712a1cf9cbee54d..d8147b336c354e203addd40bb85bcc0abbeeded6 100644 (file)
@@ -45,9 +45,13 @@ device.
 
 rq_affinity (RW)
 ----------------
-If this option is enabled, the block layer will migrate request completions
-to the CPU that originally submitted the request. For some workloads
-this provides a significant reduction in CPU cycles due to caching effects.
+If this option is '1', the block layer will migrate request completions to the
+cpu "group" that originally submitted the request. For some workloads this
+provides a significant reduction in CPU cycles due to caching effects.
+
+For storage configurations that need to maximize distribution of completion
+processing setting this option to '2' forces the completion to run on the
+requesting cpu (bypassing the "group" aggregation logic).
 
 scheduler (RW)
 --------------
index a564852920626a7984935bbf21aba5c5722e3908..b3228255304d2f25294ad5e8b5ea291874dce9a9 100644 (file)
@@ -1279,10 +1279,8 @@ get_rq:
        init_request_from_bio(req, bio);
 
        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-           bio_flagged(bio, BIO_CPU_AFFINE)) {
-               req->cpu = blk_cpu_to_group(get_cpu());
-               put_cpu();
-       }
+           bio_flagged(bio, BIO_CPU_AFFINE))
+               req->cpu = smp_processor_id();
 
        plug = current->plug;
        if (plug) {
index ee9c21602228e0dcb1b6ab8adf6909dae4adc2fd..475fab809a80cce9eb5573fbb392597d8dd9b838 100644 (file)
@@ -103,22 +103,25 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 
 void __blk_complete_request(struct request *req)
 {
+       int ccpu, cpu, group_cpu = NR_CPUS;
        struct request_queue *q = req->q;
        unsigned long flags;
-       int ccpu, cpu, group_cpu;
 
        BUG_ON(!q->softirq_done_fn);
 
        local_irq_save(flags);
        cpu = smp_processor_id();
-       group_cpu = blk_cpu_to_group(cpu);
 
        /*
         * Select completion CPU
         */
-       if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
+       if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
                ccpu = req->cpu;
-       else
+               if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
+                       ccpu = blk_cpu_to_group(ccpu);
+                       group_cpu = blk_cpu_to_group(cpu);
+               }
+       } else
                ccpu = cpu;
 
        if (ccpu == cpu || ccpu == group_cpu) {
index d935bd859c87bc1c9a0e39eb61438583f91690f9..0ee17b5e7fb656235de604deb5838029456222ea 100644 (file)
@@ -244,8 +244,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
 {
        bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
+       bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);
 
-       return queue_var_show(set, page);
+       return queue_var_show(set << force, page);
 }
 
 static ssize_t
@@ -257,10 +258,14 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 
        ret = queue_var_store(&val, page, count);
        spin_lock_irq(q->queue_lock);
-       if (val)
+       if (val) {
                queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
-       else
-               queue_flag_clear(QUEUE_FLAG_SAME_COMP,  q);
+               if (val == 2)
+                       queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
+       } else {
+               queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
+               queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+       }
        spin_unlock_irq(q->queue_lock);
 #endif
        return ret;
index c0cd9a2f22ef79c8145a3139eee643ebcc06b984..0e67c45b3bc95ec70c38836cd8801bb4e568f804 100644 (file)
@@ -392,7 +392,7 @@ struct request_queue {
 #define QUEUE_FLAG_ELVSWITCH   6       /* don't use elevator, just do FIFO */
 #define QUEUE_FLAG_BIDI                7       /* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES     8      /* disable merge attempts */
-#define QUEUE_FLAG_SAME_COMP   9       /* force complete on same CPU */
+#define QUEUE_FLAG_SAME_COMP   9       /* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO     10      /* fake timeout */
 #define QUEUE_FLAG_STACKABLE   11      /* supports request stacking */
 #define QUEUE_FLAG_NONROT      12      /* non-rotational device (SSD) */
@@ -402,6 +402,7 @@ struct request_queue {
 #define QUEUE_FLAG_NOXMERGES   15      /* No extended merges */
 #define QUEUE_FLAG_ADD_RANDOM  16      /* Contributes to random pool */
 #define QUEUE_FLAG_SECDISCARD  17      /* supports SECDISCARD */
+#define QUEUE_FLAG_SAME_FORCE  18      /* force complete on same CPU */
 
 #define QUEUE_FLAG_DEFAULT     ((1 << QUEUE_FLAG_IO_STAT) |            \
                                 (1 << QUEUE_FLAG_STACKABLE)    |       \