From: Jens Axboe Date: Fri, 10 Jul 2015 05:17:35 +0000 (+0800) Subject: direct-io: only inc/dec inode->i_dio_count for file systems X-Git-Tag: firefly_0821_release~3927 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=7479f23c2acd51e977622833adbc6c5a84f8ed90;p=firefly-linux-kernel-4.4.55.git direct-io: only inc/dec inode->i_dio_count for file systems do_blockdev_direct_IO() increments and decrements the inode ->i_dio_count for each IO operation. It does this to protect against truncate of a file. Block devices don't need this sort of protection. For a capable multiqueue setup, this atomic int is the only shared state between applications accessing the device for O_DIRECT, and it presents a scaling wall for that. In my testing, as much as 30% of system time is spent incrementing and decrementing this value. A mixed read/write workload improved from ~2.5M IOPS to ~9.6M IOPS, with better latencies too. Before: clat percentiles (usec): | 1.00th=[ 33], 5.00th=[ 34], 10.00th=[ 34], 20.00th=[ 34], | 30.00th=[ 34], 40.00th=[ 34], 50.00th=[ 35], 60.00th=[ 35], | 70.00th=[ 35], 80.00th=[ 35], 90.00th=[ 37], 95.00th=[ 80], | 99.00th=[ 98], 99.50th=[ 151], 99.90th=[ 155], 99.95th=[ 155], | 99.99th=[ 165] After: clat percentiles (usec): | 1.00th=[ 95], 5.00th=[ 108], 10.00th=[ 129], 20.00th=[ 149], | 30.00th=[ 155], 40.00th=[ 161], 50.00th=[ 167], 60.00th=[ 171], | 70.00th=[ 177], 80.00th=[ 185], 90.00th=[ 201], 95.00th=[ 270], | 99.00th=[ 390], 99.50th=[ 398], 99.90th=[ 418], 99.95th=[ 422], | 99.99th=[ 438] In other setups, Robert Elliott reported seeing good performance improvements: https://lkml.org/lkml/2015/4/3/557 The more applications accessing the device, the worse it gets. Add a new direct-io flags, DIO_SKIP_DIO_COUNT, which tells do_blockdev_direct_IO() that it need not worry about incrementing or decrementing the inode i_dio_count for this caller. Cc: Andrew Morton Cc: Christoph Hellwig Cc: Theodore Ts'o Cc: Elliott, Robert (Server Storage) Cc: Al Viro Signed-off-by: Jens Axboe Signed-off-by: Al Viro Signed-off-by: Shawn Lin Tested-and-Reviewed-by: Shawn Lin --- diff --git a/fs/block_dev.c b/fs/block_dev.c index 85f5c85ec91c..66738e403f97 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -172,7 +172,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct inode *inode = file->f_mapping->host; return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, - nr_segs, blkdev_get_block, NULL, NULL, 0); + nr_segs, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT); } int __sync_blockdev(struct block_device *bdev, int wait) diff --git a/fs/direct-io.c b/fs/direct-io.c index 6e5dd6f5df1a..08a8c0373962 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -262,7 +262,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is dio->end_io(dio->iocb, offset, transferred, dio->private, ret, is_async); } else { - inode_dio_done(dio->inode); + if (!(dio->flags & DIO_SKIP_DIO_COUNT)) + inode_dio_end(dio->inode); + if (is_async) aio_complete(dio->iocb, ret, 0); } @@ -1135,7 +1137,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, /* * Will be decremented at I/O completion time. */ - atomic_inc(&inode->i_dio_count); + if (!(dio->flags & DIO_SKIP_DIO_COUNT)) + inode_dio_begin(inode); /* * For file extending writes updating i_size before data diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 589061469687..167b64641409 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -691,18 +691,18 @@ retry: * via ext4_inode_block_unlocked_dio(). Check inode's state * while holding extra i_dio_count ref. */ - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); smp_mb(); if (unlikely(ext4_test_inode_state(inode, EXT4_STATE_DIOREAD_LOCK))) { - inode_dio_done(inode); + inode_dio_end(inode); goto locked; } ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL, NULL, 0); - inode_dio_done(inode); + inode_dio_end(inode); } else { locked: ret = blockdev_direct_IO(rw, iocb, inode, iov, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8a277505a3be..001f88f35e33 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3092,7 +3092,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { ext4_free_io_end(io_end); out: - inode_dio_done(inode); + inode_dio_end(inode); if (is_async) aio_complete(iocb, ret, 0); return; @@ -3150,7 +3150,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, overwrite = *((int *)iocb->private); if (overwrite) { - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); down_read(&EXT4_I(inode)->i_data_sem); mutex_unlock(&inode->i_mutex); } @@ -3243,7 +3243,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, retake_lock: /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) { - inode_dio_done(inode); + inode_dio_end(inode); up_read(&EXT4_I(inode)->i_data_sem); mutex_lock(&inode->i_mutex); } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index b12a4427aedc..11c236686868 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -97,7 +97,7 @@ static int ext4_end_io(ext4_io_end_t *io) if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) wake_up_all(ext4_ioend_wq(inode)); if (io->flag & EXT4_IO_END_DIRECT) - inode_dio_done(inode); + inode_dio_end(inode); if (io->iocb) aio_complete(io->iocb, io->result, 0); return ret; diff --git a/fs/inode.c b/fs/inode.c index 1b300a06b8be..8b85f856ca4b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1887,16 +1887,3 @@ void inode_dio_wait(struct inode *inode) } EXPORT_SYMBOL(inode_dio_wait); -/* - * inode_dio_done - signal finish of a direct I/O requests - * @inode: inode the direct I/O happens on - * - * This is called once we've finished processing a direct I/O request, - * and is used to wake up callers waiting for direct I/O to be quiesced. - */ -void inode_dio_done(struct inode *inode) -{ - if (atomic_dec_and_test(&inode->i_dio_count)) - wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); -} -EXPORT_SYMBOL(inode_dio_done); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9a3437377f71..405b18e31621 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2461,6 +2461,12 @@ enum { /* filesystem does not support filling holes */ DIO_SKIP_HOLES = 0x02, + + /* filesystem can handle aio writes beyond i_size */ + DIO_ASYNC_EXTEND = 0x04, + + /* inode/fs/bdev does not need truncate protection */ + DIO_SKIP_DIO_COUNT = 0x08, }; void dio_end_io(struct bio *bio, int error); @@ -2481,8 +2487,32 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, #endif void inode_dio_wait(struct inode *inode); -void inode_dio_done(struct inode *inode); + +/* + * inode_dio_begin - signal start of a direct I/O requests + * @inode: inode the direct I/O happens on + * + * This is called once we've finished processing a direct I/O request, + * and is used to wake up callers waiting for direct I/O to be quiesced. + */ +static inline void inode_dio_begin(struct inode *inode) +{ + atomic_inc(&inode->i_dio_count); +} + +/* + * inode_dio_end - signal finish of a direct I/O requests + * @inode: inode the direct I/O happens on + * + * This is called once we've finished processing a direct I/O request, + * and is used to wake up callers waiting for direct I/O to be quiesced. + */ +static inline void inode_dio_end(struct inode *inode) +{ + if (atomic_dec_and_test(&inode->i_dio_count)) + wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); +} extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))