#include <linux/init.h>
#include <linux/swap.h>
#include <linux/slab.h>
-#include <linux/loop.h>
#include <linux/compat.h>
#include <linux/suspend.h>
#include <linux/freezer.h>
#include <linux/sysfs.h>
#include <linux/miscdevice.h>
#include <linux/falloc.h>
+#include <linux/uio.h>
+#include "loop.h"
#include <asm/uaccess.h>
static int max_part;
static int part_shift;
-/*
- * Transfer functions
- */
-static int transfer_none(struct loop_device *lo, int cmd,
- struct page *raw_page, unsigned raw_off,
- struct page *loop_page, unsigned loop_off,
- int size, sector_t real_block)
-{
- char *raw_buf = kmap_atomic(raw_page) + raw_off;
- char *loop_buf = kmap_atomic(loop_page) + loop_off;
-
- if (cmd == READ)
- memcpy(loop_buf, raw_buf, size);
- else
- memcpy(raw_buf, loop_buf, size);
-
- kunmap_atomic(loop_buf);
- kunmap_atomic(raw_buf);
- cond_resched();
- return 0;
-}
-
static int transfer_xor(struct loop_device *lo, int cmd,
struct page *raw_page, unsigned raw_off,
struct page *loop_page, unsigned loop_off,
static struct loop_func_table none_funcs = {
.number = LO_CRYPT_NONE,
- .transfer = transfer_none,
-};
+};
static struct loop_func_table xor_funcs = {
.number = LO_CRYPT_XOR,
.transfer = transfer_xor,
.init = xor_init
-};
+};
/* xfer_funcs[0] is special - its release function is never called */
static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
return get_size(lo->lo_offset, lo->lo_sizelimit, file);
}
+static void __loop_update_dio(struct loop_device *lo, bool dio)
+{
+ struct file *file = lo->lo_backing_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned short sb_bsize = 0;
+ unsigned dio_align = 0;
+ bool use_dio;
+
+ if (inode->i_sb->s_bdev) {
+ sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
+ dio_align = sb_bsize - 1;
+ }
+
+ /*
+ * We support direct I/O only if lo_offset is aligned with the
+ * logical I/O size of backing device, and the logical block
+ * size of loop is bigger than the backing device's and the loop
+ * needn't transform transfer.
+ *
+ * TODO: the above condition may be loosed in the future, and
+ * direct I/O may be switched runtime at that time because most
+ * of requests in sane appplications should be PAGE_SIZE algined
+ */
+ if (dio) {
+ if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
+ !(lo->lo_offset & dio_align) &&
+ mapping->a_ops->direct_IO &&
+ !lo->transfer)
+ use_dio = true;
+ else
+ use_dio = false;
+ } else {
+ use_dio = false;
+ }
+
+ if (lo->use_dio == use_dio)
+ return;
+
+ /* flush dirty pages before changing direct IO */
+ vfs_fsync(file, 0);
+
+ /*
+ * The flag of LO_FLAGS_DIRECT_IO is handled similarly with
+ * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup
+ * will get updated by ioctl(LOOP_GET_STATUS)
+ */
+ blk_mq_freeze_queue(lo->lo_queue);
+ lo->use_dio = use_dio;
+ if (use_dio)
+ lo->lo_flags |= LO_FLAGS_DIRECT_IO;
+ else
+ lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
+ blk_mq_unfreeze_queue(lo->lo_queue);
+}
+
static int
figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
{
struct page *lpage, unsigned loffs,
int size, sector_t rblock)
{
- if (unlikely(!lo->transfer))
+ int ret;
+
+ ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
+ if (likely(!ret))
return 0;
- return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
+ printk_ratelimited(KERN_ERR
+ "loop: Transfer error at byte offset %llu, length %i.\n",
+ (unsigned long long)rblock << 9, size);
+ return ret;
}
-/**
- * __do_lo_send_write - helper for writing data to a loop device
- *
- * This helper just factors out common code between do_lo_send_direct_write()
- * and do_lo_send_write().
- */
-static int __do_lo_send_write(struct file *file,
- u8 *buf, const int len, loff_t pos)
+static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
{
+ struct iov_iter i;
ssize_t bw;
- mm_segment_t old_fs = get_fs();
+
+ iov_iter_bvec(&i, ITER_BVEC, bvec, 1, bvec->bv_len);
file_start_write(file);
- set_fs(get_ds());
- bw = file->f_op->write(file, buf, len, &pos);
- set_fs(old_fs);
+ bw = vfs_iter_write(file, &i, ppos);
file_end_write(file);
- if (likely(bw == len))
+
+ if (likely(bw == bvec->bv_len))
return 0;
- printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
- (unsigned long long)pos, len);
+
+ printk_ratelimited(KERN_ERR
+ "loop: Write error at byte offset %llu, length %i.\n",
+ (unsigned long long)*ppos, bvec->bv_len);
if (bw >= 0)
bw = -EIO;
return bw;
}
-/**
- * do_lo_send_direct_write - helper for writing data to a loop device
- *
- * This is the fast, non-transforming version that does not need double
- * buffering.
- */
-static int do_lo_send_direct_write(struct loop_device *lo,
- struct bio_vec *bvec, loff_t pos, struct page *page)
+static int lo_write_simple(struct loop_device *lo, struct request *rq,
+ loff_t pos)
{
- ssize_t bw = __do_lo_send_write(lo->lo_backing_file,
- kmap(bvec->bv_page) + bvec->bv_offset,
- bvec->bv_len, pos);
- kunmap(bvec->bv_page);
- cond_resched();
- return bw;
+ struct bio_vec bvec;
+ struct req_iterator iter;
+ int ret = 0;
+
+ rq_for_each_segment(bvec, rq, iter) {
+ ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos);
+ if (ret < 0)
+ break;
+ cond_resched();
+ }
+
+ return ret;
}
-/**
- * do_lo_send_write - helper for writing data to a loop device
- *
+/*
* This is the slow, transforming version that needs to double buffer the
* data as it cannot do the transformations in place without having direct
* access to the destination pages of the backing file.
*/
-static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
- loff_t pos, struct page *page)
+static int lo_write_transfer(struct loop_device *lo, struct request *rq,
+ loff_t pos)
{
- int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page,
- bvec->bv_offset, bvec->bv_len, pos >> 9);
- if (likely(!ret))
- return __do_lo_send_write(lo->lo_backing_file,
- page_address(page), bvec->bv_len,
- pos);
- printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
- "length %i.\n", (unsigned long long)pos, bvec->bv_len);
- if (ret > 0)
- ret = -EIO;
- return ret;
-}
+ struct bio_vec bvec, b;
+ struct req_iterator iter;
+ struct page *page;
+ int ret = 0;
-static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
-{
- int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t,
- struct page *page);
- struct bio_vec *bvec;
- struct page *page = NULL;
- int i, ret = 0;
-
- if (lo->transfer != transfer_none) {
- page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
- if (unlikely(!page))
- goto fail;
- kmap(page);
- do_lo_send = do_lo_send_write;
- } else {
- do_lo_send = do_lo_send_direct_write;
- }
+ page = alloc_page(GFP_NOIO);
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ rq_for_each_segment(bvec, rq, iter) {
+ ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page,
+ bvec.bv_offset, bvec.bv_len, pos >> 9);
+ if (unlikely(ret))
+ break;
- bio_for_each_segment(bvec, bio, i) {
- ret = do_lo_send(lo, bvec, pos, page);
+ b.bv_page = page;
+ b.bv_offset = 0;
+ b.bv_len = bvec.bv_len;
+ ret = lo_write_bvec(lo->lo_backing_file, &b, &pos);
if (ret < 0)
break;
- pos += bvec->bv_len;
- }
- if (page) {
- kunmap(page);
- __free_page(page);
}
-out:
+
+ __free_page(page);
return ret;
-fail:
- printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
- ret = -ENOMEM;
- goto out;
}
-struct lo_read_data {
- struct loop_device *lo;
- struct page *page;
- unsigned offset;
- int bsize;
-};
+static int lo_read_simple(struct loop_device *lo, struct request *rq,
+ loff_t pos)
+{
+ struct bio_vec bvec;
+ struct req_iterator iter;
+ struct iov_iter i;
+ ssize_t len;
-static int
-lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
- struct splice_desc *sd)
-{
- struct lo_read_data *p = sd->u.data;
- struct loop_device *lo = p->lo;
- struct page *page = buf->page;
- sector_t IV;
- int size;
-
- IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +
- (buf->offset >> 9);
- size = sd->len;
- if (size > p->bsize)
- size = p->bsize;
-
- if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) {
- printk(KERN_ERR "loop: transfer error block %ld\n",
- page->index);
- size = -EINVAL;
- }
+ rq_for_each_segment(bvec, rq, iter) {
+ iov_iter_bvec(&i, ITER_BVEC, &bvec, 1, bvec.bv_len);
+ len = vfs_iter_read(lo->lo_backing_file, &i, &pos);
+ if (len < 0)
+ return len;
- flush_dcache_page(p->page);
+ flush_dcache_page(bvec.bv_page);
- if (size > 0)
- p->offset += size;
+ if (len != bvec.bv_len) {
+ struct bio *bio;
- return size;
-}
+ __rq_for_each_bio(bio, rq)
+ zero_fill_bio(bio);
+ break;
+ }
+ cond_resched();
+ }
-static int
-lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd)
-{
- return __splice_from_pipe(pipe, sd, lo_splice_actor);
+ return 0;
}
-static ssize_t
-do_lo_receive(struct loop_device *lo,
- struct bio_vec *bvec, int bsize, loff_t pos)
+static int lo_read_transfer(struct loop_device *lo, struct request *rq,
+ loff_t pos)
{
- struct lo_read_data cookie;
- struct splice_desc sd;
- struct file *file;
- ssize_t retval;
+ struct bio_vec bvec, b;
+ struct req_iterator iter;
+ struct iov_iter i;
+ struct page *page;
+ ssize_t len;
+ int ret = 0;
- cookie.lo = lo;
- cookie.page = bvec->bv_page;
- cookie.offset = bvec->bv_offset;
- cookie.bsize = bsize;
+ page = alloc_page(GFP_NOIO);
+ if (unlikely(!page))
+ return -ENOMEM;
- sd.len = 0;
- sd.total_len = bvec->bv_len;
- sd.flags = 0;
- sd.pos = pos;
- sd.u.data = &cookie;
+ rq_for_each_segment(bvec, rq, iter) {
+ loff_t offset = pos;
- file = lo->lo_backing_file;
- retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor);
+ b.bv_page = page;
+ b.bv_offset = 0;
+ b.bv_len = bvec.bv_len;
- return retval;
-}
+ iov_iter_bvec(&i, ITER_BVEC, &b, 1, b.bv_len);
+ len = vfs_iter_read(lo->lo_backing_file, &i, &pos);
+ if (len < 0) {
+ ret = len;
+ goto out_free_page;
+ }
-static int
-lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
-{
- struct bio_vec *bvec;
- ssize_t s;
- int i;
+ ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page,
+ bvec.bv_offset, len, offset >> 9);
+ if (ret)
+ goto out_free_page;
+
+ flush_dcache_page(bvec.bv_page);
- bio_for_each_segment(bvec, bio, i) {
- s = do_lo_receive(lo, bvec, bsize, pos);
- if (s < 0)
- return s;
+ if (len != bvec.bv_len) {
+ struct bio *bio;
- if (s != bvec->bv_len) {
- zero_fill_bio(bio);
+ __rq_for_each_bio(bio, rq)
+ zero_fill_bio(bio);
break;
}
- pos += bvec->bv_len;
}
- return 0;
+
+ ret = 0;
+out_free_page:
+ __free_page(page);
+ return ret;
}
-static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
+static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
{
- loff_t pos;
+ /*
+ * We use punch hole to reclaim the free space used by the
+ * image a.k.a. discard. However we do not support discard if
+ * encryption is enabled, because it may give an attacker
+ * useful information.
+ */
+ struct file *file = lo->lo_backing_file;
+ int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
int ret;
- pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
+ if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
- if (bio_rw(bio) == WRITE) {
- struct file *file = lo->lo_backing_file;
+ ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
+ if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
+ ret = -EIO;
+ out:
+ return ret;
+}
- if (bio->bi_rw & REQ_FLUSH) {
- ret = vfs_fsync(file, 0);
- if (unlikely(ret && ret != -EINVAL)) {
- ret = -EIO;
- goto out;
- }
- }
+static int lo_req_flush(struct loop_device *lo, struct request *rq)
+{
+ struct file *file = lo->lo_backing_file;
+ int ret = vfs_fsync(file, 0);
+ if (unlikely(ret && ret != -EINVAL))
+ ret = -EIO;
- /*
- * We use punch hole to reclaim the free space used by the
- * image a.k.a. discard. However we do not support discard if
- * encryption is enabled, because it may give an attacker
- * useful information.
- */
- if (bio->bi_rw & REQ_DISCARD) {
- struct file *file = lo->lo_backing_file;
- int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
-
- if ((!file->f_op->fallocate) ||
- lo->lo_encrypt_key_size) {
- ret = -EOPNOTSUPP;
- goto out;
- }
- ret = file->f_op->fallocate(file, mode, pos,
- bio->bi_size);
- if (unlikely(ret && ret != -EINVAL &&
- ret != -EOPNOTSUPP))
- ret = -EIO;
- goto out;
- }
+ return ret;
+}
- ret = lo_send(lo, bio, pos);
+static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
+{
+ if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE))
+ return;
- if ((bio->bi_rw & REQ_FUA) && !ret) {
- ret = vfs_fsync(file, 0);
- if (unlikely(ret && ret != -EINVAL))
- ret = -EIO;
- }
- } else
- ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
+ if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
+ struct bio *bio = cmd->rq->bio;
-out:
- return ret;
+ bio_advance(bio, bytes);
+ zero_fill_bio(bio);
+ }
}
-/*
- * Add bio to back of pending list
- */
-static void loop_add_bio(struct loop_device *lo, struct bio *bio)
+static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
{
- lo->lo_bio_count++;
- bio_list_add(&lo->lo_bio_list, bio);
+ struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
+ struct request *rq = cmd->rq;
+
+ handle_partial_read(cmd, ret);
+
+ if (ret > 0)
+ ret = 0;
+ else if (ret < 0)
+ ret = -EIO;
+
+ blk_mq_complete_request(rq, ret);
}
-/*
- * Grab first pending buffer
- */
-static struct bio *loop_get_bio(struct loop_device *lo)
+static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
+ loff_t pos, bool rw)
{
- lo->lo_bio_count--;
- return bio_list_pop(&lo->lo_bio_list);
+ struct iov_iter iter;
+ struct bio_vec *bvec;
+ struct bio *bio = cmd->rq->bio;
+ struct file *file = lo->lo_backing_file;
+ int ret;
+
+ /* nomerge for loop request queue */
+ WARN_ON(cmd->rq->bio != cmd->rq->biotail);
+
+ bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+ iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
+ bio_segments(bio), blk_rq_bytes(cmd->rq));
+ /*
+ * This bio may be started from the middle of the 'bvec'
+ * because of bio splitting, so offset from the bvec must
+ * be passed to iov iterator
+ */
+ iter.iov_offset = bio->bi_iter.bi_bvec_done;
+
+ cmd->iocb.ki_pos = pos;
+ cmd->iocb.ki_filp = file;
+ cmd->iocb.ki_complete = lo_rw_aio_complete;
+ cmd->iocb.ki_flags = IOCB_DIRECT;
+
+ if (rw == WRITE)
+ ret = file->f_op->write_iter(&cmd->iocb, &iter);
+ else
+ ret = file->f_op->read_iter(&cmd->iocb, &iter);
+
+ if (ret != -EIOCBQUEUED)
+ cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
+ return 0;
}
-static void loop_make_request(struct request_queue *q, struct bio *old_bio)
+
+static inline int lo_rw_simple(struct loop_device *lo,
+ struct request *rq, loff_t pos, bool rw)
{
- struct loop_device *lo = q->queuedata;
- int rw = bio_rw(old_bio);
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
- if (rw == READA)
- rw = READ;
+ if (cmd->use_aio)
+ return lo_rw_aio(lo, cmd, pos, rw);
- BUG_ON(!lo || (rw != READ && rw != WRITE));
+ /*
+ * lo_write_simple and lo_read_simple should have been covered
+ * by io submit style function like lo_rw_aio(), one blocker
+ * is that lo_read_simple() need to call flush_dcache_page after
+ * the page is written from kernel, and it isn't easy to handle
+ * this in io submit style function which submits all segments
+ * of the req at one time. And direct read IO doesn't need to
+ * run flush_dcache_page().
+ */
+ if (rw == WRITE)
+ return lo_write_simple(lo, rq, pos);
+ else
+ return lo_read_simple(lo, rq, pos);
+}
- spin_lock_irq(&lo->lo_lock);
- if (lo->lo_state != Lo_bound)
- goto out;
- if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
- goto out;
- if (lo->lo_bio_count >= q->nr_congestion_on)
- wait_event_lock_irq(lo->lo_req_wait,
- lo->lo_bio_count < q->nr_congestion_off,
- lo->lo_lock);
- loop_add_bio(lo, old_bio);
- wake_up(&lo->lo_event);
- spin_unlock_irq(&lo->lo_lock);
- return;
+static int do_req_filebacked(struct loop_device *lo, struct request *rq)
+{
+ loff_t pos;
+ int ret;
-out:
- spin_unlock_irq(&lo->lo_lock);
- bio_io_error(old_bio);
+ pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
+
+ if (rq->cmd_flags & REQ_WRITE) {
+ if (rq->cmd_flags & REQ_FLUSH)
+ ret = lo_req_flush(lo, rq);
+ else if (rq->cmd_flags & REQ_DISCARD)
+ ret = lo_discard(lo, rq, pos);
+ else if (lo->transfer)
+ ret = lo_write_transfer(lo, rq, pos);
+ else
+ ret = lo_rw_simple(lo, rq, pos, WRITE);
+
+ } else {
+ if (lo->transfer)
+ ret = lo_read_transfer(lo, rq, pos);
+ else
+ ret = lo_rw_simple(lo, rq, pos, READ);
+ }
+
+ return ret;
}
struct switch_request {
struct completion wait;
};
-static void do_loop_switch(struct loop_device *, struct switch_request *);
-
-static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
+static inline void loop_update_dio(struct loop_device *lo)
{
- if (unlikely(!bio->bi_bdev)) {
- do_loop_switch(lo, bio->bi_private);
- bio_put(bio);
- } else {
- int ret = do_bio_filebacked(lo, bio);
- bio_endio(bio, ret);
- }
+ __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) |
+ lo->use_dio);
}
/*
- * worker thread that handles reads/writes to file backed loop devices,
- * to avoid blocking in our make_request_fn. it also does loop decrypting
- * on reads for block backed loop, as that is too heavy to do from
- * b_end_io context where irqs may be disabled.
- *
- * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before
- * calling kthread_stop(). Therefore once kthread_should_stop() is
- * true, make_request will not place any more requests. Therefore
- * once kthread_should_stop() is true and lo_bio is NULL, we are
- * done with the loop.
+ * Do the actual switch; called from the BIO completion routine
*/
-static int loop_thread(void *data)
+static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
{
- struct loop_device *lo = data;
- struct bio *bio;
-
- set_user_nice(current, -20);
-
- while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
-
- wait_event_interruptible(lo->lo_event,
- !bio_list_empty(&lo->lo_bio_list) ||
- kthread_should_stop());
-
- if (bio_list_empty(&lo->lo_bio_list))
- continue;
- spin_lock_irq(&lo->lo_lock);
- bio = loop_get_bio(lo);
- if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
- wake_up(&lo->lo_req_wait);
- spin_unlock_irq(&lo->lo_lock);
+ struct file *file = p->file;
+ struct file *old_file = lo->lo_backing_file;
+ struct address_space *mapping;
- BUG_ON(!bio);
- loop_handle_bio(lo, bio);
- }
+ /* if no new file, only flush of queued bios requested */
+ if (!file)
+ return;
- return 0;
+ mapping = file->f_mapping;
+ mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
+ lo->lo_backing_file = file;
+ lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
+ mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
+ lo->old_gfp_mask = mapping_gfp_mask(mapping);
+ mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+ loop_update_dio(lo);
}
/*
static int loop_switch(struct loop_device *lo, struct file *file)
{
struct switch_request w;
- struct bio *bio = bio_alloc(GFP_KERNEL, 0);
- if (!bio)
- return -ENOMEM;
- init_completion(&w.wait);
+
w.file = file;
- bio->bi_private = &w;
- bio->bi_bdev = NULL;
- loop_make_request(lo->lo_queue, bio);
- wait_for_completion(&w.wait);
+
+ /* freeze queue and wait for completion of scheduled requests */
+ blk_mq_freeze_queue(lo->lo_queue);
+
+ /* do the switch action */
+ do_loop_switch(lo, &w);
+
+ /* unfreeze */
+ blk_mq_unfreeze_queue(lo->lo_queue);
+
return 0;
}
*/
static int loop_flush(struct loop_device *lo)
{
- /* loop not yet configured, no running thread, nothing to flush */
- if (!lo->lo_thread)
- return 0;
-
return loop_switch(lo, NULL);
}
-/*
- * Do the actual switch; called from the BIO completion routine
- */
-static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
+static void loop_reread_partitions(struct loop_device *lo,
+ struct block_device *bdev)
{
- struct file *file = p->file;
- struct file *old_file = lo->lo_backing_file;
- struct address_space *mapping;
+ int rc;
- /* if no new file, only flush of queued bios requested */
- if (!file)
- goto out;
-
- mapping = file->f_mapping;
- mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
- lo->lo_backing_file = file;
- lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
- mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
- lo->old_gfp_mask = mapping_gfp_mask(mapping);
- mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
-out:
- complete(&p->wait);
+ /*
+ * bd_mutex has been held already in release path, so don't
+ * acquire it if this function is called in such case.
+ *
+ * If the reread partition isn't from release path, lo_refcnt
+ * must be at least one and it can only become zero when the
+ * current holder is released.
+ */
+ if (!atomic_read(&lo->lo_refcnt))
+ rc = __blkdev_reread_part(bdev);
+ else
+ rc = blkdev_reread_part(bdev);
+ if (rc)
+ pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
+ __func__, lo->lo_number, lo->lo_file_name, rc);
}
-
/*
* loop_change_fd switched the backing store of a loopback device to
* a new file. This is useful for operating system installers to free up
fput(old_file);
if (lo->lo_flags & LO_FLAGS_PARTSCAN)
- ioctl_by_bdev(bdev, BLKRRPART, 0);
+ loop_reread_partitions(lo, bdev);
return 0;
out_putf:
spin_lock_irq(&lo->lo_lock);
if (lo->lo_backing_file)
- p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+ p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1);
spin_unlock_irq(&lo->lo_lock);
if (IS_ERR_OR_NULL(p))
return sprintf(buf, "%s\n", partscan ? "1" : "0");
}
+static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
+{
+ int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO);
+
+ return sprintf(buf, "%s\n", dio ? "1" : "0");
+}
+
LOOP_ATTR_RO(backing_file);
LOOP_ATTR_RO(offset);
LOOP_ATTR_RO(sizelimit);
LOOP_ATTR_RO(autoclear);
LOOP_ATTR_RO(partscan);
+LOOP_ATTR_RO(dio);
static struct attribute *loop_attrs[] = {
&loop_attr_backing_file.attr,
&loop_attr_sizelimit.attr,
&loop_attr_autoclear.attr,
&loop_attr_partscan.attr,
+ &loop_attr_dio.attr,
NULL,
};
/*
* We use punch hole to reclaim the free space used by the
- * image a.k.a. discard. However we do support discard if
+ * image a.k.a. discard. However we do not support discard if
* encryption is enabled, because it may give an attacker
* useful information.
*/
lo->lo_encrypt_key_size) {
q->limits.discard_granularity = 0;
q->limits.discard_alignment = 0;
- q->limits.max_discard_sectors = 0;
+ blk_queue_max_discard_sectors(q, 0);
q->limits.discard_zeroes_data = 0;
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
return;
q->limits.discard_granularity = inode->i_sb->s_blocksize;
q->limits.discard_alignment = 0;
- q->limits.max_discard_sectors = UINT_MAX >> 9;
+ blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
q->limits.discard_zeroes_data = 1;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
}
+static void loop_unprepare_queue(struct loop_device *lo)
+{
+ flush_kthread_worker(&lo->worker);
+ kthread_stop(lo->worker_task);
+}
+
+static int loop_prepare_queue(struct loop_device *lo)
+{
+ init_kthread_worker(&lo->worker);
+ lo->worker_task = kthread_run(kthread_worker_fn,
+ &lo->worker, "loop%d", lo->lo_number);
+ if (IS_ERR(lo->worker_task))
+ return -ENOMEM;
+ set_user_nice(lo->worker_task, MIN_NICE);
+ return 0;
+}
+
static int loop_set_fd(struct loop_device *lo, fmode_t mode,
struct block_device *bdev, unsigned int arg)
{
goto out_putf;
if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) ||
- !file->f_op->write)
+ !file->f_op->write_iter)
lo_flags |= LO_FLAGS_READ_ONLY;
lo_blocksize = S_ISBLK(inode->i_mode) ?
size = get_loop_size(lo, file);
if ((loff_t)(sector_t)size != size)
goto out_putf;
+ error = loop_prepare_queue(lo);
+ if (error)
+ goto out_putf;
error = 0;
set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
+ lo->use_dio = false;
lo->lo_blocksize = lo_blocksize;
lo->lo_device = bdev;
lo->lo_flags = lo_flags;
lo->lo_backing_file = file;
- lo->transfer = transfer_none;
+ lo->transfer = NULL;
lo->ioctl = NULL;
lo->lo_sizelimit = 0;
- lo->lo_bio_count = 0;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
- bio_list_init(&lo->lo_bio_list);
-
- /*
- * set queue make_request_fn, and add limits based on lower level
- * device
- */
- blk_queue_make_request(lo->lo_queue, loop_make_request);
- lo->lo_queue->queuedata = lo;
-
if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
blk_queue_flush(lo->lo_queue, REQ_FLUSH);
+ loop_update_dio(lo);
set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
loop_sysfs_init(lo);
set_blocksize(bdev, lo_blocksize);
- lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
- lo->lo_number);
- if (IS_ERR(lo->lo_thread)) {
- error = PTR_ERR(lo->lo_thread);
- goto out_clr;
- }
lo->lo_state = Lo_bound;
- wake_up_process(lo->lo_thread);
if (part_shift)
lo->lo_flags |= LO_FLAGS_PARTSCAN;
if (lo->lo_flags & LO_FLAGS_PARTSCAN)
- ioctl_by_bdev(bdev, BLKRRPART, 0);
+ loop_reread_partitions(lo, bdev);
/* Grab the block_device to prevent its destruction after we
* put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
bdgrab(bdev);
return 0;
-out_clr:
- loop_sysfs_exit(lo);
- lo->lo_thread = NULL;
- lo->lo_device = NULL;
- lo->lo_backing_file = NULL;
- lo->lo_flags = 0;
- set_capacity(lo->lo_disk, 0);
- invalidate_bdev(bdev);
- bd_set_size(bdev, 0);
- kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
- mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
- lo->lo_state = Lo_unbound;
out_putf:
fput(file);
out:
* <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
* command to fail with EBUSY.
*/
- if (lo->lo_refcnt > 1) {
+ if (atomic_read(&lo->lo_refcnt) > 1) {
lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
mutex_unlock(&lo->lo_ctl_mutex);
return 0;
if (filp == NULL)
return -EINVAL;
- spin_lock_irq(&lo->lo_lock);
- lo->lo_state = Lo_rundown;
- spin_unlock_irq(&lo->lo_lock);
-
- kthread_stop(lo->lo_thread);
+ /* freeze request queue during the transition */
+ blk_mq_freeze_queue(lo->lo_queue);
spin_lock_irq(&lo->lo_lock);
+ lo->lo_state = Lo_rundown;
lo->lo_backing_file = NULL;
spin_unlock_irq(&lo->lo_lock);
lo->lo_offset = 0;
lo->lo_sizelimit = 0;
lo->lo_encrypt_key_size = 0;
- lo->lo_thread = NULL;
memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
lo->lo_state = Lo_unbound;
/* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE);
+ blk_mq_unfreeze_queue(lo->lo_queue);
+
if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
- ioctl_by_bdev(bdev, BLKRRPART, 0);
+ loop_reread_partitions(lo, bdev);
lo->lo_flags = 0;
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+ loop_unprepare_queue(lo);
mutex_unlock(&lo->lo_ctl_mutex);
/*
* Need not hold lo_ctl_mutex to fput backing file.
!(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
lo->lo_flags |= LO_FLAGS_PARTSCAN;
lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
- ioctl_by_bdev(lo->lo_device, BLKRRPART, 0);
+ loop_reread_partitions(lo, lo->lo_device);
}
lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,
info->lo_encrypt_key_size);
lo->lo_key_owner = uid;
- }
+ }
+
+ /* update dio if lo_offset or transfer is changed */
+ __loop_update_dio(lo, lo->use_dio);
return 0;
}
return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
}
+static int loop_set_dio(struct loop_device *lo, unsigned long arg)
+{
+ int error = -ENXIO;
+ if (lo->lo_state != Lo_bound)
+ goto out;
+
+ __loop_update_dio(lo, !!arg);
+ if (lo->use_dio == !!arg)
+ return 0;
+ error = -EINVAL;
+ out:
+ return error;
+}
+
static int lo_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
err = loop_set_capacity(lo, bdev);
break;
+ case LOOP_SET_DIRECT_IO:
+ err = -EPERM;
+ if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+ err = loop_set_dio(lo, arg);
+ break;
default:
err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
}
goto out;
}
- mutex_lock(&lo->lo_ctl_mutex);
- lo->lo_refcnt++;
- mutex_unlock(&lo->lo_ctl_mutex);
+ atomic_inc(&lo->lo_refcnt);
out:
mutex_unlock(&loop_index_mutex);
return err;
struct loop_device *lo = disk->private_data;
int err;
- mutex_lock(&lo->lo_ctl_mutex);
-
- if (--lo->lo_refcnt)
- goto out;
+ if (atomic_dec_return(&lo->lo_refcnt))
+ return;
+ mutex_lock(&lo->lo_ctl_mutex);
if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
/*
* In autoclear mode, stop the loop thread
loop_flush(lo);
}
-out:
mutex_unlock(&lo->lo_ctl_mutex);
}
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);
+static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+ struct loop_device *lo = cmd->rq->q->queuedata;
+
+ blk_mq_start_request(bd->rq);
+
+ if (lo->lo_state != Lo_bound)
+ return -EIO;
+
+ if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH |
+ REQ_DISCARD)))
+ cmd->use_aio = true;
+ else
+ cmd->use_aio = false;
+
+ queue_kthread_work(&lo->worker, &cmd->work);
+
+ return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static void loop_handle_cmd(struct loop_cmd *cmd)
+{
+ const bool write = cmd->rq->cmd_flags & REQ_WRITE;
+ struct loop_device *lo = cmd->rq->q->queuedata;
+ int ret = 0;
+
+ if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
+ ret = -EIO;
+ goto failed;
+ }
+
+ ret = do_req_filebacked(lo, cmd->rq);
+ failed:
+ /* complete non-aio request */
+ if (!cmd->use_aio || ret)
+ blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
+}
+
+static void loop_queue_work(struct kthread_work *work)
+{
+ struct loop_cmd *cmd =
+ container_of(work, struct loop_cmd, work);
+
+ loop_handle_cmd(cmd);
+}
+
+static int loop_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+ cmd->rq = rq;
+ init_kthread_work(&cmd->work, loop_queue_work);
+
+ return 0;
+}
+
+static struct blk_mq_ops loop_mq_ops = {
+ .queue_rq = loop_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_request = loop_init_request,
+};
+
static int loop_add(struct loop_device **l, int i)
{
struct loop_device *lo;
if (!lo)
goto out;
+ lo->lo_state = Lo_unbound;
+
/* allocate id, if @id >= 0, we're requesting that specific id */
if (i >= 0) {
err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
i = err;
err = -ENOMEM;
- lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
- if (!lo->lo_queue)
- goto out_free_dev;
+ lo->tag_set.ops = &loop_mq_ops;
+ lo->tag_set.nr_hw_queues = 1;
+ lo->tag_set.queue_depth = 128;
+ lo->tag_set.numa_node = NUMA_NO_NODE;
+ lo->tag_set.cmd_size = sizeof(struct loop_cmd);
+ lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ lo->tag_set.driver_data = lo;
+
+ err = blk_mq_alloc_tag_set(&lo->tag_set);
+ if (err)
+ goto out_free_idr;
+
+ lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
+ if (IS_ERR_OR_NULL(lo->lo_queue)) {
+ err = PTR_ERR(lo->lo_queue);
+ goto out_cleanup_tags;
+ }
+ lo->lo_queue->queuedata = lo;
+
+ /*
+ * It doesn't make sense to enable merge because the I/O
+ * submitted to backing file is handled page by page.
+ */
+ queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
disk = lo->lo_disk = alloc_disk(1 << part_shift);
if (!disk)
disk->flags |= GENHD_FL_NO_PART_SCAN;
disk->flags |= GENHD_FL_EXT_DEVT;
mutex_init(&lo->lo_ctl_mutex);
+ atomic_set(&lo->lo_refcnt, 0);
lo->lo_number = i;
- lo->lo_thread = NULL;
- init_waitqueue_head(&lo->lo_event);
- init_waitqueue_head(&lo->lo_req_wait);
spin_lock_init(&lo->lo_lock);
disk->major = LOOP_MAJOR;
disk->first_minor = i << part_shift;
out_free_queue:
blk_cleanup_queue(lo->lo_queue);
+out_cleanup_tags:
+ blk_mq_free_tag_set(&lo->tag_set);
+out_free_idr:
+ idr_remove(&loop_index_idr, i);
out_free_dev:
kfree(lo);
out:
static void loop_remove(struct loop_device *lo)
{
- del_gendisk(lo->lo_disk);
blk_cleanup_queue(lo->lo_queue);
+ del_gendisk(lo->lo_disk);
+ blk_mq_free_tag_set(&lo->tag_set);
put_disk(lo->lo_disk);
kfree(lo);
}
if (err < 0)
err = loop_add(&lo, MINOR(dev) >> part_shift);
if (err < 0)
- kobj = ERR_PTR(err);
+ kobj = NULL;
else
kobj = get_disk(lo->lo_disk);
mutex_unlock(&loop_index_mutex);
mutex_unlock(&lo->lo_ctl_mutex);
break;
}
- if (lo->lo_refcnt > 0) {
+ if (atomic_read(&lo->lo_refcnt) > 0) {
ret = -EBUSY;
mutex_unlock(&lo->lo_ctl_mutex);
break;