Merge branch 'linux-linaro-lsk-v4.4' into linux-linaro-lsk-v4.4-android
[firefly-linux-kernel-4.4.55.git] / fs / fuse / file.c
1 /*
2   FUSE: Filesystem in Userspace
3   Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
4
5   This program can be distributed under the terms of the GNU GPL.
6   See the file COPYING.
7 */
8
9 #include "fuse_i.h"
10
11 #include <linux/pagemap.h>
12 #include <linux/slab.h>
13 #include <linux/kernel.h>
14 #include <linux/sched.h>
15 #include <linux/module.h>
16 #include <linux/compat.h>
17 #include <linux/swap.h>
18 #include <linux/falloc.h>
19 #include <linux/uio.h>
20
21 static const struct file_operations fuse_direct_io_file_operations;
22
23 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
24                           int opcode, struct fuse_open_out *outargp)
25 {
26         struct fuse_open_in inarg;
27         FUSE_ARGS(args);
28
29         memset(&inarg, 0, sizeof(inarg));
30         inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
31         if (!fc->atomic_o_trunc)
32                 inarg.flags &= ~O_TRUNC;
33         args.in.h.opcode = opcode;
34         args.in.h.nodeid = nodeid;
35         args.in.numargs = 1;
36         args.in.args[0].size = sizeof(inarg);
37         args.in.args[0].value = &inarg;
38         args.out.numargs = 1;
39         args.out.args[0].size = sizeof(*outargp);
40         args.out.args[0].value = outargp;
41
42         return fuse_simple_request(fc, &args);
43 }
44
45 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
46 {
47         struct fuse_file *ff;
48
49         ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
50         if (unlikely(!ff))
51                 return NULL;
52
53         ff->fc = fc;
54         ff->reserved_req = fuse_request_alloc(0);
55         if (unlikely(!ff->reserved_req)) {
56                 kfree(ff);
57                 return NULL;
58         }
59
60         INIT_LIST_HEAD(&ff->write_entry);
61         atomic_set(&ff->count, 0);
62         RB_CLEAR_NODE(&ff->polled_node);
63         init_waitqueue_head(&ff->poll_wait);
64
65         spin_lock(&fc->lock);
66         ff->kh = ++fc->khctr;
67         spin_unlock(&fc->lock);
68
69         return ff;
70 }
71
72 void fuse_file_free(struct fuse_file *ff)
73 {
74         fuse_request_free(ff->reserved_req);
75         kfree(ff);
76 }
77
78 struct fuse_file *fuse_file_get(struct fuse_file *ff)
79 {
80         atomic_inc(&ff->count);
81         return ff;
82 }
83
84 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
85 {
86         iput(req->misc.release.inode);
87 }
88
89 static void fuse_file_put(struct fuse_file *ff, bool sync)
90 {
91         if (atomic_dec_and_test(&ff->count)) {
92                 struct fuse_req *req = ff->reserved_req;
93
94                 if (ff->fc->no_open) {
95                         /*
96                          * Drop the release request when client does not
97                          * implement 'open'
98                          */
99                         __clear_bit(FR_BACKGROUND, &req->flags);
100                         iput(req->misc.release.inode);
101                         fuse_put_request(ff->fc, req);
102                 } else if (sync) {
103                         __set_bit(FR_FORCE, &req->flags);
104                         __clear_bit(FR_BACKGROUND, &req->flags);
105                         fuse_request_send(ff->fc, req);
106                         iput(req->misc.release.inode);
107                         fuse_put_request(ff->fc, req);
108                 } else {
109                         req->end = fuse_release_end;
110                         __set_bit(FR_BACKGROUND, &req->flags);
111                         fuse_request_send_background(ff->fc, req);
112                 }
113                 kfree(ff);
114         }
115 }
116
117 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
118                  bool isdir)
119 {
120         struct fuse_file *ff;
121         int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
122
123         ff = fuse_file_alloc(fc);
124         if (!ff)
125                 return -ENOMEM;
126
127         ff->fh = 0;
128         ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
129         if (!fc->no_open || isdir) {
130                 struct fuse_open_out outarg;
131                 int err;
132
133                 err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
134                 if (!err) {
135                         ff->fh = outarg.fh;
136                         ff->open_flags = outarg.open_flags;
137
138                 } else if (err != -ENOSYS || isdir) {
139                         fuse_file_free(ff);
140                         return err;
141                 } else {
142                         fc->no_open = 1;
143                 }
144         }
145
146         if (isdir)
147                 ff->open_flags &= ~FOPEN_DIRECT_IO;
148
149         ff->nodeid = nodeid;
150         file->private_data = fuse_file_get(ff);
151
152         return 0;
153 }
154 EXPORT_SYMBOL_GPL(fuse_do_open);
155
156 static void fuse_link_write_file(struct file *file)
157 {
158         struct inode *inode = file_inode(file);
159         struct fuse_conn *fc = get_fuse_conn(inode);
160         struct fuse_inode *fi = get_fuse_inode(inode);
161         struct fuse_file *ff = file->private_data;
162         /*
163          * file may be written through mmap, so chain it onto the
164          * inodes's write_file list
165          */
166         spin_lock(&fc->lock);
167         if (list_empty(&ff->write_entry))
168                 list_add(&ff->write_entry, &fi->write_files);
169         spin_unlock(&fc->lock);
170 }
171
172 void fuse_finish_open(struct inode *inode, struct file *file)
173 {
174         struct fuse_file *ff = file->private_data;
175         struct fuse_conn *fc = get_fuse_conn(inode);
176
177         if (ff->open_flags & FOPEN_DIRECT_IO)
178                 file->f_op = &fuse_direct_io_file_operations;
179         if (!(ff->open_flags & FOPEN_KEEP_CACHE))
180                 invalidate_inode_pages2(inode->i_mapping);
181         if (ff->open_flags & FOPEN_NONSEEKABLE)
182                 nonseekable_open(inode, file);
183         if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
184                 struct fuse_inode *fi = get_fuse_inode(inode);
185
186                 spin_lock(&fc->lock);
187                 fi->attr_version = ++fc->attr_version;
188                 i_size_write(inode, 0);
189                 spin_unlock(&fc->lock);
190                 fuse_invalidate_attr(inode);
191                 if (fc->writeback_cache)
192                         file_update_time(file);
193         }
194         if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
195                 fuse_link_write_file(file);
196 }
197
198 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
199 {
200         struct fuse_conn *fc = get_fuse_conn(inode);
201         int err;
202         bool lock_inode = (file->f_flags & O_TRUNC) &&
203                           fc->atomic_o_trunc &&
204                           fc->writeback_cache;
205
206         err = generic_file_open(inode, file);
207         if (err)
208                 return err;
209
210         if (lock_inode)
211                 mutex_lock(&inode->i_mutex);
212
213         err = fuse_do_open(fc, get_node_id(inode), file, isdir);
214
215         if (!err)
216                 fuse_finish_open(inode, file);
217
218         if (lock_inode)
219                 mutex_unlock(&inode->i_mutex);
220
221         return err;
222 }
223
224 static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
225 {
226         struct fuse_conn *fc = ff->fc;
227         struct fuse_req *req = ff->reserved_req;
228         struct fuse_release_in *inarg = &req->misc.release.in;
229
230         spin_lock(&fc->lock);
231         list_del(&ff->write_entry);
232         if (!RB_EMPTY_NODE(&ff->polled_node))
233                 rb_erase(&ff->polled_node, &fc->polled_files);
234         spin_unlock(&fc->lock);
235
236         wake_up_interruptible_all(&ff->poll_wait);
237
238         inarg->fh = ff->fh;
239         inarg->flags = flags;
240         req->in.h.opcode = opcode;
241         req->in.h.nodeid = ff->nodeid;
242         req->in.numargs = 1;
243         req->in.args[0].size = sizeof(struct fuse_release_in);
244         req->in.args[0].value = inarg;
245 }
246
247 void fuse_release_common(struct file *file, int opcode)
248 {
249         struct fuse_file *ff;
250         struct fuse_req *req;
251
252         ff = file->private_data;
253         if (unlikely(!ff))
254                 return;
255
256         req = ff->reserved_req;
257         fuse_prepare_release(ff, file->f_flags, opcode);
258
259         if (ff->flock) {
260                 struct fuse_release_in *inarg = &req->misc.release.in;
261                 inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
262                 inarg->lock_owner = fuse_lock_owner_id(ff->fc,
263                                                        (fl_owner_t) file);
264         }
265         /* Hold inode until release is finished */
266         req->misc.release.inode = igrab(file_inode(file));
267
268         /*
269          * Normally this will send the RELEASE request, however if
270          * some asynchronous READ or WRITE requests are outstanding,
271          * the sending will be delayed.
272          *
273          * Make the release synchronous if this is a fuseblk mount,
274          * synchronous RELEASE is allowed (and desirable) in this case
275          * because the server can be trusted not to screw up.
276          */
277         fuse_file_put(ff, ff->fc->destroy_req != NULL);
278 }
279
280 static int fuse_open(struct inode *inode, struct file *file)
281 {
282         return fuse_open_common(inode, file, false);
283 }
284
285 static int fuse_release(struct inode *inode, struct file *file)
286 {
287         struct fuse_conn *fc = get_fuse_conn(inode);
288
289         /* see fuse_vma_close() for !writeback_cache case */
290         if (fc->writeback_cache)
291                 write_inode_now(inode, 1);
292
293         fuse_release_common(file, FUSE_RELEASE);
294
295         /* return value is ignored by VFS */
296         return 0;
297 }
298
299 void fuse_sync_release(struct fuse_file *ff, int flags)
300 {
301         WARN_ON(atomic_read(&ff->count) > 1);
302         fuse_prepare_release(ff, flags, FUSE_RELEASE);
303         __set_bit(FR_FORCE, &ff->reserved_req->flags);
304         __clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
305         fuse_request_send(ff->fc, ff->reserved_req);
306         fuse_put_request(ff->fc, ff->reserved_req);
307         kfree(ff);
308 }
309 EXPORT_SYMBOL_GPL(fuse_sync_release);
310
311 /*
312  * Scramble the ID space with XTEA, so that the value of the files_struct
313  * pointer is not exposed to userspace.
314  */
315 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
316 {
317         u32 *k = fc->scramble_key;
318         u64 v = (unsigned long) id;
319         u32 v0 = v;
320         u32 v1 = v >> 32;
321         u32 sum = 0;
322         int i;
323
324         for (i = 0; i < 32; i++) {
325                 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
326                 sum += 0x9E3779B9;
327                 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
328         }
329
330         return (u64) v0 + ((u64) v1 << 32);
331 }
332
333 /*
334  * Check if any page in a range is under writeback
335  *
336  * This is currently done by walking the list of writepage requests
337  * for the inode, which can be pretty inefficient.
338  */
339 static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
340                                    pgoff_t idx_to)
341 {
342         struct fuse_conn *fc = get_fuse_conn(inode);
343         struct fuse_inode *fi = get_fuse_inode(inode);
344         struct fuse_req *req;
345         bool found = false;
346
347         spin_lock(&fc->lock);
348         list_for_each_entry(req, &fi->writepages, writepages_entry) {
349                 pgoff_t curr_index;
350
351                 BUG_ON(req->inode != inode);
352                 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
353                 if (idx_from < curr_index + req->num_pages &&
354                     curr_index <= idx_to) {
355                         found = true;
356                         break;
357                 }
358         }
359         spin_unlock(&fc->lock);
360
361         return found;
362 }
363
364 static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
365 {
366         return fuse_range_is_writeback(inode, index, index);
367 }
368
369 /*
370  * Wait for page writeback to be completed.
371  *
372  * Since fuse doesn't rely on the VM writeback tracking, this has to
373  * use some other means.
374  */
375 static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
376 {
377         struct fuse_inode *fi = get_fuse_inode(inode);
378
379         wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
380         return 0;
381 }
382
383 /*
384  * Wait for all pending writepages on the inode to finish.
385  *
386  * This is currently done by blocking further writes with FUSE_NOWRITE
387  * and waiting for all sent writes to complete.
388  *
389  * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
390  * could conflict with truncation.
391  */
392 static void fuse_sync_writes(struct inode *inode)
393 {
394         fuse_set_nowrite(inode);
395         fuse_release_nowrite(inode);
396 }
397
398 static int fuse_flush(struct file *file, fl_owner_t id)
399 {
400         struct inode *inode = file_inode(file);
401         struct fuse_conn *fc = get_fuse_conn(inode);
402         struct fuse_file *ff = file->private_data;
403         struct fuse_req *req;
404         struct fuse_flush_in inarg;
405         int err;
406
407         if (is_bad_inode(inode))
408                 return -EIO;
409
410         if (fc->no_flush)
411                 return 0;
412
413         err = write_inode_now(inode, 1);
414         if (err)
415                 return err;
416
417         mutex_lock(&inode->i_mutex);
418         fuse_sync_writes(inode);
419         mutex_unlock(&inode->i_mutex);
420
421         if (test_bit(AS_ENOSPC, &file->f_mapping->flags) &&
422             test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags))
423                 err = -ENOSPC;
424         if (test_bit(AS_EIO, &file->f_mapping->flags) &&
425             test_and_clear_bit(AS_EIO, &file->f_mapping->flags))
426                 err = -EIO;
427         if (err)
428                 return err;
429
430         req = fuse_get_req_nofail_nopages(fc, file);
431         memset(&inarg, 0, sizeof(inarg));
432         inarg.fh = ff->fh;
433         inarg.lock_owner = fuse_lock_owner_id(fc, id);
434         req->in.h.opcode = FUSE_FLUSH;
435         req->in.h.nodeid = get_node_id(inode);
436         req->in.numargs = 1;
437         req->in.args[0].size = sizeof(inarg);
438         req->in.args[0].value = &inarg;
439         __set_bit(FR_FORCE, &req->flags);
440         fuse_request_send(fc, req);
441         err = req->out.h.error;
442         fuse_put_request(fc, req);
443         if (err == -ENOSYS) {
444                 fc->no_flush = 1;
445                 err = 0;
446         }
447         return err;
448 }
449
450 int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
451                       int datasync, int isdir)
452 {
453         struct inode *inode = file->f_mapping->host;
454         struct fuse_conn *fc = get_fuse_conn(inode);
455         struct fuse_file *ff = file->private_data;
456         FUSE_ARGS(args);
457         struct fuse_fsync_in inarg;
458         int err;
459
460         if (is_bad_inode(inode))
461                 return -EIO;
462
463         mutex_lock(&inode->i_mutex);
464
465         /*
466          * Start writeback against all dirty pages of the inode, then
467          * wait for all outstanding writes, before sending the FSYNC
468          * request.
469          */
470         err = filemap_write_and_wait_range(inode->i_mapping, start, end);
471         if (err)
472                 goto out;
473
474         fuse_sync_writes(inode);
475
476         /*
477          * Due to implementation of fuse writeback
478          * filemap_write_and_wait_range() does not catch errors.
479          * We have to do this directly after fuse_sync_writes()
480          */
481         if (test_bit(AS_ENOSPC, &file->f_mapping->flags) &&
482             test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags))
483                 err = -ENOSPC;
484         if (test_bit(AS_EIO, &file->f_mapping->flags) &&
485             test_and_clear_bit(AS_EIO, &file->f_mapping->flags))
486                 err = -EIO;
487         if (err)
488                 goto out;
489
490         err = sync_inode_metadata(inode, 1);
491         if (err)
492                 goto out;
493
494         if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
495                 goto out;
496
497         memset(&inarg, 0, sizeof(inarg));
498         inarg.fh = ff->fh;
499         inarg.fsync_flags = datasync ? 1 : 0;
500         args.in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
501         args.in.h.nodeid = get_node_id(inode);
502         args.in.numargs = 1;
503         args.in.args[0].size = sizeof(inarg);
504         args.in.args[0].value = &inarg;
505         err = fuse_simple_request(fc, &args);
506         if (err == -ENOSYS) {
507                 if (isdir)
508                         fc->no_fsyncdir = 1;
509                 else
510                         fc->no_fsync = 1;
511                 err = 0;
512         }
513 out:
514         mutex_unlock(&inode->i_mutex);
515         return err;
516 }
517
518 static int fuse_fsync(struct file *file, loff_t start, loff_t end,
519                       int datasync)
520 {
521         return fuse_fsync_common(file, start, end, datasync, 0);
522 }
523
524 void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
525                     size_t count, int opcode)
526 {
527         struct fuse_read_in *inarg = &req->misc.read.in;
528         struct fuse_file *ff = file->private_data;
529
530         inarg->fh = ff->fh;
531         inarg->offset = pos;
532         inarg->size = count;
533         inarg->flags = file->f_flags;
534         req->in.h.opcode = opcode;
535         req->in.h.nodeid = ff->nodeid;
536         req->in.numargs = 1;
537         req->in.args[0].size = sizeof(struct fuse_read_in);
538         req->in.args[0].value = inarg;
539         req->out.argvar = 1;
540         req->out.numargs = 1;
541         req->out.args[0].size = count;
542 }
543
544 static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty)
545 {
546         unsigned i;
547
548         for (i = 0; i < req->num_pages; i++) {
549                 struct page *page = req->pages[i];
550                 if (should_dirty)
551                         set_page_dirty_lock(page);
552                 put_page(page);
553         }
554 }
555
556 static void fuse_io_release(struct kref *kref)
557 {
558         kfree(container_of(kref, struct fuse_io_priv, refcnt));
559 }
560
561 static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
562 {
563         if (io->err)
564                 return io->err;
565
566         if (io->bytes >= 0 && io->write)
567                 return -EIO;
568
569         return io->bytes < 0 ? io->size : io->bytes;
570 }
571
572 /**
573  * In case of short read, the caller sets 'pos' to the position of
574  * actual end of fuse request in IO request. Otherwise, if bytes_requested
575  * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
576  *
577  * An example:
578  * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
579  * both submitted asynchronously. The first of them was ACKed by userspace as
580  * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
581  * second request was ACKed as short, e.g. only 1K was read, resulting in
582  * pos == 33K.
583  *
584  * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
585  * will be equal to the length of the longest contiguous fragment of
586  * transferred data starting from the beginning of IO request.
587  */
588 static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
589 {
590         bool is_sync = is_sync_kiocb(io->iocb);
591         int left;
592
593         spin_lock(&io->lock);
594         if (err)
595                 io->err = io->err ? : err;
596         else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
597                 io->bytes = pos;
598
599         left = --io->reqs;
600         if (!left && is_sync)
601                 complete(io->done);
602         spin_unlock(&io->lock);
603
604         if (!left && !is_sync) {
605                 ssize_t res = fuse_get_res_by_io(io);
606
607                 if (res >= 0) {
608                         struct inode *inode = file_inode(io->iocb->ki_filp);
609                         struct fuse_conn *fc = get_fuse_conn(inode);
610                         struct fuse_inode *fi = get_fuse_inode(inode);
611
612                         spin_lock(&fc->lock);
613                         fi->attr_version = ++fc->attr_version;
614                         spin_unlock(&fc->lock);
615                 }
616
617                 io->iocb->ki_complete(io->iocb, res, 0);
618         }
619
620         kref_put(&io->refcnt, fuse_io_release);
621 }
622
623 static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
624 {
625         struct fuse_io_priv *io = req->io;
626         ssize_t pos = -1;
627
628         fuse_release_user_pages(req, !io->write);
629
630         if (io->write) {
631                 if (req->misc.write.in.size != req->misc.write.out.size)
632                         pos = req->misc.write.in.offset - io->offset +
633                                 req->misc.write.out.size;
634         } else {
635                 if (req->misc.read.in.size != req->out.args[0].size)
636                         pos = req->misc.read.in.offset - io->offset +
637                                 req->out.args[0].size;
638         }
639
640         fuse_aio_complete(io, req->out.h.error, pos);
641 }
642
643 static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
644                 size_t num_bytes, struct fuse_io_priv *io)
645 {
646         spin_lock(&io->lock);
647         kref_get(&io->refcnt);
648         io->size += num_bytes;
649         io->reqs++;
650         spin_unlock(&io->lock);
651
652         req->io = io;
653         req->end = fuse_aio_complete_req;
654
655         __fuse_get_request(req);
656         fuse_request_send_background(fc, req);
657
658         return num_bytes;
659 }
660
661 static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
662                              loff_t pos, size_t count, fl_owner_t owner)
663 {
664         struct file *file = io->file;
665         struct fuse_file *ff = file->private_data;
666         struct fuse_conn *fc = ff->fc;
667
668         fuse_read_fill(req, file, pos, count, FUSE_READ);
669         if (owner != NULL) {
670                 struct fuse_read_in *inarg = &req->misc.read.in;
671
672                 inarg->read_flags |= FUSE_READ_LOCKOWNER;
673                 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
674         }
675
676         if (io->async)
677                 return fuse_async_req_send(fc, req, count, io);
678
679         fuse_request_send(fc, req);
680         return req->out.args[0].size;
681 }
682
683 static void fuse_read_update_size(struct inode *inode, loff_t size,
684                                   u64 attr_ver)
685 {
686         struct fuse_conn *fc = get_fuse_conn(inode);
687         struct fuse_inode *fi = get_fuse_inode(inode);
688
689         spin_lock(&fc->lock);
690         if (attr_ver == fi->attr_version && size < inode->i_size &&
691             !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
692                 fi->attr_version = ++fc->attr_version;
693                 i_size_write(inode, size);
694         }
695         spin_unlock(&fc->lock);
696 }
697
698 static void fuse_short_read(struct fuse_req *req, struct inode *inode,
699                             u64 attr_ver)
700 {
701         size_t num_read = req->out.args[0].size;
702         struct fuse_conn *fc = get_fuse_conn(inode);
703
704         if (fc->writeback_cache) {
705                 /*
706                  * A hole in a file. Some data after the hole are in page cache,
707                  * but have not reached the client fs yet. So, the hole is not
708                  * present there.
709                  */
710                 int i;
711                 int start_idx = num_read >> PAGE_CACHE_SHIFT;
712                 size_t off = num_read & (PAGE_CACHE_SIZE - 1);
713
714                 for (i = start_idx; i < req->num_pages; i++) {
715                         zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
716                         off = 0;
717                 }
718         } else {
719                 loff_t pos = page_offset(req->pages[0]) + num_read;
720                 fuse_read_update_size(inode, pos, attr_ver);
721         }
722 }
723
724 static int fuse_do_readpage(struct file *file, struct page *page)
725 {
726         struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
727         struct inode *inode = page->mapping->host;
728         struct fuse_conn *fc = get_fuse_conn(inode);
729         struct fuse_req *req;
730         size_t num_read;
731         loff_t pos = page_offset(page);
732         size_t count = PAGE_CACHE_SIZE;
733         u64 attr_ver;
734         int err;
735
736         /*
737          * Page writeback can extend beyond the lifetime of the
738          * page-cache page, so make sure we read a properly synced
739          * page.
740          */
741         fuse_wait_on_page_writeback(inode, page->index);
742
743         req = fuse_get_req(fc, 1);
744         if (IS_ERR(req))
745                 return PTR_ERR(req);
746
747         attr_ver = fuse_get_attr_version(fc);
748
749         req->out.page_zeroing = 1;
750         req->out.argpages = 1;
751         req->num_pages = 1;
752         req->pages[0] = page;
753         req->page_descs[0].length = count;
754         num_read = fuse_send_read(req, &io, pos, count, NULL);
755         err = req->out.h.error;
756
757         if (!err) {
758                 /*
759                  * Short read means EOF.  If file size is larger, truncate it
760                  */
761                 if (num_read < count)
762                         fuse_short_read(req, inode, attr_ver);
763
764                 SetPageUptodate(page);
765         }
766
767         fuse_put_request(fc, req);
768
769         return err;
770 }
771
772 static int fuse_readpage(struct file *file, struct page *page)
773 {
774         struct inode *inode = page->mapping->host;
775         int err;
776
777         err = -EIO;
778         if (is_bad_inode(inode))
779                 goto out;
780
781         err = fuse_do_readpage(file, page);
782         fuse_invalidate_atime(inode);
783  out:
784         unlock_page(page);
785         return err;
786 }
787
788 static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
789 {
790         int i;
791         size_t count = req->misc.read.in.size;
792         size_t num_read = req->out.args[0].size;
793         struct address_space *mapping = NULL;
794
795         for (i = 0; mapping == NULL && i < req->num_pages; i++)
796                 mapping = req->pages[i]->mapping;
797
798         if (mapping) {
799                 struct inode *inode = mapping->host;
800
801                 /*
802                  * Short read means EOF. If file size is larger, truncate it
803                  */
804                 if (!req->out.h.error && num_read < count)
805                         fuse_short_read(req, inode, req->misc.read.attr_ver);
806
807                 fuse_invalidate_atime(inode);
808         }
809
810         for (i = 0; i < req->num_pages; i++) {
811                 struct page *page = req->pages[i];
812                 if (!req->out.h.error)
813                         SetPageUptodate(page);
814                 else
815                         SetPageError(page);
816                 unlock_page(page);
817                 page_cache_release(page);
818         }
819         if (req->ff)
820                 fuse_file_put(req->ff, false);
821 }
822
823 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
824 {
825         struct fuse_file *ff = file->private_data;
826         struct fuse_conn *fc = ff->fc;
827         loff_t pos = page_offset(req->pages[0]);
828         size_t count = req->num_pages << PAGE_CACHE_SHIFT;
829
830         req->out.argpages = 1;
831         req->out.page_zeroing = 1;
832         req->out.page_replace = 1;
833         fuse_read_fill(req, file, pos, count, FUSE_READ);
834         req->misc.read.attr_ver = fuse_get_attr_version(fc);
835         if (fc->async_read) {
836                 req->ff = fuse_file_get(ff);
837                 req->end = fuse_readpages_end;
838                 fuse_request_send_background(fc, req);
839         } else {
840                 fuse_request_send(fc, req);
841                 fuse_readpages_end(fc, req);
842                 fuse_put_request(fc, req);
843         }
844 }
845
846 struct fuse_fill_data {
847         struct fuse_req *req;
848         struct file *file;
849         struct inode *inode;
850         unsigned nr_pages;
851 };
852
853 static int fuse_readpages_fill(void *_data, struct page *page)
854 {
855         struct fuse_fill_data *data = _data;
856         struct fuse_req *req = data->req;
857         struct inode *inode = data->inode;
858         struct fuse_conn *fc = get_fuse_conn(inode);
859
860         fuse_wait_on_page_writeback(inode, page->index);
861
862         if (req->num_pages &&
863             (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
864              (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
865              req->pages[req->num_pages - 1]->index + 1 != page->index)) {
866                 int nr_alloc = min_t(unsigned, data->nr_pages,
867                                      FUSE_MAX_PAGES_PER_REQ);
868                 fuse_send_readpages(req, data->file);
869                 if (fc->async_read)
870                         req = fuse_get_req_for_background(fc, nr_alloc);
871                 else
872                         req = fuse_get_req(fc, nr_alloc);
873
874                 data->req = req;
875                 if (IS_ERR(req)) {
876                         unlock_page(page);
877                         return PTR_ERR(req);
878                 }
879         }
880
881         if (WARN_ON(req->num_pages >= req->max_pages)) {
882                 fuse_put_request(fc, req);
883                 return -EIO;
884         }
885
886         page_cache_get(page);
887         req->pages[req->num_pages] = page;
888         req->page_descs[req->num_pages].length = PAGE_SIZE;
889         req->num_pages++;
890         data->nr_pages--;
891         return 0;
892 }
893
894 static int fuse_readpages(struct file *file, struct address_space *mapping,
895                           struct list_head *pages, unsigned nr_pages)
896 {
897         struct inode *inode = mapping->host;
898         struct fuse_conn *fc = get_fuse_conn(inode);
899         struct fuse_fill_data data;
900         int err;
901         int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);
902
903         err = -EIO;
904         if (is_bad_inode(inode))
905                 goto out;
906
907         data.file = file;
908         data.inode = inode;
909         if (fc->async_read)
910                 data.req = fuse_get_req_for_background(fc, nr_alloc);
911         else
912                 data.req = fuse_get_req(fc, nr_alloc);
913         data.nr_pages = nr_pages;
914         err = PTR_ERR(data.req);
915         if (IS_ERR(data.req))
916                 goto out;
917
918         err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
919         if (!err) {
920                 if (data.req->num_pages)
921                         fuse_send_readpages(data.req, file);
922                 else
923                         fuse_put_request(fc, data.req);
924         }
925 out:
926         return err;
927 }
928
929 static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
930 {
931         struct inode *inode = iocb->ki_filp->f_mapping->host;
932         struct fuse_conn *fc = get_fuse_conn(inode);
933
934         /*
935          * In auto invalidate mode, always update attributes on read.
936          * Otherwise, only update if we attempt to read past EOF (to ensure
937          * i_size is up to date).
938          */
939         if (fc->auto_inval_data ||
940             (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
941                 int err;
942                 err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
943                 if (err)
944                         return err;
945         }
946
947         return generic_file_read_iter(iocb, to);
948 }
949
950 static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
951                             loff_t pos, size_t count)
952 {
953         struct fuse_write_in *inarg = &req->misc.write.in;
954         struct fuse_write_out *outarg = &req->misc.write.out;
955
956         inarg->fh = ff->fh;
957         inarg->offset = pos;
958         inarg->size = count;
959         req->in.h.opcode = FUSE_WRITE;
960         req->in.h.nodeid = ff->nodeid;
961         req->in.numargs = 2;
962         if (ff->fc->minor < 9)
963                 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
964         else
965                 req->in.args[0].size = sizeof(struct fuse_write_in);
966         req->in.args[0].value = inarg;
967         req->in.args[1].size = count;
968         req->out.numargs = 1;
969         req->out.args[0].size = sizeof(struct fuse_write_out);
970         req->out.args[0].value = outarg;
971 }
972
973 static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
974                               loff_t pos, size_t count, fl_owner_t owner)
975 {
976         struct file *file = io->file;
977         struct fuse_file *ff = file->private_data;
978         struct fuse_conn *fc = ff->fc;
979         struct fuse_write_in *inarg = &req->misc.write.in;
980
981         fuse_write_fill(req, ff, pos, count);
982         inarg->flags = file->f_flags;
983         if (owner != NULL) {
984                 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
985                 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
986         }
987
988         if (io->async)
989                 return fuse_async_req_send(fc, req, count, io);
990
991         fuse_request_send(fc, req);
992         return req->misc.write.out.size;
993 }
994
995 bool fuse_write_update_size(struct inode *inode, loff_t pos)
996 {
997         struct fuse_conn *fc = get_fuse_conn(inode);
998         struct fuse_inode *fi = get_fuse_inode(inode);
999         bool ret = false;
1000
1001         spin_lock(&fc->lock);
1002         fi->attr_version = ++fc->attr_version;
1003         if (pos > inode->i_size) {
1004                 i_size_write(inode, pos);
1005                 ret = true;
1006         }
1007         spin_unlock(&fc->lock);
1008
1009         return ret;
1010 }
1011
1012 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
1013                                     struct inode *inode, loff_t pos,
1014                                     size_t count)
1015 {
1016         size_t res;
1017         unsigned offset;
1018         unsigned i;
1019         struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
1020
1021         for (i = 0; i < req->num_pages; i++)
1022                 fuse_wait_on_page_writeback(inode, req->pages[i]->index);
1023
1024         res = fuse_send_write(req, &io, pos, count, NULL);
1025
1026         offset = req->page_descs[0].offset;
1027         count = res;
1028         for (i = 0; i < req->num_pages; i++) {
1029                 struct page *page = req->pages[i];
1030
1031                 if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
1032                         SetPageUptodate(page);
1033
1034                 if (count > PAGE_CACHE_SIZE - offset)
1035                         count -= PAGE_CACHE_SIZE - offset;
1036                 else
1037                         count = 0;
1038                 offset = 0;
1039
1040                 unlock_page(page);
1041                 page_cache_release(page);
1042         }
1043
1044         return res;
1045 }
1046
1047 static ssize_t fuse_fill_write_pages(struct fuse_req *req,
1048                                struct address_space *mapping,
1049                                struct iov_iter *ii, loff_t pos)
1050 {
1051         struct fuse_conn *fc = get_fuse_conn(mapping->host);
1052         unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1053         size_t count = 0;
1054         int err;
1055
1056         req->in.argpages = 1;
1057         req->page_descs[0].offset = offset;
1058
1059         do {
1060                 size_t tmp;
1061                 struct page *page;
1062                 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1063                 size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
1064                                      iov_iter_count(ii));
1065
1066                 bytes = min_t(size_t, bytes, fc->max_write - count);
1067
1068  again:
1069                 err = -EFAULT;
1070                 if (iov_iter_fault_in_readable(ii, bytes))
1071                         break;
1072
1073                 err = -ENOMEM;
1074                 page = grab_cache_page_write_begin(mapping, index, 0);
1075                 if (!page)
1076                         break;
1077
1078                 if (mapping_writably_mapped(mapping))
1079                         flush_dcache_page(page);
1080
1081                 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
1082                 flush_dcache_page(page);
1083
1084                 iov_iter_advance(ii, tmp);
1085                 if (!tmp) {
1086                         unlock_page(page);
1087                         page_cache_release(page);
1088                         bytes = min(bytes, iov_iter_single_seg_count(ii));
1089                         goto again;
1090                 }
1091
1092                 err = 0;
1093                 req->pages[req->num_pages] = page;
1094                 req->page_descs[req->num_pages].length = tmp;
1095                 req->num_pages++;
1096
1097                 count += tmp;
1098                 pos += tmp;
1099                 offset += tmp;
1100                 if (offset == PAGE_CACHE_SIZE)
1101                         offset = 0;
1102
1103                 if (!fc->big_writes)
1104                         break;
1105         } while (iov_iter_count(ii) && count < fc->max_write &&
1106                  req->num_pages < req->max_pages && offset == 0);
1107
1108         return count > 0 ? count : err;
1109 }
1110
1111 static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
1112 {
1113         return min_t(unsigned,
1114                      ((pos + len - 1) >> PAGE_CACHE_SHIFT) -
1115                      (pos >> PAGE_CACHE_SHIFT) + 1,
1116                      FUSE_MAX_PAGES_PER_REQ);
1117 }
1118
1119 static ssize_t fuse_perform_write(struct file *file,
1120                                   struct address_space *mapping,
1121                                   struct iov_iter *ii, loff_t pos)
1122 {
1123         struct inode *inode = mapping->host;
1124         struct fuse_conn *fc = get_fuse_conn(inode);
1125         struct fuse_inode *fi = get_fuse_inode(inode);
1126         int err = 0;
1127         ssize_t res = 0;
1128
1129         if (is_bad_inode(inode))
1130                 return -EIO;
1131
1132         if (inode->i_size < pos + iov_iter_count(ii))
1133                 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1134
1135         do {
1136                 struct fuse_req *req;
1137                 ssize_t count;
1138                 unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii));
1139
1140                 req = fuse_get_req(fc, nr_pages);
1141                 if (IS_ERR(req)) {
1142                         err = PTR_ERR(req);
1143                         break;
1144                 }
1145
1146                 count = fuse_fill_write_pages(req, mapping, ii, pos);
1147                 if (count <= 0) {
1148                         err = count;
1149                 } else {
1150                         size_t num_written;
1151
1152                         num_written = fuse_send_write_pages(req, file, inode,
1153                                                             pos, count);
1154                         err = req->out.h.error;
1155                         if (!err) {
1156                                 res += num_written;
1157                                 pos += num_written;
1158
1159                                 /* break out of the loop on short write */
1160                                 if (num_written != count)
1161                                         err = -EIO;
1162                         }
1163                 }
1164                 fuse_put_request(fc, req);
1165         } while (!err && iov_iter_count(ii));
1166
1167         if (res > 0)
1168                 fuse_write_update_size(inode, pos);
1169
1170         clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1171         fuse_invalidate_attr(inode);
1172
1173         return res > 0 ? res : err;
1174 }
1175
1176 static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1177 {
1178         struct file *file = iocb->ki_filp;
1179         struct address_space *mapping = file->f_mapping;
1180         ssize_t written = 0;
1181         ssize_t written_buffered = 0;
1182         struct inode *inode = mapping->host;
1183         ssize_t err;
1184         loff_t endbyte = 0;
1185
1186         if (get_fuse_conn(inode)->writeback_cache) {
1187                 /* Update size (EOF optimization) and mode (SUID clearing) */
1188                 err = fuse_update_attributes(mapping->host, NULL, file, NULL);
1189                 if (err)
1190                         return err;
1191
1192                 return generic_file_write_iter(iocb, from);
1193         }
1194
1195         mutex_lock(&inode->i_mutex);
1196
1197         /* We can write back this queue in page reclaim */
1198         current->backing_dev_info = inode_to_bdi(inode);
1199
1200         err = generic_write_checks(iocb, from);
1201         if (err <= 0)
1202                 goto out;
1203
1204         err = file_remove_privs(file);
1205         if (err)
1206                 goto out;
1207
1208         err = file_update_time(file);
1209         if (err)
1210                 goto out;
1211
1212         if (iocb->ki_flags & IOCB_DIRECT) {
1213                 loff_t pos = iocb->ki_pos;
1214                 written = generic_file_direct_write(iocb, from, pos);
1215                 if (written < 0 || !iov_iter_count(from))
1216                         goto out;
1217
1218                 pos += written;
1219
1220                 written_buffered = fuse_perform_write(file, mapping, from, pos);
1221                 if (written_buffered < 0) {
1222                         err = written_buffered;
1223                         goto out;
1224                 }
1225                 endbyte = pos + written_buffered - 1;
1226
1227                 err = filemap_write_and_wait_range(file->f_mapping, pos,
1228                                                    endbyte);
1229                 if (err)
1230                         goto out;
1231
1232                 invalidate_mapping_pages(file->f_mapping,
1233                                          pos >> PAGE_CACHE_SHIFT,
1234                                          endbyte >> PAGE_CACHE_SHIFT);
1235
1236                 written += written_buffered;
1237                 iocb->ki_pos = pos + written_buffered;
1238         } else {
1239                 written = fuse_perform_write(file, mapping, from, iocb->ki_pos);
1240                 if (written >= 0)
1241                         iocb->ki_pos += written;
1242         }
1243 out:
1244         current->backing_dev_info = NULL;
1245         mutex_unlock(&inode->i_mutex);
1246
1247         return written ? written : err;
1248 }
1249
1250 static inline void fuse_page_descs_length_init(struct fuse_req *req,
1251                 unsigned index, unsigned nr_pages)
1252 {
1253         int i;
1254
1255         for (i = index; i < index + nr_pages; i++)
1256                 req->page_descs[i].length = PAGE_SIZE -
1257                         req->page_descs[i].offset;
1258 }
1259
1260 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
1261 {
1262         return (unsigned long)ii->iov->iov_base + ii->iov_offset;
1263 }
1264
1265 static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
1266                                         size_t max_size)
1267 {
1268         return min(iov_iter_single_seg_count(ii), max_size);
1269 }
1270
1271 static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1272                                size_t *nbytesp, int write)
1273 {
1274         size_t nbytes = 0;  /* # bytes already packed in req */
1275
1276         /* Special case for kernel I/O: can copy directly into the buffer */
1277         if (ii->type & ITER_KVEC) {
1278                 unsigned long user_addr = fuse_get_user_addr(ii);
1279                 size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
1280
1281                 if (write)
1282                         req->in.args[1].value = (void *) user_addr;
1283                 else
1284                         req->out.args[0].value = (void *) user_addr;
1285
1286                 iov_iter_advance(ii, frag_size);
1287                 *nbytesp = frag_size;
1288                 return 0;
1289         }
1290
1291         while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
1292                 unsigned npages;
1293                 size_t start;
1294                 ssize_t ret = iov_iter_get_pages(ii,
1295                                         &req->pages[req->num_pages],
1296                                         *nbytesp - nbytes,
1297                                         req->max_pages - req->num_pages,
1298                                         &start);
1299                 if (ret < 0)
1300                         return ret;
1301
1302                 iov_iter_advance(ii, ret);
1303                 nbytes += ret;
1304
1305                 ret += start;
1306                 npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
1307
1308                 req->page_descs[req->num_pages].offset = start;
1309                 fuse_page_descs_length_init(req, req->num_pages, npages);
1310
1311                 req->num_pages += npages;
1312                 req->page_descs[req->num_pages - 1].length -=
1313                         (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
1314         }
1315
1316         if (write)
1317                 req->in.argpages = 1;
1318         else
1319                 req->out.argpages = 1;
1320
1321         *nbytesp = nbytes;
1322
1323         return 0;
1324 }
1325
1326 static inline int fuse_iter_npages(const struct iov_iter *ii_p)
1327 {
1328         return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
1329 }
1330
1331 ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1332                        loff_t *ppos, int flags)
1333 {
1334         int write = flags & FUSE_DIO_WRITE;
1335         bool should_dirty = !write && iter_is_iovec(iter);
1336         int cuse = flags & FUSE_DIO_CUSE;
1337         struct file *file = io->file;
1338         struct inode *inode = file->f_mapping->host;
1339         struct fuse_file *ff = file->private_data;
1340         struct fuse_conn *fc = ff->fc;
1341         size_t nmax = write ? fc->max_write : fc->max_read;
1342         loff_t pos = *ppos;
1343         size_t count = iov_iter_count(iter);
1344         pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
1345         pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
1346         ssize_t res = 0;
1347         struct fuse_req *req;
1348
1349         if (io->async)
1350                 req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
1351         else
1352                 req = fuse_get_req(fc, fuse_iter_npages(iter));
1353         if (IS_ERR(req))
1354                 return PTR_ERR(req);
1355
1356         if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1357                 if (!write)
1358                         mutex_lock(&inode->i_mutex);
1359                 fuse_sync_writes(inode);
1360                 if (!write)
1361                         mutex_unlock(&inode->i_mutex);
1362         }
1363
1364         while (count) {
1365                 size_t nres;
1366                 fl_owner_t owner = current->files;
1367                 size_t nbytes = min(count, nmax);
1368                 int err = fuse_get_user_pages(req, iter, &nbytes, write);
1369                 if (err) {
1370                         res = err;
1371                         break;
1372                 }
1373
1374                 if (write)
1375                         nres = fuse_send_write(req, io, pos, nbytes, owner);
1376                 else
1377                         nres = fuse_send_read(req, io, pos, nbytes, owner);
1378
1379                 if (!io->async)
1380                         fuse_release_user_pages(req, should_dirty);
1381                 if (req->out.h.error) {
1382                         if (!res)
1383                                 res = req->out.h.error;
1384                         break;
1385                 } else if (nres > nbytes) {
1386                         res = -EIO;
1387                         break;
1388                 }
1389                 count -= nres;
1390                 res += nres;
1391                 pos += nres;
1392                 if (nres != nbytes)
1393                         break;
1394                 if (count) {
1395                         fuse_put_request(fc, req);
1396                         if (io->async)
1397                                 req = fuse_get_req_for_background(fc,
1398                                         fuse_iter_npages(iter));
1399                         else
1400                                 req = fuse_get_req(fc, fuse_iter_npages(iter));
1401                         if (IS_ERR(req))
1402                                 break;
1403                 }
1404         }
1405         if (!IS_ERR(req))
1406                 fuse_put_request(fc, req);
1407         if (res > 0)
1408                 *ppos = pos;
1409
1410         return res;
1411 }
1412 EXPORT_SYMBOL_GPL(fuse_direct_io);
1413
1414 static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1415                                   struct iov_iter *iter,
1416                                   loff_t *ppos)
1417 {
1418         ssize_t res;
1419         struct file *file = io->file;
1420         struct inode *inode = file_inode(file);
1421
1422         if (is_bad_inode(inode))
1423                 return -EIO;
1424
1425         res = fuse_direct_io(io, iter, ppos, 0);
1426
1427         fuse_invalidate_attr(inode);
1428
1429         return res;
1430 }
1431
1432 static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
1433 {
1434         struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp);
1435         return __fuse_direct_read(&io, to, &iocb->ki_pos);
1436 }
1437
1438 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
1439 {
1440         struct file *file = iocb->ki_filp;
1441         struct inode *inode = file_inode(file);
1442         struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
1443         ssize_t res;
1444
1445         if (is_bad_inode(inode))
1446                 return -EIO;
1447
1448         /* Don't allow parallel writes to the same file */
1449         mutex_lock(&inode->i_mutex);
1450         res = generic_write_checks(iocb, from);
1451         if (res > 0)
1452                 res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
1453         fuse_invalidate_attr(inode);
1454         if (res > 0)
1455                 fuse_write_update_size(inode, iocb->ki_pos);
1456         mutex_unlock(&inode->i_mutex);
1457
1458         return res;
1459 }
1460
1461 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
1462 {
1463         int i;
1464
1465         for (i = 0; i < req->num_pages; i++)
1466                 __free_page(req->pages[i]);
1467
1468         if (req->ff)
1469                 fuse_file_put(req->ff, false);
1470 }
1471
1472 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1473 {
1474         struct inode *inode = req->inode;
1475         struct fuse_inode *fi = get_fuse_inode(inode);
1476         struct backing_dev_info *bdi = inode_to_bdi(inode);
1477         int i;
1478
1479         list_del(&req->writepages_entry);
1480         for (i = 0; i < req->num_pages; i++) {
1481                 dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1482                 dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
1483                 wb_writeout_inc(&bdi->wb);
1484         }
1485         wake_up(&fi->page_waitq);
1486 }
1487
1488 /* Called under fc->lock, may release and reacquire it */
1489 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
1490                                 loff_t size)
1491 __releases(fc->lock)
1492 __acquires(fc->lock)
1493 {
1494         struct fuse_inode *fi = get_fuse_inode(req->inode);
1495         struct fuse_write_in *inarg = &req->misc.write.in;
1496         __u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
1497
1498         if (!fc->connected)
1499                 goto out_free;
1500
1501         if (inarg->offset + data_size <= size) {
1502                 inarg->size = data_size;
1503         } else if (inarg->offset < size) {
1504                 inarg->size = size - inarg->offset;
1505         } else {
1506                 /* Got truncated off completely */
1507                 goto out_free;
1508         }
1509
1510         req->in.args[1].size = inarg->size;
1511         fi->writectr++;
1512         fuse_request_send_background_locked(fc, req);
1513         return;
1514
1515  out_free:
1516         fuse_writepage_finish(fc, req);
1517         spin_unlock(&fc->lock);
1518         fuse_writepage_free(fc, req);
1519         fuse_put_request(fc, req);
1520         spin_lock(&fc->lock);
1521 }
1522
1523 /*
1524  * If fi->writectr is positive (no truncate or fsync going on) send
1525  * all queued writepage requests.
1526  *
1527  * Called with fc->lock
1528  */
1529 void fuse_flush_writepages(struct inode *inode)
1530 __releases(fc->lock)
1531 __acquires(fc->lock)
1532 {
1533         struct fuse_conn *fc = get_fuse_conn(inode);
1534         struct fuse_inode *fi = get_fuse_inode(inode);
1535         size_t crop = i_size_read(inode);
1536         struct fuse_req *req;
1537
1538         while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1539                 req = list_entry(fi->queued_writes.next, struct fuse_req, list);
1540                 list_del_init(&req->list);
1541                 fuse_send_writepage(fc, req, crop);
1542         }
1543 }
1544
1545 static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
1546 {
1547         struct inode *inode = req->inode;
1548         struct fuse_inode *fi = get_fuse_inode(inode);
1549
1550         mapping_set_error(inode->i_mapping, req->out.h.error);
1551         spin_lock(&fc->lock);
1552         while (req->misc.write.next) {
1553                 struct fuse_conn *fc = get_fuse_conn(inode);
1554                 struct fuse_write_in *inarg = &req->misc.write.in;
1555                 struct fuse_req *next = req->misc.write.next;
1556                 req->misc.write.next = next->misc.write.next;
1557                 next->misc.write.next = NULL;
1558                 next->ff = fuse_file_get(req->ff);
1559                 list_add(&next->writepages_entry, &fi->writepages);
1560
1561                 /*
1562                  * Skip fuse_flush_writepages() to make it easy to crop requests
1563                  * based on primary request size.
1564                  *
1565                  * 1st case (trivial): there are no concurrent activities using
1566                  * fuse_set/release_nowrite.  Then we're on safe side because
1567                  * fuse_flush_writepages() would call fuse_send_writepage()
1568                  * anyway.
1569                  *
1570                  * 2nd case: someone called fuse_set_nowrite and it is waiting
1571                  * now for completion of all in-flight requests.  This happens
1572                  * rarely and no more than once per page, so this should be
1573                  * okay.
1574                  *
1575                  * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
1576                  * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
1577                  * that fuse_set_nowrite returned implies that all in-flight
1578                  * requests were completed along with all of their secondary
1579                  * requests.  Further primary requests are blocked by negative
1580                  * writectr.  Hence there cannot be any in-flight requests and
1581                  * no invocations of fuse_writepage_end() while we're in
1582                  * fuse_set_nowrite..fuse_release_nowrite section.
1583                  */
1584                 fuse_send_writepage(fc, next, inarg->offset + inarg->size);
1585         }
1586         fi->writectr--;
1587         fuse_writepage_finish(fc, req);
1588         spin_unlock(&fc->lock);
1589         fuse_writepage_free(fc, req);
1590 }
1591
1592 static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
1593                                                struct fuse_inode *fi)
1594 {
1595         struct fuse_file *ff = NULL;
1596
1597         spin_lock(&fc->lock);
1598         if (!list_empty(&fi->write_files)) {
1599                 ff = list_entry(fi->write_files.next, struct fuse_file,
1600                                 write_entry);
1601                 fuse_file_get(ff);
1602         }
1603         spin_unlock(&fc->lock);
1604
1605         return ff;
1606 }
1607
1608 static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
1609                                              struct fuse_inode *fi)
1610 {
1611         struct fuse_file *ff = __fuse_write_file_get(fc, fi);
1612         WARN_ON(!ff);
1613         return ff;
1614 }
1615
1616 int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
1617 {
1618         struct fuse_conn *fc = get_fuse_conn(inode);
1619         struct fuse_inode *fi = get_fuse_inode(inode);
1620         struct fuse_file *ff;
1621         int err;
1622
1623         ff = __fuse_write_file_get(fc, fi);
1624         err = fuse_flush_times(inode, ff);
1625         if (ff)
1626                 fuse_file_put(ff, 0);
1627
1628         return err;
1629 }
1630
1631 static int fuse_writepage_locked(struct page *page)
1632 {
1633         struct address_space *mapping = page->mapping;
1634         struct inode *inode = mapping->host;
1635         struct fuse_conn *fc = get_fuse_conn(inode);
1636         struct fuse_inode *fi = get_fuse_inode(inode);
1637         struct fuse_req *req;
1638         struct page *tmp_page;
1639         int error = -ENOMEM;
1640
1641         set_page_writeback(page);
1642
1643         req = fuse_request_alloc_nofs(1);
1644         if (!req)
1645                 goto err;
1646
1647         /* writeback always goes to bg_queue */
1648         __set_bit(FR_BACKGROUND, &req->flags);
1649         tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1650         if (!tmp_page)
1651                 goto err_free;
1652
1653         error = -EIO;
1654         req->ff = fuse_write_file_get(fc, fi);
1655         if (!req->ff)
1656                 goto err_nofile;
1657
1658         fuse_write_fill(req, req->ff, page_offset(page), 0);
1659
1660         copy_highpage(tmp_page, page);
1661         req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
1662         req->misc.write.next = NULL;
1663         req->in.argpages = 1;
1664         req->num_pages = 1;
1665         req->pages[0] = tmp_page;
1666         req->page_descs[0].offset = 0;
1667         req->page_descs[0].length = PAGE_SIZE;
1668         req->end = fuse_writepage_end;
1669         req->inode = inode;
1670
1671         inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
1672         inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
1673
1674         spin_lock(&fc->lock);
1675         list_add(&req->writepages_entry, &fi->writepages);
1676         list_add_tail(&req->list, &fi->queued_writes);
1677         fuse_flush_writepages(inode);
1678         spin_unlock(&fc->lock);
1679
1680         end_page_writeback(page);
1681
1682         return 0;
1683
1684 err_nofile:
1685         __free_page(tmp_page);
1686 err_free:
1687         fuse_request_free(req);
1688 err:
1689         end_page_writeback(page);
1690         return error;
1691 }
1692
1693 static int fuse_writepage(struct page *page, struct writeback_control *wbc)
1694 {
1695         int err;
1696
1697         if (fuse_page_is_writeback(page->mapping->host, page->index)) {
1698                 /*
1699                  * ->writepages() should be called for sync() and friends.  We
1700                  * should only get here on direct reclaim and then we are
1701                  * allowed to skip a page which is already in flight
1702                  */
1703                 WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
1704
1705                 redirty_page_for_writepage(wbc, page);
1706                 return 0;
1707         }
1708
1709         err = fuse_writepage_locked(page);
1710         unlock_page(page);
1711
1712         return err;
1713 }
1714
1715 struct fuse_fill_wb_data {
1716         struct fuse_req *req;
1717         struct fuse_file *ff;
1718         struct inode *inode;
1719         struct page **orig_pages;
1720 };
1721
1722 static void fuse_writepages_send(struct fuse_fill_wb_data *data)
1723 {
1724         struct fuse_req *req = data->req;
1725         struct inode *inode = data->inode;
1726         struct fuse_conn *fc = get_fuse_conn(inode);
1727         struct fuse_inode *fi = get_fuse_inode(inode);
1728         int num_pages = req->num_pages;
1729         int i;
1730
1731         req->ff = fuse_file_get(data->ff);
1732         spin_lock(&fc->lock);
1733         list_add_tail(&req->list, &fi->queued_writes);
1734         fuse_flush_writepages(inode);
1735         spin_unlock(&fc->lock);
1736
1737         for (i = 0; i < num_pages; i++)
1738                 end_page_writeback(data->orig_pages[i]);
1739 }
1740
1741 static bool fuse_writepage_in_flight(struct fuse_req *new_req,
1742                                      struct page *page)
1743 {
1744         struct fuse_conn *fc = get_fuse_conn(new_req->inode);
1745         struct fuse_inode *fi = get_fuse_inode(new_req->inode);
1746         struct fuse_req *tmp;
1747         struct fuse_req *old_req;
1748         bool found = false;
1749         pgoff_t curr_index;
1750
1751         BUG_ON(new_req->num_pages != 0);
1752
1753         spin_lock(&fc->lock);
1754         list_del(&new_req->writepages_entry);
1755         list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
1756                 BUG_ON(old_req->inode != new_req->inode);
1757                 curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
1758                 if (curr_index <= page->index &&
1759                     page->index < curr_index + old_req->num_pages) {
1760                         found = true;
1761                         break;
1762                 }
1763         }
1764         if (!found) {
1765                 list_add(&new_req->writepages_entry, &fi->writepages);
1766                 goto out_unlock;
1767         }
1768
1769         new_req->num_pages = 1;
1770         for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
1771                 BUG_ON(tmp->inode != new_req->inode);
1772                 curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT;
1773                 if (tmp->num_pages == 1 &&
1774                     curr_index == page->index) {
1775                         old_req = tmp;
1776                 }
1777         }
1778
1779         if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) {
1780                 struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
1781
1782                 copy_highpage(old_req->pages[0], page);
1783                 spin_unlock(&fc->lock);
1784
1785                 dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1786                 dec_zone_page_state(page, NR_WRITEBACK_TEMP);
1787                 wb_writeout_inc(&bdi->wb);
1788                 fuse_writepage_free(fc, new_req);
1789                 fuse_request_free(new_req);
1790                 goto out;
1791         } else {
1792                 new_req->misc.write.next = old_req->misc.write.next;
1793                 old_req->misc.write.next = new_req;
1794         }
1795 out_unlock:
1796         spin_unlock(&fc->lock);
1797 out:
1798         return found;
1799 }
1800
1801 static int fuse_writepages_fill(struct page *page,
1802                 struct writeback_control *wbc, void *_data)
1803 {
1804         struct fuse_fill_wb_data *data = _data;
1805         struct fuse_req *req = data->req;
1806         struct inode *inode = data->inode;
1807         struct fuse_conn *fc = get_fuse_conn(inode);
1808         struct page *tmp_page;
1809         bool is_writeback;
1810         int err;
1811
1812         if (!data->ff) {
1813                 err = -EIO;
1814                 data->ff = fuse_write_file_get(fc, get_fuse_inode(inode));
1815                 if (!data->ff)
1816                         goto out_unlock;
1817         }
1818
1819         /*
1820          * Being under writeback is unlikely but possible.  For example direct
1821          * read to an mmaped fuse file will set the page dirty twice; once when
1822          * the pages are faulted with get_user_pages(), and then after the read
1823          * completed.
1824          */
1825         is_writeback = fuse_page_is_writeback(inode, page->index);
1826
1827         if (req && req->num_pages &&
1828             (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
1829              (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
1830              data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
1831                 fuse_writepages_send(data);
1832                 data->req = NULL;
1833         }
1834         err = -ENOMEM;
1835         tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1836         if (!tmp_page)
1837                 goto out_unlock;
1838
1839         /*
1840          * The page must not be redirtied until the writeout is completed
1841          * (i.e. userspace has sent a reply to the write request).  Otherwise
1842          * there could be more than one temporary page instance for each real
1843          * page.
1844          *
1845          * This is ensured by holding the page lock in page_mkwrite() while
1846          * checking fuse_page_is_writeback().  We already hold the page lock
1847          * since clear_page_dirty_for_io() and keep it held until we add the
1848          * request to the fi->writepages list and increment req->num_pages.
1849          * After this fuse_page_is_writeback() will indicate that the page is
1850          * under writeback, so we can release the page lock.
1851          */
1852         if (data->req == NULL) {
1853                 struct fuse_inode *fi = get_fuse_inode(inode);
1854
1855                 err = -ENOMEM;
1856                 req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
1857                 if (!req) {
1858                         __free_page(tmp_page);
1859                         goto out_unlock;
1860                 }
1861
1862                 fuse_write_fill(req, data->ff, page_offset(page), 0);
1863                 req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
1864                 req->misc.write.next = NULL;
1865                 req->in.argpages = 1;
1866                 __set_bit(FR_BACKGROUND, &req->flags);
1867                 req->num_pages = 0;
1868                 req->end = fuse_writepage_end;
1869                 req->inode = inode;
1870
1871                 spin_lock(&fc->lock);
1872                 list_add(&req->writepages_entry, &fi->writepages);
1873                 spin_unlock(&fc->lock);
1874
1875                 data->req = req;
1876         }
1877         set_page_writeback(page);
1878
1879         copy_highpage(tmp_page, page);
1880         req->pages[req->num_pages] = tmp_page;
1881         req->page_descs[req->num_pages].offset = 0;
1882         req->page_descs[req->num_pages].length = PAGE_SIZE;
1883
1884         inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
1885         inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
1886
1887         err = 0;
1888         if (is_writeback && fuse_writepage_in_flight(req, page)) {
1889                 end_page_writeback(page);
1890                 data->req = NULL;
1891                 goto out_unlock;
1892         }
1893         data->orig_pages[req->num_pages] = page;
1894
1895         /*
1896          * Protected by fc->lock against concurrent access by
1897          * fuse_page_is_writeback().
1898          */
1899         spin_lock(&fc->lock);
1900         req->num_pages++;
1901         spin_unlock(&fc->lock);
1902
1903 out_unlock:
1904         unlock_page(page);
1905
1906         return err;
1907 }
1908
1909 static int fuse_writepages(struct address_space *mapping,
1910                            struct writeback_control *wbc)
1911 {
1912         struct inode *inode = mapping->host;
1913         struct fuse_fill_wb_data data;
1914         int err;
1915
1916         err = -EIO;
1917         if (is_bad_inode(inode))
1918                 goto out;
1919
1920         data.inode = inode;
1921         data.req = NULL;
1922         data.ff = NULL;
1923
1924         err = -ENOMEM;
1925         data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ,
1926                                   sizeof(struct page *),
1927                                   GFP_NOFS);
1928         if (!data.orig_pages)
1929                 goto out;
1930
1931         err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
1932         if (data.req) {
1933                 /* Ignore errors if we can write at least one page */
1934                 BUG_ON(!data.req->num_pages);
1935                 fuse_writepages_send(&data);
1936                 err = 0;
1937         }
1938         if (data.ff)
1939                 fuse_file_put(data.ff, false);
1940
1941         kfree(data.orig_pages);
1942 out:
1943         return err;
1944 }
1945
1946 /*
1947  * It's worthy to make sure that space is reserved on disk for the write,
1948  * but how to implement it without killing performance need more thinking.
1949  */
1950 static int fuse_write_begin(struct file *file, struct address_space *mapping,
1951                 loff_t pos, unsigned len, unsigned flags,
1952                 struct page **pagep, void **fsdata)
1953 {
1954         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1955         struct fuse_conn *fc = get_fuse_conn(file_inode(file));
1956         struct page *page;
1957         loff_t fsize;
1958         int err = -ENOMEM;
1959
1960         WARN_ON(!fc->writeback_cache);
1961
1962         page = grab_cache_page_write_begin(mapping, index, flags);
1963         if (!page)
1964                 goto error;
1965
1966         fuse_wait_on_page_writeback(mapping->host, page->index);
1967
1968         if (PageUptodate(page) || len == PAGE_CACHE_SIZE)
1969                 goto success;
1970         /*
1971          * Check if the start this page comes after the end of file, in which
1972          * case the readpage can be optimized away.
1973          */
1974         fsize = i_size_read(mapping->host);
1975         if (fsize <= (pos & PAGE_CACHE_MASK)) {
1976                 size_t off = pos & ~PAGE_CACHE_MASK;
1977                 if (off)
1978                         zero_user_segment(page, 0, off);
1979                 goto success;
1980         }
1981         err = fuse_do_readpage(file, page);
1982         if (err)
1983                 goto cleanup;
1984 success:
1985         *pagep = page;
1986         return 0;
1987
1988 cleanup:
1989         unlock_page(page);
1990         page_cache_release(page);
1991 error:
1992         return err;
1993 }
1994
1995 static int fuse_write_end(struct file *file, struct address_space *mapping,
1996                 loff_t pos, unsigned len, unsigned copied,
1997                 struct page *page, void *fsdata)
1998 {
1999         struct inode *inode = page->mapping->host;
2000
2001         /* Haven't copied anything?  Skip zeroing, size extending, dirtying. */
2002         if (!copied)
2003                 goto unlock;
2004
2005         if (!PageUptodate(page)) {
2006                 /* Zero any unwritten bytes at the end of the page */
2007                 size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK;
2008                 if (endoff)
2009                         zero_user_segment(page, endoff, PAGE_CACHE_SIZE);
2010                 SetPageUptodate(page);
2011         }
2012
2013         fuse_write_update_size(inode, pos + copied);
2014         set_page_dirty(page);
2015
2016 unlock:
2017         unlock_page(page);
2018         page_cache_release(page);
2019
2020         return copied;
2021 }
2022
2023 static int fuse_launder_page(struct page *page)
2024 {
2025         int err = 0;
2026         if (clear_page_dirty_for_io(page)) {
2027                 struct inode *inode = page->mapping->host;
2028                 err = fuse_writepage_locked(page);
2029                 if (!err)
2030                         fuse_wait_on_page_writeback(inode, page->index);
2031         }
2032         return err;
2033 }
2034
2035 /*
2036  * Write back dirty pages now, because there may not be any suitable
2037  * open files later
2038  */
2039 static void fuse_vma_close(struct vm_area_struct *vma)
2040 {
2041         filemap_write_and_wait(vma->vm_file->f_mapping);
2042 }
2043
2044 /*
2045  * Wait for writeback against this page to complete before allowing it
2046  * to be marked dirty again, and hence written back again, possibly
2047  * before the previous writepage completed.
2048  *
2049  * Block here, instead of in ->writepage(), so that the userspace fs
2050  * can only block processes actually operating on the filesystem.
2051  *
2052  * Otherwise unprivileged userspace fs would be able to block
2053  * unrelated:
2054  *
2055  * - page migration
2056  * - sync(2)
2057  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
2058  */
2059 static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
2060 {
2061         struct page *page = vmf->page;
2062         struct inode *inode = file_inode(vma->vm_file);
2063
2064         file_update_time(vma->vm_file);
2065         lock_page(page);
2066         if (page->mapping != inode->i_mapping) {
2067                 unlock_page(page);
2068                 return VM_FAULT_NOPAGE;
2069         }
2070
2071         fuse_wait_on_page_writeback(inode, page->index);
2072         return VM_FAULT_LOCKED;
2073 }
2074
2075 static const struct vm_operations_struct fuse_file_vm_ops = {
2076         .close          = fuse_vma_close,
2077         .fault          = filemap_fault,
2078         .map_pages      = filemap_map_pages,
2079         .page_mkwrite   = fuse_page_mkwrite,
2080 };
2081
2082 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
2083 {
2084         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
2085                 fuse_link_write_file(file);
2086
2087         file_accessed(file);
2088         vma->vm_ops = &fuse_file_vm_ops;
2089         return 0;
2090 }
2091
2092 static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
2093 {
2094         /* Can't provide the coherency needed for MAP_SHARED */
2095         if (vma->vm_flags & VM_MAYSHARE)
2096                 return -ENODEV;
2097
2098         invalidate_inode_pages2(file->f_mapping);
2099
2100         return generic_file_mmap(file, vma);
2101 }
2102
2103 static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
2104                                   struct file_lock *fl)
2105 {
2106         switch (ffl->type) {
2107         case F_UNLCK:
2108                 break;
2109
2110         case F_RDLCK:
2111         case F_WRLCK:
2112                 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
2113                     ffl->end < ffl->start)
2114                         return -EIO;
2115
2116                 fl->fl_start = ffl->start;
2117                 fl->fl_end = ffl->end;
2118                 fl->fl_pid = ffl->pid;
2119                 break;
2120
2121         default:
2122                 return -EIO;
2123         }
2124         fl->fl_type = ffl->type;
2125         return 0;
2126 }
2127
2128 static void fuse_lk_fill(struct fuse_args *args, struct file *file,
2129                          const struct file_lock *fl, int opcode, pid_t pid,
2130                          int flock, struct fuse_lk_in *inarg)
2131 {
2132         struct inode *inode = file_inode(file);
2133         struct fuse_conn *fc = get_fuse_conn(inode);
2134         struct fuse_file *ff = file->private_data;
2135
2136         memset(inarg, 0, sizeof(*inarg));
2137         inarg->fh = ff->fh;
2138         inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
2139         inarg->lk.start = fl->fl_start;
2140         inarg->lk.end = fl->fl_end;
2141         inarg->lk.type = fl->fl_type;
2142         inarg->lk.pid = pid;
2143         if (flock)
2144                 inarg->lk_flags |= FUSE_LK_FLOCK;
2145         args->in.h.opcode = opcode;
2146         args->in.h.nodeid = get_node_id(inode);
2147         args->in.numargs = 1;
2148         args->in.args[0].size = sizeof(*inarg);
2149         args->in.args[0].value = inarg;
2150 }
2151
2152 static int fuse_getlk(struct file *file, struct file_lock *fl)
2153 {
2154         struct inode *inode = file_inode(file);
2155         struct fuse_conn *fc = get_fuse_conn(inode);
2156         FUSE_ARGS(args);
2157         struct fuse_lk_in inarg;
2158         struct fuse_lk_out outarg;
2159         int err;
2160
2161         fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
2162         args.out.numargs = 1;
2163         args.out.args[0].size = sizeof(outarg);
2164         args.out.args[0].value = &outarg;
2165         err = fuse_simple_request(fc, &args);
2166         if (!err)
2167                 err = convert_fuse_file_lock(&outarg.lk, fl);
2168
2169         return err;
2170 }
2171
2172 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
2173 {
2174         struct inode *inode = file_inode(file);
2175         struct fuse_conn *fc = get_fuse_conn(inode);
2176         FUSE_ARGS(args);
2177         struct fuse_lk_in inarg;
2178         int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
2179         pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
2180         int err;
2181
2182         if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
2183                 /* NLM needs asynchronous locks, which we don't support yet */
2184                 return -ENOLCK;
2185         }
2186
2187         /* Unlock on close is handled by the flush method */
2188         if (fl->fl_flags & FL_CLOSE)
2189                 return 0;
2190
2191         fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg);
2192         err = fuse_simple_request(fc, &args);
2193
2194         /* locking is restartable */
2195         if (err == -EINTR)
2196                 err = -ERESTARTSYS;
2197
2198         return err;
2199 }
2200
2201 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
2202 {
2203         struct inode *inode = file_inode(file);
2204         struct fuse_conn *fc = get_fuse_conn(inode);
2205         int err;
2206
2207         if (cmd == F_CANCELLK) {
2208                 err = 0;
2209         } else if (cmd == F_GETLK) {
2210                 if (fc->no_lock) {
2211                         posix_test_lock(file, fl);
2212                         err = 0;
2213                 } else
2214                         err = fuse_getlk(file, fl);
2215         } else {
2216                 if (fc->no_lock)
2217                         err = posix_lock_file(file, fl, NULL);
2218                 else
2219                         err = fuse_setlk(file, fl, 0);
2220         }
2221         return err;
2222 }
2223
2224 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
2225 {
2226         struct inode *inode = file_inode(file);
2227         struct fuse_conn *fc = get_fuse_conn(inode);
2228         int err;
2229
2230         if (fc->no_flock) {
2231                 err = locks_lock_file_wait(file, fl);
2232         } else {
2233                 struct fuse_file *ff = file->private_data;
2234
2235                 /* emulate flock with POSIX locks */
2236                 ff->flock = true;
2237                 err = fuse_setlk(file, fl, 1);
2238         }
2239
2240         return err;
2241 }
2242
2243 static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
2244 {
2245         struct inode *inode = mapping->host;
2246         struct fuse_conn *fc = get_fuse_conn(inode);
2247         FUSE_ARGS(args);
2248         struct fuse_bmap_in inarg;
2249         struct fuse_bmap_out outarg;
2250         int err;
2251
2252         if (!inode->i_sb->s_bdev || fc->no_bmap)
2253                 return 0;
2254
2255         memset(&inarg, 0, sizeof(inarg));
2256         inarg.block = block;
2257         inarg.blocksize = inode->i_sb->s_blocksize;
2258         args.in.h.opcode = FUSE_BMAP;
2259         args.in.h.nodeid = get_node_id(inode);
2260         args.in.numargs = 1;
2261         args.in.args[0].size = sizeof(inarg);
2262         args.in.args[0].value = &inarg;
2263         args.out.numargs = 1;
2264         args.out.args[0].size = sizeof(outarg);
2265         args.out.args[0].value = &outarg;
2266         err = fuse_simple_request(fc, &args);
2267         if (err == -ENOSYS)
2268                 fc->no_bmap = 1;
2269
2270         return err ? 0 : outarg.block;
2271 }
2272
2273 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
2274 {
2275         loff_t retval;
2276         struct inode *inode = file_inode(file);
2277
2278         /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
2279         if (whence == SEEK_CUR || whence == SEEK_SET)
2280                 return generic_file_llseek(file, offset, whence);
2281
2282         mutex_lock(&inode->i_mutex);
2283         retval = fuse_update_attributes(inode, NULL, file, NULL);
2284         if (!retval)
2285                 retval = generic_file_llseek(file, offset, whence);
2286         mutex_unlock(&inode->i_mutex);
2287
2288         return retval;
2289 }
2290
2291 static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
2292                         unsigned int nr_segs, size_t bytes, bool to_user)
2293 {
2294         struct iov_iter ii;
2295         int page_idx = 0;
2296
2297         if (!bytes)
2298                 return 0;
2299
2300         iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);
2301
2302         while (iov_iter_count(&ii)) {
2303                 struct page *page = pages[page_idx++];
2304                 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
2305                 void *kaddr;
2306
2307                 kaddr = kmap(page);
2308
2309                 while (todo) {
2310                         char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
2311                         size_t iov_len = ii.iov->iov_len - ii.iov_offset;
2312                         size_t copy = min(todo, iov_len);
2313                         size_t left;
2314
2315                         if (!to_user)
2316                                 left = copy_from_user(kaddr, uaddr, copy);
2317                         else
2318                                 left = copy_to_user(uaddr, kaddr, copy);
2319
2320                         if (unlikely(left))
2321                                 return -EFAULT;
2322
2323                         iov_iter_advance(&ii, copy);
2324                         todo -= copy;
2325                         kaddr += copy;
2326                 }
2327
2328                 kunmap(page);
2329         }
2330
2331         return 0;
2332 }
2333
2334 /*
2335  * CUSE servers compiled on 32bit broke on 64bit kernels because the
2336  * ABI was defined to be 'struct iovec' which is different on 32bit
2337  * and 64bit.  Fortunately we can determine which structure the server
2338  * used from the size of the reply.
2339  */
2340 static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
2341                                      size_t transferred, unsigned count,
2342                                      bool is_compat)
2343 {
2344 #ifdef CONFIG_COMPAT
2345         if (count * sizeof(struct compat_iovec) == transferred) {
2346                 struct compat_iovec *ciov = src;
2347                 unsigned i;
2348
2349                 /*
2350                  * With this interface a 32bit server cannot support
2351                  * non-compat (i.e. ones coming from 64bit apps) ioctl
2352                  * requests
2353                  */
2354                 if (!is_compat)
2355                         return -EINVAL;
2356
2357                 for (i = 0; i < count; i++) {
2358                         dst[i].iov_base = compat_ptr(ciov[i].iov_base);
2359                         dst[i].iov_len = ciov[i].iov_len;
2360                 }
2361                 return 0;
2362         }
2363 #endif
2364
2365         if (count * sizeof(struct iovec) != transferred)
2366                 return -EIO;
2367
2368         memcpy(dst, src, transferred);
2369         return 0;
2370 }
2371
2372 /* Make sure iov_length() won't overflow */
2373 static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
2374 {
2375         size_t n;
2376         u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
2377
2378         for (n = 0; n < count; n++, iov++) {
2379                 if (iov->iov_len > (size_t) max)
2380                         return -ENOMEM;
2381                 max -= iov->iov_len;
2382         }
2383         return 0;
2384 }
2385
2386 static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
2387                                  void *src, size_t transferred, unsigned count,
2388                                  bool is_compat)
2389 {
2390         unsigned i;
2391         struct fuse_ioctl_iovec *fiov = src;
2392
2393         if (fc->minor < 16) {
2394                 return fuse_copy_ioctl_iovec_old(dst, src, transferred,
2395                                                  count, is_compat);
2396         }
2397
2398         if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
2399                 return -EIO;
2400
2401         for (i = 0; i < count; i++) {
2402                 /* Did the server supply an inappropriate value? */
2403                 if (fiov[i].base != (unsigned long) fiov[i].base ||
2404                     fiov[i].len != (unsigned long) fiov[i].len)
2405                         return -EIO;
2406
2407                 dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
2408                 dst[i].iov_len = (size_t) fiov[i].len;
2409
2410 #ifdef CONFIG_COMPAT
2411                 if (is_compat &&
2412                     (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
2413                      (compat_size_t) dst[i].iov_len != fiov[i].len))
2414                         return -EIO;
2415 #endif
2416         }
2417
2418         return 0;
2419 }
2420
2421
2422 /*
2423  * For ioctls, there is no generic way to determine how much memory
2424  * needs to be read and/or written.  Furthermore, ioctls are allowed
2425  * to dereference the passed pointer, so the parameter requires deep
2426  * copying but FUSE has no idea whatsoever about what to copy in or
2427  * out.
2428  *
2429  * This is solved by allowing FUSE server to retry ioctl with
2430  * necessary in/out iovecs.  Let's assume the ioctl implementation
2431  * needs to read in the following structure.
2432  *
2433  * struct a {
2434  *      char    *buf;
2435  *      size_t  buflen;
2436  * }
2437  *
2438  * On the first callout to FUSE server, inarg->in_size and
2439  * inarg->out_size will be NULL; then, the server completes the ioctl
2440  * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
2441  * the actual iov array to
2442  *
2443  * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a) } }
2444  *
2445  * which tells FUSE to copy in the requested area and retry the ioctl.
2446  * On the second round, the server has access to the structure and
2447  * from that it can tell what to look for next, so on the invocation,
2448  * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
2449  *
2450  * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a)     },
2451  *   { .iov_base = a.buf,       .iov_len = a.buflen             } }
2452  *
2453  * FUSE will copy both struct a and the pointed buffer from the
2454  * process doing the ioctl and retry ioctl with both struct a and the
2455  * buffer.
2456  *
2457  * This time, FUSE server has everything it needs and completes ioctl
2458  * without FUSE_IOCTL_RETRY which finishes the ioctl call.
2459  *
2460  * Copying data out works the same way.
2461  *
2462  * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
2463  * automatically initializes in and out iovs by decoding @cmd with
2464  * _IOC_* macros and the server is not allowed to request RETRY.  This
2465  * limits ioctl data transfers to well-formed ioctls and is the forced
2466  * behavior for all FUSE servers.
2467  */
2468 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
2469                    unsigned int flags)
2470 {
2471         struct fuse_file *ff = file->private_data;
2472         struct fuse_conn *fc = ff->fc;
2473         struct fuse_ioctl_in inarg = {
2474                 .fh = ff->fh,
2475                 .cmd = cmd,
2476                 .arg = arg,
2477                 .flags = flags
2478         };
2479         struct fuse_ioctl_out outarg;
2480         struct fuse_req *req = NULL;
2481         struct page **pages = NULL;
2482         struct iovec *iov_page = NULL;
2483         struct iovec *in_iov = NULL, *out_iov = NULL;
2484         unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
2485         size_t in_size, out_size, transferred;
2486         int err;
2487
2488 #if BITS_PER_LONG == 32
2489         inarg.flags |= FUSE_IOCTL_32BIT;
2490 #else
2491         if (flags & FUSE_IOCTL_COMPAT)
2492                 inarg.flags |= FUSE_IOCTL_32BIT;
2493 #endif
2494
2495         /* assume all the iovs returned by client always fits in a page */
2496         BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
2497
2498         err = -ENOMEM;
2499         pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
2500         iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
2501         if (!pages || !iov_page)
2502                 goto out;
2503
2504         /*
2505          * If restricted, initialize IO parameters as encoded in @cmd.
2506          * RETRY from server is not allowed.
2507          */
2508         if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
2509                 struct iovec *iov = iov_page;
2510
2511                 iov->iov_base = (void __user *)arg;
2512                 iov->iov_len = _IOC_SIZE(cmd);
2513
2514                 if (_IOC_DIR(cmd) & _IOC_WRITE) {
2515                         in_iov = iov;
2516                         in_iovs = 1;
2517                 }
2518
2519                 if (_IOC_DIR(cmd) & _IOC_READ) {
2520                         out_iov = iov;
2521                         out_iovs = 1;
2522                 }
2523         }
2524
2525  retry:
2526         inarg.in_size = in_size = iov_length(in_iov, in_iovs);
2527         inarg.out_size = out_size = iov_length(out_iov, out_iovs);
2528
2529         /*
2530          * Out data can be used either for actual out data or iovs,
2531          * make sure there always is at least one page.
2532          */
2533         out_size = max_t(size_t, out_size, PAGE_SIZE);
2534         max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
2535
2536         /* make sure there are enough buffer pages and init request with them */
2537         err = -ENOMEM;
2538         if (max_pages > FUSE_MAX_PAGES_PER_REQ)
2539                 goto out;
2540         while (num_pages < max_pages) {
2541                 pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2542                 if (!pages[num_pages])
2543                         goto out;
2544                 num_pages++;
2545         }
2546
2547         req = fuse_get_req(fc, num_pages);
2548         if (IS_ERR(req)) {
2549                 err = PTR_ERR(req);
2550                 req = NULL;
2551                 goto out;
2552         }
2553         memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
2554         req->num_pages = num_pages;
2555         fuse_page_descs_length_init(req, 0, req->num_pages);
2556
2557         /* okay, let's send it to the client */
2558         req->in.h.opcode = FUSE_IOCTL;
2559         req->in.h.nodeid = ff->nodeid;
2560         req->in.numargs = 1;
2561         req->in.args[0].size = sizeof(inarg);
2562         req->in.args[0].value = &inarg;
2563         if (in_size) {
2564                 req->in.numargs++;
2565                 req->in.args[1].size = in_size;
2566                 req->in.argpages = 1;
2567
2568                 err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
2569                                            false);
2570                 if (err)
2571                         goto out;
2572         }
2573
2574         req->out.numargs = 2;
2575         req->out.args[0].size = sizeof(outarg);
2576         req->out.args[0].value = &outarg;
2577         req->out.args[1].size = out_size;
2578         req->out.argpages = 1;
2579         req->out.argvar = 1;
2580
2581         fuse_request_send(fc, req);
2582         err = req->out.h.error;
2583         transferred = req->out.args[1].size;
2584         fuse_put_request(fc, req);
2585         req = NULL;
2586         if (err)
2587                 goto out;
2588
2589         /* did it ask for retry? */
2590         if (outarg.flags & FUSE_IOCTL_RETRY) {
2591                 void *vaddr;
2592
2593                 /* no retry if in restricted mode */
2594                 err = -EIO;
2595                 if (!(flags & FUSE_IOCTL_UNRESTRICTED))
2596                         goto out;
2597
2598                 in_iovs = outarg.in_iovs;
2599                 out_iovs = outarg.out_iovs;
2600
2601                 /*
2602                  * Make sure things are in boundary, separate checks
2603                  * are to protect against overflow.
2604                  */
2605                 err = -ENOMEM;
2606                 if (in_iovs > FUSE_IOCTL_MAX_IOV ||
2607                     out_iovs > FUSE_IOCTL_MAX_IOV ||
2608                     in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
2609                         goto out;
2610
2611                 vaddr = kmap_atomic(pages[0]);
2612                 err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
2613                                             transferred, in_iovs + out_iovs,
2614                                             (flags & FUSE_IOCTL_COMPAT) != 0);
2615                 kunmap_atomic(vaddr);
2616                 if (err)
2617                         goto out;
2618
2619                 in_iov = iov_page;
2620                 out_iov = in_iov + in_iovs;
2621
2622                 err = fuse_verify_ioctl_iov(in_iov, in_iovs);
2623                 if (err)
2624                         goto out;
2625
2626                 err = fuse_verify_ioctl_iov(out_iov, out_iovs);
2627                 if (err)
2628                         goto out;
2629
2630                 goto retry;
2631         }
2632
2633         err = -EIO;
2634         if (transferred > inarg.out_size)
2635                 goto out;
2636
2637         err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
2638  out:
2639         if (req)
2640                 fuse_put_request(fc, req);
2641         free_page((unsigned long) iov_page);
2642         while (num_pages)
2643                 __free_page(pages[--num_pages]);
2644         kfree(pages);
2645
2646         return err ? err : outarg.result;
2647 }
2648 EXPORT_SYMBOL_GPL(fuse_do_ioctl);
2649
2650 long fuse_ioctl_common(struct file *file, unsigned int cmd,
2651                        unsigned long arg, unsigned int flags)
2652 {
2653         struct inode *inode = file_inode(file);
2654         struct fuse_conn *fc = get_fuse_conn(inode);
2655
2656         if (!fuse_allow_current_process(fc))
2657                 return -EACCES;
2658
2659         if (is_bad_inode(inode))
2660                 return -EIO;
2661
2662         return fuse_do_ioctl(file, cmd, arg, flags);
2663 }
2664
2665 static long fuse_file_ioctl(struct file *file, unsigned int cmd,
2666                             unsigned long arg)
2667 {
2668         return fuse_ioctl_common(file, cmd, arg, 0);
2669 }
2670
2671 static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
2672                                    unsigned long arg)
2673 {
2674         return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
2675 }
2676
2677 /*
2678  * All files which have been polled are linked to RB tree
2679  * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
2680  * find the matching one.
2681  */
2682 static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
2683                                               struct rb_node **parent_out)
2684 {
2685         struct rb_node **link = &fc->polled_files.rb_node;
2686         struct rb_node *last = NULL;
2687
2688         while (*link) {
2689                 struct fuse_file *ff;
2690
2691                 last = *link;
2692                 ff = rb_entry(last, struct fuse_file, polled_node);
2693
2694                 if (kh < ff->kh)
2695                         link = &last->rb_left;
2696                 else if (kh > ff->kh)
2697                         link = &last->rb_right;
2698                 else
2699                         return link;
2700         }
2701
2702         if (parent_out)
2703                 *parent_out = last;
2704         return link;
2705 }
2706
2707 /*
2708  * The file is about to be polled.  Make sure it's on the polled_files
2709  * RB tree.  Note that files once added to the polled_files tree are
2710  * not removed before the file is released.  This is because a file
2711  * polled once is likely to be polled again.
2712  */
2713 static void fuse_register_polled_file(struct fuse_conn *fc,
2714                                       struct fuse_file *ff)
2715 {
2716         spin_lock(&fc->lock);
2717         if (RB_EMPTY_NODE(&ff->polled_node)) {
2718                 struct rb_node **link, *uninitialized_var(parent);
2719
2720                 link = fuse_find_polled_node(fc, ff->kh, &parent);
2721                 BUG_ON(*link);
2722                 rb_link_node(&ff->polled_node, parent, link);
2723                 rb_insert_color(&ff->polled_node, &fc->polled_files);
2724         }
2725         spin_unlock(&fc->lock);
2726 }
2727
2728 unsigned fuse_file_poll(struct file *file, poll_table *wait)
2729 {
2730         struct fuse_file *ff = file->private_data;
2731         struct fuse_conn *fc = ff->fc;
2732         struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
2733         struct fuse_poll_out outarg;
2734         FUSE_ARGS(args);
2735         int err;
2736
2737         if (fc->no_poll)
2738                 return DEFAULT_POLLMASK;
2739
2740         poll_wait(file, &ff->poll_wait, wait);
2741         inarg.events = (__u32)poll_requested_events(wait);
2742
2743         /*
2744          * Ask for notification iff there's someone waiting for it.
2745          * The client may ignore the flag and always notify.
2746          */
2747         if (waitqueue_active(&ff->poll_wait)) {
2748                 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
2749                 fuse_register_polled_file(fc, ff);
2750         }
2751
2752         args.in.h.opcode = FUSE_POLL;
2753         args.in.h.nodeid = ff->nodeid;
2754         args.in.numargs = 1;
2755         args.in.args[0].size = sizeof(inarg);
2756         args.in.args[0].value = &inarg;
2757         args.out.numargs = 1;
2758         args.out.args[0].size = sizeof(outarg);
2759         args.out.args[0].value = &outarg;
2760         err = fuse_simple_request(fc, &args);
2761
2762         if (!err)
2763                 return outarg.revents;
2764         if (err == -ENOSYS) {
2765                 fc->no_poll = 1;
2766                 return DEFAULT_POLLMASK;
2767         }
2768         return POLLERR;
2769 }
2770 EXPORT_SYMBOL_GPL(fuse_file_poll);
2771
2772 /*
2773  * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
2774  * wakes up the poll waiters.
2775  */
2776 int fuse_notify_poll_wakeup(struct fuse_conn *fc,
2777                             struct fuse_notify_poll_wakeup_out *outarg)
2778 {
2779         u64 kh = outarg->kh;
2780         struct rb_node **link;
2781
2782         spin_lock(&fc->lock);
2783
2784         link = fuse_find_polled_node(fc, kh, NULL);
2785         if (*link) {
2786                 struct fuse_file *ff;
2787
2788                 ff = rb_entry(*link, struct fuse_file, polled_node);
2789                 wake_up_interruptible_sync(&ff->poll_wait);
2790         }
2791
2792         spin_unlock(&fc->lock);
2793         return 0;
2794 }
2795
2796 static void fuse_do_truncate(struct file *file)
2797 {
2798         struct inode *inode = file->f_mapping->host;
2799         struct iattr attr;
2800
2801         attr.ia_valid = ATTR_SIZE;
2802         attr.ia_size = i_size_read(inode);
2803
2804         attr.ia_file = file;
2805         attr.ia_valid |= ATTR_FILE;
2806
2807         fuse_do_setattr(inode, &attr, file);
2808 }
2809
2810 static inline loff_t fuse_round_up(loff_t off)
2811 {
2812         return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
2813 }
2814
2815 static ssize_t
2816 fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
2817 {
2818         DECLARE_COMPLETION_ONSTACK(wait);
2819         ssize_t ret = 0;
2820         struct file *file = iocb->ki_filp;
2821         struct fuse_file *ff = file->private_data;
2822         bool async_dio = ff->fc->async_dio;
2823         loff_t pos = 0;
2824         struct inode *inode;
2825         loff_t i_size;
2826         size_t count = iov_iter_count(iter);
2827         struct fuse_io_priv *io;
2828         bool is_sync = is_sync_kiocb(iocb);
2829
2830         pos = offset;
2831         inode = file->f_mapping->host;
2832         i_size = i_size_read(inode);
2833
2834         if ((iov_iter_rw(iter) == READ) && (offset > i_size))
2835                 return 0;
2836
2837         /* optimization for short read */
2838         if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
2839                 if (offset >= i_size)
2840                         return 0;
2841                 iov_iter_truncate(iter, fuse_round_up(i_size - offset));
2842                 count = iov_iter_count(iter);
2843         }
2844
2845         io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
2846         if (!io)
2847                 return -ENOMEM;
2848         spin_lock_init(&io->lock);
2849         kref_init(&io->refcnt);
2850         io->reqs = 1;
2851         io->bytes = -1;
2852         io->size = 0;
2853         io->offset = offset;
2854         io->write = (iov_iter_rw(iter) == WRITE);
2855         io->err = 0;
2856         io->file = file;
2857         /*
2858          * By default, we want to optimize all I/Os with async request
2859          * submission to the client filesystem if supported.
2860          */
2861         io->async = async_dio;
2862         io->iocb = iocb;
2863
2864         /*
2865          * We cannot asynchronously extend the size of a file. We have no method
2866          * to wait on real async I/O requests, so we must submit this request
2867          * synchronously.
2868          */
2869         if (!is_sync && (offset + count > i_size) &&
2870             iov_iter_rw(iter) == WRITE)
2871                 io->async = false;
2872
2873         if (io->async && is_sync) {
2874                 /*
2875                  * Additional reference to keep io around after
2876                  * calling fuse_aio_complete()
2877                  */
2878                 kref_get(&io->refcnt);
2879                 io->done = &wait;
2880         }
2881
2882         if (iov_iter_rw(iter) == WRITE) {
2883                 ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
2884                 fuse_invalidate_attr(inode);
2885         } else {
2886                 ret = __fuse_direct_read(io, iter, &pos);
2887         }
2888
2889         if (io->async) {
2890                 fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
2891
2892                 /* we have a non-extending, async request, so return */
2893                 if (!is_sync)
2894                         return -EIOCBQUEUED;
2895
2896                 wait_for_completion(&wait);
2897                 ret = fuse_get_res_by_io(io);
2898         }
2899
2900         kref_put(&io->refcnt, fuse_io_release);
2901
2902         if (iov_iter_rw(iter) == WRITE) {
2903                 if (ret > 0)
2904                         fuse_write_update_size(inode, pos);
2905                 else if (ret < 0 && offset + count > i_size)
2906                         fuse_do_truncate(file);
2907         }
2908
2909         return ret;
2910 }
2911
2912 static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2913                                 loff_t length)
2914 {
2915         struct fuse_file *ff = file->private_data;
2916         struct inode *inode = file_inode(file);
2917         struct fuse_inode *fi = get_fuse_inode(inode);
2918         struct fuse_conn *fc = ff->fc;
2919         FUSE_ARGS(args);
2920         struct fuse_fallocate_in inarg = {
2921                 .fh = ff->fh,
2922                 .offset = offset,
2923                 .length = length,
2924                 .mode = mode
2925         };
2926         int err;
2927         bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
2928                            (mode & FALLOC_FL_PUNCH_HOLE);
2929
2930         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2931                 return -EOPNOTSUPP;
2932
2933         if (fc->no_fallocate)
2934                 return -EOPNOTSUPP;
2935
2936         if (lock_inode) {
2937                 mutex_lock(&inode->i_mutex);
2938                 if (mode & FALLOC_FL_PUNCH_HOLE) {
2939                         loff_t endbyte = offset + length - 1;
2940                         err = filemap_write_and_wait_range(inode->i_mapping,
2941                                                            offset, endbyte);
2942                         if (err)
2943                                 goto out;
2944
2945                         fuse_sync_writes(inode);
2946                 }
2947         }
2948
2949         if (!(mode & FALLOC_FL_KEEP_SIZE))
2950                 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
2951
2952         args.in.h.opcode = FUSE_FALLOCATE;
2953         args.in.h.nodeid = ff->nodeid;
2954         args.in.numargs = 1;
2955         args.in.args[0].size = sizeof(inarg);
2956         args.in.args[0].value = &inarg;
2957         err = fuse_simple_request(fc, &args);
2958         if (err == -ENOSYS) {
2959                 fc->no_fallocate = 1;
2960                 err = -EOPNOTSUPP;
2961         }
2962         if (err)
2963                 goto out;
2964
2965         /* we could have extended the file */
2966         if (!(mode & FALLOC_FL_KEEP_SIZE)) {
2967                 bool changed = fuse_write_update_size(inode, offset + length);
2968
2969                 if (changed && fc->writeback_cache)
2970                         file_update_time(file);
2971         }
2972
2973         if (mode & FALLOC_FL_PUNCH_HOLE)
2974                 truncate_pagecache_range(inode, offset, offset + length - 1);
2975
2976         fuse_invalidate_attr(inode);
2977
2978 out:
2979         if (!(mode & FALLOC_FL_KEEP_SIZE))
2980                 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
2981
2982         if (lock_inode)
2983                 mutex_unlock(&inode->i_mutex);
2984
2985         return err;
2986 }
2987
2988 static const struct file_operations fuse_file_operations = {
2989         .llseek         = fuse_file_llseek,
2990         .read_iter      = fuse_file_read_iter,
2991         .write_iter     = fuse_file_write_iter,
2992         .mmap           = fuse_file_mmap,
2993         .open           = fuse_open,
2994         .flush          = fuse_flush,
2995         .release        = fuse_release,
2996         .fsync          = fuse_fsync,
2997         .lock           = fuse_file_lock,
2998         .flock          = fuse_file_flock,
2999         .splice_read    = generic_file_splice_read,
3000         .unlocked_ioctl = fuse_file_ioctl,
3001         .compat_ioctl   = fuse_file_compat_ioctl,
3002         .poll           = fuse_file_poll,
3003         .fallocate      = fuse_file_fallocate,
3004 };
3005
3006 static const struct file_operations fuse_direct_io_file_operations = {
3007         .llseek         = fuse_file_llseek,
3008         .read_iter      = fuse_direct_read_iter,
3009         .write_iter     = fuse_direct_write_iter,
3010         .mmap           = fuse_direct_mmap,
3011         .open           = fuse_open,
3012         .flush          = fuse_flush,
3013         .release        = fuse_release,
3014         .fsync          = fuse_fsync,
3015         .lock           = fuse_file_lock,
3016         .flock          = fuse_file_flock,
3017         .unlocked_ioctl = fuse_file_ioctl,
3018         .compat_ioctl   = fuse_file_compat_ioctl,
3019         .poll           = fuse_file_poll,
3020         .fallocate      = fuse_file_fallocate,
3021         /* no splice_read */
3022 };
3023
3024 static const struct address_space_operations fuse_file_aops  = {
3025         .readpage       = fuse_readpage,
3026         .writepage      = fuse_writepage,
3027         .writepages     = fuse_writepages,
3028         .launder_page   = fuse_launder_page,
3029         .readpages      = fuse_readpages,
3030         .set_page_dirty = __set_page_dirty_nobuffers,
3031         .bmap           = fuse_bmap,
3032         .direct_IO      = fuse_direct_IO,
3033         .write_begin    = fuse_write_begin,
3034         .write_end      = fuse_write_end,
3035 };
3036
3037 void fuse_init_file_inode(struct inode *inode)
3038 {
3039         inode->i_fop = &fuse_file_operations;
3040         inode->i_data.a_ops = &fuse_file_aops;
3041 }