fuse: misc cleanups
[firefly-linux-kernel-4.4.55.git] / fs / fuse / file.c
1 /*
2   FUSE: Filesystem in Userspace
3   Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
4
5   This program can be distributed under the terms of the GNU GPL.
6   See the file COPYING.
7 */
8
9 #include "fuse_i.h"
10
11 #include <linux/pagemap.h>
12 #include <linux/slab.h>
13 #include <linux/kernel.h>
14 #include <linux/sched.h>
15
16 static const struct file_operations fuse_direct_io_file_operations;
17
18 static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
19                           struct fuse_open_out *outargp)
20 {
21         struct fuse_conn *fc = get_fuse_conn(inode);
22         struct fuse_open_in inarg;
23         struct fuse_req *req;
24         int err;
25
26         req = fuse_get_req(fc);
27         if (IS_ERR(req))
28                 return PTR_ERR(req);
29
30         memset(&inarg, 0, sizeof(inarg));
31         inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
32         if (!fc->atomic_o_trunc)
33                 inarg.flags &= ~O_TRUNC;
34         req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
35         req->in.h.nodeid = get_node_id(inode);
36         req->in.numargs = 1;
37         req->in.args[0].size = sizeof(inarg);
38         req->in.args[0].value = &inarg;
39         req->out.numargs = 1;
40         req->out.args[0].size = sizeof(*outargp);
41         req->out.args[0].value = outargp;
42         fuse_request_send(fc, req);
43         err = req->out.h.error;
44         fuse_put_request(fc, req);
45
46         return err;
47 }
48
49 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
50 {
51         struct fuse_file *ff;
52
53         ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
54         if (unlikely(!ff))
55                 return NULL;
56
57         ff->reserved_req = fuse_request_alloc();
58         if (unlikely(!ff->reserved_req)) {
59                 kfree(ff);
60                 return NULL;
61         }
62
63         INIT_LIST_HEAD(&ff->write_entry);
64         atomic_set(&ff->count, 0);
65         RB_CLEAR_NODE(&ff->polled_node);
66         init_waitqueue_head(&ff->poll_wait);
67
68         spin_lock(&fc->lock);
69         ff->kh = ++fc->khctr;
70         spin_unlock(&fc->lock);
71
72         return ff;
73 }
74
75 void fuse_file_free(struct fuse_file *ff)
76 {
77         fuse_request_free(ff->reserved_req);
78         kfree(ff);
79 }
80
81 static struct fuse_file *fuse_file_get(struct fuse_file *ff)
82 {
83         atomic_inc(&ff->count);
84         return ff;
85 }
86
87 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
88 {
89         dput(req->misc.release.dentry);
90         mntput(req->misc.release.vfsmount);
91 }
92
93 static void fuse_file_put(struct fuse_file *ff)
94 {
95         if (atomic_dec_and_test(&ff->count)) {
96                 struct fuse_req *req = ff->reserved_req;
97                 struct inode *inode = req->misc.release.dentry->d_inode;
98                 struct fuse_conn *fc = get_fuse_conn(inode);
99                 req->end = fuse_release_end;
100                 fuse_request_send_background(fc, req);
101                 kfree(ff);
102         }
103 }
104
105 void fuse_finish_open(struct inode *inode, struct file *file,
106                       struct fuse_file *ff, struct fuse_open_out *outarg)
107 {
108         if (outarg->open_flags & FOPEN_DIRECT_IO)
109                 file->f_op = &fuse_direct_io_file_operations;
110         if (!(outarg->open_flags & FOPEN_KEEP_CACHE))
111                 invalidate_inode_pages2(inode->i_mapping);
112         if (outarg->open_flags & FOPEN_NONSEEKABLE)
113                 nonseekable_open(inode, file);
114         ff->fh = outarg->fh;
115         file->private_data = fuse_file_get(ff);
116 }
117
118 int fuse_open_common(struct inode *inode, struct file *file, int isdir)
119 {
120         struct fuse_conn *fc = get_fuse_conn(inode);
121         struct fuse_open_out outarg;
122         struct fuse_file *ff;
123         int err;
124
125         /* VFS checks this, but only _after_ ->open() */
126         if (file->f_flags & O_DIRECT)
127                 return -EINVAL;
128
129         err = generic_file_open(inode, file);
130         if (err)
131                 return err;
132
133         ff = fuse_file_alloc(fc);
134         if (!ff)
135                 return -ENOMEM;
136
137         err = fuse_send_open(inode, file, isdir, &outarg);
138         if (err)
139                 fuse_file_free(ff);
140         else {
141                 if (isdir)
142                         outarg.open_flags &= ~FOPEN_DIRECT_IO;
143                 fuse_finish_open(inode, file, ff, &outarg);
144         }
145
146         return err;
147 }
148
149 void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode)
150 {
151         struct fuse_req *req = ff->reserved_req;
152         struct fuse_release_in *inarg = &req->misc.release.in;
153
154         inarg->fh = ff->fh;
155         inarg->flags = flags;
156         req->in.h.opcode = opcode;
157         req->in.h.nodeid = nodeid;
158         req->in.numargs = 1;
159         req->in.args[0].size = sizeof(struct fuse_release_in);
160         req->in.args[0].value = inarg;
161 }
162
163 int fuse_release_common(struct inode *inode, struct file *file, int isdir)
164 {
165         struct fuse_conn *fc;
166         struct fuse_file *ff;
167         struct fuse_req *req;
168
169         ff = file->private_data;
170         if (unlikely(!ff))
171                 return 0;       /* return value is ignored by VFS */
172
173         fc = get_fuse_conn(inode);
174         req = ff->reserved_req;
175
176         fuse_release_fill(ff, get_node_id(inode), file->f_flags,
177                           isdir ? FUSE_RELEASEDIR : FUSE_RELEASE);
178
179         /* Hold vfsmount and dentry until release is finished */
180         req->misc.release.vfsmount = mntget(file->f_path.mnt);
181         req->misc.release.dentry = dget(file->f_path.dentry);
182
183         spin_lock(&fc->lock);
184         list_del(&ff->write_entry);
185         if (!RB_EMPTY_NODE(&ff->polled_node))
186                 rb_erase(&ff->polled_node, &fc->polled_files);
187         spin_unlock(&fc->lock);
188
189         wake_up_interruptible_sync(&ff->poll_wait);
190         /*
191          * Normally this will send the RELEASE request, however if
192          * some asynchronous READ or WRITE requests are outstanding,
193          * the sending will be delayed.
194          */
195         fuse_file_put(ff);
196         return 0;
197 }
198
199 static int fuse_open(struct inode *inode, struct file *file)
200 {
201         return fuse_open_common(inode, file, 0);
202 }
203
204 static int fuse_release(struct inode *inode, struct file *file)
205 {
206         return fuse_release_common(inode, file, 0);
207 }
208
209 /*
210  * Scramble the ID space with XTEA, so that the value of the files_struct
211  * pointer is not exposed to userspace.
212  */
213 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
214 {
215         u32 *k = fc->scramble_key;
216         u64 v = (unsigned long) id;
217         u32 v0 = v;
218         u32 v1 = v >> 32;
219         u32 sum = 0;
220         int i;
221
222         for (i = 0; i < 32; i++) {
223                 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
224                 sum += 0x9E3779B9;
225                 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
226         }
227
228         return (u64) v0 + ((u64) v1 << 32);
229 }
230
231 /*
232  * Check if page is under writeback
233  *
234  * This is currently done by walking the list of writepage requests
235  * for the inode, which can be pretty inefficient.
236  */
237 static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
238 {
239         struct fuse_conn *fc = get_fuse_conn(inode);
240         struct fuse_inode *fi = get_fuse_inode(inode);
241         struct fuse_req *req;
242         bool found = false;
243
244         spin_lock(&fc->lock);
245         list_for_each_entry(req, &fi->writepages, writepages_entry) {
246                 pgoff_t curr_index;
247
248                 BUG_ON(req->inode != inode);
249                 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
250                 if (curr_index == index) {
251                         found = true;
252                         break;
253                 }
254         }
255         spin_unlock(&fc->lock);
256
257         return found;
258 }
259
260 /*
261  * Wait for page writeback to be completed.
262  *
263  * Since fuse doesn't rely on the VM writeback tracking, this has to
264  * use some other means.
265  */
266 static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
267 {
268         struct fuse_inode *fi = get_fuse_inode(inode);
269
270         wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
271         return 0;
272 }
273
274 static int fuse_flush(struct file *file, fl_owner_t id)
275 {
276         struct inode *inode = file->f_path.dentry->d_inode;
277         struct fuse_conn *fc = get_fuse_conn(inode);
278         struct fuse_file *ff = file->private_data;
279         struct fuse_req *req;
280         struct fuse_flush_in inarg;
281         int err;
282
283         if (is_bad_inode(inode))
284                 return -EIO;
285
286         if (fc->no_flush)
287                 return 0;
288
289         req = fuse_get_req_nofail(fc, file);
290         memset(&inarg, 0, sizeof(inarg));
291         inarg.fh = ff->fh;
292         inarg.lock_owner = fuse_lock_owner_id(fc, id);
293         req->in.h.opcode = FUSE_FLUSH;
294         req->in.h.nodeid = get_node_id(inode);
295         req->in.numargs = 1;
296         req->in.args[0].size = sizeof(inarg);
297         req->in.args[0].value = &inarg;
298         req->force = 1;
299         fuse_request_send(fc, req);
300         err = req->out.h.error;
301         fuse_put_request(fc, req);
302         if (err == -ENOSYS) {
303                 fc->no_flush = 1;
304                 err = 0;
305         }
306         return err;
307 }
308
309 /*
310  * Wait for all pending writepages on the inode to finish.
311  *
312  * This is currently done by blocking further writes with FUSE_NOWRITE
313  * and waiting for all sent writes to complete.
314  *
315  * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
316  * could conflict with truncation.
317  */
318 static void fuse_sync_writes(struct inode *inode)
319 {
320         fuse_set_nowrite(inode);
321         fuse_release_nowrite(inode);
322 }
323
324 int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
325                       int isdir)
326 {
327         struct inode *inode = de->d_inode;
328         struct fuse_conn *fc = get_fuse_conn(inode);
329         struct fuse_file *ff = file->private_data;
330         struct fuse_req *req;
331         struct fuse_fsync_in inarg;
332         int err;
333
334         if (is_bad_inode(inode))
335                 return -EIO;
336
337         if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
338                 return 0;
339
340         /*
341          * Start writeback against all dirty pages of the inode, then
342          * wait for all outstanding writes, before sending the FSYNC
343          * request.
344          */
345         err = write_inode_now(inode, 0);
346         if (err)
347                 return err;
348
349         fuse_sync_writes(inode);
350
351         req = fuse_get_req(fc);
352         if (IS_ERR(req))
353                 return PTR_ERR(req);
354
355         memset(&inarg, 0, sizeof(inarg));
356         inarg.fh = ff->fh;
357         inarg.fsync_flags = datasync ? 1 : 0;
358         req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
359         req->in.h.nodeid = get_node_id(inode);
360         req->in.numargs = 1;
361         req->in.args[0].size = sizeof(inarg);
362         req->in.args[0].value = &inarg;
363         fuse_request_send(fc, req);
364         err = req->out.h.error;
365         fuse_put_request(fc, req);
366         if (err == -ENOSYS) {
367                 if (isdir)
368                         fc->no_fsyncdir = 1;
369                 else
370                         fc->no_fsync = 1;
371                 err = 0;
372         }
373         return err;
374 }
375
376 static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
377 {
378         return fuse_fsync_common(file, de, datasync, 0);
379 }
380
381 void fuse_read_fill(struct fuse_req *req, struct file *file,
382                     struct inode *inode, loff_t pos, size_t count, int opcode)
383 {
384         struct fuse_read_in *inarg = &req->misc.read.in;
385         struct fuse_file *ff = file->private_data;
386
387         inarg->fh = ff->fh;
388         inarg->offset = pos;
389         inarg->size = count;
390         inarg->flags = file->f_flags;
391         req->in.h.opcode = opcode;
392         req->in.h.nodeid = get_node_id(inode);
393         req->in.numargs = 1;
394         req->in.args[0].size = sizeof(struct fuse_read_in);
395         req->in.args[0].value = inarg;
396         req->out.argvar = 1;
397         req->out.numargs = 1;
398         req->out.args[0].size = count;
399 }
400
401 static size_t fuse_send_read(struct fuse_req *req, struct file *file,
402                              struct inode *inode, loff_t pos, size_t count,
403                              fl_owner_t owner)
404 {
405         struct fuse_conn *fc = get_fuse_conn(inode);
406
407         fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
408         if (owner != NULL) {
409                 struct fuse_read_in *inarg = &req->misc.read.in;
410
411                 inarg->read_flags |= FUSE_READ_LOCKOWNER;
412                 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
413         }
414         fuse_request_send(fc, req);
415         return req->out.args[0].size;
416 }
417
418 static void fuse_read_update_size(struct inode *inode, loff_t size,
419                                   u64 attr_ver)
420 {
421         struct fuse_conn *fc = get_fuse_conn(inode);
422         struct fuse_inode *fi = get_fuse_inode(inode);
423
424         spin_lock(&fc->lock);
425         if (attr_ver == fi->attr_version && size < inode->i_size) {
426                 fi->attr_version = ++fc->attr_version;
427                 i_size_write(inode, size);
428         }
429         spin_unlock(&fc->lock);
430 }
431
432 static int fuse_readpage(struct file *file, struct page *page)
433 {
434         struct inode *inode = page->mapping->host;
435         struct fuse_conn *fc = get_fuse_conn(inode);
436         struct fuse_req *req;
437         size_t num_read;
438         loff_t pos = page_offset(page);
439         size_t count = PAGE_CACHE_SIZE;
440         u64 attr_ver;
441         int err;
442
443         err = -EIO;
444         if (is_bad_inode(inode))
445                 goto out;
446
447         /*
448          * Page writeback can extend beyond the liftime of the
449          * page-cache page, so make sure we read a properly synced
450          * page.
451          */
452         fuse_wait_on_page_writeback(inode, page->index);
453
454         req = fuse_get_req(fc);
455         err = PTR_ERR(req);
456         if (IS_ERR(req))
457                 goto out;
458
459         attr_ver = fuse_get_attr_version(fc);
460
461         req->out.page_zeroing = 1;
462         req->out.argpages = 1;
463         req->num_pages = 1;
464         req->pages[0] = page;
465         num_read = fuse_send_read(req, file, inode, pos, count, NULL);
466         err = req->out.h.error;
467         fuse_put_request(fc, req);
468
469         if (!err) {
470                 /*
471                  * Short read means EOF.  If file size is larger, truncate it
472                  */
473                 if (num_read < count)
474                         fuse_read_update_size(inode, pos + num_read, attr_ver);
475
476                 SetPageUptodate(page);
477         }
478
479         fuse_invalidate_attr(inode); /* atime changed */
480  out:
481         unlock_page(page);
482         return err;
483 }
484
485 static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
486 {
487         int i;
488         size_t count = req->misc.read.in.size;
489         size_t num_read = req->out.args[0].size;
490         struct inode *inode = req->pages[0]->mapping->host;
491
492         /*
493          * Short read means EOF.  If file size is larger, truncate it
494          */
495         if (!req->out.h.error && num_read < count) {
496                 loff_t pos = page_offset(req->pages[0]) + num_read;
497                 fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
498         }
499
500         fuse_invalidate_attr(inode); /* atime changed */
501
502         for (i = 0; i < req->num_pages; i++) {
503                 struct page *page = req->pages[i];
504                 if (!req->out.h.error)
505                         SetPageUptodate(page);
506                 else
507                         SetPageError(page);
508                 unlock_page(page);
509         }
510         if (req->ff)
511                 fuse_file_put(req->ff);
512 }
513
514 static void fuse_send_readpages(struct fuse_req *req, struct file *file,
515                                 struct inode *inode)
516 {
517         struct fuse_conn *fc = get_fuse_conn(inode);
518         loff_t pos = page_offset(req->pages[0]);
519         size_t count = req->num_pages << PAGE_CACHE_SHIFT;
520
521         req->out.argpages = 1;
522         req->out.page_zeroing = 1;
523         fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
524         req->misc.read.attr_ver = fuse_get_attr_version(fc);
525         if (fc->async_read) {
526                 struct fuse_file *ff = file->private_data;
527                 req->ff = fuse_file_get(ff);
528                 req->end = fuse_readpages_end;
529                 fuse_request_send_background(fc, req);
530         } else {
531                 fuse_request_send(fc, req);
532                 fuse_readpages_end(fc, req);
533                 fuse_put_request(fc, req);
534         }
535 }
536
537 struct fuse_fill_data {
538         struct fuse_req *req;
539         struct file *file;
540         struct inode *inode;
541 };
542
543 static int fuse_readpages_fill(void *_data, struct page *page)
544 {
545         struct fuse_fill_data *data = _data;
546         struct fuse_req *req = data->req;
547         struct inode *inode = data->inode;
548         struct fuse_conn *fc = get_fuse_conn(inode);
549
550         fuse_wait_on_page_writeback(inode, page->index);
551
552         if (req->num_pages &&
553             (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
554              (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
555              req->pages[req->num_pages - 1]->index + 1 != page->index)) {
556                 fuse_send_readpages(req, data->file, inode);
557                 data->req = req = fuse_get_req(fc);
558                 if (IS_ERR(req)) {
559                         unlock_page(page);
560                         return PTR_ERR(req);
561                 }
562         }
563         req->pages[req->num_pages] = page;
564         req->num_pages++;
565         return 0;
566 }
567
568 static int fuse_readpages(struct file *file, struct address_space *mapping,
569                           struct list_head *pages, unsigned nr_pages)
570 {
571         struct inode *inode = mapping->host;
572         struct fuse_conn *fc = get_fuse_conn(inode);
573         struct fuse_fill_data data;
574         int err;
575
576         err = -EIO;
577         if (is_bad_inode(inode))
578                 goto out;
579
580         data.file = file;
581         data.inode = inode;
582         data.req = fuse_get_req(fc);
583         err = PTR_ERR(data.req);
584         if (IS_ERR(data.req))
585                 goto out;
586
587         err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
588         if (!err) {
589                 if (data.req->num_pages)
590                         fuse_send_readpages(data.req, file, inode);
591                 else
592                         fuse_put_request(fc, data.req);
593         }
594 out:
595         return err;
596 }
597
598 static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
599                                   unsigned long nr_segs, loff_t pos)
600 {
601         struct inode *inode = iocb->ki_filp->f_mapping->host;
602
603         if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) {
604                 int err;
605                 /*
606                  * If trying to read past EOF, make sure the i_size
607                  * attribute is up-to-date.
608                  */
609                 err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
610                 if (err)
611                         return err;
612         }
613
614         return generic_file_aio_read(iocb, iov, nr_segs, pos);
615 }
616
617 static void fuse_write_fill(struct fuse_req *req, struct file *file,
618                             struct fuse_file *ff, struct inode *inode,
619                             loff_t pos, size_t count, int writepage)
620 {
621         struct fuse_conn *fc = get_fuse_conn(inode);
622         struct fuse_write_in *inarg = &req->misc.write.in;
623         struct fuse_write_out *outarg = &req->misc.write.out;
624
625         memset(inarg, 0, sizeof(struct fuse_write_in));
626         inarg->fh = ff->fh;
627         inarg->offset = pos;
628         inarg->size = count;
629         inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0;
630         inarg->flags = file ? file->f_flags : 0;
631         req->in.h.opcode = FUSE_WRITE;
632         req->in.h.nodeid = get_node_id(inode);
633         req->in.numargs = 2;
634         if (fc->minor < 9)
635                 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
636         else
637                 req->in.args[0].size = sizeof(struct fuse_write_in);
638         req->in.args[0].value = inarg;
639         req->in.args[1].size = count;
640         req->out.numargs = 1;
641         req->out.args[0].size = sizeof(struct fuse_write_out);
642         req->out.args[0].value = outarg;
643 }
644
645 static size_t fuse_send_write(struct fuse_req *req, struct file *file,
646                               struct inode *inode, loff_t pos, size_t count,
647                               fl_owner_t owner)
648 {
649         struct fuse_conn *fc = get_fuse_conn(inode);
650         fuse_write_fill(req, file, file->private_data, inode, pos, count, 0);
651         if (owner != NULL) {
652                 struct fuse_write_in *inarg = &req->misc.write.in;
653                 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
654                 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
655         }
656         fuse_request_send(fc, req);
657         return req->misc.write.out.size;
658 }
659
660 static int fuse_write_begin(struct file *file, struct address_space *mapping,
661                         loff_t pos, unsigned len, unsigned flags,
662                         struct page **pagep, void **fsdata)
663 {
664         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
665
666         *pagep = grab_cache_page_write_begin(mapping, index, flags);
667         if (!*pagep)
668                 return -ENOMEM;
669         return 0;
670 }
671
672 static void fuse_write_update_size(struct inode *inode, loff_t pos)
673 {
674         struct fuse_conn *fc = get_fuse_conn(inode);
675         struct fuse_inode *fi = get_fuse_inode(inode);
676
677         spin_lock(&fc->lock);
678         fi->attr_version = ++fc->attr_version;
679         if (pos > inode->i_size)
680                 i_size_write(inode, pos);
681         spin_unlock(&fc->lock);
682 }
683
684 static int fuse_buffered_write(struct file *file, struct inode *inode,
685                                loff_t pos, unsigned count, struct page *page)
686 {
687         int err;
688         size_t nres;
689         struct fuse_conn *fc = get_fuse_conn(inode);
690         unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
691         struct fuse_req *req;
692
693         if (is_bad_inode(inode))
694                 return -EIO;
695
696         /*
697          * Make sure writepages on the same page are not mixed up with
698          * plain writes.
699          */
700         fuse_wait_on_page_writeback(inode, page->index);
701
702         req = fuse_get_req(fc);
703         if (IS_ERR(req))
704                 return PTR_ERR(req);
705
706         req->in.argpages = 1;
707         req->num_pages = 1;
708         req->pages[0] = page;
709         req->page_offset = offset;
710         nres = fuse_send_write(req, file, inode, pos, count, NULL);
711         err = req->out.h.error;
712         fuse_put_request(fc, req);
713         if (!err && !nres)
714                 err = -EIO;
715         if (!err) {
716                 pos += nres;
717                 fuse_write_update_size(inode, pos);
718                 if (count == PAGE_CACHE_SIZE)
719                         SetPageUptodate(page);
720         }
721         fuse_invalidate_attr(inode);
722         return err ? err : nres;
723 }
724
725 static int fuse_write_end(struct file *file, struct address_space *mapping,
726                         loff_t pos, unsigned len, unsigned copied,
727                         struct page *page, void *fsdata)
728 {
729         struct inode *inode = mapping->host;
730         int res = 0;
731
732         if (copied)
733                 res = fuse_buffered_write(file, inode, pos, copied, page);
734
735         unlock_page(page);
736         page_cache_release(page);
737         return res;
738 }
739
740 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
741                                     struct inode *inode, loff_t pos,
742                                     size_t count)
743 {
744         size_t res;
745         unsigned offset;
746         unsigned i;
747
748         for (i = 0; i < req->num_pages; i++)
749                 fuse_wait_on_page_writeback(inode, req->pages[i]->index);
750
751         res = fuse_send_write(req, file, inode, pos, count, NULL);
752
753         offset = req->page_offset;
754         count = res;
755         for (i = 0; i < req->num_pages; i++) {
756                 struct page *page = req->pages[i];
757
758                 if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
759                         SetPageUptodate(page);
760
761                 if (count > PAGE_CACHE_SIZE - offset)
762                         count -= PAGE_CACHE_SIZE - offset;
763                 else
764                         count = 0;
765                 offset = 0;
766
767                 unlock_page(page);
768                 page_cache_release(page);
769         }
770
771         return res;
772 }
773
774 static ssize_t fuse_fill_write_pages(struct fuse_req *req,
775                                struct address_space *mapping,
776                                struct iov_iter *ii, loff_t pos)
777 {
778         struct fuse_conn *fc = get_fuse_conn(mapping->host);
779         unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
780         size_t count = 0;
781         int err;
782
783         req->in.argpages = 1;
784         req->page_offset = offset;
785
786         do {
787                 size_t tmp;
788                 struct page *page;
789                 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
790                 size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
791                                      iov_iter_count(ii));
792
793                 bytes = min_t(size_t, bytes, fc->max_write - count);
794
795  again:
796                 err = -EFAULT;
797                 if (iov_iter_fault_in_readable(ii, bytes))
798                         break;
799
800                 err = -ENOMEM;
801                 page = grab_cache_page_write_begin(mapping, index, 0);
802                 if (!page)
803                         break;
804
805                 pagefault_disable();
806                 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
807                 pagefault_enable();
808                 flush_dcache_page(page);
809
810                 if (!tmp) {
811                         unlock_page(page);
812                         page_cache_release(page);
813                         bytes = min(bytes, iov_iter_single_seg_count(ii));
814                         goto again;
815                 }
816
817                 err = 0;
818                 req->pages[req->num_pages] = page;
819                 req->num_pages++;
820
821                 iov_iter_advance(ii, tmp);
822                 count += tmp;
823                 pos += tmp;
824                 offset += tmp;
825                 if (offset == PAGE_CACHE_SIZE)
826                         offset = 0;
827
828                 if (!fc->big_writes)
829                         break;
830         } while (iov_iter_count(ii) && count < fc->max_write &&
831                  req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
832
833         return count > 0 ? count : err;
834 }
835
836 static ssize_t fuse_perform_write(struct file *file,
837                                   struct address_space *mapping,
838                                   struct iov_iter *ii, loff_t pos)
839 {
840         struct inode *inode = mapping->host;
841         struct fuse_conn *fc = get_fuse_conn(inode);
842         int err = 0;
843         ssize_t res = 0;
844
845         if (is_bad_inode(inode))
846                 return -EIO;
847
848         do {
849                 struct fuse_req *req;
850                 ssize_t count;
851
852                 req = fuse_get_req(fc);
853                 if (IS_ERR(req)) {
854                         err = PTR_ERR(req);
855                         break;
856                 }
857
858                 count = fuse_fill_write_pages(req, mapping, ii, pos);
859                 if (count <= 0) {
860                         err = count;
861                 } else {
862                         size_t num_written;
863
864                         num_written = fuse_send_write_pages(req, file, inode,
865                                                             pos, count);
866                         err = req->out.h.error;
867                         if (!err) {
868                                 res += num_written;
869                                 pos += num_written;
870
871                                 /* break out of the loop on short write */
872                                 if (num_written != count)
873                                         err = -EIO;
874                         }
875                 }
876                 fuse_put_request(fc, req);
877         } while (!err && iov_iter_count(ii));
878
879         if (res > 0)
880                 fuse_write_update_size(inode, pos);
881
882         fuse_invalidate_attr(inode);
883
884         return res > 0 ? res : err;
885 }
886
887 static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
888                                    unsigned long nr_segs, loff_t pos)
889 {
890         struct file *file = iocb->ki_filp;
891         struct address_space *mapping = file->f_mapping;
892         size_t count = 0;
893         ssize_t written = 0;
894         struct inode *inode = mapping->host;
895         ssize_t err;
896         struct iov_iter i;
897
898         WARN_ON(iocb->ki_pos != pos);
899
900         err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
901         if (err)
902                 return err;
903
904         mutex_lock(&inode->i_mutex);
905         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
906
907         /* We can write back this queue in page reclaim */
908         current->backing_dev_info = mapping->backing_dev_info;
909
910         err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
911         if (err)
912                 goto out;
913
914         if (count == 0)
915                 goto out;
916
917         err = file_remove_suid(file);
918         if (err)
919                 goto out;
920
921         file_update_time(file);
922
923         iov_iter_init(&i, iov, nr_segs, count, 0);
924         written = fuse_perform_write(file, mapping, &i, pos);
925         if (written >= 0)
926                 iocb->ki_pos = pos + written;
927
928 out:
929         current->backing_dev_info = NULL;
930         mutex_unlock(&inode->i_mutex);
931
932         return written ? written : err;
933 }
934
935 static void fuse_release_user_pages(struct fuse_req *req, int write)
936 {
937         unsigned i;
938
939         for (i = 0; i < req->num_pages; i++) {
940                 struct page *page = req->pages[i];
941                 if (write)
942                         set_page_dirty_lock(page);
943                 put_page(page);
944         }
945 }
946
947 static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
948                                size_t *nbytesp, int write)
949 {
950         size_t nbytes = *nbytesp;
951         unsigned long user_addr = (unsigned long) buf;
952         unsigned offset = user_addr & ~PAGE_MASK;
953         int npages;
954
955         /* Special case for kernel I/O: can copy directly into the buffer */
956         if (segment_eq(get_fs(), KERNEL_DS)) {
957                 if (write)
958                         req->in.args[1].value = (void *) user_addr;
959                 else
960                         req->out.args[0].value = (void *) user_addr;
961
962                 return 0;
963         }
964
965         nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
966         npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
967         npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
968         down_read(&current->mm->mmap_sem);
969         npages = get_user_pages(current, current->mm, user_addr, npages, !write,
970                                 0, req->pages, NULL);
971         up_read(&current->mm->mmap_sem);
972         if (npages < 0)
973                 return npages;
974
975         req->num_pages = npages;
976         req->page_offset = offset;
977
978         if (write)
979                 req->in.argpages = 1;
980         else
981                 req->out.argpages = 1;
982
983         nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
984         *nbytesp = min(*nbytesp, nbytes);
985
986         return 0;
987 }
988
989 static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
990                               size_t count, loff_t *ppos, int write)
991 {
992         struct inode *inode = file->f_path.dentry->d_inode;
993         struct fuse_conn *fc = get_fuse_conn(inode);
994         size_t nmax = write ? fc->max_write : fc->max_read;
995         loff_t pos = *ppos;
996         ssize_t res = 0;
997         struct fuse_req *req;
998
999         if (is_bad_inode(inode))
1000                 return -EIO;
1001
1002         req = fuse_get_req(fc);
1003         if (IS_ERR(req))
1004                 return PTR_ERR(req);
1005
1006         while (count) {
1007                 size_t nres;
1008                 size_t nbytes = min(count, nmax);
1009                 int err = fuse_get_user_pages(req, buf, &nbytes, write);
1010                 if (err) {
1011                         res = err;
1012                         break;
1013                 }
1014
1015                 if (write)
1016                         nres = fuse_send_write(req, file, inode, pos, nbytes,
1017                                                current->files);
1018                 else
1019                         nres = fuse_send_read(req, file, inode, pos, nbytes,
1020                                               current->files);
1021                 fuse_release_user_pages(req, !write);
1022                 if (req->out.h.error) {
1023                         if (!res)
1024                                 res = req->out.h.error;
1025                         break;
1026                 } else if (nres > nbytes) {
1027                         res = -EIO;
1028                         break;
1029                 }
1030                 count -= nres;
1031                 res += nres;
1032                 pos += nres;
1033                 buf += nres;
1034                 if (nres != nbytes)
1035                         break;
1036                 if (count) {
1037                         fuse_put_request(fc, req);
1038                         req = fuse_get_req(fc);
1039                         if (IS_ERR(req))
1040                                 break;
1041                 }
1042         }
1043         fuse_put_request(fc, req);
1044         if (res > 0) {
1045                 if (write)
1046                         fuse_write_update_size(inode, pos);
1047                 *ppos = pos;
1048         }
1049         fuse_invalidate_attr(inode);
1050
1051         return res;
1052 }
1053
1054 static ssize_t fuse_direct_read(struct file *file, char __user *buf,
1055                                      size_t count, loff_t *ppos)
1056 {
1057         return fuse_direct_io(file, buf, count, ppos, 0);
1058 }
1059
1060 static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
1061                                  size_t count, loff_t *ppos)
1062 {
1063         struct inode *inode = file->f_path.dentry->d_inode;
1064         ssize_t res;
1065         /* Don't allow parallel writes to the same file */
1066         mutex_lock(&inode->i_mutex);
1067         res = generic_write_checks(file, ppos, &count, 0);
1068         if (!res)
1069                 res = fuse_direct_io(file, buf, count, ppos, 1);
1070         mutex_unlock(&inode->i_mutex);
1071         return res;
1072 }
1073
1074 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
1075 {
1076         __free_page(req->pages[0]);
1077         fuse_file_put(req->ff);
1078 }
1079
1080 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1081 {
1082         struct inode *inode = req->inode;
1083         struct fuse_inode *fi = get_fuse_inode(inode);
1084         struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
1085
1086         list_del(&req->writepages_entry);
1087         dec_bdi_stat(bdi, BDI_WRITEBACK);
1088         dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
1089         bdi_writeout_inc(bdi);
1090         wake_up(&fi->page_waitq);
1091 }
1092
1093 /* Called under fc->lock, may release and reacquire it */
1094 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1095 __releases(&fc->lock)
1096 __acquires(&fc->lock)
1097 {
1098         struct fuse_inode *fi = get_fuse_inode(req->inode);
1099         loff_t size = i_size_read(req->inode);
1100         struct fuse_write_in *inarg = &req->misc.write.in;
1101
1102         if (!fc->connected)
1103                 goto out_free;
1104
1105         if (inarg->offset + PAGE_CACHE_SIZE <= size) {
1106                 inarg->size = PAGE_CACHE_SIZE;
1107         } else if (inarg->offset < size) {
1108                 inarg->size = size & (PAGE_CACHE_SIZE - 1);
1109         } else {
1110                 /* Got truncated off completely */
1111                 goto out_free;
1112         }
1113
1114         req->in.args[1].size = inarg->size;
1115         fi->writectr++;
1116         fuse_request_send_background_locked(fc, req);
1117         return;
1118
1119  out_free:
1120         fuse_writepage_finish(fc, req);
1121         spin_unlock(&fc->lock);
1122         fuse_writepage_free(fc, req);
1123         fuse_put_request(fc, req);
1124         spin_lock(&fc->lock);
1125 }
1126
1127 /*
1128  * If fi->writectr is positive (no truncate or fsync going on) send
1129  * all queued writepage requests.
1130  *
1131  * Called with fc->lock
1132  */
1133 void fuse_flush_writepages(struct inode *inode)
1134 __releases(&fc->lock)
1135 __acquires(&fc->lock)
1136 {
1137         struct fuse_conn *fc = get_fuse_conn(inode);
1138         struct fuse_inode *fi = get_fuse_inode(inode);
1139         struct fuse_req *req;
1140
1141         while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1142                 req = list_entry(fi->queued_writes.next, struct fuse_req, list);
1143                 list_del_init(&req->list);
1144                 fuse_send_writepage(fc, req);
1145         }
1146 }
1147
1148 static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
1149 {
1150         struct inode *inode = req->inode;
1151         struct fuse_inode *fi = get_fuse_inode(inode);
1152
1153         mapping_set_error(inode->i_mapping, req->out.h.error);
1154         spin_lock(&fc->lock);
1155         fi->writectr--;
1156         fuse_writepage_finish(fc, req);
1157         spin_unlock(&fc->lock);
1158         fuse_writepage_free(fc, req);
1159 }
1160
1161 static int fuse_writepage_locked(struct page *page)
1162 {
1163         struct address_space *mapping = page->mapping;
1164         struct inode *inode = mapping->host;
1165         struct fuse_conn *fc = get_fuse_conn(inode);
1166         struct fuse_inode *fi = get_fuse_inode(inode);
1167         struct fuse_req *req;
1168         struct fuse_file *ff;
1169         struct page *tmp_page;
1170
1171         set_page_writeback(page);
1172
1173         req = fuse_request_alloc_nofs();
1174         if (!req)
1175                 goto err;
1176
1177         tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1178         if (!tmp_page)
1179                 goto err_free;
1180
1181         spin_lock(&fc->lock);
1182         BUG_ON(list_empty(&fi->write_files));
1183         ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
1184         req->ff = fuse_file_get(ff);
1185         spin_unlock(&fc->lock);
1186
1187         fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
1188
1189         copy_highpage(tmp_page, page);
1190         req->in.argpages = 1;
1191         req->num_pages = 1;
1192         req->pages[0] = tmp_page;
1193         req->page_offset = 0;
1194         req->end = fuse_writepage_end;
1195         req->inode = inode;
1196
1197         inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
1198         inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
1199         end_page_writeback(page);
1200
1201         spin_lock(&fc->lock);
1202         list_add(&req->writepages_entry, &fi->writepages);
1203         list_add_tail(&req->list, &fi->queued_writes);
1204         fuse_flush_writepages(inode);
1205         spin_unlock(&fc->lock);
1206
1207         return 0;
1208
1209 err_free:
1210         fuse_request_free(req);
1211 err:
1212         end_page_writeback(page);
1213         return -ENOMEM;
1214 }
1215
1216 static int fuse_writepage(struct page *page, struct writeback_control *wbc)
1217 {
1218         int err;
1219
1220         err = fuse_writepage_locked(page);
1221         unlock_page(page);
1222
1223         return err;
1224 }
1225
1226 static int fuse_launder_page(struct page *page)
1227 {
1228         int err = 0;
1229         if (clear_page_dirty_for_io(page)) {
1230                 struct inode *inode = page->mapping->host;
1231                 err = fuse_writepage_locked(page);
1232                 if (!err)
1233                         fuse_wait_on_page_writeback(inode, page->index);
1234         }
1235         return err;
1236 }
1237
1238 /*
1239  * Write back dirty pages now, because there may not be any suitable
1240  * open files later
1241  */
1242 static void fuse_vma_close(struct vm_area_struct *vma)
1243 {
1244         filemap_write_and_wait(vma->vm_file->f_mapping);
1245 }
1246
1247 /*
1248  * Wait for writeback against this page to complete before allowing it
1249  * to be marked dirty again, and hence written back again, possibly
1250  * before the previous writepage completed.
1251  *
1252  * Block here, instead of in ->writepage(), so that the userspace fs
1253  * can only block processes actually operating on the filesystem.
1254  *
1255  * Otherwise unprivileged userspace fs would be able to block
1256  * unrelated:
1257  *
1258  * - page migration
1259  * - sync(2)
1260  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
1261  */
1262 static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1263 {
1264         struct page *page = vmf->page;
1265         /*
1266          * Don't use page->mapping as it may become NULL from a
1267          * concurrent truncate.
1268          */
1269         struct inode *inode = vma->vm_file->f_mapping->host;
1270
1271         fuse_wait_on_page_writeback(inode, page->index);
1272         return 0;
1273 }
1274
1275 static struct vm_operations_struct fuse_file_vm_ops = {
1276         .close          = fuse_vma_close,
1277         .fault          = filemap_fault,
1278         .page_mkwrite   = fuse_page_mkwrite,
1279 };
1280
1281 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
1282 {
1283         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1284                 struct inode *inode = file->f_dentry->d_inode;
1285                 struct fuse_conn *fc = get_fuse_conn(inode);
1286                 struct fuse_inode *fi = get_fuse_inode(inode);
1287                 struct fuse_file *ff = file->private_data;
1288                 /*
1289                  * file may be written through mmap, so chain it onto the
1290                  * inodes's write_file list
1291                  */
1292                 spin_lock(&fc->lock);
1293                 if (list_empty(&ff->write_entry))
1294                         list_add(&ff->write_entry, &fi->write_files);
1295                 spin_unlock(&fc->lock);
1296         }
1297         file_accessed(file);
1298         vma->vm_ops = &fuse_file_vm_ops;
1299         return 0;
1300 }
1301
1302 static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
1303 {
1304         /* Can't provide the coherency needed for MAP_SHARED */
1305         if (vma->vm_flags & VM_MAYSHARE)
1306                 return -ENODEV;
1307
1308         invalidate_inode_pages2(file->f_mapping);
1309
1310         return generic_file_mmap(file, vma);
1311 }
1312
1313 static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
1314                                   struct file_lock *fl)
1315 {
1316         switch (ffl->type) {
1317         case F_UNLCK:
1318                 break;
1319
1320         case F_RDLCK:
1321         case F_WRLCK:
1322                 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
1323                     ffl->end < ffl->start)
1324                         return -EIO;
1325
1326                 fl->fl_start = ffl->start;
1327                 fl->fl_end = ffl->end;
1328                 fl->fl_pid = ffl->pid;
1329                 break;
1330
1331         default:
1332                 return -EIO;
1333         }
1334         fl->fl_type = ffl->type;
1335         return 0;
1336 }
1337
1338 static void fuse_lk_fill(struct fuse_req *req, struct file *file,
1339                          const struct file_lock *fl, int opcode, pid_t pid,
1340                          int flock)
1341 {
1342         struct inode *inode = file->f_path.dentry->d_inode;
1343         struct fuse_conn *fc = get_fuse_conn(inode);
1344         struct fuse_file *ff = file->private_data;
1345         struct fuse_lk_in *arg = &req->misc.lk_in;
1346
1347         arg->fh = ff->fh;
1348         arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
1349         arg->lk.start = fl->fl_start;
1350         arg->lk.end = fl->fl_end;
1351         arg->lk.type = fl->fl_type;
1352         arg->lk.pid = pid;
1353         if (flock)
1354                 arg->lk_flags |= FUSE_LK_FLOCK;
1355         req->in.h.opcode = opcode;
1356         req->in.h.nodeid = get_node_id(inode);
1357         req->in.numargs = 1;
1358         req->in.args[0].size = sizeof(*arg);
1359         req->in.args[0].value = arg;
1360 }
1361
1362 static int fuse_getlk(struct file *file, struct file_lock *fl)
1363 {
1364         struct inode *inode = file->f_path.dentry->d_inode;
1365         struct fuse_conn *fc = get_fuse_conn(inode);
1366         struct fuse_req *req;
1367         struct fuse_lk_out outarg;
1368         int err;
1369
1370         req = fuse_get_req(fc);
1371         if (IS_ERR(req))
1372                 return PTR_ERR(req);
1373
1374         fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0);
1375         req->out.numargs = 1;
1376         req->out.args[0].size = sizeof(outarg);
1377         req->out.args[0].value = &outarg;
1378         fuse_request_send(fc, req);
1379         err = req->out.h.error;
1380         fuse_put_request(fc, req);
1381         if (!err)
1382                 err = convert_fuse_file_lock(&outarg.lk, fl);
1383
1384         return err;
1385 }
1386
1387 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
1388 {
1389         struct inode *inode = file->f_path.dentry->d_inode;
1390         struct fuse_conn *fc = get_fuse_conn(inode);
1391         struct fuse_req *req;
1392         int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
1393         pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
1394         int err;
1395
1396         if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
1397                 /* NLM needs asynchronous locks, which we don't support yet */
1398                 return -ENOLCK;
1399         }
1400
1401         /* Unlock on close is handled by the flush method */
1402         if (fl->fl_flags & FL_CLOSE)
1403                 return 0;
1404
1405         req = fuse_get_req(fc);
1406         if (IS_ERR(req))
1407                 return PTR_ERR(req);
1408
1409         fuse_lk_fill(req, file, fl, opcode, pid, flock);
1410         fuse_request_send(fc, req);
1411         err = req->out.h.error;
1412         /* locking is restartable */
1413         if (err == -EINTR)
1414                 err = -ERESTARTSYS;
1415         fuse_put_request(fc, req);
1416         return err;
1417 }
1418
1419 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
1420 {
1421         struct inode *inode = file->f_path.dentry->d_inode;
1422         struct fuse_conn *fc = get_fuse_conn(inode);
1423         int err;
1424
1425         if (cmd == F_CANCELLK) {
1426                 err = 0;
1427         } else if (cmd == F_GETLK) {
1428                 if (fc->no_lock) {
1429                         posix_test_lock(file, fl);
1430                         err = 0;
1431                 } else
1432                         err = fuse_getlk(file, fl);
1433         } else {
1434                 if (fc->no_lock)
1435                         err = posix_lock_file(file, fl, NULL);
1436                 else
1437                         err = fuse_setlk(file, fl, 0);
1438         }
1439         return err;
1440 }
1441
1442 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
1443 {
1444         struct inode *inode = file->f_path.dentry->d_inode;
1445         struct fuse_conn *fc = get_fuse_conn(inode);
1446         int err;
1447
1448         if (fc->no_lock) {
1449                 err = flock_lock_file_wait(file, fl);
1450         } else {
1451                 /* emulate flock with POSIX locks */
1452                 fl->fl_owner = (fl_owner_t) file;
1453                 err = fuse_setlk(file, fl, 1);
1454         }
1455
1456         return err;
1457 }
1458
1459 static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
1460 {
1461         struct inode *inode = mapping->host;
1462         struct fuse_conn *fc = get_fuse_conn(inode);
1463         struct fuse_req *req;
1464         struct fuse_bmap_in inarg;
1465         struct fuse_bmap_out outarg;
1466         int err;
1467
1468         if (!inode->i_sb->s_bdev || fc->no_bmap)
1469                 return 0;
1470
1471         req = fuse_get_req(fc);
1472         if (IS_ERR(req))
1473                 return 0;
1474
1475         memset(&inarg, 0, sizeof(inarg));
1476         inarg.block = block;
1477         inarg.blocksize = inode->i_sb->s_blocksize;
1478         req->in.h.opcode = FUSE_BMAP;
1479         req->in.h.nodeid = get_node_id(inode);
1480         req->in.numargs = 1;
1481         req->in.args[0].size = sizeof(inarg);
1482         req->in.args[0].value = &inarg;
1483         req->out.numargs = 1;
1484         req->out.args[0].size = sizeof(outarg);
1485         req->out.args[0].value = &outarg;
1486         fuse_request_send(fc, req);
1487         err = req->out.h.error;
1488         fuse_put_request(fc, req);
1489         if (err == -ENOSYS)
1490                 fc->no_bmap = 1;
1491
1492         return err ? 0 : outarg.block;
1493 }
1494
1495 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
1496 {
1497         loff_t retval;
1498         struct inode *inode = file->f_path.dentry->d_inode;
1499
1500         mutex_lock(&inode->i_mutex);
1501         switch (origin) {
1502         case SEEK_END:
1503                 retval = fuse_update_attributes(inode, NULL, file, NULL);
1504                 if (retval)
1505                         goto exit;
1506                 offset += i_size_read(inode);
1507                 break;
1508         case SEEK_CUR:
1509                 offset += file->f_pos;
1510         }
1511         retval = -EINVAL;
1512         if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
1513                 if (offset != file->f_pos) {
1514                         file->f_pos = offset;
1515                         file->f_version = 0;
1516                 }
1517                 retval = offset;
1518         }
1519 exit:
1520         mutex_unlock(&inode->i_mutex);
1521         return retval;
1522 }
1523
1524 static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1525                         unsigned int nr_segs, size_t bytes, bool to_user)
1526 {
1527         struct iov_iter ii;
1528         int page_idx = 0;
1529
1530         if (!bytes)
1531                 return 0;
1532
1533         iov_iter_init(&ii, iov, nr_segs, bytes, 0);
1534
1535         while (iov_iter_count(&ii)) {
1536                 struct page *page = pages[page_idx++];
1537                 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1538                 void *kaddr, *map;
1539
1540                 kaddr = map = kmap(page);
1541
1542                 while (todo) {
1543                         char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
1544                         size_t iov_len = ii.iov->iov_len - ii.iov_offset;
1545                         size_t copy = min(todo, iov_len);
1546                         size_t left;
1547
1548                         if (!to_user)
1549                                 left = copy_from_user(kaddr, uaddr, copy);
1550                         else
1551                                 left = copy_to_user(uaddr, kaddr, copy);
1552
1553                         if (unlikely(left))
1554                                 return -EFAULT;
1555
1556                         iov_iter_advance(&ii, copy);
1557                         todo -= copy;
1558                         kaddr += copy;
1559                 }
1560
1561                 kunmap(map);
1562         }
1563
1564         return 0;
1565 }
1566
1567 /*
1568  * For ioctls, there is no generic way to determine how much memory
1569  * needs to be read and/or written.  Furthermore, ioctls are allowed
1570  * to dereference the passed pointer, so the parameter requires deep
1571  * copying but FUSE has no idea whatsoever about what to copy in or
1572  * out.
1573  *
1574  * This is solved by allowing FUSE server to retry ioctl with
1575  * necessary in/out iovecs.  Let's assume the ioctl implementation
1576  * needs to read in the following structure.
1577  *
1578  * struct a {
1579  *      char    *buf;
1580  *      size_t  buflen;
1581  * }
1582  *
1583  * On the first callout to FUSE server, inarg->in_size and
1584  * inarg->out_size will be NULL; then, the server completes the ioctl
1585  * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
1586  * the actual iov array to
1587  *
1588  * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a) } }
1589  *
1590  * which tells FUSE to copy in the requested area and retry the ioctl.
1591  * On the second round, the server has access to the structure and
1592  * from that it can tell what to look for next, so on the invocation,
1593  * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
1594  *
1595  * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a)     },
1596  *   { .iov_base = a.buf,       .iov_len = a.buflen             } }
1597  *
1598  * FUSE will copy both struct a and the pointed buffer from the
1599  * process doing the ioctl and retry ioctl with both struct a and the
1600  * buffer.
1601  *
1602  * This time, FUSE server has everything it needs and completes ioctl
1603  * without FUSE_IOCTL_RETRY which finishes the ioctl call.
1604  *
1605  * Copying data out works the same way.
1606  *
1607  * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
1608  * automatically initializes in and out iovs by decoding @cmd with
1609  * _IOC_* macros and the server is not allowed to request RETRY.  This
1610  * limits ioctl data transfers to well-formed ioctls and is the forced
1611  * behavior for all FUSE servers.
1612  */
1613 static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
1614                                unsigned long arg, unsigned int flags)
1615 {
1616         struct inode *inode = file->f_dentry->d_inode;
1617         struct fuse_file *ff = file->private_data;
1618         struct fuse_conn *fc = get_fuse_conn(inode);
1619         struct fuse_ioctl_in inarg = {
1620                 .fh = ff->fh,
1621                 .cmd = cmd,
1622                 .arg = arg,
1623                 .flags = flags
1624         };
1625         struct fuse_ioctl_out outarg;
1626         struct fuse_req *req = NULL;
1627         struct page **pages = NULL;
1628         struct page *iov_page = NULL;
1629         struct iovec *in_iov = NULL, *out_iov = NULL;
1630         unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
1631         size_t in_size, out_size, transferred;
1632         int err;
1633
1634         /* assume all the iovs returned by client always fits in a page */
1635         BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1636
1637         if (!fuse_allow_task(fc, current))
1638                 return -EACCES;
1639
1640         err = -EIO;
1641         if (is_bad_inode(inode))
1642                 goto out;
1643
1644         err = -ENOMEM;
1645         pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
1646         iov_page = alloc_page(GFP_KERNEL);
1647         if (!pages || !iov_page)
1648                 goto out;
1649
1650         /*
1651          * If restricted, initialize IO parameters as encoded in @cmd.
1652          * RETRY from server is not allowed.
1653          */
1654         if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
1655                 struct iovec *iov = page_address(iov_page);
1656
1657                 iov->iov_base = (void __user *)arg;
1658                 iov->iov_len = _IOC_SIZE(cmd);
1659
1660                 if (_IOC_DIR(cmd) & _IOC_WRITE) {
1661                         in_iov = iov;
1662                         in_iovs = 1;
1663                 }
1664
1665                 if (_IOC_DIR(cmd) & _IOC_READ) {
1666                         out_iov = iov;
1667                         out_iovs = 1;
1668                 }
1669         }
1670
1671  retry:
1672         inarg.in_size = in_size = iov_length(in_iov, in_iovs);
1673         inarg.out_size = out_size = iov_length(out_iov, out_iovs);
1674
1675         /*
1676          * Out data can be used either for actual out data or iovs,
1677          * make sure there always is at least one page.
1678          */
1679         out_size = max_t(size_t, out_size, PAGE_SIZE);
1680         max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
1681
1682         /* make sure there are enough buffer pages and init request with them */
1683         err = -ENOMEM;
1684         if (max_pages > FUSE_MAX_PAGES_PER_REQ)
1685                 goto out;
1686         while (num_pages < max_pages) {
1687                 pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
1688                 if (!pages[num_pages])
1689                         goto out;
1690                 num_pages++;
1691         }
1692
1693         req = fuse_get_req(fc);
1694         if (IS_ERR(req)) {
1695                 err = PTR_ERR(req);
1696                 req = NULL;
1697                 goto out;
1698         }
1699         memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
1700         req->num_pages = num_pages;
1701
1702         /* okay, let's send it to the client */
1703         req->in.h.opcode = FUSE_IOCTL;
1704         req->in.h.nodeid = get_node_id(inode);
1705         req->in.numargs = 1;
1706         req->in.args[0].size = sizeof(inarg);
1707         req->in.args[0].value = &inarg;
1708         if (in_size) {
1709                 req->in.numargs++;
1710                 req->in.args[1].size = in_size;
1711                 req->in.argpages = 1;
1712
1713                 err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
1714                                            false);
1715                 if (err)
1716                         goto out;
1717         }
1718
1719         req->out.numargs = 2;
1720         req->out.args[0].size = sizeof(outarg);
1721         req->out.args[0].value = &outarg;
1722         req->out.args[1].size = out_size;
1723         req->out.argpages = 1;
1724         req->out.argvar = 1;
1725
1726         fuse_request_send(fc, req);
1727         err = req->out.h.error;
1728         transferred = req->out.args[1].size;
1729         fuse_put_request(fc, req);
1730         req = NULL;
1731         if (err)
1732                 goto out;
1733
1734         /* did it ask for retry? */
1735         if (outarg.flags & FUSE_IOCTL_RETRY) {
1736                 char *vaddr;
1737
1738                 /* no retry if in restricted mode */
1739                 err = -EIO;
1740                 if (!(flags & FUSE_IOCTL_UNRESTRICTED))
1741                         goto out;
1742
1743                 in_iovs = outarg.in_iovs;
1744                 out_iovs = outarg.out_iovs;
1745
1746                 /*
1747                  * Make sure things are in boundary, separate checks
1748                  * are to protect against overflow.
1749                  */
1750                 err = -ENOMEM;
1751                 if (in_iovs > FUSE_IOCTL_MAX_IOV ||
1752                     out_iovs > FUSE_IOCTL_MAX_IOV ||
1753                     in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
1754                         goto out;
1755
1756                 err = -EIO;
1757                 if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
1758                         goto out;
1759
1760                 /* okay, copy in iovs and retry */
1761                 vaddr = kmap_atomic(pages[0], KM_USER0);
1762                 memcpy(page_address(iov_page), vaddr, transferred);
1763                 kunmap_atomic(vaddr, KM_USER0);
1764
1765                 in_iov = page_address(iov_page);
1766                 out_iov = in_iov + in_iovs;
1767
1768                 goto retry;
1769         }
1770
1771         err = -EIO;
1772         if (transferred > inarg.out_size)
1773                 goto out;
1774
1775         err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
1776  out:
1777         if (req)
1778                 fuse_put_request(fc, req);
1779         if (iov_page)
1780                 __free_page(iov_page);
1781         while (num_pages)
1782                 __free_page(pages[--num_pages]);
1783         kfree(pages);
1784
1785         return err ? err : outarg.result;
1786 }
1787
1788 static long fuse_file_ioctl(struct file *file, unsigned int cmd,
1789                             unsigned long arg)
1790 {
1791         return fuse_file_do_ioctl(file, cmd, arg, 0);
1792 }
1793
1794 static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
1795                                    unsigned long arg)
1796 {
1797         return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
1798 }
1799
1800 /*
1801  * All files which have been polled are linked to RB tree
1802  * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
1803  * find the matching one.
1804  */
1805 static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
1806                                               struct rb_node **parent_out)
1807 {
1808         struct rb_node **link = &fc->polled_files.rb_node;
1809         struct rb_node *last = NULL;
1810
1811         while (*link) {
1812                 struct fuse_file *ff;
1813
1814                 last = *link;
1815                 ff = rb_entry(last, struct fuse_file, polled_node);
1816
1817                 if (kh < ff->kh)
1818                         link = &last->rb_left;
1819                 else if (kh > ff->kh)
1820                         link = &last->rb_right;
1821                 else
1822                         return link;
1823         }
1824
1825         if (parent_out)
1826                 *parent_out = last;
1827         return link;
1828 }
1829
1830 /*
1831  * The file is about to be polled.  Make sure it's on the polled_files
1832  * RB tree.  Note that files once added to the polled_files tree are
1833  * not removed before the file is released.  This is because a file
1834  * polled once is likely to be polled again.
1835  */
1836 static void fuse_register_polled_file(struct fuse_conn *fc,
1837                                       struct fuse_file *ff)
1838 {
1839         spin_lock(&fc->lock);
1840         if (RB_EMPTY_NODE(&ff->polled_node)) {
1841                 struct rb_node **link, *parent;
1842
1843                 link = fuse_find_polled_node(fc, ff->kh, &parent);
1844                 BUG_ON(*link);
1845                 rb_link_node(&ff->polled_node, parent, link);
1846                 rb_insert_color(&ff->polled_node, &fc->polled_files);
1847         }
1848         spin_unlock(&fc->lock);
1849 }
1850
1851 static unsigned fuse_file_poll(struct file *file, poll_table *wait)
1852 {
1853         struct inode *inode = file->f_dentry->d_inode;
1854         struct fuse_file *ff = file->private_data;
1855         struct fuse_conn *fc = get_fuse_conn(inode);
1856         struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
1857         struct fuse_poll_out outarg;
1858         struct fuse_req *req;
1859         int err;
1860
1861         if (fc->no_poll)
1862                 return DEFAULT_POLLMASK;
1863
1864         poll_wait(file, &ff->poll_wait, wait);
1865
1866         /*
1867          * Ask for notification iff there's someone waiting for it.
1868          * The client may ignore the flag and always notify.
1869          */
1870         if (waitqueue_active(&ff->poll_wait)) {
1871                 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
1872                 fuse_register_polled_file(fc, ff);
1873         }
1874
1875         req = fuse_get_req(fc);
1876         if (IS_ERR(req))
1877                 return PTR_ERR(req);
1878
1879         req->in.h.opcode = FUSE_POLL;
1880         req->in.h.nodeid = get_node_id(inode);
1881         req->in.numargs = 1;
1882         req->in.args[0].size = sizeof(inarg);
1883         req->in.args[0].value = &inarg;
1884         req->out.numargs = 1;
1885         req->out.args[0].size = sizeof(outarg);
1886         req->out.args[0].value = &outarg;
1887         fuse_request_send(fc, req);
1888         err = req->out.h.error;
1889         fuse_put_request(fc, req);
1890
1891         if (!err)
1892                 return outarg.revents;
1893         if (err == -ENOSYS) {
1894                 fc->no_poll = 1;
1895                 return DEFAULT_POLLMASK;
1896         }
1897         return POLLERR;
1898 }
1899
1900 /*
1901  * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
1902  * wakes up the poll waiters.
1903  */
1904 int fuse_notify_poll_wakeup(struct fuse_conn *fc,
1905                             struct fuse_notify_poll_wakeup_out *outarg)
1906 {
1907         u64 kh = outarg->kh;
1908         struct rb_node **link;
1909
1910         spin_lock(&fc->lock);
1911
1912         link = fuse_find_polled_node(fc, kh, NULL);
1913         if (*link) {
1914                 struct fuse_file *ff;
1915
1916                 ff = rb_entry(*link, struct fuse_file, polled_node);
1917                 wake_up_interruptible_sync(&ff->poll_wait);
1918         }
1919
1920         spin_unlock(&fc->lock);
1921         return 0;
1922 }
1923
1924 static const struct file_operations fuse_file_operations = {
1925         .llseek         = fuse_file_llseek,
1926         .read           = do_sync_read,
1927         .aio_read       = fuse_file_aio_read,
1928         .write          = do_sync_write,
1929         .aio_write      = fuse_file_aio_write,
1930         .mmap           = fuse_file_mmap,
1931         .open           = fuse_open,
1932         .flush          = fuse_flush,
1933         .release        = fuse_release,
1934         .fsync          = fuse_fsync,
1935         .lock           = fuse_file_lock,
1936         .flock          = fuse_file_flock,
1937         .splice_read    = generic_file_splice_read,
1938         .unlocked_ioctl = fuse_file_ioctl,
1939         .compat_ioctl   = fuse_file_compat_ioctl,
1940         .poll           = fuse_file_poll,
1941 };
1942
1943 static const struct file_operations fuse_direct_io_file_operations = {
1944         .llseek         = fuse_file_llseek,
1945         .read           = fuse_direct_read,
1946         .write          = fuse_direct_write,
1947         .mmap           = fuse_direct_mmap,
1948         .open           = fuse_open,
1949         .flush          = fuse_flush,
1950         .release        = fuse_release,
1951         .fsync          = fuse_fsync,
1952         .lock           = fuse_file_lock,
1953         .flock          = fuse_file_flock,
1954         .unlocked_ioctl = fuse_file_ioctl,
1955         .compat_ioctl   = fuse_file_compat_ioctl,
1956         .poll           = fuse_file_poll,
1957         /* no splice_read */
1958 };
1959
1960 static const struct address_space_operations fuse_file_aops  = {
1961         .readpage       = fuse_readpage,
1962         .writepage      = fuse_writepage,
1963         .launder_page   = fuse_launder_page,
1964         .write_begin    = fuse_write_begin,
1965         .write_end      = fuse_write_end,
1966         .readpages      = fuse_readpages,
1967         .set_page_dirty = __set_page_dirty_nobuffers,
1968         .bmap           = fuse_bmap,
1969 };
1970
1971 void fuse_init_file_inode(struct inode *inode)
1972 {
1973         inode->i_fop = &fuse_file_operations;
1974         inode->i_data.a_ops = &fuse_file_aops;
1975 }