x86: Use u32 instead of long to set reset vector back to 0
[firefly-linux-kernel-4.4.55.git] / fs / fuse / file.c
1 /*
2   FUSE: Filesystem in Userspace
3   Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
4
5   This program can be distributed under the terms of the GNU GPL.
6   See the file COPYING.
7 */
8
9 #include "fuse_i.h"
10
11 #include <linux/pagemap.h>
12 #include <linux/slab.h>
13 #include <linux/kernel.h>
14 #include <linux/sched.h>
15 #include <linux/module.h>
16 #include <linux/compat.h>
17
18 static const struct file_operations fuse_direct_io_file_operations;
19
20 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
21                           int opcode, struct fuse_open_out *outargp)
22 {
23         struct fuse_open_in inarg;
24         struct fuse_req *req;
25         int err;
26
27         req = fuse_get_req(fc);
28         if (IS_ERR(req))
29                 return PTR_ERR(req);
30
31         memset(&inarg, 0, sizeof(inarg));
32         inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
33         if (!fc->atomic_o_trunc)
34                 inarg.flags &= ~O_TRUNC;
35         req->in.h.opcode = opcode;
36         req->in.h.nodeid = nodeid;
37         req->in.numargs = 1;
38         req->in.args[0].size = sizeof(inarg);
39         req->in.args[0].value = &inarg;
40         req->out.numargs = 1;
41         req->out.args[0].size = sizeof(*outargp);
42         req->out.args[0].value = outargp;
43         fuse_request_send(fc, req);
44         err = req->out.h.error;
45         fuse_put_request(fc, req);
46
47         return err;
48 }
49
50 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
51 {
52         struct fuse_file *ff;
53
54         ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
55         if (unlikely(!ff))
56                 return NULL;
57
58         ff->fc = fc;
59         ff->reserved_req = fuse_request_alloc();
60         if (unlikely(!ff->reserved_req)) {
61                 kfree(ff);
62                 return NULL;
63         }
64
65         INIT_LIST_HEAD(&ff->write_entry);
66         atomic_set(&ff->count, 0);
67         RB_CLEAR_NODE(&ff->polled_node);
68         init_waitqueue_head(&ff->poll_wait);
69
70         spin_lock(&fc->lock);
71         ff->kh = ++fc->khctr;
72         spin_unlock(&fc->lock);
73
74         return ff;
75 }
76
77 void fuse_file_free(struct fuse_file *ff)
78 {
79         fuse_request_free(ff->reserved_req);
80         kfree(ff);
81 }
82
83 struct fuse_file *fuse_file_get(struct fuse_file *ff)
84 {
85         atomic_inc(&ff->count);
86         return ff;
87 }
88
89 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
90 {
91         path_put(&req->misc.release.path);
92 }
93
94 static void fuse_file_put(struct fuse_file *ff)
95 {
96         if (atomic_dec_and_test(&ff->count)) {
97                 struct fuse_req *req = ff->reserved_req;
98
99                 req->end = fuse_release_end;
100                 fuse_request_send_background(ff->fc, req);
101                 kfree(ff);
102         }
103 }
104
105 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
106                  bool isdir)
107 {
108         struct fuse_open_out outarg;
109         struct fuse_file *ff;
110         int err;
111         int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
112
113         ff = fuse_file_alloc(fc);
114         if (!ff)
115                 return -ENOMEM;
116
117         err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
118         if (err) {
119                 fuse_file_free(ff);
120                 return err;
121         }
122
123         if (isdir)
124                 outarg.open_flags &= ~FOPEN_DIRECT_IO;
125
126         ff->fh = outarg.fh;
127         ff->nodeid = nodeid;
128         ff->open_flags = outarg.open_flags;
129         file->private_data = fuse_file_get(ff);
130
131         return 0;
132 }
133 EXPORT_SYMBOL_GPL(fuse_do_open);
134
135 void fuse_finish_open(struct inode *inode, struct file *file)
136 {
137         struct fuse_file *ff = file->private_data;
138         struct fuse_conn *fc = get_fuse_conn(inode);
139
140         if (ff->open_flags & FOPEN_DIRECT_IO)
141                 file->f_op = &fuse_direct_io_file_operations;
142         if (!(ff->open_flags & FOPEN_KEEP_CACHE))
143                 invalidate_inode_pages2(inode->i_mapping);
144         if (ff->open_flags & FOPEN_NONSEEKABLE)
145                 nonseekable_open(inode, file);
146         if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
147                 struct fuse_inode *fi = get_fuse_inode(inode);
148
149                 spin_lock(&fc->lock);
150                 fi->attr_version = ++fc->attr_version;
151                 i_size_write(inode, 0);
152                 spin_unlock(&fc->lock);
153                 fuse_invalidate_attr(inode);
154         }
155 }
156
157 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
158 {
159         struct fuse_conn *fc = get_fuse_conn(inode);
160         int err;
161
162         /* VFS checks this, but only _after_ ->open() */
163         if (file->f_flags & O_DIRECT)
164                 return -EINVAL;
165
166         err = generic_file_open(inode, file);
167         if (err)
168                 return err;
169
170         err = fuse_do_open(fc, get_node_id(inode), file, isdir);
171         if (err)
172                 return err;
173
174         fuse_finish_open(inode, file);
175
176         return 0;
177 }
178
179 static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
180 {
181         struct fuse_conn *fc = ff->fc;
182         struct fuse_req *req = ff->reserved_req;
183         struct fuse_release_in *inarg = &req->misc.release.in;
184
185         spin_lock(&fc->lock);
186         list_del(&ff->write_entry);
187         if (!RB_EMPTY_NODE(&ff->polled_node))
188                 rb_erase(&ff->polled_node, &fc->polled_files);
189         spin_unlock(&fc->lock);
190
191         wake_up_interruptible_sync(&ff->poll_wait);
192
193         inarg->fh = ff->fh;
194         inarg->flags = flags;
195         req->in.h.opcode = opcode;
196         req->in.h.nodeid = ff->nodeid;
197         req->in.numargs = 1;
198         req->in.args[0].size = sizeof(struct fuse_release_in);
199         req->in.args[0].value = inarg;
200 }
201
202 void fuse_release_common(struct file *file, int opcode)
203 {
204         struct fuse_file *ff;
205         struct fuse_req *req;
206
207         ff = file->private_data;
208         if (unlikely(!ff))
209                 return;
210
211         req = ff->reserved_req;
212         fuse_prepare_release(ff, file->f_flags, opcode);
213
214         /* Hold vfsmount and dentry until release is finished */
215         path_get(&file->f_path);
216         req->misc.release.path = file->f_path;
217
218         /*
219          * Normally this will send the RELEASE request, however if
220          * some asynchronous READ or WRITE requests are outstanding,
221          * the sending will be delayed.
222          */
223         fuse_file_put(ff);
224 }
225
226 static int fuse_open(struct inode *inode, struct file *file)
227 {
228         return fuse_open_common(inode, file, false);
229 }
230
231 static int fuse_release(struct inode *inode, struct file *file)
232 {
233         fuse_release_common(file, FUSE_RELEASE);
234
235         /* return value is ignored by VFS */
236         return 0;
237 }
238
239 void fuse_sync_release(struct fuse_file *ff, int flags)
240 {
241         WARN_ON(atomic_read(&ff->count) > 1);
242         fuse_prepare_release(ff, flags, FUSE_RELEASE);
243         ff->reserved_req->force = 1;
244         fuse_request_send(ff->fc, ff->reserved_req);
245         fuse_put_request(ff->fc, ff->reserved_req);
246         kfree(ff);
247 }
248 EXPORT_SYMBOL_GPL(fuse_sync_release);
249
250 /*
251  * Scramble the ID space with XTEA, so that the value of the files_struct
252  * pointer is not exposed to userspace.
253  */
254 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
255 {
256         u32 *k = fc->scramble_key;
257         u64 v = (unsigned long) id;
258         u32 v0 = v;
259         u32 v1 = v >> 32;
260         u32 sum = 0;
261         int i;
262
263         for (i = 0; i < 32; i++) {
264                 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
265                 sum += 0x9E3779B9;
266                 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
267         }
268
269         return (u64) v0 + ((u64) v1 << 32);
270 }
271
272 /*
273  * Check if page is under writeback
274  *
275  * This is currently done by walking the list of writepage requests
276  * for the inode, which can be pretty inefficient.
277  */
278 static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
279 {
280         struct fuse_conn *fc = get_fuse_conn(inode);
281         struct fuse_inode *fi = get_fuse_inode(inode);
282         struct fuse_req *req;
283         bool found = false;
284
285         spin_lock(&fc->lock);
286         list_for_each_entry(req, &fi->writepages, writepages_entry) {
287                 pgoff_t curr_index;
288
289                 BUG_ON(req->inode != inode);
290                 curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
291                 if (curr_index == index) {
292                         found = true;
293                         break;
294                 }
295         }
296         spin_unlock(&fc->lock);
297
298         return found;
299 }
300
301 /*
302  * Wait for page writeback to be completed.
303  *
304  * Since fuse doesn't rely on the VM writeback tracking, this has to
305  * use some other means.
306  */
307 static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
308 {
309         struct fuse_inode *fi = get_fuse_inode(inode);
310
311         wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
312         return 0;
313 }
314
315 static int fuse_flush(struct file *file, fl_owner_t id)
316 {
317         struct inode *inode = file->f_path.dentry->d_inode;
318         struct fuse_conn *fc = get_fuse_conn(inode);
319         struct fuse_file *ff = file->private_data;
320         struct fuse_req *req;
321         struct fuse_flush_in inarg;
322         int err;
323
324         if (is_bad_inode(inode))
325                 return -EIO;
326
327         if (fc->no_flush)
328                 return 0;
329
330         req = fuse_get_req_nofail(fc, file);
331         memset(&inarg, 0, sizeof(inarg));
332         inarg.fh = ff->fh;
333         inarg.lock_owner = fuse_lock_owner_id(fc, id);
334         req->in.h.opcode = FUSE_FLUSH;
335         req->in.h.nodeid = get_node_id(inode);
336         req->in.numargs = 1;
337         req->in.args[0].size = sizeof(inarg);
338         req->in.args[0].value = &inarg;
339         req->force = 1;
340         fuse_request_send(fc, req);
341         err = req->out.h.error;
342         fuse_put_request(fc, req);
343         if (err == -ENOSYS) {
344                 fc->no_flush = 1;
345                 err = 0;
346         }
347         return err;
348 }
349
350 /*
351  * Wait for all pending writepages on the inode to finish.
352  *
353  * This is currently done by blocking further writes with FUSE_NOWRITE
354  * and waiting for all sent writes to complete.
355  *
356  * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
357  * could conflict with truncation.
358  */
359 static void fuse_sync_writes(struct inode *inode)
360 {
361         fuse_set_nowrite(inode);
362         fuse_release_nowrite(inode);
363 }
364
365 int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
366                       int isdir)
367 {
368         struct inode *inode = de->d_inode;
369         struct fuse_conn *fc = get_fuse_conn(inode);
370         struct fuse_file *ff = file->private_data;
371         struct fuse_req *req;
372         struct fuse_fsync_in inarg;
373         int err;
374
375         if (is_bad_inode(inode))
376                 return -EIO;
377
378         if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
379                 return 0;
380
381         /*
382          * Start writeback against all dirty pages of the inode, then
383          * wait for all outstanding writes, before sending the FSYNC
384          * request.
385          */
386         err = write_inode_now(inode, 0);
387         if (err)
388                 return err;
389
390         fuse_sync_writes(inode);
391
392         req = fuse_get_req(fc);
393         if (IS_ERR(req))
394                 return PTR_ERR(req);
395
396         memset(&inarg, 0, sizeof(inarg));
397         inarg.fh = ff->fh;
398         inarg.fsync_flags = datasync ? 1 : 0;
399         req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
400         req->in.h.nodeid = get_node_id(inode);
401         req->in.numargs = 1;
402         req->in.args[0].size = sizeof(inarg);
403         req->in.args[0].value = &inarg;
404         fuse_request_send(fc, req);
405         err = req->out.h.error;
406         fuse_put_request(fc, req);
407         if (err == -ENOSYS) {
408                 if (isdir)
409                         fc->no_fsyncdir = 1;
410                 else
411                         fc->no_fsync = 1;
412                 err = 0;
413         }
414         return err;
415 }
416
417 static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
418 {
419         return fuse_fsync_common(file, de, datasync, 0);
420 }
421
422 void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
423                     size_t count, int opcode)
424 {
425         struct fuse_read_in *inarg = &req->misc.read.in;
426         struct fuse_file *ff = file->private_data;
427
428         inarg->fh = ff->fh;
429         inarg->offset = pos;
430         inarg->size = count;
431         inarg->flags = file->f_flags;
432         req->in.h.opcode = opcode;
433         req->in.h.nodeid = ff->nodeid;
434         req->in.numargs = 1;
435         req->in.args[0].size = sizeof(struct fuse_read_in);
436         req->in.args[0].value = inarg;
437         req->out.argvar = 1;
438         req->out.numargs = 1;
439         req->out.args[0].size = count;
440 }
441
442 static size_t fuse_send_read(struct fuse_req *req, struct file *file,
443                              loff_t pos, size_t count, fl_owner_t owner)
444 {
445         struct fuse_file *ff = file->private_data;
446         struct fuse_conn *fc = ff->fc;
447
448         fuse_read_fill(req, file, pos, count, FUSE_READ);
449         if (owner != NULL) {
450                 struct fuse_read_in *inarg = &req->misc.read.in;
451
452                 inarg->read_flags |= FUSE_READ_LOCKOWNER;
453                 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
454         }
455         fuse_request_send(fc, req);
456         return req->out.args[0].size;
457 }
458
459 static void fuse_read_update_size(struct inode *inode, loff_t size,
460                                   u64 attr_ver)
461 {
462         struct fuse_conn *fc = get_fuse_conn(inode);
463         struct fuse_inode *fi = get_fuse_inode(inode);
464
465         spin_lock(&fc->lock);
466         if (attr_ver == fi->attr_version && size < inode->i_size) {
467                 fi->attr_version = ++fc->attr_version;
468                 i_size_write(inode, size);
469         }
470         spin_unlock(&fc->lock);
471 }
472
473 static int fuse_readpage(struct file *file, struct page *page)
474 {
475         struct inode *inode = page->mapping->host;
476         struct fuse_conn *fc = get_fuse_conn(inode);
477         struct fuse_req *req;
478         size_t num_read;
479         loff_t pos = page_offset(page);
480         size_t count = PAGE_CACHE_SIZE;
481         u64 attr_ver;
482         int err;
483
484         err = -EIO;
485         if (is_bad_inode(inode))
486                 goto out;
487
488         /*
489          * Page writeback can extend beyond the liftime of the
490          * page-cache page, so make sure we read a properly synced
491          * page.
492          */
493         fuse_wait_on_page_writeback(inode, page->index);
494
495         req = fuse_get_req(fc);
496         err = PTR_ERR(req);
497         if (IS_ERR(req))
498                 goto out;
499
500         attr_ver = fuse_get_attr_version(fc);
501
502         req->out.page_zeroing = 1;
503         req->out.argpages = 1;
504         req->num_pages = 1;
505         req->pages[0] = page;
506         num_read = fuse_send_read(req, file, pos, count, NULL);
507         err = req->out.h.error;
508         fuse_put_request(fc, req);
509
510         if (!err) {
511                 /*
512                  * Short read means EOF.  If file size is larger, truncate it
513                  */
514                 if (num_read < count)
515                         fuse_read_update_size(inode, pos + num_read, attr_ver);
516
517                 SetPageUptodate(page);
518         }
519
520         fuse_invalidate_attr(inode); /* atime changed */
521  out:
522         unlock_page(page);
523         return err;
524 }
525
526 static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
527 {
528         int i;
529         size_t count = req->misc.read.in.size;
530         size_t num_read = req->out.args[0].size;
531         struct inode *inode = req->pages[0]->mapping->host;
532
533         /*
534          * Short read means EOF.  If file size is larger, truncate it
535          */
536         if (!req->out.h.error && num_read < count) {
537                 loff_t pos = page_offset(req->pages[0]) + num_read;
538                 fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
539         }
540
541         fuse_invalidate_attr(inode); /* atime changed */
542
543         for (i = 0; i < req->num_pages; i++) {
544                 struct page *page = req->pages[i];
545                 if (!req->out.h.error)
546                         SetPageUptodate(page);
547                 else
548                         SetPageError(page);
549                 unlock_page(page);
550         }
551         if (req->ff)
552                 fuse_file_put(req->ff);
553 }
554
555 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
556 {
557         struct fuse_file *ff = file->private_data;
558         struct fuse_conn *fc = ff->fc;
559         loff_t pos = page_offset(req->pages[0]);
560         size_t count = req->num_pages << PAGE_CACHE_SHIFT;
561
562         req->out.argpages = 1;
563         req->out.page_zeroing = 1;
564         fuse_read_fill(req, file, pos, count, FUSE_READ);
565         req->misc.read.attr_ver = fuse_get_attr_version(fc);
566         if (fc->async_read) {
567                 req->ff = fuse_file_get(ff);
568                 req->end = fuse_readpages_end;
569                 fuse_request_send_background(fc, req);
570         } else {
571                 fuse_request_send(fc, req);
572                 fuse_readpages_end(fc, req);
573                 fuse_put_request(fc, req);
574         }
575 }
576
577 struct fuse_fill_data {
578         struct fuse_req *req;
579         struct file *file;
580         struct inode *inode;
581 };
582
583 static int fuse_readpages_fill(void *_data, struct page *page)
584 {
585         struct fuse_fill_data *data = _data;
586         struct fuse_req *req = data->req;
587         struct inode *inode = data->inode;
588         struct fuse_conn *fc = get_fuse_conn(inode);
589
590         fuse_wait_on_page_writeback(inode, page->index);
591
592         if (req->num_pages &&
593             (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
594              (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
595              req->pages[req->num_pages - 1]->index + 1 != page->index)) {
596                 fuse_send_readpages(req, data->file);
597                 data->req = req = fuse_get_req(fc);
598                 if (IS_ERR(req)) {
599                         unlock_page(page);
600                         return PTR_ERR(req);
601                 }
602         }
603         req->pages[req->num_pages] = page;
604         req->num_pages++;
605         return 0;
606 }
607
608 static int fuse_readpages(struct file *file, struct address_space *mapping,
609                           struct list_head *pages, unsigned nr_pages)
610 {
611         struct inode *inode = mapping->host;
612         struct fuse_conn *fc = get_fuse_conn(inode);
613         struct fuse_fill_data data;
614         int err;
615
616         err = -EIO;
617         if (is_bad_inode(inode))
618                 goto out;
619
620         data.file = file;
621         data.inode = inode;
622         data.req = fuse_get_req(fc);
623         err = PTR_ERR(data.req);
624         if (IS_ERR(data.req))
625                 goto out;
626
627         err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
628         if (!err) {
629                 if (data.req->num_pages)
630                         fuse_send_readpages(data.req, file);
631                 else
632                         fuse_put_request(fc, data.req);
633         }
634 out:
635         return err;
636 }
637
638 static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
639                                   unsigned long nr_segs, loff_t pos)
640 {
641         struct inode *inode = iocb->ki_filp->f_mapping->host;
642
643         if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) {
644                 int err;
645                 /*
646                  * If trying to read past EOF, make sure the i_size
647                  * attribute is up-to-date.
648                  */
649                 err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
650                 if (err)
651                         return err;
652         }
653
654         return generic_file_aio_read(iocb, iov, nr_segs, pos);
655 }
656
657 static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
658                             loff_t pos, size_t count)
659 {
660         struct fuse_write_in *inarg = &req->misc.write.in;
661         struct fuse_write_out *outarg = &req->misc.write.out;
662
663         inarg->fh = ff->fh;
664         inarg->offset = pos;
665         inarg->size = count;
666         req->in.h.opcode = FUSE_WRITE;
667         req->in.h.nodeid = ff->nodeid;
668         req->in.numargs = 2;
669         if (ff->fc->minor < 9)
670                 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
671         else
672                 req->in.args[0].size = sizeof(struct fuse_write_in);
673         req->in.args[0].value = inarg;
674         req->in.args[1].size = count;
675         req->out.numargs = 1;
676         req->out.args[0].size = sizeof(struct fuse_write_out);
677         req->out.args[0].value = outarg;
678 }
679
680 static size_t fuse_send_write(struct fuse_req *req, struct file *file,
681                               loff_t pos, size_t count, fl_owner_t owner)
682 {
683         struct fuse_file *ff = file->private_data;
684         struct fuse_conn *fc = ff->fc;
685         struct fuse_write_in *inarg = &req->misc.write.in;
686
687         fuse_write_fill(req, ff, pos, count);
688         inarg->flags = file->f_flags;
689         if (owner != NULL) {
690                 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
691                 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
692         }
693         fuse_request_send(fc, req);
694         return req->misc.write.out.size;
695 }
696
697 static int fuse_write_begin(struct file *file, struct address_space *mapping,
698                         loff_t pos, unsigned len, unsigned flags,
699                         struct page **pagep, void **fsdata)
700 {
701         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
702
703         *pagep = grab_cache_page_write_begin(mapping, index, flags);
704         if (!*pagep)
705                 return -ENOMEM;
706         return 0;
707 }
708
709 static void fuse_write_update_size(struct inode *inode, loff_t pos)
710 {
711         struct fuse_conn *fc = get_fuse_conn(inode);
712         struct fuse_inode *fi = get_fuse_inode(inode);
713
714         spin_lock(&fc->lock);
715         fi->attr_version = ++fc->attr_version;
716         if (pos > inode->i_size)
717                 i_size_write(inode, pos);
718         spin_unlock(&fc->lock);
719 }
720
721 static int fuse_buffered_write(struct file *file, struct inode *inode,
722                                loff_t pos, unsigned count, struct page *page)
723 {
724         int err;
725         size_t nres;
726         struct fuse_conn *fc = get_fuse_conn(inode);
727         unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
728         struct fuse_req *req;
729
730         if (is_bad_inode(inode))
731                 return -EIO;
732
733         /*
734          * Make sure writepages on the same page are not mixed up with
735          * plain writes.
736          */
737         fuse_wait_on_page_writeback(inode, page->index);
738
739         req = fuse_get_req(fc);
740         if (IS_ERR(req))
741                 return PTR_ERR(req);
742
743         req->in.argpages = 1;
744         req->num_pages = 1;
745         req->pages[0] = page;
746         req->page_offset = offset;
747         nres = fuse_send_write(req, file, pos, count, NULL);
748         err = req->out.h.error;
749         fuse_put_request(fc, req);
750         if (!err && !nres)
751                 err = -EIO;
752         if (!err) {
753                 pos += nres;
754                 fuse_write_update_size(inode, pos);
755                 if (count == PAGE_CACHE_SIZE)
756                         SetPageUptodate(page);
757         }
758         fuse_invalidate_attr(inode);
759         return err ? err : nres;
760 }
761
762 static int fuse_write_end(struct file *file, struct address_space *mapping,
763                         loff_t pos, unsigned len, unsigned copied,
764                         struct page *page, void *fsdata)
765 {
766         struct inode *inode = mapping->host;
767         int res = 0;
768
769         if (copied)
770                 res = fuse_buffered_write(file, inode, pos, copied, page);
771
772         unlock_page(page);
773         page_cache_release(page);
774         return res;
775 }
776
777 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
778                                     struct inode *inode, loff_t pos,
779                                     size_t count)
780 {
781         size_t res;
782         unsigned offset;
783         unsigned i;
784
785         for (i = 0; i < req->num_pages; i++)
786                 fuse_wait_on_page_writeback(inode, req->pages[i]->index);
787
788         res = fuse_send_write(req, file, pos, count, NULL);
789
790         offset = req->page_offset;
791         count = res;
792         for (i = 0; i < req->num_pages; i++) {
793                 struct page *page = req->pages[i];
794
795                 if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
796                         SetPageUptodate(page);
797
798                 if (count > PAGE_CACHE_SIZE - offset)
799                         count -= PAGE_CACHE_SIZE - offset;
800                 else
801                         count = 0;
802                 offset = 0;
803
804                 unlock_page(page);
805                 page_cache_release(page);
806         }
807
808         return res;
809 }
810
811 static ssize_t fuse_fill_write_pages(struct fuse_req *req,
812                                struct address_space *mapping,
813                                struct iov_iter *ii, loff_t pos)
814 {
815         struct fuse_conn *fc = get_fuse_conn(mapping->host);
816         unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
817         size_t count = 0;
818         int err;
819
820         req->in.argpages = 1;
821         req->page_offset = offset;
822
823         do {
824                 size_t tmp;
825                 struct page *page;
826                 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
827                 size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
828                                      iov_iter_count(ii));
829
830                 bytes = min_t(size_t, bytes, fc->max_write - count);
831
832  again:
833                 err = -EFAULT;
834                 if (iov_iter_fault_in_readable(ii, bytes))
835                         break;
836
837                 err = -ENOMEM;
838                 page = grab_cache_page_write_begin(mapping, index, 0);
839                 if (!page)
840                         break;
841
842                 if (mapping_writably_mapped(mapping))
843                         flush_dcache_page(page);
844
845                 pagefault_disable();
846                 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
847                 pagefault_enable();
848                 flush_dcache_page(page);
849
850                 if (!tmp) {
851                         unlock_page(page);
852                         page_cache_release(page);
853                         bytes = min(bytes, iov_iter_single_seg_count(ii));
854                         goto again;
855                 }
856
857                 err = 0;
858                 req->pages[req->num_pages] = page;
859                 req->num_pages++;
860
861                 iov_iter_advance(ii, tmp);
862                 count += tmp;
863                 pos += tmp;
864                 offset += tmp;
865                 if (offset == PAGE_CACHE_SIZE)
866                         offset = 0;
867
868                 if (!fc->big_writes)
869                         break;
870         } while (iov_iter_count(ii) && count < fc->max_write &&
871                  req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
872
873         return count > 0 ? count : err;
874 }
875
876 static ssize_t fuse_perform_write(struct file *file,
877                                   struct address_space *mapping,
878                                   struct iov_iter *ii, loff_t pos)
879 {
880         struct inode *inode = mapping->host;
881         struct fuse_conn *fc = get_fuse_conn(inode);
882         int err = 0;
883         ssize_t res = 0;
884
885         if (is_bad_inode(inode))
886                 return -EIO;
887
888         do {
889                 struct fuse_req *req;
890                 ssize_t count;
891
892                 req = fuse_get_req(fc);
893                 if (IS_ERR(req)) {
894                         err = PTR_ERR(req);
895                         break;
896                 }
897
898                 count = fuse_fill_write_pages(req, mapping, ii, pos);
899                 if (count <= 0) {
900                         err = count;
901                 } else {
902                         size_t num_written;
903
904                         num_written = fuse_send_write_pages(req, file, inode,
905                                                             pos, count);
906                         err = req->out.h.error;
907                         if (!err) {
908                                 res += num_written;
909                                 pos += num_written;
910
911                                 /* break out of the loop on short write */
912                                 if (num_written != count)
913                                         err = -EIO;
914                         }
915                 }
916                 fuse_put_request(fc, req);
917         } while (!err && iov_iter_count(ii));
918
919         if (res > 0)
920                 fuse_write_update_size(inode, pos);
921
922         fuse_invalidate_attr(inode);
923
924         return res > 0 ? res : err;
925 }
926
927 static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
928                                    unsigned long nr_segs, loff_t pos)
929 {
930         struct file *file = iocb->ki_filp;
931         struct address_space *mapping = file->f_mapping;
932         size_t count = 0;
933         ssize_t written = 0;
934         struct inode *inode = mapping->host;
935         ssize_t err;
936         struct iov_iter i;
937
938         WARN_ON(iocb->ki_pos != pos);
939
940         err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
941         if (err)
942                 return err;
943
944         mutex_lock(&inode->i_mutex);
945         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
946
947         /* We can write back this queue in page reclaim */
948         current->backing_dev_info = mapping->backing_dev_info;
949
950         err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
951         if (err)
952                 goto out;
953
954         if (count == 0)
955                 goto out;
956
957         err = file_remove_suid(file);
958         if (err)
959                 goto out;
960
961         file_update_time(file);
962
963         iov_iter_init(&i, iov, nr_segs, count, 0);
964         written = fuse_perform_write(file, mapping, &i, pos);
965         if (written >= 0)
966                 iocb->ki_pos = pos + written;
967
968 out:
969         current->backing_dev_info = NULL;
970         mutex_unlock(&inode->i_mutex);
971
972         return written ? written : err;
973 }
974
975 static void fuse_release_user_pages(struct fuse_req *req, int write)
976 {
977         unsigned i;
978
979         for (i = 0; i < req->num_pages; i++) {
980                 struct page *page = req->pages[i];
981                 if (write)
982                         set_page_dirty_lock(page);
983                 put_page(page);
984         }
985 }
986
987 static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
988                                size_t *nbytesp, int write)
989 {
990         size_t nbytes = *nbytesp;
991         unsigned long user_addr = (unsigned long) buf;
992         unsigned offset = user_addr & ~PAGE_MASK;
993         int npages;
994
995         /* Special case for kernel I/O: can copy directly into the buffer */
996         if (segment_eq(get_fs(), KERNEL_DS)) {
997                 if (write)
998                         req->in.args[1].value = (void *) user_addr;
999                 else
1000                         req->out.args[0].value = (void *) user_addr;
1001
1002                 return 0;
1003         }
1004
1005         nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
1006         npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
1007         npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
1008         down_read(&current->mm->mmap_sem);
1009         npages = get_user_pages(current, current->mm, user_addr, npages, !write,
1010                                 0, req->pages, NULL);
1011         up_read(&current->mm->mmap_sem);
1012         if (npages < 0)
1013                 return npages;
1014
1015         req->num_pages = npages;
1016         req->page_offset = offset;
1017
1018         if (write)
1019                 req->in.argpages = 1;
1020         else
1021                 req->out.argpages = 1;
1022
1023         nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
1024         *nbytesp = min(*nbytesp, nbytes);
1025
1026         return 0;
1027 }
1028
1029 ssize_t fuse_direct_io(struct file *file, const char __user *buf,
1030                        size_t count, loff_t *ppos, int write)
1031 {
1032         struct fuse_file *ff = file->private_data;
1033         struct fuse_conn *fc = ff->fc;
1034         size_t nmax = write ? fc->max_write : fc->max_read;
1035         loff_t pos = *ppos;
1036         ssize_t res = 0;
1037         struct fuse_req *req;
1038
1039         req = fuse_get_req(fc);
1040         if (IS_ERR(req))
1041                 return PTR_ERR(req);
1042
1043         while (count) {
1044                 size_t nres;
1045                 fl_owner_t owner = current->files;
1046                 size_t nbytes = min(count, nmax);
1047                 int err = fuse_get_user_pages(req, buf, &nbytes, write);
1048                 if (err) {
1049                         res = err;
1050                         break;
1051                 }
1052
1053                 if (write)
1054                         nres = fuse_send_write(req, file, pos, nbytes, owner);
1055                 else
1056                         nres = fuse_send_read(req, file, pos, nbytes, owner);
1057
1058                 fuse_release_user_pages(req, !write);
1059                 if (req->out.h.error) {
1060                         if (!res)
1061                                 res = req->out.h.error;
1062                         break;
1063                 } else if (nres > nbytes) {
1064                         res = -EIO;
1065                         break;
1066                 }
1067                 count -= nres;
1068                 res += nres;
1069                 pos += nres;
1070                 buf += nres;
1071                 if (nres != nbytes)
1072                         break;
1073                 if (count) {
1074                         fuse_put_request(fc, req);
1075                         req = fuse_get_req(fc);
1076                         if (IS_ERR(req))
1077                                 break;
1078                 }
1079         }
1080         if (!IS_ERR(req))
1081                 fuse_put_request(fc, req);
1082         if (res > 0)
1083                 *ppos = pos;
1084
1085         return res;
1086 }
1087 EXPORT_SYMBOL_GPL(fuse_direct_io);
1088
1089 static ssize_t fuse_direct_read(struct file *file, char __user *buf,
1090                                      size_t count, loff_t *ppos)
1091 {
1092         ssize_t res;
1093         struct inode *inode = file->f_path.dentry->d_inode;
1094
1095         if (is_bad_inode(inode))
1096                 return -EIO;
1097
1098         res = fuse_direct_io(file, buf, count, ppos, 0);
1099
1100         fuse_invalidate_attr(inode);
1101
1102         return res;
1103 }
1104
1105 static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
1106                                  size_t count, loff_t *ppos)
1107 {
1108         struct inode *inode = file->f_path.dentry->d_inode;
1109         ssize_t res;
1110
1111         if (is_bad_inode(inode))
1112                 return -EIO;
1113
1114         /* Don't allow parallel writes to the same file */
1115         mutex_lock(&inode->i_mutex);
1116         res = generic_write_checks(file, ppos, &count, 0);
1117         if (!res) {
1118                 res = fuse_direct_io(file, buf, count, ppos, 1);
1119                 if (res > 0)
1120                         fuse_write_update_size(inode, *ppos);
1121         }
1122         mutex_unlock(&inode->i_mutex);
1123
1124         fuse_invalidate_attr(inode);
1125
1126         return res;
1127 }
1128
1129 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
1130 {
1131         __free_page(req->pages[0]);
1132         fuse_file_put(req->ff);
1133 }
1134
1135 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1136 {
1137         struct inode *inode = req->inode;
1138         struct fuse_inode *fi = get_fuse_inode(inode);
1139         struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
1140
1141         list_del(&req->writepages_entry);
1142         dec_bdi_stat(bdi, BDI_WRITEBACK);
1143         dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
1144         bdi_writeout_inc(bdi);
1145         wake_up(&fi->page_waitq);
1146 }
1147
1148 /* Called under fc->lock, may release and reacquire it */
1149 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1150 __releases(&fc->lock)
1151 __acquires(&fc->lock)
1152 {
1153         struct fuse_inode *fi = get_fuse_inode(req->inode);
1154         loff_t size = i_size_read(req->inode);
1155         struct fuse_write_in *inarg = &req->misc.write.in;
1156
1157         if (!fc->connected)
1158                 goto out_free;
1159
1160         if (inarg->offset + PAGE_CACHE_SIZE <= size) {
1161                 inarg->size = PAGE_CACHE_SIZE;
1162         } else if (inarg->offset < size) {
1163                 inarg->size = size & (PAGE_CACHE_SIZE - 1);
1164         } else {
1165                 /* Got truncated off completely */
1166                 goto out_free;
1167         }
1168
1169         req->in.args[1].size = inarg->size;
1170         fi->writectr++;
1171         fuse_request_send_background_locked(fc, req);
1172         return;
1173
1174  out_free:
1175         fuse_writepage_finish(fc, req);
1176         spin_unlock(&fc->lock);
1177         fuse_writepage_free(fc, req);
1178         fuse_put_request(fc, req);
1179         spin_lock(&fc->lock);
1180 }
1181
1182 /*
1183  * If fi->writectr is positive (no truncate or fsync going on) send
1184  * all queued writepage requests.
1185  *
1186  * Called with fc->lock
1187  */
1188 void fuse_flush_writepages(struct inode *inode)
1189 __releases(&fc->lock)
1190 __acquires(&fc->lock)
1191 {
1192         struct fuse_conn *fc = get_fuse_conn(inode);
1193         struct fuse_inode *fi = get_fuse_inode(inode);
1194         struct fuse_req *req;
1195
1196         while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1197                 req = list_entry(fi->queued_writes.next, struct fuse_req, list);
1198                 list_del_init(&req->list);
1199                 fuse_send_writepage(fc, req);
1200         }
1201 }
1202
1203 static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
1204 {
1205         struct inode *inode = req->inode;
1206         struct fuse_inode *fi = get_fuse_inode(inode);
1207
1208         mapping_set_error(inode->i_mapping, req->out.h.error);
1209         spin_lock(&fc->lock);
1210         fi->writectr--;
1211         fuse_writepage_finish(fc, req);
1212         spin_unlock(&fc->lock);
1213         fuse_writepage_free(fc, req);
1214 }
1215
1216 static int fuse_writepage_locked(struct page *page)
1217 {
1218         struct address_space *mapping = page->mapping;
1219         struct inode *inode = mapping->host;
1220         struct fuse_conn *fc = get_fuse_conn(inode);
1221         struct fuse_inode *fi = get_fuse_inode(inode);
1222         struct fuse_req *req;
1223         struct fuse_file *ff;
1224         struct page *tmp_page;
1225
1226         set_page_writeback(page);
1227
1228         req = fuse_request_alloc_nofs();
1229         if (!req)
1230                 goto err;
1231
1232         tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1233         if (!tmp_page)
1234                 goto err_free;
1235
1236         spin_lock(&fc->lock);
1237         BUG_ON(list_empty(&fi->write_files));
1238         ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
1239         req->ff = fuse_file_get(ff);
1240         spin_unlock(&fc->lock);
1241
1242         fuse_write_fill(req, ff, page_offset(page), 0);
1243
1244         copy_highpage(tmp_page, page);
1245         req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
1246         req->in.argpages = 1;
1247         req->num_pages = 1;
1248         req->pages[0] = tmp_page;
1249         req->page_offset = 0;
1250         req->end = fuse_writepage_end;
1251         req->inode = inode;
1252
1253         inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
1254         inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
1255         end_page_writeback(page);
1256
1257         spin_lock(&fc->lock);
1258         list_add(&req->writepages_entry, &fi->writepages);
1259         list_add_tail(&req->list, &fi->queued_writes);
1260         fuse_flush_writepages(inode);
1261         spin_unlock(&fc->lock);
1262
1263         return 0;
1264
1265 err_free:
1266         fuse_request_free(req);
1267 err:
1268         end_page_writeback(page);
1269         return -ENOMEM;
1270 }
1271
1272 static int fuse_writepage(struct page *page, struct writeback_control *wbc)
1273 {
1274         int err;
1275
1276         err = fuse_writepage_locked(page);
1277         unlock_page(page);
1278
1279         return err;
1280 }
1281
1282 static int fuse_launder_page(struct page *page)
1283 {
1284         int err = 0;
1285         if (clear_page_dirty_for_io(page)) {
1286                 struct inode *inode = page->mapping->host;
1287                 err = fuse_writepage_locked(page);
1288                 if (!err)
1289                         fuse_wait_on_page_writeback(inode, page->index);
1290         }
1291         return err;
1292 }
1293
1294 /*
1295  * Write back dirty pages now, because there may not be any suitable
1296  * open files later
1297  */
1298 static void fuse_vma_close(struct vm_area_struct *vma)
1299 {
1300         filemap_write_and_wait(vma->vm_file->f_mapping);
1301 }
1302
1303 /*
1304  * Wait for writeback against this page to complete before allowing it
1305  * to be marked dirty again, and hence written back again, possibly
1306  * before the previous writepage completed.
1307  *
1308  * Block here, instead of in ->writepage(), so that the userspace fs
1309  * can only block processes actually operating on the filesystem.
1310  *
1311  * Otherwise unprivileged userspace fs would be able to block
1312  * unrelated:
1313  *
1314  * - page migration
1315  * - sync(2)
1316  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
1317  */
1318 static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1319 {
1320         struct page *page = vmf->page;
1321         /*
1322          * Don't use page->mapping as it may become NULL from a
1323          * concurrent truncate.
1324          */
1325         struct inode *inode = vma->vm_file->f_mapping->host;
1326
1327         fuse_wait_on_page_writeback(inode, page->index);
1328         return 0;
1329 }
1330
1331 static const struct vm_operations_struct fuse_file_vm_ops = {
1332         .close          = fuse_vma_close,
1333         .fault          = filemap_fault,
1334         .page_mkwrite   = fuse_page_mkwrite,
1335 };
1336
1337 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
1338 {
1339         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1340                 struct inode *inode = file->f_dentry->d_inode;
1341                 struct fuse_conn *fc = get_fuse_conn(inode);
1342                 struct fuse_inode *fi = get_fuse_inode(inode);
1343                 struct fuse_file *ff = file->private_data;
1344                 /*
1345                  * file may be written through mmap, so chain it onto the
1346                  * inodes's write_file list
1347                  */
1348                 spin_lock(&fc->lock);
1349                 if (list_empty(&ff->write_entry))
1350                         list_add(&ff->write_entry, &fi->write_files);
1351                 spin_unlock(&fc->lock);
1352         }
1353         file_accessed(file);
1354         vma->vm_ops = &fuse_file_vm_ops;
1355         return 0;
1356 }
1357
1358 static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
1359 {
1360         /* Can't provide the coherency needed for MAP_SHARED */
1361         if (vma->vm_flags & VM_MAYSHARE)
1362                 return -ENODEV;
1363
1364         invalidate_inode_pages2(file->f_mapping);
1365
1366         return generic_file_mmap(file, vma);
1367 }
1368
1369 static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
1370                                   struct file_lock *fl)
1371 {
1372         switch (ffl->type) {
1373         case F_UNLCK:
1374                 break;
1375
1376         case F_RDLCK:
1377         case F_WRLCK:
1378                 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
1379                     ffl->end < ffl->start)
1380                         return -EIO;
1381
1382                 fl->fl_start = ffl->start;
1383                 fl->fl_end = ffl->end;
1384                 fl->fl_pid = ffl->pid;
1385                 break;
1386
1387         default:
1388                 return -EIO;
1389         }
1390         fl->fl_type = ffl->type;
1391         return 0;
1392 }
1393
1394 static void fuse_lk_fill(struct fuse_req *req, struct file *file,
1395                          const struct file_lock *fl, int opcode, pid_t pid,
1396                          int flock)
1397 {
1398         struct inode *inode = file->f_path.dentry->d_inode;
1399         struct fuse_conn *fc = get_fuse_conn(inode);
1400         struct fuse_file *ff = file->private_data;
1401         struct fuse_lk_in *arg = &req->misc.lk_in;
1402
1403         arg->fh = ff->fh;
1404         arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
1405         arg->lk.start = fl->fl_start;
1406         arg->lk.end = fl->fl_end;
1407         arg->lk.type = fl->fl_type;
1408         arg->lk.pid = pid;
1409         if (flock)
1410                 arg->lk_flags |= FUSE_LK_FLOCK;
1411         req->in.h.opcode = opcode;
1412         req->in.h.nodeid = get_node_id(inode);
1413         req->in.numargs = 1;
1414         req->in.args[0].size = sizeof(*arg);
1415         req->in.args[0].value = arg;
1416 }
1417
1418 static int fuse_getlk(struct file *file, struct file_lock *fl)
1419 {
1420         struct inode *inode = file->f_path.dentry->d_inode;
1421         struct fuse_conn *fc = get_fuse_conn(inode);
1422         struct fuse_req *req;
1423         struct fuse_lk_out outarg;
1424         int err;
1425
1426         req = fuse_get_req(fc);
1427         if (IS_ERR(req))
1428                 return PTR_ERR(req);
1429
1430         fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0);
1431         req->out.numargs = 1;
1432         req->out.args[0].size = sizeof(outarg);
1433         req->out.args[0].value = &outarg;
1434         fuse_request_send(fc, req);
1435         err = req->out.h.error;
1436         fuse_put_request(fc, req);
1437         if (!err)
1438                 err = convert_fuse_file_lock(&outarg.lk, fl);
1439
1440         return err;
1441 }
1442
1443 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
1444 {
1445         struct inode *inode = file->f_path.dentry->d_inode;
1446         struct fuse_conn *fc = get_fuse_conn(inode);
1447         struct fuse_req *req;
1448         int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
1449         pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
1450         int err;
1451
1452         if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
1453                 /* NLM needs asynchronous locks, which we don't support yet */
1454                 return -ENOLCK;
1455         }
1456
1457         /* Unlock on close is handled by the flush method */
1458         if (fl->fl_flags & FL_CLOSE)
1459                 return 0;
1460
1461         req = fuse_get_req(fc);
1462         if (IS_ERR(req))
1463                 return PTR_ERR(req);
1464
1465         fuse_lk_fill(req, file, fl, opcode, pid, flock);
1466         fuse_request_send(fc, req);
1467         err = req->out.h.error;
1468         /* locking is restartable */
1469         if (err == -EINTR)
1470                 err = -ERESTARTSYS;
1471         fuse_put_request(fc, req);
1472         return err;
1473 }
1474
1475 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
1476 {
1477         struct inode *inode = file->f_path.dentry->d_inode;
1478         struct fuse_conn *fc = get_fuse_conn(inode);
1479         int err;
1480
1481         if (cmd == F_CANCELLK) {
1482                 err = 0;
1483         } else if (cmd == F_GETLK) {
1484                 if (fc->no_lock) {
1485                         posix_test_lock(file, fl);
1486                         err = 0;
1487                 } else
1488                         err = fuse_getlk(file, fl);
1489         } else {
1490                 if (fc->no_lock)
1491                         err = posix_lock_file(file, fl, NULL);
1492                 else
1493                         err = fuse_setlk(file, fl, 0);
1494         }
1495         return err;
1496 }
1497
1498 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
1499 {
1500         struct inode *inode = file->f_path.dentry->d_inode;
1501         struct fuse_conn *fc = get_fuse_conn(inode);
1502         int err;
1503
1504         if (fc->no_lock) {
1505                 err = flock_lock_file_wait(file, fl);
1506         } else {
1507                 /* emulate flock with POSIX locks */
1508                 fl->fl_owner = (fl_owner_t) file;
1509                 err = fuse_setlk(file, fl, 1);
1510         }
1511
1512         return err;
1513 }
1514
1515 static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
1516 {
1517         struct inode *inode = mapping->host;
1518         struct fuse_conn *fc = get_fuse_conn(inode);
1519         struct fuse_req *req;
1520         struct fuse_bmap_in inarg;
1521         struct fuse_bmap_out outarg;
1522         int err;
1523
1524         if (!inode->i_sb->s_bdev || fc->no_bmap)
1525                 return 0;
1526
1527         req = fuse_get_req(fc);
1528         if (IS_ERR(req))
1529                 return 0;
1530
1531         memset(&inarg, 0, sizeof(inarg));
1532         inarg.block = block;
1533         inarg.blocksize = inode->i_sb->s_blocksize;
1534         req->in.h.opcode = FUSE_BMAP;
1535         req->in.h.nodeid = get_node_id(inode);
1536         req->in.numargs = 1;
1537         req->in.args[0].size = sizeof(inarg);
1538         req->in.args[0].value = &inarg;
1539         req->out.numargs = 1;
1540         req->out.args[0].size = sizeof(outarg);
1541         req->out.args[0].value = &outarg;
1542         fuse_request_send(fc, req);
1543         err = req->out.h.error;
1544         fuse_put_request(fc, req);
1545         if (err == -ENOSYS)
1546                 fc->no_bmap = 1;
1547
1548         return err ? 0 : outarg.block;
1549 }
1550
1551 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
1552 {
1553         loff_t retval;
1554         struct inode *inode = file->f_path.dentry->d_inode;
1555
1556         mutex_lock(&inode->i_mutex);
1557         switch (origin) {
1558         case SEEK_END:
1559                 retval = fuse_update_attributes(inode, NULL, file, NULL);
1560                 if (retval)
1561                         goto exit;
1562                 offset += i_size_read(inode);
1563                 break;
1564         case SEEK_CUR:
1565                 offset += file->f_pos;
1566         }
1567         retval = -EINVAL;
1568         if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
1569                 if (offset != file->f_pos) {
1570                         file->f_pos = offset;
1571                         file->f_version = 0;
1572                 }
1573                 retval = offset;
1574         }
1575 exit:
1576         mutex_unlock(&inode->i_mutex);
1577         return retval;
1578 }
1579
1580 static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1581                         unsigned int nr_segs, size_t bytes, bool to_user)
1582 {
1583         struct iov_iter ii;
1584         int page_idx = 0;
1585
1586         if (!bytes)
1587                 return 0;
1588
1589         iov_iter_init(&ii, iov, nr_segs, bytes, 0);
1590
1591         while (iov_iter_count(&ii)) {
1592                 struct page *page = pages[page_idx++];
1593                 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1594                 void *kaddr, *map;
1595
1596                 kaddr = map = kmap(page);
1597
1598                 while (todo) {
1599                         char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
1600                         size_t iov_len = ii.iov->iov_len - ii.iov_offset;
1601                         size_t copy = min(todo, iov_len);
1602                         size_t left;
1603
1604                         if (!to_user)
1605                                 left = copy_from_user(kaddr, uaddr, copy);
1606                         else
1607                                 left = copy_to_user(uaddr, kaddr, copy);
1608
1609                         if (unlikely(left))
1610                                 return -EFAULT;
1611
1612                         iov_iter_advance(&ii, copy);
1613                         todo -= copy;
1614                         kaddr += copy;
1615                 }
1616
1617                 kunmap(page);
1618         }
1619
1620         return 0;
1621 }
1622
1623 /* Make sure iov_length() won't overflow */
1624 static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
1625 {
1626         size_t n;
1627         u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
1628
1629         for (n = 0; n < count; n++) {
1630                 if (iov->iov_len > (size_t) max)
1631                         return -ENOMEM;
1632                 max -= iov->iov_len;
1633         }
1634         return 0;
1635 }
1636
1637 /*
1638  * CUSE servers compiled on 32bit broke on 64bit kernels because the
1639  * ABI was defined to be 'struct iovec' which is different on 32bit
1640  * and 64bit.  Fortunately we can determine which structure the server
1641  * used from the size of the reply.
1642  */
1643 static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src,
1644                                  size_t transferred, unsigned count,
1645                                  bool is_compat)
1646 {
1647 #ifdef CONFIG_COMPAT
1648         if (count * sizeof(struct compat_iovec) == transferred) {
1649                 struct compat_iovec *ciov = src;
1650                 unsigned i;
1651
1652                 /*
1653                  * With this interface a 32bit server cannot support
1654                  * non-compat (i.e. ones coming from 64bit apps) ioctl
1655                  * requests
1656                  */
1657                 if (!is_compat)
1658                         return -EINVAL;
1659
1660                 for (i = 0; i < count; i++) {
1661                         dst[i].iov_base = compat_ptr(ciov[i].iov_base);
1662                         dst[i].iov_len = ciov[i].iov_len;
1663                 }
1664                 return 0;
1665         }
1666 #endif
1667
1668         if (count * sizeof(struct iovec) != transferred)
1669                 return -EIO;
1670
1671         memcpy(dst, src, transferred);
1672         return 0;
1673 }
1674
1675 /*
1676  * For ioctls, there is no generic way to determine how much memory
1677  * needs to be read and/or written.  Furthermore, ioctls are allowed
1678  * to dereference the passed pointer, so the parameter requires deep
1679  * copying but FUSE has no idea whatsoever about what to copy in or
1680  * out.
1681  *
1682  * This is solved by allowing FUSE server to retry ioctl with
1683  * necessary in/out iovecs.  Let's assume the ioctl implementation
1684  * needs to read in the following structure.
1685  *
1686  * struct a {
1687  *      char    *buf;
1688  *      size_t  buflen;
1689  * }
1690  *
1691  * On the first callout to FUSE server, inarg->in_size and
1692  * inarg->out_size will be NULL; then, the server completes the ioctl
1693  * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
1694  * the actual iov array to
1695  *
1696  * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a) } }
1697  *
1698  * which tells FUSE to copy in the requested area and retry the ioctl.
1699  * On the second round, the server has access to the structure and
1700  * from that it can tell what to look for next, so on the invocation,
1701  * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
1702  *
1703  * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a)     },
1704  *   { .iov_base = a.buf,       .iov_len = a.buflen             } }
1705  *
1706  * FUSE will copy both struct a and the pointed buffer from the
1707  * process doing the ioctl and retry ioctl with both struct a and the
1708  * buffer.
1709  *
1710  * This time, FUSE server has everything it needs and completes ioctl
1711  * without FUSE_IOCTL_RETRY which finishes the ioctl call.
1712  *
1713  * Copying data out works the same way.
1714  *
1715  * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
1716  * automatically initializes in and out iovs by decoding @cmd with
1717  * _IOC_* macros and the server is not allowed to request RETRY.  This
1718  * limits ioctl data transfers to well-formed ioctls and is the forced
1719  * behavior for all FUSE servers.
1720  */
1721 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1722                    unsigned int flags)
1723 {
1724         struct fuse_file *ff = file->private_data;
1725         struct fuse_conn *fc = ff->fc;
1726         struct fuse_ioctl_in inarg = {
1727                 .fh = ff->fh,
1728                 .cmd = cmd,
1729                 .arg = arg,
1730                 .flags = flags
1731         };
1732         struct fuse_ioctl_out outarg;
1733         struct fuse_req *req = NULL;
1734         struct page **pages = NULL;
1735         struct page *iov_page = NULL;
1736         struct iovec *in_iov = NULL, *out_iov = NULL;
1737         unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
1738         size_t in_size, out_size, transferred;
1739         int err;
1740
1741         /* assume all the iovs returned by client always fits in a page */
1742         BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1743
1744         err = -ENOMEM;
1745         pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
1746         iov_page = alloc_page(GFP_KERNEL);
1747         if (!pages || !iov_page)
1748                 goto out;
1749
1750         /*
1751          * If restricted, initialize IO parameters as encoded in @cmd.
1752          * RETRY from server is not allowed.
1753          */
1754         if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
1755                 struct iovec *iov = page_address(iov_page);
1756
1757                 iov->iov_base = (void __user *)arg;
1758                 iov->iov_len = _IOC_SIZE(cmd);
1759
1760                 if (_IOC_DIR(cmd) & _IOC_WRITE) {
1761                         in_iov = iov;
1762                         in_iovs = 1;
1763                 }
1764
1765                 if (_IOC_DIR(cmd) & _IOC_READ) {
1766                         out_iov = iov;
1767                         out_iovs = 1;
1768                 }
1769         }
1770
1771  retry:
1772         inarg.in_size = in_size = iov_length(in_iov, in_iovs);
1773         inarg.out_size = out_size = iov_length(out_iov, out_iovs);
1774
1775         /*
1776          * Out data can be used either for actual out data or iovs,
1777          * make sure there always is at least one page.
1778          */
1779         out_size = max_t(size_t, out_size, PAGE_SIZE);
1780         max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
1781
1782         /* make sure there are enough buffer pages and init request with them */
1783         err = -ENOMEM;
1784         if (max_pages > FUSE_MAX_PAGES_PER_REQ)
1785                 goto out;
1786         while (num_pages < max_pages) {
1787                 pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
1788                 if (!pages[num_pages])
1789                         goto out;
1790                 num_pages++;
1791         }
1792
1793         req = fuse_get_req(fc);
1794         if (IS_ERR(req)) {
1795                 err = PTR_ERR(req);
1796                 req = NULL;
1797                 goto out;
1798         }
1799         memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
1800         req->num_pages = num_pages;
1801
1802         /* okay, let's send it to the client */
1803         req->in.h.opcode = FUSE_IOCTL;
1804         req->in.h.nodeid = ff->nodeid;
1805         req->in.numargs = 1;
1806         req->in.args[0].size = sizeof(inarg);
1807         req->in.args[0].value = &inarg;
1808         if (in_size) {
1809                 req->in.numargs++;
1810                 req->in.args[1].size = in_size;
1811                 req->in.argpages = 1;
1812
1813                 err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
1814                                            false);
1815                 if (err)
1816                         goto out;
1817         }
1818
1819         req->out.numargs = 2;
1820         req->out.args[0].size = sizeof(outarg);
1821         req->out.args[0].value = &outarg;
1822         req->out.args[1].size = out_size;
1823         req->out.argpages = 1;
1824         req->out.argvar = 1;
1825
1826         fuse_request_send(fc, req);
1827         err = req->out.h.error;
1828         transferred = req->out.args[1].size;
1829         fuse_put_request(fc, req);
1830         req = NULL;
1831         if (err)
1832                 goto out;
1833
1834         /* did it ask for retry? */
1835         if (outarg.flags & FUSE_IOCTL_RETRY) {
1836                 char *vaddr;
1837
1838                 /* no retry if in restricted mode */
1839                 err = -EIO;
1840                 if (!(flags & FUSE_IOCTL_UNRESTRICTED))
1841                         goto out;
1842
1843                 in_iovs = outarg.in_iovs;
1844                 out_iovs = outarg.out_iovs;
1845
1846                 /*
1847                  * Make sure things are in boundary, separate checks
1848                  * are to protect against overflow.
1849                  */
1850                 err = -ENOMEM;
1851                 if (in_iovs > FUSE_IOCTL_MAX_IOV ||
1852                     out_iovs > FUSE_IOCTL_MAX_IOV ||
1853                     in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
1854                         goto out;
1855
1856                 vaddr = kmap_atomic(pages[0], KM_USER0);
1857                 err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr,
1858                                             transferred, in_iovs + out_iovs,
1859                                             (flags & FUSE_IOCTL_COMPAT) != 0);
1860                 kunmap_atomic(vaddr, KM_USER0);
1861                 if (err)
1862                         goto out;
1863
1864                 in_iov = page_address(iov_page);
1865                 out_iov = in_iov + in_iovs;
1866
1867                 err = fuse_verify_ioctl_iov(in_iov, in_iovs);
1868                 if (err)
1869                         goto out;
1870
1871                 err = fuse_verify_ioctl_iov(out_iov, out_iovs);
1872                 if (err)
1873                         goto out;
1874
1875                 goto retry;
1876         }
1877
1878         err = -EIO;
1879         if (transferred > inarg.out_size)
1880                 goto out;
1881
1882         err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
1883  out:
1884         if (req)
1885                 fuse_put_request(fc, req);
1886         if (iov_page)
1887                 __free_page(iov_page);
1888         while (num_pages)
1889                 __free_page(pages[--num_pages]);
1890         kfree(pages);
1891
1892         return err ? err : outarg.result;
1893 }
1894 EXPORT_SYMBOL_GPL(fuse_do_ioctl);
1895
1896 static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
1897                                    unsigned long arg, unsigned int flags)
1898 {
1899         struct inode *inode = file->f_dentry->d_inode;
1900         struct fuse_conn *fc = get_fuse_conn(inode);
1901
1902         if (!fuse_allow_task(fc, current))
1903                 return -EACCES;
1904
1905         if (is_bad_inode(inode))
1906                 return -EIO;
1907
1908         return fuse_do_ioctl(file, cmd, arg, flags);
1909 }
1910
1911 static long fuse_file_ioctl(struct file *file, unsigned int cmd,
1912                             unsigned long arg)
1913 {
1914         return fuse_file_ioctl_common(file, cmd, arg, 0);
1915 }
1916
1917 static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
1918                                    unsigned long arg)
1919 {
1920         return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
1921 }
1922
1923 /*
1924  * All files which have been polled are linked to RB tree
1925  * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
1926  * find the matching one.
1927  */
1928 static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
1929                                               struct rb_node **parent_out)
1930 {
1931         struct rb_node **link = &fc->polled_files.rb_node;
1932         struct rb_node *last = NULL;
1933
1934         while (*link) {
1935                 struct fuse_file *ff;
1936
1937                 last = *link;
1938                 ff = rb_entry(last, struct fuse_file, polled_node);
1939
1940                 if (kh < ff->kh)
1941                         link = &last->rb_left;
1942                 else if (kh > ff->kh)
1943                         link = &last->rb_right;
1944                 else
1945                         return link;
1946         }
1947
1948         if (parent_out)
1949                 *parent_out = last;
1950         return link;
1951 }
1952
1953 /*
1954  * The file is about to be polled.  Make sure it's on the polled_files
1955  * RB tree.  Note that files once added to the polled_files tree are
1956  * not removed before the file is released.  This is because a file
1957  * polled once is likely to be polled again.
1958  */
1959 static void fuse_register_polled_file(struct fuse_conn *fc,
1960                                       struct fuse_file *ff)
1961 {
1962         spin_lock(&fc->lock);
1963         if (RB_EMPTY_NODE(&ff->polled_node)) {
1964                 struct rb_node **link, *parent;
1965
1966                 link = fuse_find_polled_node(fc, ff->kh, &parent);
1967                 BUG_ON(*link);
1968                 rb_link_node(&ff->polled_node, parent, link);
1969                 rb_insert_color(&ff->polled_node, &fc->polled_files);
1970         }
1971         spin_unlock(&fc->lock);
1972 }
1973
1974 unsigned fuse_file_poll(struct file *file, poll_table *wait)
1975 {
1976         struct fuse_file *ff = file->private_data;
1977         struct fuse_conn *fc = ff->fc;
1978         struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
1979         struct fuse_poll_out outarg;
1980         struct fuse_req *req;
1981         int err;
1982
1983         if (fc->no_poll)
1984                 return DEFAULT_POLLMASK;
1985
1986         poll_wait(file, &ff->poll_wait, wait);
1987
1988         /*
1989          * Ask for notification iff there's someone waiting for it.
1990          * The client may ignore the flag and always notify.
1991          */
1992         if (waitqueue_active(&ff->poll_wait)) {
1993                 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
1994                 fuse_register_polled_file(fc, ff);
1995         }
1996
1997         req = fuse_get_req(fc);
1998         if (IS_ERR(req))
1999                 return POLLERR;
2000
2001         req->in.h.opcode = FUSE_POLL;
2002         req->in.h.nodeid = ff->nodeid;
2003         req->in.numargs = 1;
2004         req->in.args[0].size = sizeof(inarg);
2005         req->in.args[0].value = &inarg;
2006         req->out.numargs = 1;
2007         req->out.args[0].size = sizeof(outarg);
2008         req->out.args[0].value = &outarg;
2009         fuse_request_send(fc, req);
2010         err = req->out.h.error;
2011         fuse_put_request(fc, req);
2012
2013         if (!err)
2014                 return outarg.revents;
2015         if (err == -ENOSYS) {
2016                 fc->no_poll = 1;
2017                 return DEFAULT_POLLMASK;
2018         }
2019         return POLLERR;
2020 }
2021 EXPORT_SYMBOL_GPL(fuse_file_poll);
2022
2023 /*
2024  * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
2025  * wakes up the poll waiters.
2026  */
2027 int fuse_notify_poll_wakeup(struct fuse_conn *fc,
2028                             struct fuse_notify_poll_wakeup_out *outarg)
2029 {
2030         u64 kh = outarg->kh;
2031         struct rb_node **link;
2032
2033         spin_lock(&fc->lock);
2034
2035         link = fuse_find_polled_node(fc, kh, NULL);
2036         if (*link) {
2037                 struct fuse_file *ff;
2038
2039                 ff = rb_entry(*link, struct fuse_file, polled_node);
2040                 wake_up_interruptible_sync(&ff->poll_wait);
2041         }
2042
2043         spin_unlock(&fc->lock);
2044         return 0;
2045 }
2046
2047 static const struct file_operations fuse_file_operations = {
2048         .llseek         = fuse_file_llseek,
2049         .read           = do_sync_read,
2050         .aio_read       = fuse_file_aio_read,
2051         .write          = do_sync_write,
2052         .aio_write      = fuse_file_aio_write,
2053         .mmap           = fuse_file_mmap,
2054         .open           = fuse_open,
2055         .flush          = fuse_flush,
2056         .release        = fuse_release,
2057         .fsync          = fuse_fsync,
2058         .lock           = fuse_file_lock,
2059         .flock          = fuse_file_flock,
2060         .splice_read    = generic_file_splice_read,
2061         .unlocked_ioctl = fuse_file_ioctl,
2062         .compat_ioctl   = fuse_file_compat_ioctl,
2063         .poll           = fuse_file_poll,
2064 };
2065
2066 static const struct file_operations fuse_direct_io_file_operations = {
2067         .llseek         = fuse_file_llseek,
2068         .read           = fuse_direct_read,
2069         .write          = fuse_direct_write,
2070         .mmap           = fuse_direct_mmap,
2071         .open           = fuse_open,
2072         .flush          = fuse_flush,
2073         .release        = fuse_release,
2074         .fsync          = fuse_fsync,
2075         .lock           = fuse_file_lock,
2076         .flock          = fuse_file_flock,
2077         .unlocked_ioctl = fuse_file_ioctl,
2078         .compat_ioctl   = fuse_file_compat_ioctl,
2079         .poll           = fuse_file_poll,
2080         /* no splice_read */
2081 };
2082
2083 static const struct address_space_operations fuse_file_aops  = {
2084         .readpage       = fuse_readpage,
2085         .writepage      = fuse_writepage,
2086         .launder_page   = fuse_launder_page,
2087         .write_begin    = fuse_write_begin,
2088         .write_end      = fuse_write_end,
2089         .readpages      = fuse_readpages,
2090         .set_page_dirty = __set_page_dirty_nobuffers,
2091         .bmap           = fuse_bmap,
2092 };
2093
2094 void fuse_init_file_inode(struct inode *inode)
2095 {
2096         inode->i_fop = &fuse_file_operations;
2097         inode->i_data.a_ops = &fuse_file_aops;
2098 }