hugetlbfs: accept subpool min_size mount option and setup accordingly
[firefly-linux-kernel-4.4.55.git] / fs / hugetlbfs / inode.c
1 /*
2  * hugetlbpage-backed filesystem.  Based on ramfs.
3  *
4  * Nadia Yvette Chambers, 2002
5  *
6  * Copyright (C) 2002 Linus Torvalds.
7  */
8
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11 #include <linux/module.h>
12 #include <linux/thread_info.h>
13 #include <asm/current.h>
14 #include <linux/sched.h>                /* remove ASAP */
15 #include <linux/fs.h>
16 #include <linux/mount.h>
17 #include <linux/file.h>
18 #include <linux/kernel.h>
19 #include <linux/writeback.h>
20 #include <linux/pagemap.h>
21 #include <linux/highmem.h>
22 #include <linux/init.h>
23 #include <linux/string.h>
24 #include <linux/capability.h>
25 #include <linux/ctype.h>
26 #include <linux/backing-dev.h>
27 #include <linux/hugetlb.h>
28 #include <linux/pagevec.h>
29 #include <linux/parser.h>
30 #include <linux/mman.h>
31 #include <linux/slab.h>
32 #include <linux/dnotify.h>
33 #include <linux/statfs.h>
34 #include <linux/security.h>
35 #include <linux/magic.h>
36 #include <linux/migrate.h>
37
38 #include <asm/uaccess.h>
39
40 static const struct super_operations hugetlbfs_ops;
41 static const struct address_space_operations hugetlbfs_aops;
42 const struct file_operations hugetlbfs_file_operations;
43 static const struct inode_operations hugetlbfs_dir_inode_operations;
44 static const struct inode_operations hugetlbfs_inode_operations;
45
46 struct hugetlbfs_config {
47         kuid_t   uid;
48         kgid_t   gid;
49         umode_t mode;
50         long    max_hpages;
51         long    nr_inodes;
52         struct hstate *hstate;
53         long    min_hpages;
54 };
55
56 struct hugetlbfs_inode_info {
57         struct shared_policy policy;
58         struct inode vfs_inode;
59 };
60
61 static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
62 {
63         return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
64 }
65
66 int sysctl_hugetlb_shm_group;
67
68 enum {
69         Opt_size, Opt_nr_inodes,
70         Opt_mode, Opt_uid, Opt_gid,
71         Opt_pagesize, Opt_min_size,
72         Opt_err,
73 };
74
75 static const match_table_t tokens = {
76         {Opt_size,      "size=%s"},
77         {Opt_nr_inodes, "nr_inodes=%s"},
78         {Opt_mode,      "mode=%o"},
79         {Opt_uid,       "uid=%u"},
80         {Opt_gid,       "gid=%u"},
81         {Opt_pagesize,  "pagesize=%s"},
82         {Opt_min_size,  "min_size=%s"},
83         {Opt_err,       NULL},
84 };
85
86 static void huge_pagevec_release(struct pagevec *pvec)
87 {
88         int i;
89
90         for (i = 0; i < pagevec_count(pvec); ++i)
91                 put_page(pvec->pages[i]);
92
93         pagevec_reinit(pvec);
94 }
95
96 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
97 {
98         struct inode *inode = file_inode(file);
99         loff_t len, vma_len;
100         int ret;
101         struct hstate *h = hstate_file(file);
102
103         /*
104          * vma address alignment (but not the pgoff alignment) has
105          * already been checked by prepare_hugepage_range.  If you add
106          * any error returns here, do so after setting VM_HUGETLB, so
107          * is_vm_hugetlb_page tests below unmap_region go the right
108          * way when do_mmap_pgoff unwinds (may be important on powerpc
109          * and ia64).
110          */
111         vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
112         vma->vm_ops = &hugetlb_vm_ops;
113
114         if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
115                 return -EINVAL;
116
117         vma_len = (loff_t)(vma->vm_end - vma->vm_start);
118
119         mutex_lock(&inode->i_mutex);
120         file_accessed(file);
121
122         ret = -ENOMEM;
123         len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
124
125         if (hugetlb_reserve_pages(inode,
126                                 vma->vm_pgoff >> huge_page_order(h),
127                                 len >> huge_page_shift(h), vma,
128                                 vma->vm_flags))
129                 goto out;
130
131         ret = 0;
132         hugetlb_prefault_arch_hook(vma->vm_mm);
133         if (vma->vm_flags & VM_WRITE && inode->i_size < len)
134                 inode->i_size = len;
135 out:
136         mutex_unlock(&inode->i_mutex);
137
138         return ret;
139 }
140
141 /*
142  * Called under down_write(mmap_sem).
143  */
144
145 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
146 static unsigned long
147 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
148                 unsigned long len, unsigned long pgoff, unsigned long flags)
149 {
150         struct mm_struct *mm = current->mm;
151         struct vm_area_struct *vma;
152         struct hstate *h = hstate_file(file);
153         struct vm_unmapped_area_info info;
154
155         if (len & ~huge_page_mask(h))
156                 return -EINVAL;
157         if (len > TASK_SIZE)
158                 return -ENOMEM;
159
160         if (flags & MAP_FIXED) {
161                 if (prepare_hugepage_range(file, addr, len))
162                         return -EINVAL;
163                 return addr;
164         }
165
166         if (addr) {
167                 addr = ALIGN(addr, huge_page_size(h));
168                 vma = find_vma(mm, addr);
169                 if (TASK_SIZE - len >= addr &&
170                     (!vma || addr + len <= vma->vm_start))
171                         return addr;
172         }
173
174         info.flags = 0;
175         info.length = len;
176         info.low_limit = TASK_UNMAPPED_BASE;
177         info.high_limit = TASK_SIZE;
178         info.align_mask = PAGE_MASK & ~huge_page_mask(h);
179         info.align_offset = 0;
180         return vm_unmapped_area(&info);
181 }
182 #endif
183
184 static int
185 hugetlbfs_read_actor(struct page *page, unsigned long offset,
186                         char __user *buf, unsigned long count,
187                         unsigned long size)
188 {
189         char *kaddr;
190         unsigned long left, copied = 0;
191         int i, chunksize;
192
193         if (size > count)
194                 size = count;
195
196         /* Find which 4k chunk and offset with in that chunk */
197         i = offset >> PAGE_CACHE_SHIFT;
198         offset = offset & ~PAGE_CACHE_MASK;
199
200         while (size) {
201                 chunksize = PAGE_CACHE_SIZE;
202                 if (offset)
203                         chunksize -= offset;
204                 if (chunksize > size)
205                         chunksize = size;
206                 kaddr = kmap(&page[i]);
207                 left = __copy_to_user(buf, kaddr + offset, chunksize);
208                 kunmap(&page[i]);
209                 if (left) {
210                         copied += (chunksize - left);
211                         break;
212                 }
213                 offset = 0;
214                 size -= chunksize;
215                 buf += chunksize;
216                 copied += chunksize;
217                 i++;
218         }
219         return copied ? copied : -EFAULT;
220 }
221
222 /*
223  * Support for read() - Find the page attached to f_mapping and copy out the
224  * data. Its *very* similar to do_generic_mapping_read(), we can't use that
225  * since it has PAGE_CACHE_SIZE assumptions.
226  */
227 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
228                               size_t len, loff_t *ppos)
229 {
230         struct hstate *h = hstate_file(filp);
231         struct address_space *mapping = filp->f_mapping;
232         struct inode *inode = mapping->host;
233         unsigned long index = *ppos >> huge_page_shift(h);
234         unsigned long offset = *ppos & ~huge_page_mask(h);
235         unsigned long end_index;
236         loff_t isize;
237         ssize_t retval = 0;
238
239         /* validate length */
240         if (len == 0)
241                 goto out;
242
243         for (;;) {
244                 struct page *page;
245                 unsigned long nr, ret;
246                 int ra;
247
248                 /* nr is the maximum number of bytes to copy from this page */
249                 nr = huge_page_size(h);
250                 isize = i_size_read(inode);
251                 if (!isize)
252                         goto out;
253                 end_index = (isize - 1) >> huge_page_shift(h);
254                 if (index >= end_index) {
255                         if (index > end_index)
256                                 goto out;
257                         nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
258                         if (nr <= offset)
259                                 goto out;
260                 }
261                 nr = nr - offset;
262
263                 /* Find the page */
264                 page = find_lock_page(mapping, index);
265                 if (unlikely(page == NULL)) {
266                         /*
267                          * We have a HOLE, zero out the user-buffer for the
268                          * length of the hole or request.
269                          */
270                         ret = len < nr ? len : nr;
271                         if (clear_user(buf, ret))
272                                 ra = -EFAULT;
273                         else
274                                 ra = 0;
275                 } else {
276                         unlock_page(page);
277
278                         /*
279                          * We have the page, copy it to user space buffer.
280                          */
281                         ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
282                         ret = ra;
283                         page_cache_release(page);
284                 }
285                 if (ra < 0) {
286                         if (retval == 0)
287                                 retval = ra;
288                         goto out;
289                 }
290
291                 offset += ret;
292                 retval += ret;
293                 len -= ret;
294                 index += offset >> huge_page_shift(h);
295                 offset &= ~huge_page_mask(h);
296
297                 /* short read or no more work */
298                 if ((ret != nr) || (len == 0))
299                         break;
300         }
301 out:
302         *ppos = ((loff_t)index << huge_page_shift(h)) + offset;
303         return retval;
304 }
305
306 static int hugetlbfs_write_begin(struct file *file,
307                         struct address_space *mapping,
308                         loff_t pos, unsigned len, unsigned flags,
309                         struct page **pagep, void **fsdata)
310 {
311         return -EINVAL;
312 }
313
314 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
315                         loff_t pos, unsigned len, unsigned copied,
316                         struct page *page, void *fsdata)
317 {
318         BUG();
319         return -EINVAL;
320 }
321
322 static void truncate_huge_page(struct page *page)
323 {
324         ClearPageDirty(page);
325         ClearPageUptodate(page);
326         delete_from_page_cache(page);
327 }
328
329 static void truncate_hugepages(struct inode *inode, loff_t lstart)
330 {
331         struct hstate *h = hstate_inode(inode);
332         struct address_space *mapping = &inode->i_data;
333         const pgoff_t start = lstart >> huge_page_shift(h);
334         struct pagevec pvec;
335         pgoff_t next;
336         int i, freed = 0;
337
338         pagevec_init(&pvec, 0);
339         next = start;
340         while (1) {
341                 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
342                         if (next == start)
343                                 break;
344                         next = start;
345                         continue;
346                 }
347
348                 for (i = 0; i < pagevec_count(&pvec); ++i) {
349                         struct page *page = pvec.pages[i];
350
351                         lock_page(page);
352                         if (page->index > next)
353                                 next = page->index;
354                         ++next;
355                         truncate_huge_page(page);
356                         unlock_page(page);
357                         freed++;
358                 }
359                 huge_pagevec_release(&pvec);
360         }
361         BUG_ON(!lstart && mapping->nrpages);
362         hugetlb_unreserve_pages(inode, start, freed);
363 }
364
365 static void hugetlbfs_evict_inode(struct inode *inode)
366 {
367         struct resv_map *resv_map;
368
369         truncate_hugepages(inode, 0);
370         resv_map = (struct resv_map *)inode->i_mapping->private_data;
371         /* root inode doesn't have the resv_map, so we should check it */
372         if (resv_map)
373                 resv_map_release(&resv_map->refs);
374         clear_inode(inode);
375 }
376
377 static inline void
378 hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
379 {
380         struct vm_area_struct *vma;
381
382         vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
383                 unsigned long v_offset;
384
385                 /*
386                  * Can the expression below overflow on 32-bit arches?
387                  * No, because the interval tree returns us only those vmas
388                  * which overlap the truncated area starting at pgoff,
389                  * and no vma on a 32-bit arch can span beyond the 4GB.
390                  */
391                 if (vma->vm_pgoff < pgoff)
392                         v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
393                 else
394                         v_offset = 0;
395
396                 unmap_hugepage_range(vma, vma->vm_start + v_offset,
397                                      vma->vm_end, NULL);
398         }
399 }
400
401 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
402 {
403         pgoff_t pgoff;
404         struct address_space *mapping = inode->i_mapping;
405         struct hstate *h = hstate_inode(inode);
406
407         BUG_ON(offset & ~huge_page_mask(h));
408         pgoff = offset >> PAGE_SHIFT;
409
410         i_size_write(inode, offset);
411         i_mmap_lock_write(mapping);
412         if (!RB_EMPTY_ROOT(&mapping->i_mmap))
413                 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
414         i_mmap_unlock_write(mapping);
415         truncate_hugepages(inode, offset);
416         return 0;
417 }
418
419 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
420 {
421         struct inode *inode = dentry->d_inode;
422         struct hstate *h = hstate_inode(inode);
423         int error;
424         unsigned int ia_valid = attr->ia_valid;
425
426         BUG_ON(!inode);
427
428         error = inode_change_ok(inode, attr);
429         if (error)
430                 return error;
431
432         if (ia_valid & ATTR_SIZE) {
433                 error = -EINVAL;
434                 if (attr->ia_size & ~huge_page_mask(h))
435                         return -EINVAL;
436                 error = hugetlb_vmtruncate(inode, attr->ia_size);
437                 if (error)
438                         return error;
439         }
440
441         setattr_copy(inode, attr);
442         mark_inode_dirty(inode);
443         return 0;
444 }
445
446 static struct inode *hugetlbfs_get_root(struct super_block *sb,
447                                         struct hugetlbfs_config *config)
448 {
449         struct inode *inode;
450
451         inode = new_inode(sb);
452         if (inode) {
453                 struct hugetlbfs_inode_info *info;
454                 inode->i_ino = get_next_ino();
455                 inode->i_mode = S_IFDIR | config->mode;
456                 inode->i_uid = config->uid;
457                 inode->i_gid = config->gid;
458                 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
459                 info = HUGETLBFS_I(inode);
460                 mpol_shared_policy_init(&info->policy, NULL);
461                 inode->i_op = &hugetlbfs_dir_inode_operations;
462                 inode->i_fop = &simple_dir_operations;
463                 /* directory inodes start off with i_nlink == 2 (for "." entry) */
464                 inc_nlink(inode);
465                 lockdep_annotate_inode_mutex_key(inode);
466         }
467         return inode;
468 }
469
470 /*
471  * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
472  * be taken from reclaim -- unlike regular filesystems. This needs an
473  * annotation because huge_pmd_share() does an allocation under
474  * i_mmap_rwsem.
475  */
476 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
477
478 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
479                                         struct inode *dir,
480                                         umode_t mode, dev_t dev)
481 {
482         struct inode *inode;
483         struct resv_map *resv_map;
484
485         resv_map = resv_map_alloc();
486         if (!resv_map)
487                 return NULL;
488
489         inode = new_inode(sb);
490         if (inode) {
491                 struct hugetlbfs_inode_info *info;
492                 inode->i_ino = get_next_ino();
493                 inode_init_owner(inode, dir, mode);
494                 lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
495                                 &hugetlbfs_i_mmap_rwsem_key);
496                 inode->i_mapping->a_ops = &hugetlbfs_aops;
497                 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
498                 inode->i_mapping->private_data = resv_map;
499                 info = HUGETLBFS_I(inode);
500                 /*
501                  * The policy is initialized here even if we are creating a
502                  * private inode because initialization simply creates an
503                  * an empty rb tree and calls spin_lock_init(), later when we
504                  * call mpol_free_shared_policy() it will just return because
505                  * the rb tree will still be empty.
506                  */
507                 mpol_shared_policy_init(&info->policy, NULL);
508                 switch (mode & S_IFMT) {
509                 default:
510                         init_special_inode(inode, mode, dev);
511                         break;
512                 case S_IFREG:
513                         inode->i_op = &hugetlbfs_inode_operations;
514                         inode->i_fop = &hugetlbfs_file_operations;
515                         break;
516                 case S_IFDIR:
517                         inode->i_op = &hugetlbfs_dir_inode_operations;
518                         inode->i_fop = &simple_dir_operations;
519
520                         /* directory inodes start off with i_nlink == 2 (for "." entry) */
521                         inc_nlink(inode);
522                         break;
523                 case S_IFLNK:
524                         inode->i_op = &page_symlink_inode_operations;
525                         break;
526                 }
527                 lockdep_annotate_inode_mutex_key(inode);
528         } else
529                 kref_put(&resv_map->refs, resv_map_release);
530
531         return inode;
532 }
533
534 /*
535  * File creation. Allocate an inode, and we're done..
536  */
537 static int hugetlbfs_mknod(struct inode *dir,
538                         struct dentry *dentry, umode_t mode, dev_t dev)
539 {
540         struct inode *inode;
541         int error = -ENOSPC;
542
543         inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
544         if (inode) {
545                 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
546                 d_instantiate(dentry, inode);
547                 dget(dentry);   /* Extra count - pin the dentry in core */
548                 error = 0;
549         }
550         return error;
551 }
552
553 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
554 {
555         int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
556         if (!retval)
557                 inc_nlink(dir);
558         return retval;
559 }
560
561 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
562 {
563         return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
564 }
565
566 static int hugetlbfs_symlink(struct inode *dir,
567                         struct dentry *dentry, const char *symname)
568 {
569         struct inode *inode;
570         int error = -ENOSPC;
571
572         inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
573         if (inode) {
574                 int l = strlen(symname)+1;
575                 error = page_symlink(inode, symname, l);
576                 if (!error) {
577                         d_instantiate(dentry, inode);
578                         dget(dentry);
579                 } else
580                         iput(inode);
581         }
582         dir->i_ctime = dir->i_mtime = CURRENT_TIME;
583
584         return error;
585 }
586
587 /*
588  * mark the head page dirty
589  */
590 static int hugetlbfs_set_page_dirty(struct page *page)
591 {
592         struct page *head = compound_head(page);
593
594         SetPageDirty(head);
595         return 0;
596 }
597
598 static int hugetlbfs_migrate_page(struct address_space *mapping,
599                                 struct page *newpage, struct page *page,
600                                 enum migrate_mode mode)
601 {
602         int rc;
603
604         rc = migrate_huge_page_move_mapping(mapping, newpage, page);
605         if (rc != MIGRATEPAGE_SUCCESS)
606                 return rc;
607         migrate_page_copy(newpage, page);
608
609         return MIGRATEPAGE_SUCCESS;
610 }
611
612 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
613 {
614         struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
615         struct hstate *h = hstate_inode(dentry->d_inode);
616
617         buf->f_type = HUGETLBFS_MAGIC;
618         buf->f_bsize = huge_page_size(h);
619         if (sbinfo) {
620                 spin_lock(&sbinfo->stat_lock);
621                 /* If no limits set, just report 0 for max/free/used
622                  * blocks, like simple_statfs() */
623                 if (sbinfo->spool) {
624                         long free_pages;
625
626                         spin_lock(&sbinfo->spool->lock);
627                         buf->f_blocks = sbinfo->spool->max_hpages;
628                         free_pages = sbinfo->spool->max_hpages
629                                 - sbinfo->spool->used_hpages;
630                         buf->f_bavail = buf->f_bfree = free_pages;
631                         spin_unlock(&sbinfo->spool->lock);
632                         buf->f_files = sbinfo->max_inodes;
633                         buf->f_ffree = sbinfo->free_inodes;
634                 }
635                 spin_unlock(&sbinfo->stat_lock);
636         }
637         buf->f_namelen = NAME_MAX;
638         return 0;
639 }
640
641 static void hugetlbfs_put_super(struct super_block *sb)
642 {
643         struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
644
645         if (sbi) {
646                 sb->s_fs_info = NULL;
647
648                 if (sbi->spool)
649                         hugepage_put_subpool(sbi->spool);
650
651                 kfree(sbi);
652         }
653 }
654
655 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
656 {
657         if (sbinfo->free_inodes >= 0) {
658                 spin_lock(&sbinfo->stat_lock);
659                 if (unlikely(!sbinfo->free_inodes)) {
660                         spin_unlock(&sbinfo->stat_lock);
661                         return 0;
662                 }
663                 sbinfo->free_inodes--;
664                 spin_unlock(&sbinfo->stat_lock);
665         }
666
667         return 1;
668 }
669
670 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
671 {
672         if (sbinfo->free_inodes >= 0) {
673                 spin_lock(&sbinfo->stat_lock);
674                 sbinfo->free_inodes++;
675                 spin_unlock(&sbinfo->stat_lock);
676         }
677 }
678
679
680 static struct kmem_cache *hugetlbfs_inode_cachep;
681
682 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
683 {
684         struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
685         struct hugetlbfs_inode_info *p;
686
687         if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
688                 return NULL;
689         p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
690         if (unlikely(!p)) {
691                 hugetlbfs_inc_free_inodes(sbinfo);
692                 return NULL;
693         }
694         return &p->vfs_inode;
695 }
696
697 static void hugetlbfs_i_callback(struct rcu_head *head)
698 {
699         struct inode *inode = container_of(head, struct inode, i_rcu);
700         kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
701 }
702
703 static void hugetlbfs_destroy_inode(struct inode *inode)
704 {
705         hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
706         mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
707         call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
708 }
709
710 static const struct address_space_operations hugetlbfs_aops = {
711         .write_begin    = hugetlbfs_write_begin,
712         .write_end      = hugetlbfs_write_end,
713         .set_page_dirty = hugetlbfs_set_page_dirty,
714         .migratepage    = hugetlbfs_migrate_page,
715 };
716
717
718 static void init_once(void *foo)
719 {
720         struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
721
722         inode_init_once(&ei->vfs_inode);
723 }
724
725 const struct file_operations hugetlbfs_file_operations = {
726         .read                   = hugetlbfs_read,
727         .mmap                   = hugetlbfs_file_mmap,
728         .fsync                  = noop_fsync,
729         .get_unmapped_area      = hugetlb_get_unmapped_area,
730         .llseek         = default_llseek,
731 };
732
733 static const struct inode_operations hugetlbfs_dir_inode_operations = {
734         .create         = hugetlbfs_create,
735         .lookup         = simple_lookup,
736         .link           = simple_link,
737         .unlink         = simple_unlink,
738         .symlink        = hugetlbfs_symlink,
739         .mkdir          = hugetlbfs_mkdir,
740         .rmdir          = simple_rmdir,
741         .mknod          = hugetlbfs_mknod,
742         .rename         = simple_rename,
743         .setattr        = hugetlbfs_setattr,
744 };
745
746 static const struct inode_operations hugetlbfs_inode_operations = {
747         .setattr        = hugetlbfs_setattr,
748 };
749
750 static const struct super_operations hugetlbfs_ops = {
751         .alloc_inode    = hugetlbfs_alloc_inode,
752         .destroy_inode  = hugetlbfs_destroy_inode,
753         .evict_inode    = hugetlbfs_evict_inode,
754         .statfs         = hugetlbfs_statfs,
755         .put_super      = hugetlbfs_put_super,
756         .show_options   = generic_show_options,
757 };
758
759 enum { NO_SIZE, SIZE_STD, SIZE_PERCENT };
760
761 /*
762  * Convert size option passed from command line to number of huge pages
763  * in the pool specified by hstate.  Size option could be in bytes
764  * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
765  */
766 static long long
767 hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
768                                                                 int val_type)
769 {
770         if (val_type == NO_SIZE)
771                 return -1;
772
773         if (val_type == SIZE_PERCENT) {
774                 size_opt <<= huge_page_shift(h);
775                 size_opt *= h->max_huge_pages;
776                 do_div(size_opt, 100);
777         }
778
779         size_opt >>= huge_page_shift(h);
780         return size_opt;
781 }
782
783 static int
784 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
785 {
786         char *p, *rest;
787         substring_t args[MAX_OPT_ARGS];
788         int option;
789         unsigned long long max_size_opt = 0, min_size_opt = 0;
790         int max_val_type = NO_SIZE, min_val_type = NO_SIZE;
791
792         if (!options)
793                 return 0;
794
795         while ((p = strsep(&options, ",")) != NULL) {
796                 int token;
797                 if (!*p)
798                         continue;
799
800                 token = match_token(p, tokens, args);
801                 switch (token) {
802                 case Opt_uid:
803                         if (match_int(&args[0], &option))
804                                 goto bad_val;
805                         pconfig->uid = make_kuid(current_user_ns(), option);
806                         if (!uid_valid(pconfig->uid))
807                                 goto bad_val;
808                         break;
809
810                 case Opt_gid:
811                         if (match_int(&args[0], &option))
812                                 goto bad_val;
813                         pconfig->gid = make_kgid(current_user_ns(), option);
814                         if (!gid_valid(pconfig->gid))
815                                 goto bad_val;
816                         break;
817
818                 case Opt_mode:
819                         if (match_octal(&args[0], &option))
820                                 goto bad_val;
821                         pconfig->mode = option & 01777U;
822                         break;
823
824                 case Opt_size: {
825                         /* memparse() will accept a K/M/G without a digit */
826                         if (!isdigit(*args[0].from))
827                                 goto bad_val;
828                         max_size_opt = memparse(args[0].from, &rest);
829                         max_val_type = SIZE_STD;
830                         if (*rest == '%')
831                                 max_val_type = SIZE_PERCENT;
832                         break;
833                 }
834
835                 case Opt_nr_inodes:
836                         /* memparse() will accept a K/M/G without a digit */
837                         if (!isdigit(*args[0].from))
838                                 goto bad_val;
839                         pconfig->nr_inodes = memparse(args[0].from, &rest);
840                         break;
841
842                 case Opt_pagesize: {
843                         unsigned long ps;
844                         ps = memparse(args[0].from, &rest);
845                         pconfig->hstate = size_to_hstate(ps);
846                         if (!pconfig->hstate) {
847                                 pr_err("Unsupported page size %lu MB\n",
848                                         ps >> 20);
849                                 return -EINVAL;
850                         }
851                         break;
852                 }
853
854                 case Opt_min_size: {
855                         /* memparse() will accept a K/M/G without a digit */
856                         if (!isdigit(*args[0].from))
857                                 goto bad_val;
858                         min_size_opt = memparse(args[0].from, &rest);
859                         min_val_type = SIZE_STD;
860                         if (*rest == '%')
861                                 min_val_type = SIZE_PERCENT;
862                         break;
863                 }
864
865                 default:
866                         pr_err("Bad mount option: \"%s\"\n", p);
867                         return -EINVAL;
868                         break;
869                 }
870         }
871
872         /*
873          * Use huge page pool size (in hstate) to convert the size
874          * options to number of huge pages.  If NO_SIZE, -1 is returned.
875          */
876         pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
877                                                 max_size_opt, max_val_type);
878         pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
879                                                 min_size_opt, min_val_type);
880
881         /*
882          * If max_size was specified, then min_size must be smaller
883          */
884         if (max_val_type > NO_SIZE &&
885             pconfig->min_hpages > pconfig->max_hpages) {
886                 pr_err("minimum size can not be greater than maximum size\n");
887                 return -EINVAL;
888         }
889
890         return 0;
891
892 bad_val:
893         pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
894         return -EINVAL;
895 }
896
897 static int
898 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
899 {
900         int ret;
901         struct hugetlbfs_config config;
902         struct hugetlbfs_sb_info *sbinfo;
903
904         save_mount_options(sb, data);
905
906         config.max_hpages = -1; /* No limit on size by default */
907         config.nr_inodes = -1; /* No limit on number of inodes by default */
908         config.uid = current_fsuid();
909         config.gid = current_fsgid();
910         config.mode = 0755;
911         config.hstate = &default_hstate;
912         config.min_hpages = -1; /* No default minimum size */
913         ret = hugetlbfs_parse_options(data, &config);
914         if (ret)
915                 return ret;
916
917         sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
918         if (!sbinfo)
919                 return -ENOMEM;
920         sb->s_fs_info = sbinfo;
921         sbinfo->hstate = config.hstate;
922         spin_lock_init(&sbinfo->stat_lock);
923         sbinfo->max_inodes = config.nr_inodes;
924         sbinfo->free_inodes = config.nr_inodes;
925         sbinfo->spool = NULL;
926         /*
927          * Allocate and initialize subpool if maximum or minimum size is
928          * specified.  Any needed reservations (for minimim size) are taken
929          * taken when the subpool is created.
930          */
931         if (config.max_hpages != -1 || config.min_hpages != -1) {
932                 sbinfo->spool = hugepage_new_subpool(config.hstate,
933                                                         config.max_hpages,
934                                                         config.min_hpages);
935                 if (!sbinfo->spool)
936                         goto out_free;
937         }
938         sb->s_maxbytes = MAX_LFS_FILESIZE;
939         sb->s_blocksize = huge_page_size(config.hstate);
940         sb->s_blocksize_bits = huge_page_shift(config.hstate);
941         sb->s_magic = HUGETLBFS_MAGIC;
942         sb->s_op = &hugetlbfs_ops;
943         sb->s_time_gran = 1;
944         sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
945         if (!sb->s_root)
946                 goto out_free;
947         return 0;
948 out_free:
949         kfree(sbinfo->spool);
950         kfree(sbinfo);
951         return -ENOMEM;
952 }
953
954 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
955         int flags, const char *dev_name, void *data)
956 {
957         return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
958 }
959
960 static struct file_system_type hugetlbfs_fs_type = {
961         .name           = "hugetlbfs",
962         .mount          = hugetlbfs_mount,
963         .kill_sb        = kill_litter_super,
964 };
965 MODULE_ALIAS_FS("hugetlbfs");
966
967 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
968
969 static int can_do_hugetlb_shm(void)
970 {
971         kgid_t shm_group;
972         shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
973         return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
974 }
975
976 static int get_hstate_idx(int page_size_log)
977 {
978         struct hstate *h = hstate_sizelog(page_size_log);
979
980         if (!h)
981                 return -1;
982         return h - hstates;
983 }
984
985 static const struct dentry_operations anon_ops = {
986         .d_dname = simple_dname
987 };
988
989 /*
990  * Note that size should be aligned to proper hugepage size in caller side,
991  * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
992  */
993 struct file *hugetlb_file_setup(const char *name, size_t size,
994                                 vm_flags_t acctflag, struct user_struct **user,
995                                 int creat_flags, int page_size_log)
996 {
997         struct file *file = ERR_PTR(-ENOMEM);
998         struct inode *inode;
999         struct path path;
1000         struct super_block *sb;
1001         struct qstr quick_string;
1002         int hstate_idx;
1003
1004         hstate_idx = get_hstate_idx(page_size_log);
1005         if (hstate_idx < 0)
1006                 return ERR_PTR(-ENODEV);
1007
1008         *user = NULL;
1009         if (!hugetlbfs_vfsmount[hstate_idx])
1010                 return ERR_PTR(-ENOENT);
1011
1012         if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
1013                 *user = current_user();
1014                 if (user_shm_lock(size, *user)) {
1015                         task_lock(current);
1016                         pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
1017                                 current->comm, current->pid);
1018                         task_unlock(current);
1019                 } else {
1020                         *user = NULL;
1021                         return ERR_PTR(-EPERM);
1022                 }
1023         }
1024
1025         sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
1026         quick_string.name = name;
1027         quick_string.len = strlen(quick_string.name);
1028         quick_string.hash = 0;
1029         path.dentry = d_alloc_pseudo(sb, &quick_string);
1030         if (!path.dentry)
1031                 goto out_shm_unlock;
1032
1033         d_set_d_op(path.dentry, &anon_ops);
1034         path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
1035         file = ERR_PTR(-ENOSPC);
1036         inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
1037         if (!inode)
1038                 goto out_dentry;
1039
1040         file = ERR_PTR(-ENOMEM);
1041         if (hugetlb_reserve_pages(inode, 0,
1042                         size >> huge_page_shift(hstate_inode(inode)), NULL,
1043                         acctflag))
1044                 goto out_inode;
1045
1046         d_instantiate(path.dentry, inode);
1047         inode->i_size = size;
1048         clear_nlink(inode);
1049
1050         file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
1051                         &hugetlbfs_file_operations);
1052         if (IS_ERR(file))
1053                 goto out_dentry; /* inode is already attached */
1054
1055         return file;
1056
1057 out_inode:
1058         iput(inode);
1059 out_dentry:
1060         path_put(&path);
1061 out_shm_unlock:
1062         if (*user) {
1063                 user_shm_unlock(size, *user);
1064                 *user = NULL;
1065         }
1066         return file;
1067 }
1068
1069 static int __init init_hugetlbfs_fs(void)
1070 {
1071         struct hstate *h;
1072         int error;
1073         int i;
1074
1075         if (!hugepages_supported()) {
1076                 pr_info("disabling because there are no supported hugepage sizes\n");
1077                 return -ENOTSUPP;
1078         }
1079
1080         error = -ENOMEM;
1081         hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1082                                         sizeof(struct hugetlbfs_inode_info),
1083                                         0, 0, init_once);
1084         if (hugetlbfs_inode_cachep == NULL)
1085                 goto out2;
1086
1087         error = register_filesystem(&hugetlbfs_fs_type);
1088         if (error)
1089                 goto out;
1090
1091         i = 0;
1092         for_each_hstate(h) {
1093                 char buf[50];
1094                 unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
1095
1096                 snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
1097                 hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
1098                                                         buf);
1099
1100                 if (IS_ERR(hugetlbfs_vfsmount[i])) {
1101                         pr_err("Cannot mount internal hugetlbfs for "
1102                                 "page size %uK", ps_kb);
1103                         error = PTR_ERR(hugetlbfs_vfsmount[i]);
1104                         hugetlbfs_vfsmount[i] = NULL;
1105                 }
1106                 i++;
1107         }
1108         /* Non default hstates are optional */
1109         if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
1110                 return 0;
1111
1112  out:
1113         kmem_cache_destroy(hugetlbfs_inode_cachep);
1114  out2:
1115         return error;
1116 }
1117
1118 static void __exit exit_hugetlbfs_fs(void)
1119 {
1120         struct hstate *h;
1121         int i;
1122
1123
1124         /*
1125          * Make sure all delayed rcu free inodes are flushed before we
1126          * destroy cache.
1127          */
1128         rcu_barrier();
1129         kmem_cache_destroy(hugetlbfs_inode_cachep);
1130         i = 0;
1131         for_each_hstate(h)
1132                 kern_unmount(hugetlbfs_vfsmount[i++]);
1133         unregister_filesystem(&hugetlbfs_fs_type);
1134 }
1135
1136 module_init(init_hugetlbfs_fs)
1137 module_exit(exit_hugetlbfs_fs)
1138
1139 MODULE_LICENSE("GPL");