kernel/bpf/syscall.c

   1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
   2  *
   3  * This program is free software; you can redistribute it and/or
   4  * modify it under the terms of version 2 of the GNU General Public
   5  * License as published by the Free Software Foundation.
   6  *
   7  * This program is distributed in the hope that it will be useful, but
   8  * WITHOUT ANY WARRANTY; without even the implied warranty of
   9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10  * General Public License for more details.
  11  */
  12 #include <linux/bpf.h>
  13 #include <linux/syscalls.h>
  14 #include <linux/slab.h>
  15 #include <linux/anon_inodes.h>
  16
  17 static LIST_HEAD(bpf_map_types);
  18
  19 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
  20 {
  21         struct bpf_map_type_list *tl;
  22         struct bpf_map *map;
  23
  24         list_for_each_entry(tl, &bpf_map_types, list_node) {
  25                 if (tl->type == attr->map_type) {
  26                         map = tl->ops->map_alloc(attr);
  27                         if (IS_ERR(map))
  28                                 return map;
  29                         map->ops = tl->ops;
  30                         map->map_type = attr->map_type;
  31                         return map;
  32                 }
  33         }
  34         return ERR_PTR(-EINVAL);
  35 }
  36
  37 /* boot time registration of different map implementations */
  38 void bpf_register_map_type(struct bpf_map_type_list *tl)
  39 {
  40         list_add(&tl->list_node, &bpf_map_types);
  41 }
  42
  43 /* called from workqueue */
  44 static void bpf_map_free_deferred(struct work_struct *work)
  45 {
  46         struct bpf_map *map = container_of(work, struct bpf_map, work);
  47
  48         /* implementation dependent freeing */
  49         map->ops->map_free(map);
  50 }
  51
  52 /* decrement map refcnt and schedule it for freeing via workqueue
  53  * (unrelying map implementation ops->map_free() might sleep)
  54  */
  55 void bpf_map_put(struct bpf_map *map)
  56 {
  57         if (atomic_dec_and_test(&map->refcnt)) {
  58                 INIT_WORK(&map->work, bpf_map_free_deferred);
  59                 schedule_work(&map->work);
  60         }
  61 }
  62
  63 static int bpf_map_release(struct inode *inode, struct file *filp)
  64 {
  65         struct bpf_map *map = filp->private_data;
  66
  67         bpf_map_put(map);
  68         return 0;
  69 }
  70
  71 static const struct file_operations bpf_map_fops = {
  72         .release = bpf_map_release,
  73 };
  74
  75 /* helper macro to check that unused fields 'union bpf_attr' are zero */
  76 #define CHECK_ATTR(CMD) \
  77         memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
  78                    sizeof(attr->CMD##_LAST_FIELD), 0, \
  79                    sizeof(*attr) - \
  80                    offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
  81                    sizeof(attr->CMD##_LAST_FIELD)) != NULL
  82
  83 #define BPF_MAP_CREATE_LAST_FIELD max_entries
  84 /* called via syscall */
  85 static int map_create(union bpf_attr *attr)
  86 {
  87         struct bpf_map *map;
  88         int err;
  89
  90         err = CHECK_ATTR(BPF_MAP_CREATE);
  91         if (err)
  92                 return -EINVAL;
  93
  94         /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
  95         map = find_and_alloc_map(attr);
  96         if (IS_ERR(map))
  97                 return PTR_ERR(map);
  98
  99         atomic_set(&map->refcnt, 1);
 100
 101         err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
 102
 103         if (err < 0)
 104                 /* failed to allocate fd */
 105                 goto free_map;
 106
 107         return err;
 108
 109 free_map:
 110         map->ops->map_free(map);
 111         return err;
 112 }
 113
 114 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 115 {
 116         union bpf_attr attr = {};
 117         int err;
 118
 119         /* the syscall is limited to root temporarily. This restriction will be
 120          * lifted when security audit is clean. Note that eBPF+tracing must have
 121          * this restriction, since it may pass kernel data to user space
 122          */
 123         if (!capable(CAP_SYS_ADMIN))
 124                 return -EPERM;
 125
 126         if (!access_ok(VERIFY_READ, uattr, 1))
 127                 return -EFAULT;
 128
 129         if (size > PAGE_SIZE)   /* silly large */
 130                 return -E2BIG;
 131
 132         /* If we're handed a bigger struct than we know of,
 133          * ensure all the unknown bits are 0 - i.e. new
 134          * user-space does not rely on any kernel feature
 135          * extensions we dont know about yet.
 136          */
 137         if (size > sizeof(attr)) {
 138                 unsigned char __user *addr;
 139                 unsigned char __user *end;
 140                 unsigned char val;
 141
 142                 addr = (void __user *)uattr + sizeof(attr);
 143                 end  = (void __user *)uattr + size;
 144
 145                 for (; addr < end; addr++) {
 146                         err = get_user(val, addr);
 147                         if (err)
 148                                 return err;
 149                         if (val)
 150                                 return -E2BIG;
 151                 }
 152                 size = sizeof(attr);
 153         }
 154
 155         /* copy attributes from user space, may be less than sizeof(bpf_attr) */
 156         if (copy_from_user(&attr, uattr, size) != 0)
 157                 return -EFAULT;
 158
 159         switch (cmd) {
 160         case BPF_MAP_CREATE:
 161                 err = map_create(&attr);
 162                 break;
 163         default:
 164                 err = -EINVAL;
 165                 break;
 166         }
 167
 168         return err;
 169 }