#define AIO_RING_PAGES 8
+struct kioctx_table {
+ struct rcu_head rcu;
+ unsigned nr;
+ struct kioctx *table[];
+};
+
struct kioctx_cpu {
unsigned reqs_available;
};
struct percpu_ref users;
atomic_t dead;
- /* This needs improving */
unsigned long user_id;
- struct hlist_node list;
struct __percpu kioctx_cpu *cpu;
struct page *internal_pages[AIO_RING_PAGES];
struct file *aio_ring_file;
+
+ unsigned id;
};
/*------ sysctl variables----*/
}
__initcall(aio_setup);
+static void put_aio_ring_file(struct kioctx *ctx)
+{
+ struct file *aio_ring_file = ctx->aio_ring_file;
+ if (aio_ring_file) {
+ truncate_setsize(aio_ring_file->f_inode, 0);
+
+ /* Prevent further access to the kioctx from migratepages */
+ spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
+ aio_ring_file->f_inode->i_mapping->private_data = NULL;
+ ctx->aio_ring_file = NULL;
+ spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
+
+ fput(aio_ring_file);
+ }
+}
+
static void aio_free_ring(struct kioctx *ctx)
{
int i;
- struct file *aio_ring_file = ctx->aio_ring_file;
for (i = 0; i < ctx->nr_pages; i++) {
pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
put_page(ctx->ring_pages[i]);
}
+ put_aio_ring_file(ctx);
+
if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
kfree(ctx->ring_pages);
-
- if (aio_ring_file) {
- truncate_setsize(aio_ring_file->f_inode, 0);
- pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n",
- current->pid, aio_ring_file->f_inode->i_nlink,
- aio_ring_file->f_path.dentry->d_count,
- d_unhashed(aio_ring_file->f_path.dentry),
- atomic_read(&aio_ring_file->f_inode->i_count));
- fput(aio_ring_file);
- ctx->aio_ring_file = NULL;
- }
}
static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
static int aio_migratepage(struct address_space *mapping, struct page *new,
struct page *old, enum migrate_mode mode)
{
- struct kioctx *ctx = mapping->private_data;
+ struct kioctx *ctx;
unsigned long flags;
- unsigned idx = old->index;
int rc;
/* Writeback must be complete */
get_page(new);
- spin_lock_irqsave(&ctx->completion_lock, flags);
- migrate_page_copy(new, old);
- ctx->ring_pages[idx] = new;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ /* We can potentially race against kioctx teardown here. Use the
+ * address_space's private data lock to protect the mapping's
+ * private_data.
+ */
+ spin_lock(&mapping->private_lock);
+ ctx = mapping->private_data;
+ if (ctx) {
+ pgoff_t idx;
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ migrate_page_copy(new, old);
+ idx = old->index;
+ if (idx < (pgoff_t)ctx->nr_pages)
+ ctx->ring_pages[idx] = new;
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else
+ rc = -EBUSY;
+ spin_unlock(&mapping->private_lock);
return rc;
}
aio_free_ring(ctx);
return -EAGAIN;
}
- up_write(&mm->mmap_sem);
-
- mm_populate(ctx->mmap_base, populate);
pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
+
+ /* We must do this while still holding mmap_sem for write, as we
+ * need to be protected against userspace attempting to mremap()
+ * or munmap() the ring buffer.
+ */
ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
1, 0, ctx->ring_pages, NULL);
+
+ /* Dropping the reference here is safe as the page cache will hold
+ * onto the pages for us. It is also required so that page migration
+ * can unmap the pages and get the right reference count.
+ */
for (i = 0; i < ctx->nr_pages; i++)
put_page(ctx->ring_pages[i]);
+ up_write(&mm->mmap_sem);
+
if (unlikely(ctx->nr_pages != nr_pages)) {
aio_free_ring(ctx);
return -EAGAIN;
ring = kmap_atomic(ctx->ring_pages[0]);
ring->nr = nr_events; /* user copy */
- ring->id = ctx->user_id;
+ ring->id = ~0U;
ring->head = ring->tail = 0;
ring->magic = AIO_RING_MAGIC;
ring->compat_features = AIO_RING_COMPAT_FEATURES;
schedule_work(&ctx->free_work);
}
+static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
+{
+ unsigned i, new_nr;
+ struct kioctx_table *table, *old;
+ struct aio_ring *ring;
+
+ spin_lock(&mm->ioctx_lock);
+ rcu_read_lock();
+ table = rcu_dereference(mm->ioctx_table);
+
+ while (1) {
+ if (table)
+ for (i = 0; i < table->nr; i++)
+ if (!table->table[i]) {
+ ctx->id = i;
+ table->table[i] = ctx;
+ rcu_read_unlock();
+ spin_unlock(&mm->ioctx_lock);
+
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ ring->id = ctx->id;
+ kunmap_atomic(ring);
+ return 0;
+ }
+
+ new_nr = (table ? table->nr : 1) * 4;
+
+ rcu_read_unlock();
+ spin_unlock(&mm->ioctx_lock);
+
+ table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
+ new_nr, GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ table->nr = new_nr;
+
+ spin_lock(&mm->ioctx_lock);
+ rcu_read_lock();
+ old = rcu_dereference(mm->ioctx_table);
+
+ if (!old) {
+ rcu_assign_pointer(mm->ioctx_table, table);
+ } else if (table->nr > old->nr) {
+ memcpy(table->table, old->table,
+ old->nr * sizeof(struct kioctx *));
+
+ rcu_assign_pointer(mm->ioctx_table, table);
+ kfree_rcu(old, rcu);
+ } else {
+ kfree(table);
+ table = old;
+ }
+ }
+}
+
/* ioctx_alloc
* Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
*/
atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
- BUG_ON(!ctx->req_batch);
+ if (ctx->req_batch < 1)
+ ctx->req_batch = 1;
/* limit the number of system wide aios */
spin_lock(&aio_nr_lock);
percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
- /* now link into global list. */
- spin_lock(&mm->ioctx_lock);
- hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
- spin_unlock(&mm->ioctx_lock);
+ err = ioctx_add_table(ctx, mm);
+ if (err)
+ goto out_cleanup_put;
pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
ctx, ctx->user_id, mm, ctx->nr_events);
return ctx;
+out_cleanup_put:
+ percpu_ref_put(&ctx->users);
out_cleanup:
err = -EAGAIN;
aio_free_ring(ctx);
out_freeref:
free_percpu(ctx->users.pcpu_count);
out_freectx:
- if (ctx->aio_ring_file)
- fput(ctx->aio_ring_file);
+ put_aio_ring_file(ctx);
kmem_cache_free(kioctx_cachep, ctx);
pr_debug("error allocating ioctx %d\n", err);
return ERR_PTR(err);
* when the processes owning a context have all exited to encourage
* the rapid destruction of the kioctx.
*/
-static void kill_ioctx(struct kioctx *ctx)
+static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
{
if (!atomic_xchg(&ctx->dead, 1)) {
- hlist_del_rcu(&ctx->list);
+ struct kioctx_table *table;
+
+ spin_lock(&mm->ioctx_lock);
+ rcu_read_lock();
+ table = rcu_dereference(mm->ioctx_table);
+
+ WARN_ON(ctx != table->table[ctx->id]);
+ table->table[ctx->id] = NULL;
+ rcu_read_unlock();
+ spin_unlock(&mm->ioctx_lock);
+
/* percpu_ref_kill() will do the necessary call_rcu() */
wake_up_all(&ctx->wait);
*/
void exit_aio(struct mm_struct *mm)
{
+ struct kioctx_table *table;
struct kioctx *ctx;
- struct hlist_node *n;
+ unsigned i = 0;
+
+ while (1) {
+ rcu_read_lock();
+ table = rcu_dereference(mm->ioctx_table);
+
+ do {
+ if (!table || i >= table->nr) {
+ rcu_read_unlock();
+ rcu_assign_pointer(mm->ioctx_table, NULL);
+ if (table)
+ kfree(table);
+ return;
+ }
+
+ ctx = table->table[i++];
+ } while (!ctx);
+
+ rcu_read_unlock();
- hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
/*
* We don't need to bother with munmap() here -
* exit_mmap(mm) is coming and it'll unmap everything.
*/
ctx->mmap_size = 0;
- kill_ioctx(ctx);
+ kill_ioctx(mm, ctx);
}
}
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
{
+ struct aio_ring __user *ring = (void __user *)ctx_id;
struct mm_struct *mm = current->mm;
struct kioctx *ctx, *ret = NULL;
+ struct kioctx_table *table;
+ unsigned id;
+
+ if (get_user(id, &ring->id))
+ return NULL;
rcu_read_lock();
+ table = rcu_dereference(mm->ioctx_table);
- hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
- if (ctx->user_id == ctx_id) {
- percpu_ref_get(&ctx->users);
- ret = ctx;
- break;
- }
- }
+ if (!table || id >= table->nr)
+ goto out;
+ ctx = table->table[id];
+ if (ctx && ctx->user_id == ctx_id) {
+ percpu_ref_get(&ctx->users);
+ ret = ctx;
+ }
+out:
rcu_read_unlock();
return ret;
}
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
if (ret)
- kill_ioctx(ioctx);
+ kill_ioctx(current->mm, ioctx);
percpu_ref_put(&ioctx->users);
}
{
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
- kill_ioctx(ioctx);
+ kill_ioctx(current->mm, ioctx);
percpu_ref_put(&ioctx->users);
return 0;
}