Merge tag 'disintegrate-cris-20121009' of git://git.infradead.org/users/dhowells...
[firefly-linux-kernel-4.4.55.git] / drivers / staging / ramster / tmem.c
1 /*
2  * In-kernel transcendent memory (generic implementation)
3  *
4  * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp.
5  *
6  * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
7  * "handles" (triples containing a pool id, and object id, and an index), to
8  * pages in a page-accessible memory (PAM).  Tmem references the PAM pages via
9  * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
10  * set of functions (pamops).  Each pampd contains some representation of
11  * PAGE_SIZE bytes worth of data. For those familiar with key-value stores,
12  * the tmem handle is a three-level hierarchical key, and the value is always
13  * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is
14  * referenced in the datastore by the pampd.  The hierarchy is required
15  * to ensure that certain invalidation functions can be performed efficiently
16  * (i.e. flush all indexes associated with this object_id, or
17  * flush all objects associated with this pool).
18  *
19  * Tmem must support potentially millions of pages and must be able to insert,
20  * find, and delete these pages at a potential frequency of thousands per
21  * second concurrently across many CPUs, (and, if used with KVM, across many
22  * vcpus across many guests).  Tmem is tracked with a hierarchy of data
23  * structures, organized by the elements in the handle-tuple: pool_id,
24  * object_id, and page index.  One or more "clients" (e.g. guests) each
25  * provide one or more tmem_pools.  Each pool, contains a hash table of
26  * rb_trees of tmem_objs.  Each tmem_obj contains a radix-tree-like tree
27  * of pointers, with intermediate nodes called tmem_objnodes.  Each leaf
28  * pointer in this tree points to a pampd, which is accessible only through
29  * a small set of callbacks registered by the PAM implementation (see
30  * tmem_register_pamops). Tmem only needs to memory allocation for objs
31  * and objnodes and this is done via a set of callbacks that must be
32  * registered by the tmem host implementation (e.g. see tmem_register_hostops).
33  */
34
35 #include <linux/list.h>
36 #include <linux/spinlock.h>
37 #include <linux/atomic.h>
38 #ifdef CONFIG_RAMSTER
39 #include <linux/delay.h>
40 #endif
41
42 #include "tmem.h"
43
44 /* data structure sentinels used for debugging... see tmem.h */
45 #define POOL_SENTINEL 0x87658765
46 #define OBJ_SENTINEL 0x12345678
47 #define OBJNODE_SENTINEL 0xfedcba09
48
49 /*
50  * A tmem host implementation must use this function to register callbacks
51  * for memory allocation.
52  */
53 static struct tmem_hostops tmem_hostops;
54
55 static void tmem_objnode_tree_init(void);
56
57 void tmem_register_hostops(struct tmem_hostops *m)
58 {
59         tmem_objnode_tree_init();
60         tmem_hostops = *m;
61 }
62
63 /*
64  * A tmem host implementation must use this function to register
65  * callbacks for a page-accessible memory (PAM) implementation.
66  */
67 static struct tmem_pamops tmem_pamops;
68
69 void tmem_register_pamops(struct tmem_pamops *m)
70 {
71         tmem_pamops = *m;
72 }
73
74 /*
75  * Oid's are potentially very sparse and tmem_objs may have an indeterminately
76  * short life, being added and deleted at a relatively high frequency.
77  * So an rb_tree is an ideal data structure to manage tmem_objs.  But because
78  * of the potentially huge number of tmem_objs, each pool manages a hashtable
79  * of rb_trees to reduce search, insert, delete, and rebalancing time.
80  * Each hashbucket also has a lock to manage concurrent access and no
81  * searches, inserts, or deletions can be performed unless the lock is held.
82  * As a result, care must be taken to ensure tmem routines are not called
83  * recursively; the vast majority of the time, a recursive call may work
84  * but a deadlock will occur a small fraction of the time due to the
85  * hashbucket lock.
86  *
87  * The following routines manage tmem_objs.  In all of these routines,
88  * the hashbucket lock is already held.
89  */
90
91 /* Search for object==oid in pool, returns object if found. */
92 static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb,
93                                         struct tmem_oid *oidp,
94                                         struct rb_node **parent,
95                                         struct rb_node ***link)
96 {
97         struct rb_node *_parent = NULL, **rbnode;
98         struct tmem_obj *obj = NULL;
99
100         rbnode = &hb->obj_rb_root.rb_node;
101         while (*rbnode) {
102                 BUG_ON(RB_EMPTY_NODE(*rbnode));
103                 _parent = *rbnode;
104                 obj = rb_entry(*rbnode, struct tmem_obj,
105                                rb_tree_node);
106                 switch (tmem_oid_compare(oidp, &obj->oid)) {
107                 case 0: /* equal */
108                         goto out;
109                 case -1:
110                         rbnode = &(*rbnode)->rb_left;
111                         break;
112                 case 1:
113                         rbnode = &(*rbnode)->rb_right;
114                         break;
115                 }
116         }
117
118         if (parent)
119                 *parent = _parent;
120         if (link)
121                 *link = rbnode;
122         obj = NULL;
123 out:
124         return obj;
125 }
126
127 static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
128                                         struct tmem_oid *oidp)
129 {
130         return __tmem_obj_find(hb, oidp, NULL, NULL);
131 }
132
133 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool);
134
135 /* Free an object that has no more pampds in it. */
136 static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
137 {
138         struct tmem_pool *pool;
139
140         BUG_ON(obj == NULL);
141         ASSERT_SENTINEL(obj, OBJ);
142         BUG_ON(obj->pampd_count > 0);
143         pool = obj->pool;
144         BUG_ON(pool == NULL);
145         if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
146                 tmem_pampd_destroy_all_in_obj(obj, false);
147         BUG_ON(obj->objnode_tree_root != NULL);
148         BUG_ON((long)obj->objnode_count != 0);
149         atomic_dec(&pool->obj_count);
150         BUG_ON(atomic_read(&pool->obj_count) < 0);
151         INVERT_SENTINEL(obj, OBJ);
152         obj->pool = NULL;
153         tmem_oid_set_invalid(&obj->oid);
154         rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
155 }
156
157 /*
158  * Initialize, and insert an tmem_object_root (called only if find failed).
159  */
160 static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
161                                         struct tmem_pool *pool,
162                                         struct tmem_oid *oidp)
163 {
164         struct rb_root *root = &hb->obj_rb_root;
165         struct rb_node **new = NULL, *parent = NULL;
166
167         BUG_ON(pool == NULL);
168         atomic_inc(&pool->obj_count);
169         obj->objnode_tree_height = 0;
170         obj->objnode_tree_root = NULL;
171         obj->pool = pool;
172         obj->oid = *oidp;
173         obj->objnode_count = 0;
174         obj->pampd_count = 0;
175 #ifdef CONFIG_RAMSTER
176         if (tmem_pamops.new_obj != NULL)
177                 (*tmem_pamops.new_obj)(obj);
178 #endif
179         SET_SENTINEL(obj, OBJ);
180
181         if (__tmem_obj_find(hb, oidp, &parent, &new))
182                 BUG();
183
184         rb_link_node(&obj->rb_tree_node, parent, new);
185         rb_insert_color(&obj->rb_tree_node, root);
186 }
187
188 /*
189  * Tmem is managed as a set of tmem_pools with certain attributes, such as
190  * "ephemeral" vs "persistent".  These attributes apply to all tmem_objs
191  * and all pampds that belong to a tmem_pool.  A tmem_pool is created
192  * or deleted relatively rarely (for example, when a filesystem is
193  * mounted or unmounted).
194  */
195
196 /* flush all data from a pool and, optionally, free it */
197 static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
198 {
199         struct rb_node *rbnode;
200         struct tmem_obj *obj;
201         struct tmem_hashbucket *hb = &pool->hashbucket[0];
202         int i;
203
204         BUG_ON(pool == NULL);
205         for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
206                 spin_lock(&hb->lock);
207                 rbnode = rb_first(&hb->obj_rb_root);
208                 while (rbnode != NULL) {
209                         obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
210                         rbnode = rb_next(rbnode);
211                         tmem_pampd_destroy_all_in_obj(obj, true);
212                         tmem_obj_free(obj, hb);
213                         (*tmem_hostops.obj_free)(obj, pool);
214                 }
215                 spin_unlock(&hb->lock);
216         }
217         if (destroy)
218                 list_del(&pool->pool_list);
219 }
220
221 /*
222  * A tmem_obj contains a radix-tree-like tree in which the intermediate
223  * nodes are called tmem_objnodes.  (The kernel lib/radix-tree.c implementation
224  * is very specialized and tuned for specific uses and is not particularly
225  * suited for use from this code, though some code from the core algorithms has
226  * been reused, thus the copyright notices below).  Each tmem_objnode contains
227  * a set of pointers which point to either a set of intermediate tmem_objnodes
228  * or a set of of pampds.
229  *
230  * Portions Copyright (C) 2001 Momchil Velikov
231  * Portions Copyright (C) 2001 Christoph Hellwig
232  * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
233  */
234
235 struct tmem_objnode_tree_path {
236         struct tmem_objnode *objnode;
237         int offset;
238 };
239
240 /* objnode height_to_maxindex translation */
241 static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
242
243 static void tmem_objnode_tree_init(void)
244 {
245         unsigned int ht, tmp;
246
247         for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
248                 tmp = ht * OBJNODE_TREE_MAP_SHIFT;
249                 if (tmp >= OBJNODE_TREE_INDEX_BITS)
250                         tmem_objnode_tree_h2max[ht] = ~0UL;
251                 else
252                         tmem_objnode_tree_h2max[ht] =
253                             (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
254         }
255 }
256
257 static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
258 {
259         struct tmem_objnode *objnode;
260
261         ASSERT_SENTINEL(obj, OBJ);
262         BUG_ON(obj->pool == NULL);
263         ASSERT_SENTINEL(obj->pool, POOL);
264         objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
265         if (unlikely(objnode == NULL))
266                 goto out;
267         objnode->obj = obj;
268         SET_SENTINEL(objnode, OBJNODE);
269         memset(&objnode->slots, 0, sizeof(objnode->slots));
270         objnode->slots_in_use = 0;
271         obj->objnode_count++;
272 out:
273         return objnode;
274 }
275
276 static void tmem_objnode_free(struct tmem_objnode *objnode)
277 {
278         struct tmem_pool *pool;
279         int i;
280
281         BUG_ON(objnode == NULL);
282         for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
283                 BUG_ON(objnode->slots[i] != NULL);
284         ASSERT_SENTINEL(objnode, OBJNODE);
285         INVERT_SENTINEL(objnode, OBJNODE);
286         BUG_ON(objnode->obj == NULL);
287         ASSERT_SENTINEL(objnode->obj, OBJ);
288         pool = objnode->obj->pool;
289         BUG_ON(pool == NULL);
290         ASSERT_SENTINEL(pool, POOL);
291         objnode->obj->objnode_count--;
292         objnode->obj = NULL;
293         (*tmem_hostops.objnode_free)(objnode, pool);
294 }
295
296 /*
297  * Lookup index in object and return associated pampd (or NULL if not found).
298  */
299 static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
300 {
301         unsigned int height, shift;
302         struct tmem_objnode **slot = NULL;
303
304         BUG_ON(obj == NULL);
305         ASSERT_SENTINEL(obj, OBJ);
306         BUG_ON(obj->pool == NULL);
307         ASSERT_SENTINEL(obj->pool, POOL);
308
309         height = obj->objnode_tree_height;
310         if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
311                 goto out;
312         if (height == 0 && obj->objnode_tree_root) {
313                 slot = &obj->objnode_tree_root;
314                 goto out;
315         }
316         shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
317         slot = &obj->objnode_tree_root;
318         while (height > 0) {
319                 if (*slot == NULL)
320                         goto out;
321                 slot = (struct tmem_objnode **)
322                         ((*slot)->slots +
323                          ((index >> shift) & OBJNODE_TREE_MAP_MASK));
324                 shift -= OBJNODE_TREE_MAP_SHIFT;
325                 height--;
326         }
327 out:
328         return slot != NULL ? (void **)slot : NULL;
329 }
330
331 static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
332 {
333         struct tmem_objnode **slot;
334
335         slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
336         return slot != NULL ? *slot : NULL;
337 }
338
339 #ifdef CONFIG_RAMSTER
340 static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
341                                         void *new_pampd, bool no_free)
342 {
343         struct tmem_objnode **slot;
344         void *ret = NULL;
345
346         slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
347         if ((slot != NULL) && (*slot != NULL)) {
348                 void *old_pampd = *(void **)slot;
349                 *(void **)slot = new_pampd;
350                 if (!no_free)
351                         (*tmem_pamops.free)(old_pampd, obj->pool,
352                                                 NULL, 0, false);
353                 ret = new_pampd;
354         }
355         return ret;
356 }
357 #endif
358
359 static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
360                                         void *pampd)
361 {
362         int ret = 0;
363         struct tmem_objnode *objnode = NULL, *newnode, *slot;
364         unsigned int height, shift;
365         int offset = 0;
366
367         /* if necessary, extend the tree to be higher  */
368         if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
369                 height = obj->objnode_tree_height + 1;
370                 if (index > tmem_objnode_tree_h2max[height])
371                         while (index > tmem_objnode_tree_h2max[height])
372                                 height++;
373                 if (obj->objnode_tree_root == NULL) {
374                         obj->objnode_tree_height = height;
375                         goto insert;
376                 }
377                 do {
378                         newnode = tmem_objnode_alloc(obj);
379                         if (!newnode) {
380                                 ret = -ENOMEM;
381                                 goto out;
382                         }
383                         newnode->slots[0] = obj->objnode_tree_root;
384                         newnode->slots_in_use = 1;
385                         obj->objnode_tree_root = newnode;
386                         obj->objnode_tree_height++;
387                 } while (height > obj->objnode_tree_height);
388         }
389 insert:
390         slot = obj->objnode_tree_root;
391         height = obj->objnode_tree_height;
392         shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
393         while (height > 0) {
394                 if (slot == NULL) {
395                         /* add a child objnode.  */
396                         slot = tmem_objnode_alloc(obj);
397                         if (!slot) {
398                                 ret = -ENOMEM;
399                                 goto out;
400                         }
401                         if (objnode) {
402
403                                 objnode->slots[offset] = slot;
404                                 objnode->slots_in_use++;
405                         } else
406                                 obj->objnode_tree_root = slot;
407                 }
408                 /* go down a level */
409                 offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
410                 objnode = slot;
411                 slot = objnode->slots[offset];
412                 shift -= OBJNODE_TREE_MAP_SHIFT;
413                 height--;
414         }
415         BUG_ON(slot != NULL);
416         if (objnode) {
417                 objnode->slots_in_use++;
418                 objnode->slots[offset] = pampd;
419         } else
420                 obj->objnode_tree_root = pampd;
421         obj->pampd_count++;
422 out:
423         return ret;
424 }
425
426 static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
427 {
428         struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
429         struct tmem_objnode_tree_path *pathp = path;
430         struct tmem_objnode *slot = NULL;
431         unsigned int height, shift;
432         int offset;
433
434         BUG_ON(obj == NULL);
435         ASSERT_SENTINEL(obj, OBJ);
436         BUG_ON(obj->pool == NULL);
437         ASSERT_SENTINEL(obj->pool, POOL);
438         height = obj->objnode_tree_height;
439         if (index > tmem_objnode_tree_h2max[height])
440                 goto out;
441         slot = obj->objnode_tree_root;
442         if (height == 0 && obj->objnode_tree_root) {
443                 obj->objnode_tree_root = NULL;
444                 goto out;
445         }
446         shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
447         pathp->objnode = NULL;
448         do {
449                 if (slot == NULL)
450                         goto out;
451                 pathp++;
452                 offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
453                 pathp->offset = offset;
454                 pathp->objnode = slot;
455                 slot = slot->slots[offset];
456                 shift -= OBJNODE_TREE_MAP_SHIFT;
457                 height--;
458         } while (height > 0);
459         if (slot == NULL)
460                 goto out;
461         while (pathp->objnode) {
462                 pathp->objnode->slots[pathp->offset] = NULL;
463                 pathp->objnode->slots_in_use--;
464                 if (pathp->objnode->slots_in_use) {
465                         if (pathp->objnode == obj->objnode_tree_root) {
466                                 while (obj->objnode_tree_height > 0 &&
467                                   obj->objnode_tree_root->slots_in_use == 1 &&
468                                   obj->objnode_tree_root->slots[0]) {
469                                         struct tmem_objnode *to_free =
470                                                 obj->objnode_tree_root;
471
472                                         obj->objnode_tree_root =
473                                                         to_free->slots[0];
474                                         obj->objnode_tree_height--;
475                                         to_free->slots[0] = NULL;
476                                         to_free->slots_in_use = 0;
477                                         tmem_objnode_free(to_free);
478                                 }
479                         }
480                         goto out;
481                 }
482                 tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
483                 pathp--;
484         }
485         obj->objnode_tree_height = 0;
486         obj->objnode_tree_root = NULL;
487
488 out:
489         if (slot != NULL)
490                 obj->pampd_count--;
491         BUG_ON(obj->pampd_count < 0);
492         return slot;
493 }
494
495 /* Recursively walk the objnode_tree destroying pampds and objnodes. */
496 static void tmem_objnode_node_destroy(struct tmem_obj *obj,
497                                         struct tmem_objnode *objnode,
498                                         unsigned int ht)
499 {
500         int i;
501
502         if (ht == 0)
503                 return;
504         for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
505                 if (objnode->slots[i]) {
506                         if (ht == 1) {
507                                 obj->pampd_count--;
508                                 (*tmem_pamops.free)(objnode->slots[i],
509                                                 obj->pool, NULL, 0, true);
510                                 objnode->slots[i] = NULL;
511                                 continue;
512                         }
513                         tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
514                         tmem_objnode_free(objnode->slots[i]);
515                         objnode->slots[i] = NULL;
516                 }
517         }
518 }
519
520 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj,
521                                                 bool pool_destroy)
522 {
523         if (obj->objnode_tree_root == NULL)
524                 return;
525         if (obj->objnode_tree_height == 0) {
526                 obj->pampd_count--;
527                 (*tmem_pamops.free)(obj->objnode_tree_root,
528                                         obj->pool, NULL, 0, true);
529         } else {
530                 tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
531                                         obj->objnode_tree_height);
532                 tmem_objnode_free(obj->objnode_tree_root);
533                 obj->objnode_tree_height = 0;
534         }
535         obj->objnode_tree_root = NULL;
536 #ifdef CONFIG_RAMSTER
537         if (tmem_pamops.free_obj != NULL)
538                 (*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy);
539 #endif
540 }
541
542 /*
543  * Tmem is operated on by a set of well-defined actions:
544  * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
545  * (The tmem ABI allows for subpages and exchanges but these operations
546  * are not included in this implementation.)
547  *
548  * These "tmem core" operations are implemented in the following functions.
549  */
550
551 /*
552  * "Put" a page, e.g. associate the passed pampd with the passed handle.
553  * Tmem_put is complicated by a corner case: What if a page with matching
554  * handle already exists in tmem?  To guarantee coherency, one of two
555  * actions is necessary: Either the data for the page must be overwritten,
556  * or the page must be "flushed" so that the data is not accessible to a
557  * subsequent "get".  Since these "duplicate puts" are relatively rare,
558  * this implementation always flushes for simplicity.
559  */
560 int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
561                 bool raw, void *pampd_to_use)
562 {
563         struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
564         void *pampd = NULL, *pampd_del = NULL;
565         int ret = -ENOMEM;
566         struct tmem_hashbucket *hb;
567
568         hb = &pool->hashbucket[tmem_oid_hash(oidp)];
569         spin_lock(&hb->lock);
570         obj = objfound = tmem_obj_find(hb, oidp);
571         if (obj != NULL) {
572                 pampd = tmem_pampd_lookup_in_obj(objfound, index);
573                 if (pampd != NULL) {
574                         /* if found, is a dup put, flush the old one */
575                         pampd_del = tmem_pampd_delete_from_obj(obj, index);
576                         BUG_ON(pampd_del != pampd);
577                         (*tmem_pamops.free)(pampd, pool, oidp, index, true);
578                         if (obj->pampd_count == 0) {
579                                 objnew = obj;
580                                 objfound = NULL;
581                         }
582                         pampd = NULL;
583                 }
584         } else {
585                 obj = objnew = (*tmem_hostops.obj_alloc)(pool);
586                 if (unlikely(obj == NULL)) {
587                         ret = -ENOMEM;
588                         goto out;
589                 }
590                 tmem_obj_init(obj, hb, pool, oidp);
591         }
592         BUG_ON(obj == NULL);
593         BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
594         pampd = pampd_to_use;
595         BUG_ON(pampd_to_use == NULL);
596         ret = tmem_pampd_add_to_obj(obj, index, pampd);
597         if (unlikely(ret == -ENOMEM))
598                 /* may have partially built objnode tree ("stump") */
599                 goto delete_and_free;
600         (*tmem_pamops.create_finish)(pampd, is_ephemeral(pool));
601         goto out;
602
603 delete_and_free:
604         (void)tmem_pampd_delete_from_obj(obj, index);
605         if (pampd)
606                 (*tmem_pamops.free)(pampd, pool, NULL, 0, true);
607         if (objnew) {
608                 tmem_obj_free(objnew, hb);
609                 (*tmem_hostops.obj_free)(objnew, pool);
610         }
611 out:
612         spin_unlock(&hb->lock);
613         return ret;
614 }
615
616 #ifdef CONFIG_RAMSTER
617 /*
618  * For ramster only:  The following routines provide a two-step sequence
619  * to allow the caller to replace a pampd in the tmem data structures with
620  * another pampd. Here, we lookup the passed handle and, if found, return the
621  * associated pampd and object, leaving the hashbucket locked and returning
622  * a reference to it.  The caller is expected to immediately call the
623  * matching tmem_localify_finish routine which will handles the replacement
624  * and unlocks the hashbucket.
625  */
626 void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
627                                 uint32_t index, struct tmem_obj **ret_obj,
628                                 void **saved_hb)
629 {
630         struct tmem_hashbucket *hb;
631         struct tmem_obj *obj = NULL;
632         void *pampd = NULL;
633
634         hb = &pool->hashbucket[tmem_oid_hash(oidp)];
635         spin_lock(&hb->lock);
636         obj = tmem_obj_find(hb, oidp);
637         if (likely(obj != NULL))
638                 pampd = tmem_pampd_lookup_in_obj(obj, index);
639         *ret_obj = obj;
640         *saved_hb = (void *)hb;
641         /* note, hashbucket remains locked */
642         return pampd;
643 }
644
645 void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
646                           void *pampd, void *saved_hb, bool delete)
647 {
648         struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;
649
650         BUG_ON(!spin_is_locked(&hb->lock));
651         if (pampd != NULL) {
652                 BUG_ON(obj == NULL);
653                 (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
654                 (*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool));
655         } else if (delete) {
656                 BUG_ON(obj == NULL);
657                 (void)tmem_pampd_delete_from_obj(obj, index);
658         }
659         spin_unlock(&hb->lock);
660 }
661
662 /*
663  * For ramster only.  Helper function to support asynchronous tmem_get.
664  */
665 static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
666                                 struct tmem_pool *pool, struct tmem_oid *oidp,
667                                 uint32_t index, bool free, char *data)
668 {
669         void *old_pampd = *ppampd, *new_pampd = NULL;
670         bool intransit = false;
671         int ret = 0;
672
673         if (!is_ephemeral(pool))
674                 new_pampd = (*tmem_pamops.repatriate_preload)(
675                                 old_pampd, pool, oidp, index, &intransit);
676         if (intransit)
677                 ret = -EAGAIN;
678         else if (new_pampd != NULL)
679                 *ppampd = new_pampd;
680         /* must release the hb->lock else repatriate can't sleep */
681         spin_unlock(&hb->lock);
682         if (!intransit)
683                 ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
684                                                 oidp, index, free, data);
685         if (ret == -EAGAIN) {
686                 /* rare I think, but should cond_resched()??? */
687                 usleep_range(10, 1000);
688         } else if (ret == -ENOTCONN || ret == -EHOSTDOWN) {
689                 ret = -1;
690         } else if (ret != 0 && ret != -ENOENT) {
691                 ret = -1;
692         }
693         /* note hb->lock has now been unlocked */
694         return ret;
695 }
696
697 /*
698  * For ramster only.  If a page in tmem matches the handle, replace the
699  * page so that any subsequent "get" gets the new page.  Returns 0 if
700  * there was a page to replace, else returns -1.
701  */
702 int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
703                         uint32_t index, void *new_pampd)
704 {
705         struct tmem_obj *obj;
706         int ret = -1;
707         struct tmem_hashbucket *hb;
708
709         hb = &pool->hashbucket[tmem_oid_hash(oidp)];
710         spin_lock(&hb->lock);
711         obj = tmem_obj_find(hb, oidp);
712         if (obj == NULL)
713                 goto out;
714         new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
715         /* if we bug here, pamops wasn't properly set up for ramster */
716         BUG_ON(tmem_pamops.replace_in_obj == NULL);
717         ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
718 out:
719         spin_unlock(&hb->lock);
720         return ret;
721 }
722 #endif
723
724 /*
725  * "Get" a page, e.g. if a pampd can be found matching the passed handle,
726  * use a pamops callback to recreated the page from the pampd with the
727  * matching handle.  By tmem definition, when a "get" is successful on
728  * an ephemeral page, the page is "flushed", and when a "get" is successful
729  * on a persistent page, the page is retained in tmem.  Note that to preserve
730  * coherency, "get" can never be skipped if tmem contains the data.
731  * That is, if a get is done with a certain handle and fails, any
732  * subsequent "get" must also fail (unless of course there is a
733  * "put" done with the same handle).
734  */
735 int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
736                 char *data, size_t *sizep, bool raw, int get_and_free)
737 {
738         struct tmem_obj *obj;
739         void *pampd = NULL;
740         bool ephemeral = is_ephemeral(pool);
741         int ret = -1;
742         struct tmem_hashbucket *hb;
743         bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
744         bool lock_held = false;
745         void **ppampd;
746
747         do {
748                 hb = &pool->hashbucket[tmem_oid_hash(oidp)];
749                 spin_lock(&hb->lock);
750                 lock_held = true;
751                 obj = tmem_obj_find(hb, oidp);
752                 if (obj == NULL)
753                         goto out;
754                 ppampd = __tmem_pampd_lookup_in_obj(obj, index);
755                 if (ppampd == NULL)
756                         goto out;
757 #ifdef CONFIG_RAMSTER
758                 if ((tmem_pamops.is_remote != NULL) &&
759                      tmem_pamops.is_remote(*ppampd)) {
760                         ret = tmem_repatriate(ppampd, hb, pool, oidp,
761                                                 index, free, data);
762                         /* tmem_repatriate releases hb->lock */
763                         lock_held = false;
764                         *sizep = PAGE_SIZE;
765                         if (ret != -EAGAIN)
766                                 goto out;
767                 }
768 #endif
769         } while (ret == -EAGAIN);
770         if (free)
771                 pampd = tmem_pampd_delete_from_obj(obj, index);
772         else
773                 pampd = tmem_pampd_lookup_in_obj(obj, index);
774         if (pampd == NULL)
775                 goto out;
776         if (free) {
777                 if (obj->pampd_count == 0) {
778                         tmem_obj_free(obj, hb);
779                         (*tmem_hostops.obj_free)(obj, pool);
780                         obj = NULL;
781                 }
782         }
783         if (free)
784                 ret = (*tmem_pamops.get_data_and_free)(
785                                 data, sizep, raw, pampd, pool, oidp, index);
786         else
787                 ret = (*tmem_pamops.get_data)(
788                                 data, sizep, raw, pampd, pool, oidp, index);
789         if (ret < 0)
790                 goto out;
791         ret = 0;
792 out:
793         if (lock_held)
794                 spin_unlock(&hb->lock);
795         return ret;
796 }
797
798 /*
799  * If a page in tmem matches the handle, "flush" this page from tmem such
800  * that any subsequent "get" does not succeed (unless, of course, there
801  * was another "put" with the same handle).
802  */
803 int tmem_flush_page(struct tmem_pool *pool,
804                                 struct tmem_oid *oidp, uint32_t index)
805 {
806         struct tmem_obj *obj;
807         void *pampd;
808         int ret = -1;
809         struct tmem_hashbucket *hb;
810
811         hb = &pool->hashbucket[tmem_oid_hash(oidp)];
812         spin_lock(&hb->lock);
813         obj = tmem_obj_find(hb, oidp);
814         if (obj == NULL)
815                 goto out;
816         pampd = tmem_pampd_delete_from_obj(obj, index);
817         if (pampd == NULL)
818                 goto out;
819         (*tmem_pamops.free)(pampd, pool, oidp, index, true);
820         if (obj->pampd_count == 0) {
821                 tmem_obj_free(obj, hb);
822                 (*tmem_hostops.obj_free)(obj, pool);
823         }
824         ret = 0;
825
826 out:
827         spin_unlock(&hb->lock);
828         return ret;
829 }
830
831 /*
832  * "Flush" all pages in tmem matching this oid.
833  */
834 int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
835 {
836         struct tmem_obj *obj;
837         struct tmem_hashbucket *hb;
838         int ret = -1;
839
840         hb = &pool->hashbucket[tmem_oid_hash(oidp)];
841         spin_lock(&hb->lock);
842         obj = tmem_obj_find(hb, oidp);
843         if (obj == NULL)
844                 goto out;
845         tmem_pampd_destroy_all_in_obj(obj, false);
846         tmem_obj_free(obj, hb);
847         (*tmem_hostops.obj_free)(obj, pool);
848         ret = 0;
849
850 out:
851         spin_unlock(&hb->lock);
852         return ret;
853 }
854
855 /*
856  * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
857  * all subsequent access to this tmem_pool.
858  */
859 int tmem_destroy_pool(struct tmem_pool *pool)
860 {
861         int ret = -1;
862
863         if (pool == NULL)
864                 goto out;
865         tmem_pool_flush(pool, 1);
866         ret = 0;
867 out:
868         return ret;
869 }
870
871 static LIST_HEAD(tmem_global_pool_list);
872
873 /*
874  * Create a new tmem_pool with the provided flag and return
875  * a pool id provided by the tmem host implementation.
876  */
877 void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
878 {
879         int persistent = flags & TMEM_POOL_PERSIST;
880         int shared = flags & TMEM_POOL_SHARED;
881         struct tmem_hashbucket *hb = &pool->hashbucket[0];
882         int i;
883
884         for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
885                 hb->obj_rb_root = RB_ROOT;
886                 spin_lock_init(&hb->lock);
887         }
888         INIT_LIST_HEAD(&pool->pool_list);
889         atomic_set(&pool->obj_count, 0);
890         SET_SENTINEL(pool, POOL);
891         list_add_tail(&pool->pool_list, &tmem_global_pool_list);
892         pool->persistent = persistent;
893         pool->shared = shared;
894 }