drivers/staging/ramster/zcache-main.c

   1 /*
   2  * zcache.c
   3  *
   4  * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
   5  * Copyright (c) 2010,2011, Nitin Gupta
   6  *
   7  * Zcache provides an in-kernel "host implementation" for transcendent memory
   8  * and, thus indirectly, for cleancache and frontswap.  Zcache includes two
   9  * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
  10  * 1) "compression buddies" ("zbud") is used for ephemeral pages
  11  * 2) xvmalloc is used for persistent pages.
  12  * Xvmalloc (based on the TLSF allocator) has very low fragmentation
  13  * so maximizes space efficiency, while zbud allows pairs (and potentially,
  14  * in the future, more than a pair of) compressed pages to be closely linked
  15  * so that reclaiming can be done via the kernel's physical-page-oriented
  16  * "shrinker" interface.
  17  *
  18  * [1] For a definition of page-accessible memory (aka PAM), see:
  19  *   http://marc.info/?l=linux-mm&m=127811271605009
  20  *  RAMSTER TODO:
  21  *   - handle remotifying of buddied pages (see zbud_remotify_zbpg)
  22  *   - kernel boot params: nocleancache/nofrontswap don't always work?!?
  23  */
  24
  25 #include <linux/module.h>
  26 #include <linux/cpu.h>
  27 #include <linux/highmem.h>
  28 #include <linux/list.h>
  29 #include <linux/lzo.h>
  30 #include <linux/slab.h>
  31 #include <linux/spinlock.h>
  32 #include <linux/types.h>
  33 #include <linux/atomic.h>
  34 #include <linux/math64.h>
  35 #include "tmem.h"
  36 #include "zcache.h"
  37 #include "ramster.h"
  38 #include "cluster/tcp.h"
  39
  40 #include "../zram/xvmalloc.h" /* if built in drivers/staging */
  41
  42 #define RAMSTER_TESTING
  43
  44 #if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
  45 #error "ramster is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
  46 #endif
  47 #ifdef CONFIG_CLEANCACHE
  48 #include <linux/cleancache.h>
  49 #endif
  50 #ifdef CONFIG_FRONTSWAP
  51 #include <linux/frontswap.h>
  52 #endif
  53
  54 enum ramster_remotify_op {
  55         RAMSTER_REMOTIFY_EPH_PUT,
  56         RAMSTER_REMOTIFY_PERS_PUT,
  57         RAMSTER_REMOTIFY_FLUSH_PAGE,
  58         RAMSTER_REMOTIFY_FLUSH_OBJ,
  59         RAMSTER_INTRANSIT_PERS
  60 };
  61
  62 struct ramster_remotify_hdr {
  63         enum ramster_remotify_op op;
  64         struct list_head list;
  65 };
  66
  67 #define ZBH_SENTINEL  0x43214321
  68 #define ZBPG_SENTINEL  0xdeadbeef
  69
  70 #define ZBUD_MAX_BUDS 2
  71
  72 struct zbud_hdr {
  73         struct ramster_remotify_hdr rem_op;
  74         uint16_t client_id;
  75         uint16_t pool_id;
  76         struct tmem_oid oid;
  77         uint32_t index;
  78         uint16_t size; /* compressed size in bytes, zero means unused */
  79         DECL_SENTINEL
  80 };
  81
  82 #define ZVH_SENTINEL  0x43214321
  83 static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;
  84
  85 struct zv_hdr {
  86         struct ramster_remotify_hdr rem_op;
  87         uint16_t client_id;
  88         uint16_t pool_id;
  89         struct tmem_oid oid;
  90         uint32_t index;
  91         DECL_SENTINEL
  92 };
  93
  94 struct flushlist_node {
  95         struct ramster_remotify_hdr rem_op;
  96         struct tmem_xhandle xh;
  97 };
  98
  99 union {
 100         struct ramster_remotify_hdr rem_op;
 101         struct zv_hdr zv;
 102         struct zbud_hdr zbud;
 103         struct flushlist_node flist;
 104 } remotify_list_node;
 105
 106 static LIST_HEAD(zcache_rem_op_list);
 107 static DEFINE_SPINLOCK(zcache_rem_op_list_lock);
 108
 109 #if 0
 110 /* this is more aggressive but may cause other problems? */
 111 #define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
 112 #else
 113 #define ZCACHE_GFP_MASK \
 114         (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
 115 #endif
 116
 117 #define MAX_POOLS_PER_CLIENT 16
 118
 119 #define MAX_CLIENTS 16
 120 #define LOCAL_CLIENT ((uint16_t)-1)
 121
 122 MODULE_LICENSE("GPL");
 123
 124 struct zcache_client {
 125         struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
 126         struct xv_pool *xvpool;
 127         bool allocated;
 128         atomic_t refcount;
 129 };
 130
 131 static struct zcache_client zcache_host;
 132 static struct zcache_client zcache_clients[MAX_CLIENTS];
 133
 134 static inline uint16_t get_client_id_from_client(struct zcache_client *cli)
 135 {
 136         BUG_ON(cli == NULL);
 137         if (cli == &zcache_host)
 138                 return LOCAL_CLIENT;
 139         return cli - &zcache_clients[0];
 140 }
 141
 142 static inline bool is_local_client(struct zcache_client *cli)
 143 {
 144         return cli == &zcache_host;
 145 }
 146
 147 /**********
 148  * Compression buddies ("zbud") provides for packing two (or, possibly
 149  * in the future, more) compressed ephemeral pages into a single "raw"
 150  * (physical) page and tracking them with data structures so that
 151  * the raw pages can be easily reclaimed.
 152  *
 153  * A zbud page ("zbpg") is an aligned page containing a list_head,
 154  * a lock, and two "zbud headers".  The remainder of the physical
 155  * page is divided up into aligned 64-byte "chunks" which contain
 156  * the compressed data for zero, one, or two zbuds.  Each zbpg
 157  * resides on: (1) an "unused list" if it has no zbuds; (2) a
 158  * "buddied" list if it is fully populated  with two zbuds; or
 159  * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
 160  * the one unbuddied zbud uses.  The data inside a zbpg cannot be
 161  * read or written unless the zbpg's lock is held.
 162  */
 163
 164 struct zbud_page {
 165         struct list_head bud_list;
 166         spinlock_t lock;
 167         struct zbud_hdr buddy[ZBUD_MAX_BUDS];
 168         DECL_SENTINEL
 169         /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
 170 };
 171
 172 #define CHUNK_SHIFT     6
 173 #define CHUNK_SIZE      (1 << CHUNK_SHIFT)
 174 #define CHUNK_MASK      (~(CHUNK_SIZE-1))
 175 #define NCHUNKS         (((PAGE_SIZE - sizeof(struct zbud_page)) & \
 176                                 CHUNK_MASK) >> CHUNK_SHIFT)
 177 #define MAX_CHUNK       (NCHUNKS-1)
 178
 179 static struct {
 180         struct list_head list;
 181         unsigned count;
 182 } zbud_unbuddied[NCHUNKS];
 183 /* list N contains pages with N chunks USED and NCHUNKS-N unused */
 184 /* element 0 is never used but optimizing that isn't worth it */
 185 static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
 186
 187 struct list_head zbud_buddied_list;
 188 static unsigned long zcache_zbud_buddied_count;
 189
 190 /* protects the buddied list and all unbuddied lists */
 191 static DEFINE_SPINLOCK(zbud_budlists_spinlock);
 192
 193 static atomic_t zcache_zbud_curr_raw_pages;
 194 static atomic_t zcache_zbud_curr_zpages;
 195 static unsigned long zcache_zbud_curr_zbytes;
 196 static unsigned long zcache_zbud_cumul_zpages;
 197 static unsigned long zcache_zbud_cumul_zbytes;
 198 static unsigned long zcache_compress_poor;
 199 static unsigned long zcache_policy_percent_exceeded;
 200 static unsigned long zcache_mean_compress_poor;
 201
 202 /*
 203  * RAMster counters
 204  * - Remote pages are pages with a local pampd but the data is remote
 205  * - Foreign pages are pages stored locally but belonging to another node
 206  */
 207 static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0);
 208 static unsigned long ramster_pers_remotify_enable;
 209 static unsigned long ramster_eph_remotify_enable;
 210 static unsigned long ramster_eph_pages_remoted;
 211 static unsigned long ramster_eph_pages_remote_failed;
 212 static unsigned long ramster_pers_pages_remoted;
 213 static unsigned long ramster_pers_pages_remote_failed;
 214 static unsigned long ramster_pers_pages_remote_nomem;
 215 static unsigned long ramster_remote_objects_flushed;
 216 static unsigned long ramster_remote_object_flushes_failed;
 217 static unsigned long ramster_remote_pages_flushed;
 218 static unsigned long ramster_remote_page_flushes_failed;
 219 static unsigned long ramster_remote_eph_pages_succ_get;
 220 static unsigned long ramster_remote_pers_pages_succ_get;
 221 static unsigned long ramster_remote_eph_pages_unsucc_get;
 222 static unsigned long ramster_remote_pers_pages_unsucc_get;
 223 static atomic_t ramster_curr_flnode_count = ATOMIC_INIT(0);
 224 static unsigned long ramster_curr_flnode_count_max;
 225 static atomic_t ramster_foreign_eph_pampd_count = ATOMIC_INIT(0);
 226 static unsigned long ramster_foreign_eph_pampd_count_max;
 227 static atomic_t ramster_foreign_pers_pampd_count = ATOMIC_INIT(0);
 228 static unsigned long ramster_foreign_pers_pampd_count_max;
 229
 230 /* forward references */
 231 static void *zcache_get_free_page(void);
 232 static void zcache_free_page(void *p);
 233
 234 /*
 235  * zbud helper functions
 236  */
 237
 238 static inline unsigned zbud_max_buddy_size(void)
 239 {
 240         return MAX_CHUNK << CHUNK_SHIFT;
 241 }
 242
 243 static inline unsigned zbud_size_to_chunks(unsigned size)
 244 {
 245         BUG_ON(size == 0 || size > zbud_max_buddy_size());
 246         return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
 247 }
 248
 249 static inline int zbud_budnum(struct zbud_hdr *zh)
 250 {
 251         unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
 252         struct zbud_page *zbpg = NULL;
 253         unsigned budnum = -1U;
 254         int i;
 255
 256         for (i = 0; i < ZBUD_MAX_BUDS; i++)
 257                 if (offset == offsetof(typeof(*zbpg), buddy[i])) {
 258                         budnum = i;
 259                         break;
 260                 }
 261         BUG_ON(budnum == -1U);
 262         return budnum;
 263 }
 264
 265 static char *zbud_data(struct zbud_hdr *zh, unsigned size)
 266 {
 267         struct zbud_page *zbpg;
 268         char *p;
 269         unsigned budnum;
 270
 271         ASSERT_SENTINEL(zh, ZBH);
 272         budnum = zbud_budnum(zh);
 273         BUG_ON(size == 0 || size > zbud_max_buddy_size());
 274         zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
 275         ASSERT_SPINLOCK(&zbpg->lock);
 276         p = (char *)zbpg;
 277         if (budnum == 0)
 278                 p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
 279                                                         CHUNK_MASK);
 280         else if (budnum == 1)
 281                 p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
 282         return p;
 283 }
 284
 285 static void zbud_copy_from_pampd(char *data, size_t *size, struct zbud_hdr *zh)
 286 {
 287         struct zbud_page *zbpg;
 288         char *p;
 289         unsigned budnum;
 290
 291         ASSERT_SENTINEL(zh, ZBH);
 292         budnum = zbud_budnum(zh);
 293         zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
 294         spin_lock(&zbpg->lock);
 295         BUG_ON(zh->size > *size);
 296         p = (char *)zbpg;
 297         if (budnum == 0)
 298                 p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
 299                                                         CHUNK_MASK);
 300         else if (budnum == 1)
 301                 p += PAGE_SIZE - ((zh->size + CHUNK_SIZE - 1) & CHUNK_MASK);
 302         /* client should be filled in by caller */
 303         memcpy(data, p, zh->size);
 304         *size = zh->size;
 305         spin_unlock(&zbpg->lock);
 306 }
 307
 308 /*
 309  * zbud raw page management
 310  */
 311
 312 static struct zbud_page *zbud_alloc_raw_page(void)
 313 {
 314         struct zbud_page *zbpg = NULL;
 315         struct zbud_hdr *zh0, *zh1;
 316                 zbpg = zcache_get_free_page();
 317         if (likely(zbpg != NULL)) {
 318                 INIT_LIST_HEAD(&zbpg->bud_list);
 319                 zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
 320                 spin_lock_init(&zbpg->lock);
 321                 atomic_inc(&zcache_zbud_curr_raw_pages);
 322                 INIT_LIST_HEAD(&zbpg->bud_list);
 323                 SET_SENTINEL(zbpg, ZBPG);
 324                 zh0->size = 0; zh1->size = 0;
 325                 tmem_oid_set_invalid(&zh0->oid);
 326                 tmem_oid_set_invalid(&zh1->oid);
 327         }
 328         return zbpg;
 329 }
 330
 331 static void zbud_free_raw_page(struct zbud_page *zbpg)
 332 {
 333         struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
 334
 335         ASSERT_SENTINEL(zbpg, ZBPG);
 336         BUG_ON(!list_empty(&zbpg->bud_list));
 337         ASSERT_SPINLOCK(&zbpg->lock);
 338         BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
 339         BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
 340         INVERT_SENTINEL(zbpg, ZBPG);
 341         spin_unlock(&zbpg->lock);
 342         atomic_dec(&zcache_zbud_curr_raw_pages);
 343         zcache_free_page(zbpg);
 344 }
 345
 346 /*
 347  * core zbud handling routines
 348  */
 349
 350 static unsigned zbud_free(struct zbud_hdr *zh)
 351 {
 352         unsigned size;
 353
 354         ASSERT_SENTINEL(zh, ZBH);
 355         BUG_ON(!tmem_oid_valid(&zh->oid));
 356         size = zh->size;
 357         BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
 358         zh->size = 0;
 359         tmem_oid_set_invalid(&zh->oid);
 360         INVERT_SENTINEL(zh, ZBH);
 361         zcache_zbud_curr_zbytes -= size;
 362         atomic_dec(&zcache_zbud_curr_zpages);
 363         return size;
 364 }
 365
 366 static void zbud_free_and_delist(struct zbud_hdr *zh)
 367 {
 368         unsigned chunks;
 369         struct zbud_hdr *zh_other;
 370         unsigned budnum = zbud_budnum(zh), size;
 371         struct zbud_page *zbpg =
 372                 container_of(zh, struct zbud_page, buddy[budnum]);
 373
 374         /* FIXME, should be BUG_ON, pool destruction path doesn't disable
 375          * interrupts tmem_destroy_pool()->tmem_pampd_destroy_all_in_obj()->
 376          * tmem_objnode_node_destroy()-> zcache_pampd_free() */
 377         WARN_ON(!irqs_disabled());
 378         spin_lock(&zbpg->lock);
 379         if (list_empty(&zbpg->bud_list)) {
 380                 /* ignore zombie page... see zbud_evict_pages() */
 381                 spin_unlock(&zbpg->lock);
 382                 return;
 383         }
 384         size = zbud_free(zh);
 385         ASSERT_SPINLOCK(&zbpg->lock);
 386         zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
 387         if (zh_other->size == 0) { /* was unbuddied: unlist and free */
 388                 chunks = zbud_size_to_chunks(size) ;
 389                 spin_lock(&zbud_budlists_spinlock);
 390                 BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
 391                 list_del_init(&zbpg->bud_list);
 392                 zbud_unbuddied[chunks].count--;
 393                 spin_unlock(&zbud_budlists_spinlock);
 394                 zbud_free_raw_page(zbpg);
 395         } else { /* was buddied: move remaining buddy to unbuddied list */
 396                 chunks = zbud_size_to_chunks(zh_other->size) ;
 397                 spin_lock(&zbud_budlists_spinlock);
 398                 list_del_init(&zbpg->bud_list);
 399                 zcache_zbud_buddied_count--;
 400                 list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
 401                 zbud_unbuddied[chunks].count++;
 402                 spin_unlock(&zbud_budlists_spinlock);
 403                 spin_unlock(&zbpg->lock);
 404         }
 405 }
 406
 407 static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id,
 408                                         struct tmem_oid *oid,
 409                                         uint32_t index, struct page *page,
 410                                         void *cdata, unsigned size)
 411 {
 412         struct zbud_hdr *zh0, *zh1, *zh = NULL;
 413         struct zbud_page *zbpg = NULL, *ztmp;
 414         unsigned nchunks;
 415         char *to;
 416         int i, found_good_buddy = 0;
 417
 418         nchunks = zbud_size_to_chunks(size) ;
 419         for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
 420                 spin_lock(&zbud_budlists_spinlock);
 421                 if (!list_empty(&zbud_unbuddied[i].list)) {
 422                         list_for_each_entry_safe(zbpg, ztmp,
 423                                     &zbud_unbuddied[i].list, bud_list) {
 424                                 if (spin_trylock(&zbpg->lock)) {
 425                                         found_good_buddy = i;
 426                                         goto found_unbuddied;
 427                                 }
 428                         }
 429                 }
 430                 spin_unlock(&zbud_budlists_spinlock);
 431         }
 432         /* didn't find a good buddy, try allocating a new page */
 433         zbpg = zbud_alloc_raw_page();
 434         if (unlikely(zbpg == NULL))
 435                 goto out;
 436         /* ok, have a page, now compress the data before taking locks */
 437         spin_lock(&zbud_budlists_spinlock);
 438         spin_lock(&zbpg->lock);
 439         list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
 440         zbud_unbuddied[nchunks].count++;
 441         zh = &zbpg->buddy[0];
 442         goto init_zh;
 443
 444 found_unbuddied:
 445         ASSERT_SPINLOCK(&zbpg->lock);
 446         zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
 447         BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
 448         if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
 449                 ASSERT_SENTINEL(zh0, ZBH);
 450                 zh = zh1;
 451         } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
 452                 ASSERT_SENTINEL(zh1, ZBH);
 453                 zh = zh0;
 454         } else
 455                 BUG();
 456         list_del_init(&zbpg->bud_list);
 457         zbud_unbuddied[found_good_buddy].count--;
 458         list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
 459         zcache_zbud_buddied_count++;
 460
 461 init_zh:
 462         SET_SENTINEL(zh, ZBH);
 463         zh->size = size;
 464         zh->index = index;
 465         zh->oid = *oid;
 466         zh->pool_id = pool_id;
 467         zh->client_id = client_id;
 468         to = zbud_data(zh, size);
 469         memcpy(to, cdata, size);
 470         spin_unlock(&zbpg->lock);
 471         spin_unlock(&zbud_budlists_spinlock);
 472         zbud_cumul_chunk_counts[nchunks]++;
 473         atomic_inc(&zcache_zbud_curr_zpages);
 474         zcache_zbud_cumul_zpages++;
 475         zcache_zbud_curr_zbytes += size;
 476         zcache_zbud_cumul_zbytes += size;
 477 out:
 478         return zh;
 479 }
 480
 481 static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
 482 {
 483         struct zbud_page *zbpg;
 484         unsigned budnum = zbud_budnum(zh);
 485         size_t out_len = PAGE_SIZE;
 486         char *to_va, *from_va;
 487         unsigned size;
 488         int ret = 0;
 489
 490         zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
 491         spin_lock(&zbpg->lock);
 492         if (list_empty(&zbpg->bud_list)) {
 493                 /* ignore zombie page... see zbud_evict_pages() */
 494                 ret = -EINVAL;
 495                 goto out;
 496         }
 497         ASSERT_SENTINEL(zh, ZBH);
 498         BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
 499         to_va = kmap_atomic(page, KM_USER0);
 500         size = zh->size;
 501         from_va = zbud_data(zh, size);
 502         ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len);
 503         BUG_ON(ret != LZO_E_OK);
 504         BUG_ON(out_len != PAGE_SIZE);
 505         kunmap_atomic(to_va, KM_USER0);
 506 out:
 507         spin_unlock(&zbpg->lock);
 508         return ret;
 509 }
 510
 511 /*
 512  * The following routines handle shrinking of ephemeral pages by evicting
 513  * pages "least valuable" first.
 514  */
 515
 516 static unsigned long zcache_evicted_raw_pages;
 517 static unsigned long zcache_evicted_buddied_pages;
 518 static unsigned long zcache_evicted_unbuddied_pages;
 519
 520 static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id,
 521                                                 uint16_t poolid);
 522 static void zcache_put_pool(struct tmem_pool *pool);
 523
 524 /*
 525  * Flush and free all zbuds in a zbpg, then free the pageframe
 526  */
 527 static void zbud_evict_zbpg(struct zbud_page *zbpg)
 528 {
 529         struct zbud_hdr *zh;
 530         int i, j;
 531         uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS];
 532         uint32_t index[ZBUD_MAX_BUDS];
 533         struct tmem_oid oid[ZBUD_MAX_BUDS];
 534         struct tmem_pool *pool;
 535         unsigned long flags;
 536
 537         ASSERT_SPINLOCK(&zbpg->lock);
 538         for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
 539                 zh = &zbpg->buddy[i];
 540                 if (zh->size) {
 541                         client_id[j] = zh->client_id;
 542                         pool_id[j] = zh->pool_id;
 543                         oid[j] = zh->oid;
 544                         index[j] = zh->index;
 545                         j++;
 546                 }
 547         }
 548         spin_unlock(&zbpg->lock);
 549         for (i = 0; i < j; i++) {
 550                 pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
 551                 BUG_ON(pool == NULL);
 552                 local_irq_save(flags);
 553                 /* these flushes should dispose of any local storage */
 554                 tmem_flush_page(pool, &oid[i], index[i]);
 555                 local_irq_restore(flags);
 556                 zcache_put_pool(pool);
 557         }
 558 }
 559
 560 /*
 561  * Free nr pages.  This code is funky because we want to hold the locks
 562  * protecting various lists for as short a time as possible, and in some
 563  * circumstances the list may change asynchronously when the list lock is
 564  * not held.  In some cases we also trylock not only to avoid waiting on a
 565  * page in use by another cpu, but also to avoid potential deadlock due to
 566  * lock inversion.
 567  */
 568 static void zbud_evict_pages(int nr)
 569 {
 570         struct zbud_page *zbpg;
 571         int i, newly_unused_pages = 0;
 572
 573
 574         /* now try freeing unbuddied pages, starting with least space avail */
 575         for (i = 0; i < MAX_CHUNK; i++) {
 576 retry_unbud_list_i:
 577                 spin_lock_bh(&zbud_budlists_spinlock);
 578                 if (list_empty(&zbud_unbuddied[i].list)) {
 579                         spin_unlock_bh(&zbud_budlists_spinlock);
 580                         continue;
 581                 }
 582                 list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
 583                         if (unlikely(!spin_trylock(&zbpg->lock)))
 584                                 continue;
 585                         zbud_unbuddied[i].count--;
 586                         spin_unlock(&zbud_budlists_spinlock);
 587                         zcache_evicted_unbuddied_pages++;
 588                         /* want budlists unlocked when doing zbpg eviction */
 589                         zbud_evict_zbpg(zbpg);
 590                         newly_unused_pages++;
 591                         local_bh_enable();
 592                         if (--nr <= 0)
 593                                 goto evict_unused;
 594                         goto retry_unbud_list_i;
 595                 }
 596                 spin_unlock_bh(&zbud_budlists_spinlock);
 597         }
 598
 599         /* as a last resort, free buddied pages */
 600 retry_bud_list:
 601         spin_lock_bh(&zbud_budlists_spinlock);
 602         if (list_empty(&zbud_buddied_list)) {
 603                 spin_unlock_bh(&zbud_budlists_spinlock);
 604                 goto evict_unused;
 605         }
 606         list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
 607                 if (unlikely(!spin_trylock(&zbpg->lock)))
 608                         continue;
 609                 zcache_zbud_buddied_count--;
 610                 spin_unlock(&zbud_budlists_spinlock);
 611                 zcache_evicted_buddied_pages++;
 612                 /* want budlists unlocked when doing zbpg eviction */
 613                 zbud_evict_zbpg(zbpg);
 614                 newly_unused_pages++;
 615                 local_bh_enable();
 616                 if (--nr <= 0)
 617                         goto evict_unused;
 618                 goto retry_bud_list;
 619         }
 620         spin_unlock_bh(&zbud_budlists_spinlock);
 621
 622 evict_unused:
 623         return;
 624 }
 625
 626 static DEFINE_PER_CPU(unsigned char *, zcache_remoteputmem);
 627
 628 static int zbud_remotify_zbud(struct tmem_xhandle *xh, char *data,
 629                                 size_t size)
 630 {
 631         struct tmem_pool *pool;
 632         int i, remotenode, ret = -1;
 633         unsigned char cksum, *p;
 634         unsigned long flags;
 635
 636         for (p = data, cksum = 0, i = 0; i < size; i++)
 637                 cksum += *p;
 638         ret = ramster_remote_put(xh, data, size, true, &remotenode);
 639         if (ret == 0) {
 640                 /* data was successfully remoted so change the local version
 641                  * to point to the remote node where it landed */
 642                 pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh->pool_id);
 643                 BUG_ON(pool == NULL);
 644                 local_irq_save(flags);
 645                 /* tmem_replace will also free up any local space */
 646                 (void)tmem_replace(pool, &xh->oid, xh->index,
 647                         pampd_make_remote(remotenode, size, cksum));
 648                 local_irq_restore(flags);
 649                 zcache_put_pool(pool);
 650                 ramster_eph_pages_remoted++;
 651                 ret = 0;
 652         } else
 653                 ramster_eph_pages_remote_failed++;
 654         return ret;
 655 }
 656
 657 static int zbud_remotify_zbpg(struct zbud_page *zbpg)
 658 {
 659         struct zbud_hdr *zh1, *zh2 = NULL;
 660         struct tmem_xhandle xh1, xh2 = { 0 };
 661         char *data1 = NULL, *data2 = NULL;
 662         size_t size1 = 0, size2 = 0;
 663         int ret = 0;
 664         unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem);
 665
 666         ASSERT_SPINLOCK(&zbpg->lock);
 667         if (zbpg->buddy[0].size == 0)
 668                 zh1 = &zbpg->buddy[1];
 669         else if (zbpg->buddy[1].size == 0)
 670                 zh1 = &zbpg->buddy[0];
 671         else {
 672                 zh1 = &zbpg->buddy[0];
 673                 zh2 = &zbpg->buddy[1];
 674         }
 675         /* don't remotify pages that are already remotified */
 676         if (zh1->client_id != LOCAL_CLIENT)
 677                 zh1 = NULL;
 678         if ((zh2 != NULL) && (zh2->client_id != LOCAL_CLIENT))
 679                 zh2 = NULL;
 680
 681         /* copy the data and metadata so can release lock */
 682         if (zh1 != NULL) {
 683                 xh1.client_id = zh1->client_id;
 684                 xh1.pool_id = zh1->pool_id;
 685                 xh1.oid = zh1->oid;
 686                 xh1.index = zh1->index;
 687                 size1 = zh1->size;
 688                 data1 = zbud_data(zh1, size1);
 689                 memcpy(tmpmem, zbud_data(zh1, size1), size1);
 690                 data1 = tmpmem;
 691                 tmpmem += size1;
 692         }
 693         if (zh2 != NULL) {
 694                 xh2.client_id = zh2->client_id;
 695                 xh2.pool_id = zh2->pool_id;
 696                 xh2.oid = zh2->oid;
 697                 xh2.index = zh2->index;
 698                 size2 = zh2->size;
 699                 memcpy(tmpmem, zbud_data(zh2, size2), size2);
 700                 data2 = tmpmem;
 701         }
 702         spin_unlock(&zbpg->lock);
 703         preempt_enable();
 704
 705         /* OK, no locks held anymore, remotify one or both zbuds */
 706         if (zh1 != NULL)
 707                 ret = zbud_remotify_zbud(&xh1, data1, size1);
 708         if (zh2 != NULL)
 709                 ret |= zbud_remotify_zbud(&xh2, data2, size2);
 710         return ret;
 711 }
 712
 713 void zbud_remotify_pages(int nr)
 714 {
 715         struct zbud_page *zbpg;
 716         int i, ret;
 717
 718         /*
 719          * for now just try remotifying unbuddied pages, starting with
 720          * least space avail
 721          */
 722         for (i = 0; i < MAX_CHUNK; i++) {
 723 retry_unbud_list_i:
 724                 preempt_disable();  /* enable in zbud_remotify_zbpg */
 725                 spin_lock_bh(&zbud_budlists_spinlock);
 726                 if (list_empty(&zbud_unbuddied[i].list)) {
 727                         spin_unlock_bh(&zbud_budlists_spinlock);
 728                         preempt_enable();
 729                         continue; /* next i in for loop */
 730                 }
 731                 list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
 732                         if (unlikely(!spin_trylock(&zbpg->lock)))
 733                                 continue; /* next list_for_each_entry */
 734                         zbud_unbuddied[i].count--;
 735                         /* want budlists unlocked when doing zbpg remotify */
 736                         spin_unlock_bh(&zbud_budlists_spinlock);
 737                         ret = zbud_remotify_zbpg(zbpg);
 738                         /* preemption is re-enabled in zbud_remotify_zbpg */
 739                         if (ret == 0) {
 740                                 if (--nr <= 0)
 741                                         goto out;
 742                                 goto retry_unbud_list_i;
 743                         }
 744                         /* if fail to remotify any page, quit */
 745                         pr_err("TESTING zbud_remotify_pages failed on page,"
 746                                 " trying to re-add\n");
 747                         spin_lock_bh(&zbud_budlists_spinlock);
 748                         spin_lock(&zbpg->lock);
 749                         list_add_tail(&zbpg->bud_list, &zbud_unbuddied[i].list);
 750                         zbud_unbuddied[i].count++;
 751                         spin_unlock(&zbpg->lock);
 752                         spin_unlock_bh(&zbud_budlists_spinlock);
 753                         pr_err("TESTING zbud_remotify_pages failed on page,"
 754                                 " finished re-add\n");
 755                         goto out;
 756                 }
 757                 spin_unlock_bh(&zbud_budlists_spinlock);
 758                 preempt_enable();
 759         }
 760
 761 next_buddied_zbpg:
 762         preempt_disable();  /* enable in zbud_remotify_zbpg */
 763         spin_lock_bh(&zbud_budlists_spinlock);
 764         if (list_empty(&zbud_buddied_list))
 765                 goto unlock_out;
 766         list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
 767                 if (unlikely(!spin_trylock(&zbpg->lock)))
 768                         continue; /* next list_for_each_entry */
 769                 zcache_zbud_buddied_count--;
 770                 /* want budlists unlocked when doing zbpg remotify */
 771                 spin_unlock_bh(&zbud_budlists_spinlock);
 772                 ret = zbud_remotify_zbpg(zbpg);
 773                 /* preemption is re-enabled in zbud_remotify_zbpg */
 774                 if (ret == 0) {
 775                         if (--nr <= 0)
 776                                 goto out;
 777                         goto next_buddied_zbpg;
 778                 }
 779                 /* if fail to remotify any page, quit */
 780                 pr_err("TESTING zbud_remotify_pages failed on BUDDIED page,"
 781                         " trying to re-add\n");
 782                 spin_lock_bh(&zbud_budlists_spinlock);
 783                 spin_lock(&zbpg->lock);
 784                 list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
 785                 zcache_zbud_buddied_count++;
 786                 spin_unlock(&zbpg->lock);
 787                 spin_unlock_bh(&zbud_budlists_spinlock);
 788                 pr_err("TESTING zbud_remotify_pages failed on BUDDIED page,"
 789                         " finished re-add\n");
 790                 goto out;
 791         }
 792 unlock_out:
 793         spin_unlock_bh(&zbud_budlists_spinlock);
 794         preempt_enable();
 795 out:
 796         return;
 797 }
 798
 799 /* the "flush list" asynchronously collects pages to remotely flush */
 800 #define FLUSH_ENTIRE_OBJECT ((uint32_t)-1)
 801 static void ramster_flnode_free(struct flushlist_node *,
 802                                 struct tmem_pool *);
 803
 804 static void zcache_remote_flush_page(struct flushlist_node *flnode)
 805 {
 806         struct tmem_xhandle *xh;
 807         int remotenode, ret;
 808
 809         preempt_disable();
 810         xh = &flnode->xh;
 811         remotenode = flnode->xh.client_id;
 812         ret = ramster_remote_flush(xh, remotenode);
 813         if (ret >= 0)
 814                 ramster_remote_pages_flushed++;
 815         else
 816                 ramster_remote_page_flushes_failed++;
 817         preempt_enable_no_resched();
 818         ramster_flnode_free(flnode, NULL);
 819 }
 820
 821 static void zcache_remote_flush_object(struct flushlist_node *flnode)
 822 {
 823         struct tmem_xhandle *xh;
 824         int remotenode, ret;
 825
 826         preempt_disable();
 827         xh = &flnode->xh;
 828         remotenode = flnode->xh.client_id;
 829         ret = ramster_remote_flush_object(xh, remotenode);
 830         if (ret >= 0)
 831                 ramster_remote_objects_flushed++;
 832         else
 833                 ramster_remote_object_flushes_failed++;
 834         preempt_enable_no_resched();
 835         ramster_flnode_free(flnode, NULL);
 836 }
 837
 838 static void zcache_remote_eph_put(struct zbud_hdr *zbud)
 839 {
 840         /* FIXME */
 841 }
 842
 843 static void zcache_remote_pers_put(struct zv_hdr *zv)
 844 {
 845         struct tmem_xhandle xh;
 846         uint16_t size;
 847         bool ephemeral;
 848         int remotenode, ret = -1;
 849         char *data;
 850         struct tmem_pool *pool;
 851         unsigned long flags;
 852         unsigned char cksum;
 853         char *p;
 854         int i;
 855         unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem);
 856
 857         ASSERT_SENTINEL(zv, ZVH);
 858         BUG_ON(zv->client_id != LOCAL_CLIENT);
 859         local_bh_disable();
 860         xh.client_id = zv->client_id;
 861         xh.pool_id = zv->pool_id;
 862         xh.oid = zv->oid;
 863         xh.index = zv->index;
 864         size = xv_get_object_size(zv) - sizeof(*zv);
 865         BUG_ON(size == 0 || size > zv_max_page_size);
 866         data = (char *)zv + sizeof(*zv);
 867         for (p = data, cksum = 0, i = 0; i < size; i++)
 868                 cksum += *p;
 869         memcpy(tmpmem, data, size);
 870         data = tmpmem;
 871         pool = zcache_get_pool_by_id(zv->client_id, zv->pool_id);
 872         ephemeral = is_ephemeral(pool);
 873         zcache_put_pool(pool);
 874         /* now OK to release lock set in caller */
 875         spin_unlock(&zcache_rem_op_list_lock);
 876         local_bh_enable();
 877         preempt_disable();
 878         ret = ramster_remote_put(&xh, data, size, ephemeral, &remotenode);
 879         preempt_enable_no_resched();
 880         if (ret != 0) {
 881                 /*
 882                  * This is some form of a memory leak... if the remote put
 883                  * fails, there will never be another attempt to remotify
 884                  * this page.  But since we've dropped the zv pointer,
 885                  * the page may have been freed or the data replaced
 886                  * so we can't just "put it back" in the remote op list.
 887                  * Even if we could, not sure where to put it in the list
 888                  * because there may be flushes that must be strictly
 889                  * ordered vs the put.  So leave this as a FIXME for now.
 890                  * But count them so we know if it becomes a problem.
 891                  */
 892                 ramster_pers_pages_remote_failed++;
 893                 goto out;
 894         } else
 895                 atomic_inc(&ramster_remote_pers_pages);
 896         ramster_pers_pages_remoted++;
 897         /*
 898          * data was successfully remoted so change the local version to
 899          * point to the remote node where it landed
 900          */
 901         local_bh_disable();
 902         pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id);
 903         local_irq_save(flags);
 904         (void)tmem_replace(pool, &xh.oid, xh.index,
 905                         pampd_make_remote(remotenode, size, cksum));
 906         local_irq_restore(flags);
 907         zcache_put_pool(pool);
 908         local_bh_enable();
 909 out:
 910         return;
 911 }
 912
 913 static void zcache_do_remotify_ops(int nr)
 914 {
 915         struct ramster_remotify_hdr *rem_op;
 916         union remotify_list_node *u;
 917
 918         while (1) {
 919                 if (!nr)
 920                         goto out;
 921                 spin_lock(&zcache_rem_op_list_lock);
 922                 if (list_empty(&zcache_rem_op_list)) {
 923                         spin_unlock(&zcache_rem_op_list_lock);
 924                         goto out;
 925                 }
 926                 rem_op = list_first_entry(&zcache_rem_op_list,
 927                                 struct ramster_remotify_hdr, list);
 928                 list_del_init(&rem_op->list);
 929                 if (rem_op->op != RAMSTER_REMOTIFY_PERS_PUT)
 930                         spin_unlock(&zcache_rem_op_list_lock);
 931                 u = (union remotify_list_node *)rem_op;
 932                 switch (rem_op->op) {
 933                 case RAMSTER_REMOTIFY_EPH_PUT:
 934 BUG();
 935                         zcache_remote_eph_put((struct zbud_hdr *)rem_op);
 936                         break;
 937                 case RAMSTER_REMOTIFY_PERS_PUT:
 938                         zcache_remote_pers_put((struct zv_hdr *)rem_op);
 939                         break;
 940                 case RAMSTER_REMOTIFY_FLUSH_PAGE:
 941                         zcache_remote_flush_page((struct flushlist_node *)u);
 942                         break;
 943                 case RAMSTER_REMOTIFY_FLUSH_OBJ:
 944                         zcache_remote_flush_object((struct flushlist_node *)u);
 945                         break;
 946                 default:
 947                         BUG();
 948                 }
 949         }
 950 out:
 951         return;
 952 }
 953
 954 /*
 955  * For now, just push over a few pages every few seconds to
 956  * ensure that it basically works
 957  */
 958 static struct workqueue_struct *ramster_remotify_workqueue;
 959 static void ramster_remotify_process(struct work_struct *work);
 960 static DECLARE_DELAYED_WORK(ramster_remotify_worker,
 961                 ramster_remotify_process);
 962
 963 static void ramster_remotify_queue_delayed_work(unsigned long delay)
 964 {
 965         if (!queue_delayed_work(ramster_remotify_workqueue,
 966                                 &ramster_remotify_worker, delay))
 967                 pr_err("ramster_remotify: bad workqueue\n");
 968 }
 969
 970
 971 static int use_frontswap;
 972 static int use_cleancache;
 973 static void ramster_remotify_process(struct work_struct *work)
 974 {
 975         static bool remotify_in_progress;
 976
 977         BUG_ON(irqs_disabled());
 978         if (remotify_in_progress)
 979                 ramster_remotify_queue_delayed_work(HZ);
 980         else {
 981                 remotify_in_progress = true;
 982 #ifdef CONFIG_CLEANCACHE
 983         if (use_cleancache && ramster_eph_remotify_enable)
 984                 zbud_remotify_pages(5000); /* FIXME is this a good number? */
 985 #endif
 986 #ifdef CONFIG_FRONTSWAP
 987         if (use_frontswap && ramster_pers_remotify_enable)
 988                 zcache_do_remotify_ops(500); /* FIXME is this a good number? */
 989 #endif
 990                 remotify_in_progress = false;
 991                 ramster_remotify_queue_delayed_work(HZ);
 992         }
 993 }
 994
 995 static void ramster_remotify_init(void)
 996 {
 997         unsigned long n = 60UL;
 998         ramster_remotify_workqueue =
 999                 create_singlethread_workqueue("ramster_remotify");
1000         ramster_remotify_queue_delayed_work(n * HZ);
1001 }
1002
1003
1004 static void zbud_init(void)
1005 {
1006         int i;
1007
1008         INIT_LIST_HEAD(&zbud_buddied_list);
1009         zcache_zbud_buddied_count = 0;
1010         for (i = 0; i < NCHUNKS; i++) {
1011                 INIT_LIST_HEAD(&zbud_unbuddied[i].list);
1012                 zbud_unbuddied[i].count = 0;
1013         }
1014 }
1015
1016 #ifdef CONFIG_SYSFS
1017 /*
1018  * These sysfs routines show a nice distribution of how many zbpg's are
1019  * currently (and have ever been placed) in each unbuddied list.  It's fun
1020  * to watch but can probably go away before final merge.
1021  */
1022 static int zbud_show_unbuddied_list_counts(char *buf)
1023 {
1024         int i;
1025         char *p = buf;
1026
1027         for (i = 0; i < NCHUNKS; i++)
1028                 p += sprintf(p, "%u ", zbud_unbuddied[i].count);
1029         return p - buf;
1030 }
1031
1032 static int zbud_show_cumul_chunk_counts(char *buf)
1033 {
1034         unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
1035         unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
1036         unsigned long total_chunks_lte_42 = 0;
1037         char *p = buf;
1038
1039         for (i = 0; i < NCHUNKS; i++) {
1040                 p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
1041                 chunks += zbud_cumul_chunk_counts[i];
1042                 total_chunks += zbud_cumul_chunk_counts[i];
1043                 sum_total_chunks += i * zbud_cumul_chunk_counts[i];
1044                 if (i == 21)
1045                         total_chunks_lte_21 = total_chunks;
1046                 if (i == 32)
1047                         total_chunks_lte_32 = total_chunks;
1048                 if (i == 42)
1049                         total_chunks_lte_42 = total_chunks;
1050         }
1051         p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
1052                 total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
1053                 chunks == 0 ? 0 : sum_total_chunks / chunks);
1054         return p - buf;
1055 }
1056 #endif
1057
1058 /**********
1059  * This "zv" PAM implementation combines the TLSF-based xvMalloc
1060  * with lzo1x compression to maximize the amount of data that can
1061  * be packed into a physical page.
1062  *
1063  * Zv represents a PAM page with the index and object (plus a "size" value
1064  * necessary for decompression) immediately preceding the compressed data.
1065  */
1066
1067 /* rudimentary policy limits */
1068 /* total number of persistent pages may not exceed this percentage */
1069 static unsigned int zv_page_count_policy_percent = 75;
1070 /*
1071  * byte count defining poor compression; pages with greater zsize will be
1072  * rejected
1073  */
1074 static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7;
1075 /*
1076  * byte count defining poor *mean* compression; pages with greater zsize
1077  * will be rejected until sufficient better-compressed pages are accepted
1078  * driving the mean below this threshold
1079  */
1080 static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5;
1081
1082 static atomic_t zv_curr_dist_counts[NCHUNKS];
1083 static atomic_t zv_cumul_dist_counts[NCHUNKS];
1084
1085
1086 static struct zv_hdr *zv_create(struct zcache_client *cli, uint32_t pool_id,
1087                                 struct tmem_oid *oid, uint32_t index,
1088                                 void *cdata, unsigned clen)
1089 {
1090         struct page *page;
1091         struct zv_hdr *zv = NULL;
1092         uint32_t offset;
1093         int alloc_size = clen + sizeof(struct zv_hdr);
1094         int chunks = (alloc_size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
1095         int ret;
1096
1097         BUG_ON(!irqs_disabled());
1098         BUG_ON(chunks >= NCHUNKS);
1099         ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr),
1100                         &page, &offset, ZCACHE_GFP_MASK);
1101         if (unlikely(ret))
1102                 goto out;
1103         atomic_inc(&zv_curr_dist_counts[chunks]);
1104         atomic_inc(&zv_cumul_dist_counts[chunks]);
1105         zv = kmap_atomic(page, KM_USER0) + offset;
1106         zv->index = index;
1107         zv->oid = *oid;
1108         zv->pool_id = pool_id;
1109         SET_SENTINEL(zv, ZVH);
1110         INIT_LIST_HEAD(&zv->rem_op.list);
1111         zv->client_id = get_client_id_from_client(cli);
1112         zv->rem_op.op = RAMSTER_REMOTIFY_PERS_PUT;
1113         if (zv->client_id == LOCAL_CLIENT) {
1114                 spin_lock(&zcache_rem_op_list_lock);
1115                 list_add_tail(&zv->rem_op.list, &zcache_rem_op_list);
1116                 spin_unlock(&zcache_rem_op_list_lock);
1117         }
1118         memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
1119         kunmap_atomic(zv, KM_USER0);
1120 out:
1121         return zv;
1122 }
1123
1124 /* similar to zv_create, but just reserve space, no data yet */
1125 static struct zv_hdr *zv_alloc(struct tmem_pool *pool,
1126                                 struct tmem_oid *oid, uint32_t index,
1127                                 unsigned clen)
1128 {
1129         struct zcache_client *cli = pool->client;
1130         struct page *page;
1131         struct zv_hdr *zv = NULL;
1132         uint32_t offset;
1133         int ret;
1134
1135         BUG_ON(!irqs_disabled());
1136         BUG_ON(!is_local_client(pool->client));
1137         ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr),
1138                         &page, &offset, ZCACHE_GFP_MASK);
1139         if (unlikely(ret))
1140                 goto out;
1141         zv = kmap_atomic(page, KM_USER0) + offset;
1142         SET_SENTINEL(zv, ZVH);
1143         INIT_LIST_HEAD(&zv->rem_op.list);
1144         zv->client_id = LOCAL_CLIENT;
1145         zv->rem_op.op = RAMSTER_INTRANSIT_PERS;
1146         zv->index = index;
1147         zv->oid = *oid;
1148         zv->pool_id = pool->pool_id;
1149         kunmap_atomic(zv, KM_USER0);
1150 out:
1151         return zv;
1152 }
1153
1154 static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)
1155 {
1156         unsigned long flags;
1157         struct page *page;
1158         uint32_t offset;
1159         uint16_t size = xv_get_object_size(zv);
1160         int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
1161
1162         ASSERT_SENTINEL(zv, ZVH);
1163         BUG_ON(chunks >= NCHUNKS);
1164         atomic_dec(&zv_curr_dist_counts[chunks]);
1165         size -= sizeof(*zv);
1166         spin_lock(&zcache_rem_op_list_lock);
1167         size = xv_get_object_size(zv) - sizeof(*zv);
1168         BUG_ON(size == 0);
1169         INVERT_SENTINEL(zv, ZVH);
1170         if (!list_empty(&zv->rem_op.list))
1171                 list_del_init(&zv->rem_op.list);
1172         spin_unlock(&zcache_rem_op_list_lock);
1173         page = virt_to_page(zv);
1174         offset = (unsigned long)zv & ~PAGE_MASK;
1175         local_irq_save(flags);
1176         xv_free(xvpool, page, offset);
1177         local_irq_restore(flags);
1178 }
1179
1180 static void zv_decompress(struct page *page, struct zv_hdr *zv)
1181 {
1182         size_t clen = PAGE_SIZE;
1183         char *to_va;
1184         unsigned size;
1185         int ret;
1186
1187         ASSERT_SENTINEL(zv, ZVH);
1188         size = xv_get_object_size(zv) - sizeof(*zv);
1189         BUG_ON(size == 0);
1190         to_va = kmap_atomic(page, KM_USER0);
1191         ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
1192                                         size, to_va, &clen);
1193         kunmap_atomic(to_va, KM_USER0);
1194         BUG_ON(ret != LZO_E_OK);
1195         BUG_ON(clen != PAGE_SIZE);
1196 }
1197
1198 static void zv_copy_from_pampd(char *data, size_t *bufsize, struct zv_hdr *zv)
1199 {
1200         unsigned size;
1201
1202         ASSERT_SENTINEL(zv, ZVH);
1203         size = xv_get_object_size(zv) - sizeof(*zv);
1204         BUG_ON(size == 0 || size > zv_max_page_size);
1205         BUG_ON(size > *bufsize);
1206         memcpy(data, (char *)zv + sizeof(*zv), size);
1207         *bufsize = size;
1208 }
1209
1210 static void zv_copy_to_pampd(struct zv_hdr *zv, char *data, size_t size)
1211 {
1212         unsigned zv_size;
1213
1214         ASSERT_SENTINEL(zv, ZVH);
1215         zv_size = xv_get_object_size(zv) - sizeof(*zv);
1216         BUG_ON(zv_size != size);
1217         BUG_ON(zv_size == 0 || zv_size > zv_max_page_size);
1218         memcpy((char *)zv + sizeof(*zv), data, size);
1219 }
1220
1221 #ifdef CONFIG_SYSFS
1222 /*
1223  * show a distribution of compression stats for zv pages.
1224  */
1225
1226 static int zv_curr_dist_counts_show(char *buf)
1227 {
1228         unsigned long i, n, chunks = 0, sum_total_chunks = 0;
1229         char *p = buf;
1230
1231         for (i = 0; i < NCHUNKS; i++) {
1232                 n = atomic_read(&zv_curr_dist_counts[i]);
1233                 p += sprintf(p, "%lu ", n);
1234                 chunks += n;
1235                 sum_total_chunks += i * n;
1236         }
1237         p += sprintf(p, "mean:%lu\n",
1238                 chunks == 0 ? 0 : sum_total_chunks / chunks);
1239         return p - buf;
1240 }
1241
1242 static int zv_cumul_dist_counts_show(char *buf)
1243 {
1244         unsigned long i, n, chunks = 0, sum_total_chunks = 0;
1245         char *p = buf;
1246
1247         for (i = 0; i < NCHUNKS; i++) {
1248                 n = atomic_read(&zv_cumul_dist_counts[i]);
1249                 p += sprintf(p, "%lu ", n);
1250                 chunks += n;
1251                 sum_total_chunks += i * n;
1252         }
1253         p += sprintf(p, "mean:%lu\n",
1254                 chunks == 0 ? 0 : sum_total_chunks / chunks);
1255         return p - buf;
1256 }
1257
1258 /*
1259  * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
1260  * pages that don't compress to less than this value (including metadata
1261  * overhead) to be rejected.  We don't allow the value to get too close
1262  * to PAGE_SIZE.
1263  */
1264 static ssize_t zv_max_zsize_show(struct kobject *kobj,
1265                                     struct kobj_attribute *attr,
1266                                     char *buf)
1267 {
1268         return sprintf(buf, "%u\n", zv_max_zsize);
1269 }
1270
1271 static ssize_t zv_max_zsize_store(struct kobject *kobj,
1272                                     struct kobj_attribute *attr,
1273                                     const char *buf, size_t count)
1274 {
1275         unsigned long val;
1276         int err;
1277
1278         if (!capable(CAP_SYS_ADMIN))
1279                 return -EPERM;
1280
1281         err = strict_strtoul(buf, 10, &val);
1282         if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
1283                 return -EINVAL;
1284         zv_max_zsize = val;
1285         return count;
1286 }
1287
1288 /*
1289  * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
1290  * pages that don't compress to less than this value (including metadata
1291  * overhead) to be rejected UNLESS the mean compression is also smaller
1292  * than this value.  In other words, we are load-balancing-by-zsize the
1293  * accepted pages.  Again, we don't allow the value to get too close
1294  * to PAGE_SIZE.
1295  */
1296 static ssize_t zv_max_mean_zsize_show(struct kobject *kobj,
1297                                     struct kobj_attribute *attr,
1298                                     char *buf)
1299 {
1300         return sprintf(buf, "%u\n", zv_max_mean_zsize);
1301 }
1302
1303 static ssize_t zv_max_mean_zsize_store(struct kobject *kobj,
1304                                     struct kobj_attribute *attr,
1305                                     const char *buf, size_t count)
1306 {
1307         unsigned long val;
1308         int err;
1309
1310         if (!capable(CAP_SYS_ADMIN))
1311                 return -EPERM;
1312
1313         err = strict_strtoul(buf, 10, &val);
1314         if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
1315                 return -EINVAL;
1316         zv_max_mean_zsize = val;
1317         return count;
1318 }
1319
1320 /*
1321  * setting zv_page_count_policy_percent via sysfs sets an upper bound of
1322  * persistent (e.g. swap) pages that will be retained according to:
1323  *     (zv_page_count_policy_percent * totalram_pages) / 100)
1324  * when that limit is reached, further puts will be rejected (until
1325  * some pages have been flushed).  Note that, due to compression,
1326  * this number may exceed 100; it defaults to 75 and we set an
1327  * arbitary limit of 150.  A poor choice will almost certainly result
1328  * in OOM's, so this value should only be changed prudently.
1329  */
1330 static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj,
1331                                                  struct kobj_attribute *attr,
1332                                                  char *buf)
1333 {
1334         return sprintf(buf, "%u\n", zv_page_count_policy_percent);
1335 }
1336
1337 static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj,
1338                                                   struct kobj_attribute *attr,
1339                                                   const char *buf, size_t count)
1340 {
1341         unsigned long val;
1342         int err;
1343
1344         if (!capable(CAP_SYS_ADMIN))
1345                 return -EPERM;
1346
1347         err = strict_strtoul(buf, 10, &val);
1348         if (err || (val == 0) || (val > 150))
1349                 return -EINVAL;
1350         zv_page_count_policy_percent = val;
1351         return count;
1352 }
1353
1354 static struct kobj_attribute zcache_zv_max_zsize_attr = {
1355                 .attr = { .name = "zv_max_zsize", .mode = 0644 },
1356                 .show = zv_max_zsize_show,
1357                 .store = zv_max_zsize_store,
1358 };
1359
1360 static struct kobj_attribute zcache_zv_max_mean_zsize_attr = {
1361                 .attr = { .name = "zv_max_mean_zsize", .mode = 0644 },
1362                 .show = zv_max_mean_zsize_show,
1363                 .store = zv_max_mean_zsize_store,
1364 };
1365
1366 static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = {
1367                 .attr = { .name = "zv_page_count_policy_percent",
1368                           .mode = 0644 },
1369                 .show = zv_page_count_policy_percent_show,
1370                 .store = zv_page_count_policy_percent_store,
1371 };
1372 #endif
1373
1374 /*
1375  * zcache core code starts here
1376  */
1377
1378 /* useful stats not collected by cleancache or frontswap */
1379 static unsigned long zcache_flush_total;
1380 static unsigned long zcache_flush_found;
1381 static unsigned long zcache_flobj_total;
1382 static unsigned long zcache_flobj_found;
1383 static unsigned long zcache_failed_eph_puts;
1384 static unsigned long zcache_nonactive_puts;
1385 static unsigned long zcache_failed_pers_puts;
1386
1387 /*
1388  * Tmem operations assume the poolid implies the invoking client.
1389  * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
1390  * RAMster has each client numbered by cluster node, and a KVM version
1391  * of zcache would have one client per guest and each client might
1392  * have a poolid==N.
1393  */
1394 static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid)
1395 {
1396         struct tmem_pool *pool = NULL;
1397         struct zcache_client *cli = NULL;
1398
1399         if (cli_id == LOCAL_CLIENT)
1400                 cli = &zcache_host;
1401         else {
1402                 if (cli_id >= MAX_CLIENTS)
1403                         goto out;
1404                 cli = &zcache_clients[cli_id];
1405                 if (cli == NULL)
1406                         goto out;
1407                 atomic_inc(&cli->refcount);
1408         }
1409         if (poolid < MAX_POOLS_PER_CLIENT) {
1410                 pool = cli->tmem_pools[poolid];
1411                 if (pool != NULL)
1412                         atomic_inc(&pool->refcount);
1413         }
1414 out:
1415         return pool;
1416 }
1417
1418 static void zcache_put_pool(struct tmem_pool *pool)
1419 {
1420         struct zcache_client *cli = NULL;
1421
1422         if (pool == NULL)
1423                 BUG();
1424         cli = pool->client;
1425         atomic_dec(&pool->refcount);
1426         atomic_dec(&cli->refcount);
1427 }
1428
1429 int zcache_new_client(uint16_t cli_id)
1430 {
1431         struct zcache_client *cli = NULL;
1432         int ret = -1;
1433
1434         if (cli_id == LOCAL_CLIENT)
1435                 cli = &zcache_host;
1436         else if ((unsigned int)cli_id < MAX_CLIENTS)
1437                 cli = &zcache_clients[cli_id];
1438         if (cli == NULL)
1439                 goto out;
1440         if (cli->allocated)
1441                 goto out;
1442         cli->allocated = 1;
1443 #ifdef CONFIG_FRONTSWAP
1444         cli->xvpool = xv_create_pool();
1445         if (cli->xvpool == NULL)
1446                 goto out;
1447 #endif
1448         ret = 0;
1449 out:
1450         return ret;
1451 }
1452
1453 /* counters for debugging */
1454 static unsigned long zcache_failed_get_free_pages;
1455 static unsigned long zcache_failed_alloc;
1456 static unsigned long zcache_put_to_flush;
1457
1458 /*
1459  * for now, used named slabs so can easily track usage; later can
1460  * either just use kmalloc, or perhaps add a slab-like allocator
1461  * to more carefully manage total memory utilization
1462  */
1463 static struct kmem_cache *zcache_objnode_cache;
1464 static struct kmem_cache *zcache_obj_cache;
1465 static struct kmem_cache *ramster_flnode_cache;
1466 static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
1467 static unsigned long zcache_curr_obj_count_max;
1468 static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
1469 static unsigned long zcache_curr_objnode_count_max;
1470
1471 /*
1472  * to avoid memory allocation recursion (e.g. due to direct reclaim), we
1473  * preload all necessary data structures so the hostops callbacks never
1474  * actually do a malloc
1475  */
1476 struct zcache_preload {
1477         void *page;
1478         struct tmem_obj *obj;
1479         int nr;
1480         struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
1481         struct flushlist_node *flnode;
1482 };
1483 static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
1484
1485 static int zcache_do_preload(struct tmem_pool *pool)
1486 {
1487         struct zcache_preload *kp;
1488         struct tmem_objnode *objnode;
1489         struct tmem_obj *obj;
1490         struct flushlist_node *flnode;
1491         void *page;
1492         int ret = -ENOMEM;
1493
1494         if (unlikely(zcache_objnode_cache == NULL))
1495                 goto out;
1496         if (unlikely(zcache_obj_cache == NULL))
1497                 goto out;
1498         preempt_disable();
1499         kp = &__get_cpu_var(zcache_preloads);
1500         while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
1501                 preempt_enable_no_resched();
1502                 objnode = kmem_cache_alloc(zcache_objnode_cache,
1503                                 ZCACHE_GFP_MASK);
1504                 if (unlikely(objnode == NULL)) {
1505                         zcache_failed_alloc++;
1506                         goto out;
1507                 }
1508                 preempt_disable();
1509                 kp = &__get_cpu_var(zcache_preloads);
1510                 if (kp->nr < ARRAY_SIZE(kp->objnodes))
1511                         kp->objnodes[kp->nr++] = objnode;
1512                 else
1513                         kmem_cache_free(zcache_objnode_cache, objnode);
1514         }
1515         preempt_enable_no_resched();
1516         obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
1517         if (unlikely(obj == NULL)) {
1518                 zcache_failed_alloc++;
1519                 goto out;
1520         }
1521         flnode = kmem_cache_alloc(ramster_flnode_cache, ZCACHE_GFP_MASK);
1522         if (unlikely(flnode == NULL)) {
1523                 zcache_failed_alloc++;
1524                 goto out;
1525         }
1526         if (is_ephemeral(pool)) {
1527                 page = (void *)__get_free_page(ZCACHE_GFP_MASK);
1528                 if (unlikely(page == NULL)) {
1529                         zcache_failed_get_free_pages++;
1530                         kmem_cache_free(zcache_obj_cache, obj);
1531                         kmem_cache_free(ramster_flnode_cache, flnode);
1532                         goto out;
1533                 }
1534         }
1535         preempt_disable();
1536         kp = &__get_cpu_var(zcache_preloads);
1537         if (kp->obj == NULL)
1538                 kp->obj = obj;
1539         else
1540                 kmem_cache_free(zcache_obj_cache, obj);
1541         if (kp->flnode == NULL)
1542                 kp->flnode = flnode;
1543         else
1544                 kmem_cache_free(ramster_flnode_cache, flnode);
1545         if (is_ephemeral(pool)) {
1546                 if (kp->page == NULL)
1547                         kp->page = page;
1548                 else
1549                         free_page((unsigned long)page);
1550         }
1551         ret = 0;
1552 out:
1553         return ret;
1554 }
1555
1556 static int ramster_do_preload_flnode_only(struct tmem_pool *pool)
1557 {
1558         struct zcache_preload *kp;
1559         struct flushlist_node *flnode;
1560         int ret = -ENOMEM;
1561
1562         BUG_ON(!irqs_disabled());
1563         if (unlikely(ramster_flnode_cache == NULL))
1564                 BUG();
1565         kp = &__get_cpu_var(zcache_preloads);
1566         flnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC);
1567         if (unlikely(flnode == NULL) && kp->flnode == NULL)
1568                 BUG();  /* FIXME handle more gracefully, but how??? */
1569         else if (kp->flnode == NULL)
1570                 kp->flnode = flnode;
1571         else
1572                 kmem_cache_free(ramster_flnode_cache, flnode);
1573         return ret;
1574 }
1575
1576 static void *zcache_get_free_page(void)
1577 {
1578         struct zcache_preload *kp;
1579         void *page;
1580
1581         kp = &__get_cpu_var(zcache_preloads);
1582         page = kp->page;
1583         BUG_ON(page == NULL);
1584         kp->page = NULL;
1585         return page;
1586 }
1587
1588 static void zcache_free_page(void *p)
1589 {
1590         free_page((unsigned long)p);
1591 }
1592
1593 /*
1594  * zcache implementation for tmem host ops
1595  */
1596
1597 static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
1598 {
1599         struct tmem_objnode *objnode = NULL;
1600         unsigned long count;
1601         struct zcache_preload *kp;
1602
1603         kp = &__get_cpu_var(zcache_preloads);
1604         if (kp->nr <= 0)
1605                 goto out;
1606         objnode = kp->objnodes[kp->nr - 1];
1607         BUG_ON(objnode == NULL);
1608         kp->objnodes[kp->nr - 1] = NULL;
1609         kp->nr--;
1610         count = atomic_inc_return(&zcache_curr_objnode_count);
1611         if (count > zcache_curr_objnode_count_max)
1612                 zcache_curr_objnode_count_max = count;
1613 out:
1614         return objnode;
1615 }
1616
1617 static void zcache_objnode_free(struct tmem_objnode *objnode,
1618                                         struct tmem_pool *pool)
1619 {
1620         atomic_dec(&zcache_curr_objnode_count);
1621         BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
1622         kmem_cache_free(zcache_objnode_cache, objnode);
1623 }
1624
1625 static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
1626 {
1627         struct tmem_obj *obj = NULL;
1628         unsigned long count;
1629         struct zcache_preload *kp;
1630
1631         kp = &__get_cpu_var(zcache_preloads);
1632         obj = kp->obj;
1633         BUG_ON(obj == NULL);
1634         kp->obj = NULL;
1635         count = atomic_inc_return(&zcache_curr_obj_count);
1636         if (count > zcache_curr_obj_count_max)
1637                 zcache_curr_obj_count_max = count;
1638         return obj;
1639 }
1640
1641 static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
1642 {
1643         atomic_dec(&zcache_curr_obj_count);
1644         BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
1645         kmem_cache_free(zcache_obj_cache, obj);
1646 }
1647
1648 static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool)
1649 {
1650         struct flushlist_node *flnode = NULL;
1651         struct zcache_preload *kp;
1652         int count;
1653
1654         kp = &__get_cpu_var(zcache_preloads);
1655         flnode = kp->flnode;
1656         BUG_ON(flnode == NULL);
1657         kp->flnode = NULL;
1658         count = atomic_inc_return(&ramster_curr_flnode_count);
1659         if (count > ramster_curr_flnode_count_max)
1660                 ramster_curr_flnode_count_max = count;
1661         return flnode;
1662 }
1663
1664 static void ramster_flnode_free(struct flushlist_node *flnode,
1665                                 struct tmem_pool *pool)
1666 {
1667         atomic_dec(&ramster_curr_flnode_count);
1668         BUG_ON(atomic_read(&ramster_curr_flnode_count) < 0);
1669         kmem_cache_free(ramster_flnode_cache, flnode);
1670 }
1671
1672 static struct tmem_hostops zcache_hostops = {
1673         .obj_alloc = zcache_obj_alloc,
1674         .obj_free = zcache_obj_free,
1675         .objnode_alloc = zcache_objnode_alloc,
1676         .objnode_free = zcache_objnode_free,
1677 };
1678
1679 /*
1680  * zcache implementations for PAM page descriptor ops
1681  */
1682
1683 static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
1684 static unsigned long zcache_curr_eph_pampd_count_max;
1685 static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
1686 static unsigned long zcache_curr_pers_pampd_count_max;
1687
1688 /* forward reference */
1689 static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
1690
1691 static int zcache_pampd_eph_create(char *data, size_t size, bool raw,
1692                                 struct tmem_pool *pool, struct tmem_oid *oid,
1693                                 uint32_t index, void **pampd)
1694 {
1695         int ret = -1;
1696         void *cdata = data;
1697         size_t clen = size;
1698         struct zcache_client *cli = pool->client;
1699         uint16_t client_id = get_client_id_from_client(cli);
1700         struct page *page = NULL;
1701         unsigned long count;
1702
1703         if (!raw) {
1704                 page = virt_to_page(data);
1705                 ret = zcache_compress(page, &cdata, &clen);
1706                 if (ret == 0)
1707                         goto out;
1708                 if (clen == 0 || clen > zbud_max_buddy_size()) {
1709                         zcache_compress_poor++;
1710                         goto out;
1711                 }
1712         }
1713         *pampd = (void *)zbud_create(client_id, pool->pool_id, oid,
1714                                         index, page, cdata, clen);
1715         if (*pampd == NULL) {
1716                 ret = -ENOMEM;
1717                 goto out;
1718         }
1719         ret = 0;
1720         count = atomic_inc_return(&zcache_curr_eph_pampd_count);
1721         if (count > zcache_curr_eph_pampd_count_max)
1722                 zcache_curr_eph_pampd_count_max = count;
1723         if (client_id != LOCAL_CLIENT) {
1724                 count = atomic_inc_return(&ramster_foreign_eph_pampd_count);
1725                 if (count > ramster_foreign_eph_pampd_count_max)
1726                         ramster_foreign_eph_pampd_count_max = count;
1727         }
1728 out:
1729         return ret;
1730 }
1731
1732 static int zcache_pampd_pers_create(char *data, size_t size, bool raw,
1733                                 struct tmem_pool *pool, struct tmem_oid *oid,
1734                                 uint32_t index, void **pampd)
1735 {
1736         int ret = -1;
1737         void *cdata = data;
1738         size_t clen = size;
1739         struct zcache_client *cli = pool->client;
1740         struct page *page;
1741         unsigned long count;
1742         unsigned long zv_mean_zsize;
1743         struct zv_hdr *zv;
1744         long curr_pers_pampd_count;
1745         u64 total_zsize;
1746 #ifdef RAMSTER_TESTING
1747         static bool pampd_neg_warned;
1748 #endif
1749
1750         curr_pers_pampd_count = atomic_read(&zcache_curr_pers_pampd_count) -
1751                         atomic_read(&ramster_remote_pers_pages);
1752 #ifdef RAMSTER_TESTING
1753         /* should always be positive, but warn if accounting is off */
1754         if (!pampd_neg_warned) {
1755                 pr_warn("ramster: bad accounting for curr_pers_pampd_count\n");
1756                 pampd_neg_warned = true;
1757         }
1758 #endif
1759         if (curr_pers_pampd_count >
1760                     (zv_page_count_policy_percent * totalram_pages) / 100) {
1761                 zcache_policy_percent_exceeded++;
1762                 goto out;
1763         }
1764         if (raw)
1765                 goto ok_to_create;
1766         page = virt_to_page(data);
1767         if (zcache_compress(page, &cdata, &clen) == 0)
1768                 goto out;
1769         /* reject if compression is too poor */
1770         if (clen > zv_max_zsize) {
1771                 zcache_compress_poor++;
1772                 goto out;
1773         }
1774         /* reject if mean compression is too poor */
1775         if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) {
1776                 total_zsize = xv_get_total_size_bytes(cli->xvpool);
1777                 zv_mean_zsize = div_u64(total_zsize, curr_pers_pampd_count);
1778                 if (zv_mean_zsize > zv_max_mean_zsize) {
1779                         zcache_mean_compress_poor++;
1780                         goto out;
1781                 }
1782         }
1783 ok_to_create:
1784         *pampd = (void *)zv_create(cli, pool->pool_id, oid, index, cdata, clen);
1785         if (*pampd == NULL) {
1786                 ret = -ENOMEM;
1787                 goto out;
1788         }
1789         ret = 0;
1790         count = atomic_inc_return(&zcache_curr_pers_pampd_count);
1791         if (count > zcache_curr_pers_pampd_count_max)
1792                 zcache_curr_pers_pampd_count_max = count;
1793         if (is_local_client(cli))
1794                 goto out;
1795         zv = *(struct zv_hdr **)pampd;
1796         count = atomic_inc_return(&ramster_foreign_pers_pampd_count);
1797         if (count > ramster_foreign_pers_pampd_count_max)
1798                 ramster_foreign_pers_pampd_count_max = count;
1799 out:
1800         return ret;
1801 }
1802
1803 static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
1804                                 struct tmem_pool *pool, struct tmem_oid *oid,
1805                                 uint32_t index)
1806 {
1807         void *pampd = NULL;
1808         int ret;
1809         bool ephemeral;
1810
1811         BUG_ON(preemptible());
1812         ephemeral = (eph == 1) || ((eph == 0) && is_ephemeral(pool));
1813         if (ephemeral)
1814                 ret = zcache_pampd_eph_create(data, size, raw, pool,
1815                                                 oid, index, &pampd);
1816         else
1817                 ret = zcache_pampd_pers_create(data, size, raw, pool,
1818                                                 oid, index, &pampd);
1819         /* FIXME add some counters here for failed creates? */
1820         return pampd;
1821 }
1822
1823 /*
1824  * fill the pageframe corresponding to the struct page with the data
1825  * from the passed pampd
1826  */
1827 static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw,
1828                                         void *pampd, struct tmem_pool *pool,
1829                                         struct tmem_oid *oid, uint32_t index)
1830 {
1831         int ret = 0;
1832
1833         BUG_ON(preemptible());
1834         BUG_ON(is_ephemeral(pool)); /* Fix later for shared pools? */
1835         BUG_ON(pampd_is_remote(pampd));
1836         if (raw)
1837                 zv_copy_from_pampd(data, bufsize, pampd);
1838         else
1839                 zv_decompress(virt_to_page(data), pampd);
1840         return ret;
1841 }
1842
1843 static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw,
1844                                         void *pampd, struct tmem_pool *pool,
1845                                         struct tmem_oid *oid, uint32_t index)
1846 {
1847         int ret = 0;
1848         unsigned long flags;
1849         struct zcache_client *cli = pool->client;
1850
1851         BUG_ON(preemptible());
1852         BUG_ON(pampd_is_remote(pampd));
1853         if (is_ephemeral(pool)) {
1854                 local_irq_save(flags);
1855                 if (raw)
1856                         zbud_copy_from_pampd(data, bufsize, pampd);
1857                 else
1858                         ret = zbud_decompress(virt_to_page(data), pampd);
1859                 zbud_free_and_delist((struct zbud_hdr *)pampd);
1860                 local_irq_restore(flags);
1861                 if (!is_local_client(cli)) {
1862                         atomic_dec(&ramster_foreign_eph_pampd_count);
1863                         WARN_ON_ONCE(atomic_read(&ramster_foreign_eph_pampd_count) < 0);
1864                 }
1865                 atomic_dec(&zcache_curr_eph_pampd_count);
1866                 WARN_ON_ONCE(atomic_read(&zcache_curr_eph_pampd_count) < 0);
1867         } else {
1868                 if (is_local_client(cli))
1869                         BUG();
1870                 if (raw)
1871                         zv_copy_from_pampd(data, bufsize, pampd);
1872                 else
1873                         zv_decompress(virt_to_page(data), pampd);
1874                 zv_free(cli->xvpool, pampd);
1875                 if (!is_local_client(cli)) {
1876                         atomic_dec(&ramster_foreign_pers_pampd_count);
1877                         WARN_ON_ONCE(atomic_read(&ramster_foreign_pers_pampd_count) < 0);
1878                 }
1879                 atomic_dec(&zcache_curr_pers_pampd_count);
1880                 WARN_ON_ONCE(atomic_read(&zcache_curr_pers_pampd_count) < 0);
1881                 ret = 0;
1882         }
1883         return ret;
1884 }
1885
1886 static bool zcache_pampd_is_remote(void *pampd)
1887 {
1888         return pampd_is_remote(pampd);
1889 }
1890
1891 /*
1892  * free the pampd and remove it from any zcache lists
1893  * pampd must no longer be pointed to from any tmem data structures!
1894  */
1895 static void zcache_pampd_free(void *pampd, struct tmem_pool *pool,
1896                               struct tmem_oid *oid, uint32_t index, bool acct)
1897 {
1898         struct zcache_client *cli = pool->client;
1899         bool eph = is_ephemeral(pool);
1900         struct zv_hdr *zv;
1901
1902         BUG_ON(preemptible());
1903         if (pampd_is_remote(pampd)) {
1904                 WARN_ON(acct == false);
1905                 if (oid == NULL) {
1906                         /*
1907                          * a NULL oid means to ignore this pampd free
1908                          * as the remote freeing will be handled elsewhere
1909                          */
1910                 } else if (eph) {
1911                         /* FIXME remote flush optional but probably good idea */
1912                         /* FIXME get these working properly again */
1913                         atomic_dec(&zcache_curr_eph_pampd_count);
1914                         WARN_ON_ONCE(atomic_read(&zcache_curr_eph_pampd_count) < 0);
1915                 } else if (pampd_is_intransit(pampd)) {
1916                         /* did a pers remote get_and_free, so just free local */
1917                         pampd = pampd_mask_intransit_and_remote(pampd);
1918                         goto local_pers;
1919                 } else {
1920                         struct flushlist_node *flnode =
1921                                 ramster_flnode_alloc(pool);
1922
1923                         flnode->xh.client_id = pampd_remote_node(pampd);
1924                         flnode->xh.pool_id = pool->pool_id;
1925                         flnode->xh.oid = *oid;
1926                         flnode->xh.index = index;
1927                         flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE;
1928                         spin_lock(&zcache_rem_op_list_lock);
1929                         list_add(&flnode->rem_op.list, &zcache_rem_op_list);
1930                         spin_unlock(&zcache_rem_op_list_lock);
1931                         atomic_dec(&zcache_curr_pers_pampd_count);
1932                         WARN_ON_ONCE(atomic_read(&zcache_curr_pers_pampd_count) < 0);
1933                         atomic_dec(&ramster_remote_pers_pages);
1934                         WARN_ON_ONCE(atomic_read(&ramster_remote_pers_pages) < 0);
1935                 }
1936         } else if (eph) {
1937                 zbud_free_and_delist((struct zbud_hdr *)pampd);
1938                 if (!is_local_client(pool->client)) {
1939                         atomic_dec(&ramster_foreign_eph_pampd_count);
1940                         WARN_ON_ONCE(atomic_read(&ramster_foreign_eph_pampd_count) < 0);
1941                 }
1942                 if (acct)
1943                         atomic_dec(&zcache_curr_eph_pampd_count);
1944                         /* FIXME get these working properly again */
1945                         WARN_ON_ONCE(atomic_read(&zcache_curr_eph_pampd_count) < 0);
1946         } else {
1947 local_pers:
1948                 zv = (struct zv_hdr *)pampd;
1949                 if (!is_local_client(pool->client)) {
1950                         atomic_dec(&ramster_foreign_pers_pampd_count);
1951                         WARN_ON_ONCE(atomic_read(&ramster_foreign_pers_pampd_count) < 0);
1952                 }
1953                 zv_free(cli->xvpool, zv);
1954                 if (acct)
1955                         atomic_dec(&zcache_curr_pers_pampd_count);
1956                 /* FIXME get these working properly again */
1957                 WARN_ON_ONCE(atomic_read(&zcache_curr_pers_pampd_count) < 0);
1958         }
1959 }
1960
1961 static void zcache_pampd_free_obj(struct tmem_pool *pool,
1962                                         struct tmem_obj *obj)
1963 {
1964         struct flushlist_node *flnode;
1965
1966         BUG_ON(preemptible());
1967         if (obj->extra == NULL)
1968                 return;
1969         BUG_ON(!pampd_is_remote(obj->extra));
1970         flnode = ramster_flnode_alloc(pool);
1971         flnode->xh.client_id = pampd_remote_node(obj->extra);
1972         flnode->xh.pool_id = pool->pool_id;
1973         flnode->xh.oid = obj->oid;
1974         flnode->xh.index = FLUSH_ENTIRE_OBJECT;
1975         flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ;
1976         spin_lock(&zcache_rem_op_list_lock);
1977         list_add(&flnode->rem_op.list, &zcache_rem_op_list);
1978         spin_unlock(&zcache_rem_op_list_lock);
1979 }
1980
1981 void zcache_pampd_new_obj(struct tmem_obj *obj)
1982 {
1983         obj->extra = NULL;
1984 }
1985
1986 int zcache_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj)
1987 {
1988         int ret = -1;
1989
1990         if (new_pampd != NULL) {
1991                 if (obj->extra == NULL)
1992                         obj->extra = new_pampd;
1993                 /* enforce that all remote pages in an object reside
1994                  * in the same node! */
1995                 else if (pampd_remote_node(new_pampd) !=
1996                                 pampd_remote_node((void *)(obj->extra)))
1997                         BUG();
1998                 ret = 0;
1999         }
2000         return ret;
2001 }
2002
2003 /*
2004  * Called by the message handler after a (still compressed) page has been
2005  * fetched from the remote machine in response to an "is_remote" tmem_get
2006  * or persistent tmem_localify.  For a tmem_get, "extra" is the address of
2007  * the page that is to be filled to succesfully resolve the tmem_get; for
2008  * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only
2009  * in the local zcache).  "data" points to "size" bytes of (compressed) data
2010  * passed in the message.  In the case of a persistent remote get, if
2011  * pre-allocation was successful (see zcache_repatriate_preload), the page
2012  * is placed into both local zcache and at "extra".
2013  */
2014 int zcache_localify(int pool_id, struct tmem_oid *oidp,
2015                         uint32_t index, char *data, size_t size,
2016                         void *extra)
2017 {
2018         int ret = -ENOENT;
2019         unsigned long flags;
2020         struct tmem_pool *pool;
2021         bool ephemeral, delete = false;
2022         size_t clen = PAGE_SIZE;
2023         void *pampd, *saved_hb;
2024         struct tmem_obj *obj;
2025
2026         pool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id);
2027         if (unlikely(pool == NULL))
2028                 /* pool doesn't exist anymore */
2029                 goto out;
2030         ephemeral = is_ephemeral(pool);
2031         local_irq_save(flags);  /* FIXME: maybe only disable softirqs? */
2032         pampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb);
2033         if (pampd == NULL) {
2034                 /* hmmm... must have been a flush while waiting */
2035 #ifdef RAMSTER_TESTING
2036                 pr_err("UNTESTED pampd==NULL in zcache_localify\n");
2037 #endif
2038                 if (ephemeral)
2039                         ramster_remote_eph_pages_unsucc_get++;
2040                 else
2041                         ramster_remote_pers_pages_unsucc_get++;
2042                 obj = NULL;
2043                 goto finish;
2044         } else if (unlikely(!pampd_is_remote(pampd))) {
2045                 /* hmmm... must have been a dup put while waiting */
2046 #ifdef RAMSTER_TESTING
2047                 pr_err("UNTESTED dup while waiting in zcache_localify\n");
2048 #endif
2049                 if (ephemeral)
2050                         ramster_remote_eph_pages_unsucc_get++;
2051                 else
2052                         ramster_remote_pers_pages_unsucc_get++;
2053                 obj = NULL;
2054                 pampd = NULL;
2055                 ret = -EEXIST;
2056                 goto finish;
2057         } else if (size == 0) {
2058                 /* no remote data, delete the local is_remote pampd */
2059                 pampd = NULL;
2060                 if (ephemeral)
2061                         ramster_remote_eph_pages_unsucc_get++;
2062                 else
2063                         BUG();
2064                 delete = true;
2065                 goto finish;
2066         }
2067         if (!ephemeral && pampd_is_intransit(pampd)) {
2068                 /* localify to zcache */
2069                 pampd = pampd_mask_intransit_and_remote(pampd);
2070                 zv_copy_to_pampd(pampd, data, size);
2071         } else {
2072                 pampd = NULL;
2073                 obj = NULL;
2074         }
2075         if (extra != NULL) {
2076                 /* decompress direct-to-memory to complete remotify */
2077                 ret = lzo1x_decompress_safe((char *)data, size,
2078                                                 (char *)extra, &clen);
2079                 BUG_ON(ret != LZO_E_OK);
2080                 BUG_ON(clen != PAGE_SIZE);
2081         }
2082         if (ephemeral)
2083                 ramster_remote_eph_pages_succ_get++;
2084         else
2085                 ramster_remote_pers_pages_succ_get++;
2086         ret = 0;
2087 finish:
2088         tmem_localify_finish(obj, index, pampd, saved_hb, delete);
2089         zcache_put_pool(pool);
2090         local_irq_restore(flags);
2091 out:
2092         return ret;
2093 }
2094
2095 /*
2096  * Called on a remote persistent tmem_get to attempt to preallocate
2097  * local storage for the data contained in the remote persistent page.
2098  * If succesfully preallocated, returns the pampd, marked as remote and
2099  * in_transit.  Else returns NULL.  Note that the appropriate tmem data
2100  * structure must be locked.
2101  */
2102 static void *zcache_pampd_repatriate_preload(void *pampd,
2103                                                 struct tmem_pool *pool,
2104                                                 struct tmem_oid *oid,
2105                                                 uint32_t index,
2106                                                 bool *intransit)
2107 {
2108         int clen = pampd_remote_size(pampd);
2109         void *ret_pampd = NULL;
2110         unsigned long flags;
2111
2112         if (!pampd_is_remote(pampd))
2113                 BUG();
2114         if (is_ephemeral(pool))
2115                 BUG();
2116         if (pampd_is_intransit(pampd)) {
2117                 /*
2118                  * to avoid multiple allocations (and maybe a memory leak)
2119                  * don't preallocate if already in the process of being
2120                  * repatriated
2121                  */
2122                 *intransit = true;
2123                 goto out;
2124         }
2125         *intransit = false;
2126         local_irq_save(flags);
2127         ret_pampd = (void *)zv_alloc(pool, oid, index, clen);
2128         if (ret_pampd != NULL) {
2129                 /*
2130                  *  a pampd is marked intransit if it is remote and space has
2131                  *  been allocated for it locally (note, only happens for
2132                  *  persistent pages, in which case the remote copy is freed)
2133                  */
2134                 ret_pampd = pampd_mark_intransit(ret_pampd);
2135                 atomic_dec(&ramster_remote_pers_pages);
2136                 WARN_ON_ONCE(atomic_read(&ramster_remote_pers_pages) < 0);
2137         } else
2138                 ramster_pers_pages_remote_nomem++;
2139         local_irq_restore(flags);
2140 out:
2141         return ret_pampd;
2142 }
2143
2144 /*
2145  * Called on a remote tmem_get to invoke a message to fetch the page.
2146  * Might sleep so no tmem locks can be held.  "extra" is passed
2147  * all the way through the round-trip messaging to zcache_localify.
2148  */
2149 static int zcache_pampd_repatriate(void *fake_pampd, void *real_pampd,
2150                                    struct tmem_pool *pool,
2151                                    struct tmem_oid *oid, uint32_t index,
2152                                    bool free, void *extra)
2153 {
2154         struct tmem_xhandle xh;
2155         int ret;
2156
2157         if (pampd_is_intransit(real_pampd))
2158                 /* have local space pre-reserved, so free remote copy */
2159                 free = true;
2160         xh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index);
2161         /* unreliable request/response for now */
2162         ret = ramster_remote_async_get(&xh, free,
2163                                         pampd_remote_node(fake_pampd),
2164                                         pampd_remote_size(fake_pampd),
2165                                         pampd_remote_cksum(fake_pampd),
2166                                         extra);
2167 #ifdef RAMSTER_TESTING
2168         if (ret != 0 && ret != -ENOENT)
2169                 pr_err("TESTING zcache_pampd_repatriate returns, ret=%d\n",
2170                         ret);
2171 #endif
2172         return ret;
2173 }
2174
2175 static struct tmem_pamops zcache_pamops = {
2176         .create = zcache_pampd_create,
2177         .get_data = zcache_pampd_get_data,
2178         .free = zcache_pampd_free,
2179         .get_data_and_free = zcache_pampd_get_data_and_free,
2180         .free_obj = zcache_pampd_free_obj,
2181         .is_remote = zcache_pampd_is_remote,
2182         .repatriate_preload = zcache_pampd_repatriate_preload,
2183         .repatriate = zcache_pampd_repatriate,
2184         .new_obj = zcache_pampd_new_obj,
2185         .replace_in_obj = zcache_pampd_replace_in_obj,
2186 };
2187
2188 /*
2189  * zcache compression/decompression and related per-cpu stuff
2190  */
2191
2192 #define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
2193 #define LZO_DSTMEM_PAGE_ORDER 1
2194 static DEFINE_PER_CPU(unsigned char *, zcache_workmem);
2195 static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
2196
2197 static int zcache_compress(struct page *from, void **out_va, size_t *out_len)
2198 {
2199         int ret = 0;
2200         unsigned char *dmem = __get_cpu_var(zcache_dstmem);
2201         unsigned char *wmem = __get_cpu_var(zcache_workmem);
2202         char *from_va;
2203
2204         BUG_ON(!irqs_disabled());
2205         if (unlikely(dmem == NULL || wmem == NULL))
2206                 goto out;  /* no buffer, so can't compress */
2207         from_va = kmap_atomic(from, KM_USER0);
2208         mb();
2209         ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem);
2210         BUG_ON(ret != LZO_E_OK);
2211         *out_va = dmem;
2212         kunmap_atomic(from_va, KM_USER0);
2213         ret = 1;
2214 out:
2215         return ret;
2216 }
2217
2218
2219 static int zcache_cpu_notifier(struct notifier_block *nb,
2220                                 unsigned long action, void *pcpu)
2221 {
2222         int cpu = (long)pcpu;
2223         struct zcache_preload *kp;
2224
2225         switch (action) {
2226         case CPU_UP_PREPARE:
2227                 per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
2228                         GFP_KERNEL | __GFP_REPEAT,
2229                         LZO_DSTMEM_PAGE_ORDER),
2230                 per_cpu(zcache_workmem, cpu) =
2231                         kzalloc(LZO1X_MEM_COMPRESS,
2232                                 GFP_KERNEL | __GFP_REPEAT);
2233                 per_cpu(zcache_remoteputmem, cpu) =
2234                         kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT);
2235                 break;
2236         case CPU_DEAD:
2237         case CPU_UP_CANCELED:
2238                 kfree(per_cpu(zcache_remoteputmem, cpu));
2239                 per_cpu(zcache_remoteputmem, cpu) = NULL;
2240                 free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
2241                                 LZO_DSTMEM_PAGE_ORDER);
2242                 per_cpu(zcache_dstmem, cpu) = NULL;
2243                 kfree(per_cpu(zcache_workmem, cpu));
2244                 per_cpu(zcache_workmem, cpu) = NULL;
2245                 kp = &per_cpu(zcache_preloads, cpu);
2246                 while (kp->nr) {
2247                         kmem_cache_free(zcache_objnode_cache,
2248                                         kp->objnodes[kp->nr - 1]);
2249                         kp->objnodes[kp->nr - 1] = NULL;
2250                         kp->nr--;
2251                 }
2252                 if (kp->obj) {
2253                         kmem_cache_free(zcache_obj_cache, kp->obj);
2254                         kp->obj = NULL;
2255                 }
2256                 if (kp->flnode) {
2257                         kmem_cache_free(ramster_flnode_cache, kp->flnode);
2258                         kp->flnode = NULL;
2259                 }
2260                 if (kp->page) {
2261                         free_page((unsigned long)kp->page);
2262                         kp->page = NULL;
2263                 }
2264                 break;
2265         default:
2266                 break;
2267         }
2268         return NOTIFY_OK;
2269 }
2270
2271 static struct notifier_block zcache_cpu_notifier_block = {
2272         .notifier_call = zcache_cpu_notifier
2273 };
2274
2275 #ifdef CONFIG_SYSFS
2276 #define ZCACHE_SYSFS_RO(_name) \
2277         static ssize_t zcache_##_name##_show(struct kobject *kobj, \
2278                                 struct kobj_attribute *attr, char *buf) \
2279         { \
2280                 return sprintf(buf, "%lu\n", zcache_##_name); \
2281         } \
2282         static struct kobj_attribute zcache_##_name##_attr = { \
2283                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
2284                 .show = zcache_##_name##_show, \
2285         }
2286
2287 #define ZCACHE_SYSFS_RO_ATOMIC(_name) \
2288         static ssize_t zcache_##_name##_show(struct kobject *kobj, \
2289                                 struct kobj_attribute *attr, char *buf) \
2290         { \
2291             return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
2292         } \
2293         static struct kobj_attribute zcache_##_name##_attr = { \
2294                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
2295                 .show = zcache_##_name##_show, \
2296         }
2297
2298 #define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
2299         static ssize_t zcache_##_name##_show(struct kobject *kobj, \
2300                                 struct kobj_attribute *attr, char *buf) \
2301         { \
2302             return _func(buf); \
2303         } \
2304         static struct kobj_attribute zcache_##_name##_attr = { \
2305                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
2306                 .show = zcache_##_name##_show, \
2307         }
2308
2309 ZCACHE_SYSFS_RO(curr_obj_count_max);
2310 ZCACHE_SYSFS_RO(curr_objnode_count_max);
2311 ZCACHE_SYSFS_RO(flush_total);
2312 ZCACHE_SYSFS_RO(flush_found);
2313 ZCACHE_SYSFS_RO(flobj_total);
2314 ZCACHE_SYSFS_RO(flobj_found);
2315 ZCACHE_SYSFS_RO(failed_eph_puts);
2316 ZCACHE_SYSFS_RO(nonactive_puts);
2317 ZCACHE_SYSFS_RO(failed_pers_puts);
2318 ZCACHE_SYSFS_RO(zbud_curr_zbytes);
2319 ZCACHE_SYSFS_RO(zbud_cumul_zpages);
2320 ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
2321 ZCACHE_SYSFS_RO(zbud_buddied_count);
2322 ZCACHE_SYSFS_RO(evicted_raw_pages);
2323 ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
2324 ZCACHE_SYSFS_RO(evicted_buddied_pages);
2325 ZCACHE_SYSFS_RO(failed_get_free_pages);
2326 ZCACHE_SYSFS_RO(failed_alloc);
2327 ZCACHE_SYSFS_RO(put_to_flush);
2328 ZCACHE_SYSFS_RO(compress_poor);
2329 ZCACHE_SYSFS_RO(mean_compress_poor);
2330 ZCACHE_SYSFS_RO(policy_percent_exceeded);
2331 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
2332 ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
2333 ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
2334 ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
2335 ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
2336                         zbud_show_unbuddied_list_counts);
2337 ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
2338                         zbud_show_cumul_chunk_counts);
2339 ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts,
2340                         zv_curr_dist_counts_show);
2341 ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts,
2342                         zv_cumul_dist_counts_show);
2343
2344 static struct attribute *zcache_attrs[] = {
2345         &zcache_curr_obj_count_attr.attr,
2346         &zcache_curr_obj_count_max_attr.attr,
2347         &zcache_curr_objnode_count_attr.attr,
2348         &zcache_curr_objnode_count_max_attr.attr,
2349         &zcache_flush_total_attr.attr,
2350         &zcache_flobj_total_attr.attr,
2351         &zcache_flush_found_attr.attr,
2352         &zcache_flobj_found_attr.attr,
2353         &zcache_failed_eph_puts_attr.attr,
2354         &zcache_nonactive_puts_attr.attr,
2355         &zcache_failed_pers_puts_attr.attr,
2356         &zcache_policy_percent_exceeded_attr.attr,
2357         &zcache_compress_poor_attr.attr,
2358         &zcache_mean_compress_poor_attr.attr,
2359         &zcache_zbud_curr_raw_pages_attr.attr,
2360         &zcache_zbud_curr_zpages_attr.attr,
2361         &zcache_zbud_curr_zbytes_attr.attr,
2362         &zcache_zbud_cumul_zpages_attr.attr,
2363         &zcache_zbud_cumul_zbytes_attr.attr,
2364         &zcache_zbud_buddied_count_attr.attr,
2365         &zcache_evicted_raw_pages_attr.attr,
2366         &zcache_evicted_unbuddied_pages_attr.attr,
2367         &zcache_evicted_buddied_pages_attr.attr,
2368         &zcache_failed_get_free_pages_attr.attr,
2369         &zcache_failed_alloc_attr.attr,
2370         &zcache_put_to_flush_attr.attr,
2371         &zcache_zbud_unbuddied_list_counts_attr.attr,
2372         &zcache_zbud_cumul_chunk_counts_attr.attr,
2373         &zcache_zv_curr_dist_counts_attr.attr,
2374         &zcache_zv_cumul_dist_counts_attr.attr,
2375         &zcache_zv_max_zsize_attr.attr,
2376         &zcache_zv_max_mean_zsize_attr.attr,
2377         &zcache_zv_page_count_policy_percent_attr.attr,
2378         NULL,
2379 };
2380
2381 static struct attribute_group zcache_attr_group = {
2382         .attrs = zcache_attrs,
2383         .name = "zcache",
2384 };
2385
2386 #define RAMSTER_SYSFS_RO(_name) \
2387         static ssize_t ramster_##_name##_show(struct kobject *kobj, \
2388                                 struct kobj_attribute *attr, char *buf) \
2389         { \
2390                 return sprintf(buf, "%lu\n", ramster_##_name); \
2391         } \
2392         static struct kobj_attribute ramster_##_name##_attr = { \
2393                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
2394                 .show = ramster_##_name##_show, \
2395         }
2396
2397 #define RAMSTER_SYSFS_RW(_name) \
2398         static ssize_t ramster_##_name##_show(struct kobject *kobj, \
2399                                 struct kobj_attribute *attr, char *buf) \
2400         { \
2401                 return sprintf(buf, "%lu\n", ramster_##_name); \
2402         } \
2403         static ssize_t ramster_##_name##_store(struct kobject *kobj, \
2404                 struct kobj_attribute *attr, const char *buf, size_t count) \
2405         { \
2406                 int err; \
2407                 unsigned long enable; \
2408                 err = strict_strtoul(buf, 10, &enable); \
2409                 if (err) \
2410                         return -EINVAL; \
2411                 ramster_##_name = enable; \
2412                 return count; \
2413         } \
2414         static struct kobj_attribute ramster_##_name##_attr = { \
2415                 .attr = { .name = __stringify(_name), .mode = 0644 }, \
2416                 .show = ramster_##_name##_show, \
2417                 .store = ramster_##_name##_store, \
2418         }
2419
2420 #define RAMSTER_SYSFS_RO_ATOMIC(_name) \
2421         static ssize_t ramster_##_name##_show(struct kobject *kobj, \
2422                                 struct kobj_attribute *attr, char *buf) \
2423         { \
2424             return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \
2425         } \
2426         static struct kobj_attribute ramster_##_name##_attr = { \
2427                 .attr = { .name = __stringify(_name), .mode = 0444 }, \
2428                 .show = ramster_##_name##_show, \
2429         }
2430
2431 RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages);
2432 RAMSTER_SYSFS_RW(pers_remotify_enable);
2433 RAMSTER_SYSFS_RW(eph_remotify_enable);
2434 RAMSTER_SYSFS_RO(eph_pages_remoted);
2435 RAMSTER_SYSFS_RO(eph_pages_remote_failed);
2436 RAMSTER_SYSFS_RO(pers_pages_remoted);
2437 RAMSTER_SYSFS_RO(pers_pages_remote_failed);
2438 RAMSTER_SYSFS_RO(pers_pages_remote_nomem);
2439 RAMSTER_SYSFS_RO(remote_pages_flushed);
2440 RAMSTER_SYSFS_RO(remote_page_flushes_failed);
2441 RAMSTER_SYSFS_RO(remote_objects_flushed);
2442 RAMSTER_SYSFS_RO(remote_object_flushes_failed);
2443 RAMSTER_SYSFS_RO(remote_eph_pages_succ_get);
2444 RAMSTER_SYSFS_RO(remote_eph_pages_unsucc_get);
2445 RAMSTER_SYSFS_RO(remote_pers_pages_succ_get);
2446 RAMSTER_SYSFS_RO(remote_pers_pages_unsucc_get);
2447 RAMSTER_SYSFS_RO_ATOMIC(foreign_eph_pampd_count);
2448 RAMSTER_SYSFS_RO(foreign_eph_pampd_count_max);
2449 RAMSTER_SYSFS_RO_ATOMIC(foreign_pers_pampd_count);
2450 RAMSTER_SYSFS_RO(foreign_pers_pampd_count_max);
2451 RAMSTER_SYSFS_RO_ATOMIC(curr_flnode_count);
2452 RAMSTER_SYSFS_RO(curr_flnode_count_max);
2453
2454 #define MANUAL_NODES 8
2455 static bool ramster_nodes_manual_up[MANUAL_NODES];
2456 static ssize_t ramster_manual_node_up_show(struct kobject *kobj,
2457                                 struct kobj_attribute *attr, char *buf)
2458 {
2459         int i;
2460         char *p = buf;
2461         for (i = 0; i < MANUAL_NODES; i++)
2462                 if (ramster_nodes_manual_up[i])
2463                         p += sprintf(p, "%d ", i);
2464         p += sprintf(p, "\n");
2465         return p - buf;
2466 }
2467
2468 static ssize_t ramster_manual_node_up_store(struct kobject *kobj,
2469                 struct kobj_attribute *attr, const char *buf, size_t count)
2470 {
2471         int err;
2472         unsigned long node_num;
2473
2474         err = strict_strtoul(buf, 10, &node_num);
2475         if (err) {
2476                 pr_err("bad strtoul?\n");
2477                 return -EINVAL;
2478         }
2479         if (node_num >= MANUAL_NODES) {
2480                 pr_err("bad node_num=%lu?\n", node_num);
2481                 return -EINVAL;
2482         }
2483         if (ramster_nodes_manual_up[node_num]) {
2484                 pr_err("node %d already up, ignoring\n", (int)node_num);
2485         } else {
2486                 ramster_nodes_manual_up[node_num] = true;
2487                 o2net_hb_node_up_manual((int)node_num);
2488         }
2489         return count;
2490 }
2491
2492 static struct kobj_attribute ramster_manual_node_up_attr = {
2493         .attr = { .name = "manual_node_up", .mode = 0644 },
2494         .show = ramster_manual_node_up_show,
2495         .store = ramster_manual_node_up_store,
2496 };
2497
2498 static struct attribute *ramster_attrs[] = {
2499         &ramster_pers_remotify_enable_attr.attr,
2500         &ramster_eph_remotify_enable_attr.attr,
2501         &ramster_remote_pers_pages_attr.attr,
2502         &ramster_eph_pages_remoted_attr.attr,
2503         &ramster_eph_pages_remote_failed_attr.attr,
2504         &ramster_pers_pages_remoted_attr.attr,
2505         &ramster_pers_pages_remote_failed_attr.attr,
2506         &ramster_pers_pages_remote_nomem_attr.attr,
2507         &ramster_remote_pages_flushed_attr.attr,
2508         &ramster_remote_page_flushes_failed_attr.attr,
2509         &ramster_remote_objects_flushed_attr.attr,
2510         &ramster_remote_object_flushes_failed_attr.attr,
2511         &ramster_remote_eph_pages_succ_get_attr.attr,
2512         &ramster_remote_eph_pages_unsucc_get_attr.attr,
2513         &ramster_remote_pers_pages_succ_get_attr.attr,
2514         &ramster_remote_pers_pages_unsucc_get_attr.attr,
2515         &ramster_foreign_eph_pampd_count_attr.attr,
2516         &ramster_foreign_eph_pampd_count_max_attr.attr,
2517         &ramster_foreign_pers_pampd_count_attr.attr,
2518         &ramster_foreign_pers_pampd_count_max_attr.attr,
2519         &ramster_curr_flnode_count_attr.attr,
2520         &ramster_curr_flnode_count_max_attr.attr,
2521         &ramster_manual_node_up_attr.attr,
2522         NULL,
2523 };
2524
2525 static struct attribute_group ramster_attr_group = {
2526         .attrs = ramster_attrs,
2527         .name = "ramster",
2528 };
2529
2530 #endif /* CONFIG_SYSFS */
2531 /*
2532  * When zcache is disabled ("frozen"), pools can be created and destroyed,
2533  * but all puts (and thus all other operations that require memory allocation)
2534  * must fail.  If zcache is unfrozen, accepts puts, then frozen again,
2535  * data consistency requires all puts while frozen to be converted into
2536  * flushes.
2537  */
2538 static bool zcache_freeze;
2539
2540 /*
2541  * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
2542  */
2543 static int shrink_zcache_memory(struct shrinker *shrink,
2544                                 struct shrink_control *sc)
2545 {
2546         int ret = -1;
2547         int nr = sc->nr_to_scan;
2548         gfp_t gfp_mask = sc->gfp_mask;
2549
2550         if (nr >= 0) {
2551                 if (!(gfp_mask & __GFP_FS))
2552                         /* does this case really need to be skipped? */
2553                         goto out;
2554                 zbud_evict_pages(nr);
2555         }
2556         ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
2557 out:
2558         return ret;
2559 }
2560
2561 static struct shrinker zcache_shrinker = {
2562         .shrink = shrink_zcache_memory,
2563         .seeks = DEFAULT_SEEKS,
2564 };
2565
2566 /*
2567  * zcache shims between cleancache/frontswap ops and tmem
2568  */
2569
2570 int zcache_put(int cli_id, int pool_id, struct tmem_oid *oidp,
2571                         uint32_t index, char *data, size_t size,
2572                         bool raw, int ephemeral)
2573 {
2574         struct tmem_pool *pool;
2575         int ret = -1;
2576
2577         BUG_ON(!irqs_disabled());
2578         pool = zcache_get_pool_by_id(cli_id, pool_id);
2579         if (unlikely(pool == NULL))
2580                 goto out;
2581         if (!zcache_freeze && zcache_do_preload(pool) == 0) {
2582                 /* preload does preempt_disable on success */
2583                 ret = tmem_put(pool, oidp, index, data, size, raw, ephemeral);
2584                 if (ret < 0) {
2585                         if (is_ephemeral(pool))
2586                                 zcache_failed_eph_puts++;
2587                         else
2588                                 zcache_failed_pers_puts++;
2589                 }
2590                 zcache_put_pool(pool);
2591                 preempt_enable_no_resched();
2592         } else {
2593                 zcache_put_to_flush++;
2594                 if (atomic_read(&pool->obj_count) > 0)
2595                         /* the put fails whether the flush succeeds or not */
2596                         (void)tmem_flush_page(pool, oidp, index);
2597                 zcache_put_pool(pool);
2598         }
2599 out:
2600         return ret;
2601 }
2602
2603 int zcache_get(int cli_id, int pool_id, struct tmem_oid *oidp,
2604                         uint32_t index, char *data, size_t *sizep,
2605                         bool raw, int get_and_free)
2606 {
2607         struct tmem_pool *pool;
2608         int ret = -1;
2609         bool eph;
2610
2611         if (!raw) {
2612                 BUG_ON(irqs_disabled());
2613                 BUG_ON(in_softirq());
2614         }
2615         pool = zcache_get_pool_by_id(cli_id, pool_id);
2616         eph = is_ephemeral(pool);
2617         if (likely(pool != NULL)) {
2618                 if (atomic_read(&pool->obj_count) > 0)
2619                         ret = tmem_get(pool, oidp, index, data, sizep,
2620                                         raw, get_and_free);
2621                 zcache_put_pool(pool);
2622         }
2623         WARN_ONCE((!eph && (ret != 0)), "zcache_get fails on persistent pool, "
2624                           "bad things are very likely to happen soon\n");
2625 #ifdef RAMSTER_TESTING
2626         if (ret != 0 && ret != -1 && !(ret == -EINVAL && is_ephemeral(pool)))
2627                 pr_err("TESTING zcache_get tmem_get returns ret=%d\n", ret);
2628 #endif
2629         if (ret == -EAGAIN)
2630                 BUG(); /* FIXME... don't need this anymore??? let's ensure */
2631         return ret;
2632 }
2633
2634 int zcache_flush(int cli_id, int pool_id,
2635                                 struct tmem_oid *oidp, uint32_t index)
2636 {
2637         struct tmem_pool *pool;
2638         int ret = -1;
2639         unsigned long flags;
2640
2641         local_irq_save(flags);
2642         zcache_flush_total++;
2643         pool = zcache_get_pool_by_id(cli_id, pool_id);
2644         ramster_do_preload_flnode_only(pool);
2645         if (likely(pool != NULL)) {
2646                 if (atomic_read(&pool->obj_count) > 0)
2647                         ret = tmem_flush_page(pool, oidp, index);
2648                 zcache_put_pool(pool);
2649         }
2650         if (ret >= 0)
2651                 zcache_flush_found++;
2652         local_irq_restore(flags);
2653         return ret;
2654 }
2655
2656 int zcache_flush_object(int cli_id, int pool_id, struct tmem_oid *oidp)
2657 {
2658         struct tmem_pool *pool;
2659         int ret = -1;
2660         unsigned long flags;
2661
2662         local_irq_save(flags);
2663         zcache_flobj_total++;
2664         pool = zcache_get_pool_by_id(cli_id, pool_id);
2665         ramster_do_preload_flnode_only(pool);
2666         if (likely(pool != NULL)) {
2667                 if (atomic_read(&pool->obj_count) > 0)
2668                         ret = tmem_flush_object(pool, oidp);
2669                 zcache_put_pool(pool);
2670         }
2671         if (ret >= 0)
2672                 zcache_flobj_found++;
2673         local_irq_restore(flags);
2674         return ret;
2675 }
2676
2677 int zcache_client_destroy_pool(int cli_id, int pool_id)
2678 {
2679         struct tmem_pool *pool = NULL;
2680         struct zcache_client *cli = NULL;
2681         int ret = -1;
2682
2683         if (pool_id < 0)
2684                 goto out;
2685         if (cli_id == LOCAL_CLIENT)
2686                 cli = &zcache_host;
2687         else if ((unsigned int)cli_id < MAX_CLIENTS)
2688                 cli = &zcache_clients[cli_id];
2689         if (cli == NULL)
2690                 goto out;
2691         atomic_inc(&cli->refcount);
2692         pool = cli->tmem_pools[pool_id];
2693         if (pool == NULL)
2694                 goto out;
2695         cli->tmem_pools[pool_id] = NULL;
2696         /* wait for pool activity on other cpus to quiesce */
2697         while (atomic_read(&pool->refcount) != 0)
2698                 ;
2699         atomic_dec(&cli->refcount);
2700         local_bh_disable();
2701         ret = tmem_destroy_pool(pool);
2702         local_bh_enable();
2703         kfree(pool);
2704         pr_info("ramster: destroyed pool id=%d cli_id=%d\n", pool_id, cli_id);
2705 out:
2706         return ret;
2707 }
2708
2709 static int zcache_destroy_pool(int pool_id)
2710 {
2711         return zcache_client_destroy_pool(LOCAL_CLIENT, pool_id);
2712 }
2713
2714 int zcache_new_pool(uint16_t cli_id, uint32_t flags)
2715 {
2716         int poolid = -1;
2717         struct tmem_pool *pool;
2718         struct zcache_client *cli = NULL;
2719
2720         if (cli_id == LOCAL_CLIENT)
2721                 cli = &zcache_host;
2722         else if ((unsigned int)cli_id < MAX_CLIENTS)
2723                 cli = &zcache_clients[cli_id];
2724         if (cli == NULL)
2725                 goto out;
2726         atomic_inc(&cli->refcount);
2727         pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC);
2728         if (pool == NULL) {
2729                 pr_info("ramster: pool creation failed: out of memory\n");
2730                 goto out;
2731         }
2732
2733         for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
2734                 if (cli->tmem_pools[poolid] == NULL)
2735                         break;
2736         if (poolid >= MAX_POOLS_PER_CLIENT) {
2737                 pr_info("ramster: pool creation failed: max exceeded\n");
2738                 kfree(pool);
2739                 poolid = -1;
2740                 goto out;
2741         }
2742         atomic_set(&pool->refcount, 0);
2743         pool->client = cli;
2744         pool->pool_id = poolid;
2745         tmem_new_pool(pool, flags);
2746         cli->tmem_pools[poolid] = pool;
2747         pr_info("ramster: created %s tmem pool, id=%d, client=%d\n",
2748                 flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
2749                 poolid, cli_id);
2750 out:
2751         if (cli != NULL)
2752                 atomic_dec(&cli->refcount);
2753         return poolid;
2754 }
2755
2756 static int zcache_local_new_pool(uint32_t flags)
2757 {
2758         return zcache_new_pool(LOCAL_CLIENT, flags);
2759 }
2760
2761 int zcache_autocreate_pool(int cli_id, int pool_id, bool ephemeral)
2762 {
2763         struct tmem_pool *pool;
2764         struct zcache_client *cli = NULL;
2765         uint32_t flags = ephemeral ? 0 : TMEM_POOL_PERSIST;
2766         int ret = -1;
2767
2768         if (cli_id == LOCAL_CLIENT)
2769                 goto out;
2770         if (pool_id >= MAX_POOLS_PER_CLIENT)
2771                 goto out;
2772         else if ((unsigned int)cli_id < MAX_CLIENTS)
2773                 cli = &zcache_clients[cli_id];
2774         if ((ephemeral && !use_cleancache) || (!ephemeral && !use_frontswap))
2775                 BUG(); /* FIXME, handle more gracefully later */
2776         if (!cli->allocated) {
2777                 if (zcache_new_client(cli_id))
2778                         BUG(); /* FIXME, handle more gracefully later */
2779                 cli = &zcache_clients[cli_id];
2780         }
2781         atomic_inc(&cli->refcount);
2782         pool = cli->tmem_pools[pool_id];
2783         if (pool != NULL) {
2784                 if (pool->persistent && ephemeral) {
2785                         pr_err("zcache_autocreate_pool: type mismatch\n");
2786                         goto out;
2787                 }
2788                 ret = 0;
2789                 goto out;
2790         }
2791         pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
2792         if (pool == NULL) {
2793                 pr_info("ramster: pool creation failed: out of memory\n");
2794                 goto out;
2795         }
2796         atomic_set(&pool->refcount, 0);
2797         pool->client = cli;
2798         pool->pool_id = pool_id;
2799         tmem_new_pool(pool, flags);
2800         cli->tmem_pools[pool_id] = pool;
2801         pr_info("ramster: AUTOcreated %s tmem poolid=%d, for remote client=%d\n",
2802                 flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
2803                 pool_id, cli_id);
2804         ret = 0;
2805 out:
2806         if (cli == NULL)
2807                 BUG(); /* FIXME, handle more gracefully later */
2808                 /* pr_err("zcache_autocreate_pool: failed\n"); */
2809         if (cli != NULL)
2810                 atomic_dec(&cli->refcount);
2811         return ret;
2812 }
2813
2814 /**********
2815  * Two kernel functionalities currently can be layered on top of tmem.
2816  * These are "cleancache" which is used as a second-chance cache for clean
2817  * page cache pages; and "frontswap" which is used for swap pages
2818  * to avoid writes to disk.  A generic "shim" is provided here for each
2819  * to translate in-kernel semantics to zcache semantics.
2820  */
2821
2822 #ifdef CONFIG_CLEANCACHE
2823 static void zcache_cleancache_put_page(int pool_id,
2824                                         struct cleancache_filekey key,
2825                                         pgoff_t index, struct page *page)
2826 {
2827         u32 ind = (u32) index;
2828         struct tmem_oid oid = *(struct tmem_oid *)&key;
2829
2830 #ifdef __PG_WAS_ACTIVE
2831         if (!PageWasActive(page)) {
2832                 zcache_nonactive_puts++;
2833                 return;
2834         }
2835 #endif
2836         if (likely(ind == index)) {
2837                 char *kva = page_address(page);
2838
2839                 (void)zcache_put(LOCAL_CLIENT, pool_id, &oid, index,
2840                         kva, PAGE_SIZE, 0, 1);
2841         }
2842 }
2843
2844 static int zcache_cleancache_get_page(int pool_id,
2845                                         struct cleancache_filekey key,
2846                                         pgoff_t index, struct page *page)
2847 {
2848         u32 ind = (u32) index;
2849         struct tmem_oid oid = *(struct tmem_oid *)&key;
2850         int ret = -1;
2851
2852         preempt_disable();
2853         if (likely(ind == index)) {
2854                 char *kva = page_address(page);
2855                 size_t size = PAGE_SIZE;
2856
2857                 ret = zcache_get(LOCAL_CLIENT, pool_id, &oid, index,
2858                         kva, &size, 0, 0);
2859 #ifdef __PG_WAS_ACTIVE
2860                 if (ret == 0)
2861                         SetPageWasActive(page);
2862 #endif
2863         }
2864         preempt_enable();
2865         return ret;
2866 }
2867
2868 static void zcache_cleancache_flush_page(int pool_id,
2869                                         struct cleancache_filekey key,
2870                                         pgoff_t index)
2871 {
2872         u32 ind = (u32) index;
2873         struct tmem_oid oid = *(struct tmem_oid *)&key;
2874
2875         if (likely(ind == index))
2876                 (void)zcache_flush(LOCAL_CLIENT, pool_id, &oid, ind);
2877 }
2878
2879 static void zcache_cleancache_flush_inode(int pool_id,
2880                                         struct cleancache_filekey key)
2881 {
2882         struct tmem_oid oid = *(struct tmem_oid *)&key;
2883
2884         (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid);
2885 }
2886
2887 static void zcache_cleancache_flush_fs(int pool_id)
2888 {
2889         if (pool_id >= 0)
2890                 (void)zcache_destroy_pool(pool_id);
2891 }
2892
2893 static int zcache_cleancache_init_fs(size_t pagesize)
2894 {
2895         BUG_ON(sizeof(struct cleancache_filekey) !=
2896                                 sizeof(struct tmem_oid));
2897         BUG_ON(pagesize != PAGE_SIZE);
2898         return zcache_local_new_pool(0);
2899 }
2900
2901 static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
2902 {
2903         /* shared pools are unsupported and map to private */
2904         BUG_ON(sizeof(struct cleancache_filekey) !=
2905                                 sizeof(struct tmem_oid));
2906         BUG_ON(pagesize != PAGE_SIZE);
2907         return zcache_local_new_pool(0);
2908 }
2909
2910 static struct cleancache_ops zcache_cleancache_ops = {
2911         .put_page = zcache_cleancache_put_page,
2912         .get_page = zcache_cleancache_get_page,
2913         .invalidate_page = zcache_cleancache_flush_page,
2914         .invalidate_inode = zcache_cleancache_flush_inode,
2915         .invalidate_fs = zcache_cleancache_flush_fs,
2916         .init_shared_fs = zcache_cleancache_init_shared_fs,
2917         .init_fs = zcache_cleancache_init_fs
2918 };
2919
2920 struct cleancache_ops zcache_cleancache_register_ops(void)
2921 {
2922         struct cleancache_ops old_ops =
2923                 cleancache_register_ops(&zcache_cleancache_ops);
2924
2925         return old_ops;
2926 }
2927 #endif
2928
2929 #ifdef CONFIG_FRONTSWAP
2930 /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
2931 static int zcache_frontswap_poolid = -1;
2932
2933 /*
2934  * Swizzling increases objects per swaptype, increasing tmem concurrency
2935  * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
2936  */
2937 #define SWIZ_BITS               8
2938 #define SWIZ_MASK               ((1 << SWIZ_BITS) - 1)
2939 #define _oswiz(_type, _ind)     ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
2940 #define iswiz(_ind)             (_ind >> SWIZ_BITS)
2941
2942 static inline struct tmem_oid oswiz(unsigned type, u32 ind)
2943 {
2944         struct tmem_oid oid = { .oid = { 0 } };
2945         oid.oid[0] = _oswiz(type, ind);
2946         return oid;
2947 }
2948
2949 static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
2950                                    struct page *page)
2951 {
2952         u64 ind64 = (u64)offset;
2953         u32 ind = (u32)offset;
2954         struct tmem_oid oid = oswiz(type, ind);
2955         int ret = -1;
2956         unsigned long flags;
2957         char *kva;
2958
2959         BUG_ON(!PageLocked(page));
2960         if (likely(ind64 == ind)) {
2961                 local_irq_save(flags);
2962                 kva = page_address(page);
2963                 ret = zcache_put(LOCAL_CLIENT, zcache_frontswap_poolid,
2964                                 &oid, iswiz(ind), kva, PAGE_SIZE, 0, 0);
2965                 local_irq_restore(flags);
2966         }
2967         return ret;
2968 }
2969
2970 /* returns 0 if the page was successfully gotten from frontswap, -1 if
2971  * was not present (should never happen!) */
2972 static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
2973                                    struct page *page)
2974 {
2975         u64 ind64 = (u64)offset;
2976         u32 ind = (u32)offset;
2977         struct tmem_oid oid = oswiz(type, ind);
2978         int ret = -1;
2979
2980         preempt_disable(); /* FIXME, remove this? */
2981         BUG_ON(!PageLocked(page));
2982         if (likely(ind64 == ind)) {
2983                 char *kva = page_address(page);
2984                 size_t size = PAGE_SIZE;
2985
2986                 ret = zcache_get(LOCAL_CLIENT, zcache_frontswap_poolid,
2987                                         &oid, iswiz(ind), kva, &size, 0, -1);
2988         }
2989         preempt_enable(); /* FIXME, remove this? */
2990         return ret;
2991 }
2992
2993 /* flush a single page from frontswap */
2994 static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
2995 {
2996         u64 ind64 = (u64)offset;
2997         u32 ind = (u32)offset;
2998         struct tmem_oid oid = oswiz(type, ind);
2999
3000         if (likely(ind64 == ind))
3001                 (void)zcache_flush(LOCAL_CLIENT, zcache_frontswap_poolid,
3002                                         &oid, iswiz(ind));
3003 }
3004
3005 /* flush all pages from the passed swaptype */
3006 static void zcache_frontswap_flush_area(unsigned type)
3007 {
3008         struct tmem_oid oid;
3009         int ind;
3010
3011         for (ind = SWIZ_MASK; ind >= 0; ind--) {
3012                 oid = oswiz(type, ind);
3013                 (void)zcache_flush_object(LOCAL_CLIENT,
3014                                                 zcache_frontswap_poolid, &oid);
3015         }
3016 }
3017
3018 static void zcache_frontswap_init(unsigned ignored)
3019 {
3020         /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
3021         if (zcache_frontswap_poolid < 0)
3022                 zcache_frontswap_poolid =
3023                                 zcache_local_new_pool(TMEM_POOL_PERSIST);
3024 }
3025
3026 static struct frontswap_ops zcache_frontswap_ops = {
3027         .put_page = zcache_frontswap_put_page,
3028         .get_page = zcache_frontswap_get_page,
3029         .invalidate_page = zcache_frontswap_flush_page,
3030         .invalidate_area = zcache_frontswap_flush_area,
3031         .init = zcache_frontswap_init
3032 };
3033
3034 struct frontswap_ops zcache_frontswap_register_ops(void)
3035 {
3036         struct frontswap_ops old_ops =
3037                 frontswap_register_ops(&zcache_frontswap_ops);
3038
3039         return old_ops;
3040 }
3041 #endif
3042
3043 /*
3044  * frontswap selfshrinking
3045  */
3046
3047 #ifdef CONFIG_FRONTSWAP
3048 /* In HZ, controls frequency of worker invocation. */
3049 static unsigned int selfshrink_interval __read_mostly = 5;
3050
3051 static void selfshrink_process(struct work_struct *work);
3052 static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process);
3053
3054 /* Enable/disable with sysfs. */
3055 static bool frontswap_selfshrinking __read_mostly;
3056
3057 /* Enable/disable with kernel boot option. */
3058 static bool use_frontswap_selfshrink __initdata = true;
3059
3060 /*
3061  * The default values for the following parameters were deemed reasonable
3062  * by experimentation, may be workload-dependent, and can all be
3063  * adjusted via sysfs.
3064  */
3065
3066 /* Control rate for frontswap shrinking. Higher hysteresis is slower. */
3067 static unsigned int frontswap_hysteresis __read_mostly = 20;
3068
3069 /*
3070  * Number of selfshrink worker invocations to wait before observing that
3071  * frontswap selfshrinking should commence. Note that selfshrinking does
3072  * not use a separate worker thread.
3073  */
3074 static unsigned int frontswap_inertia __read_mostly = 3;
3075
3076 /* Countdown to next invocation of frontswap_shrink() */
3077 static unsigned long frontswap_inertia_counter;
3078
3079 /*
3080  * Invoked by the selfshrink worker thread, uses current number of pages
3081  * in frontswap (frontswap_curr_pages()), previous status, and control
3082  * values (hysteresis and inertia) to determine if frontswap should be
3083  * shrunk and what the new frontswap size should be.  Note that
3084  * frontswap_shrink is essentially a partial swapoff that immediately
3085  * transfers pages from the "swap device" (frontswap) back into kernel
3086  * RAM; despite the name, frontswap "shrinking" is very different from
3087  * the "shrinker" interface used by the kernel MM subsystem to reclaim
3088  * memory.
3089  */
3090 static void frontswap_selfshrink(void)
3091 {
3092         static unsigned long cur_frontswap_pages;
3093         static unsigned long last_frontswap_pages;
3094         static unsigned long tgt_frontswap_pages;
3095
3096         last_frontswap_pages = cur_frontswap_pages;
3097         cur_frontswap_pages = frontswap_curr_pages();
3098         if (!cur_frontswap_pages ||
3099                         (cur_frontswap_pages > last_frontswap_pages)) {
3100                 frontswap_inertia_counter = frontswap_inertia;
3101                 return;
3102         }
3103         if (frontswap_inertia_counter && --frontswap_inertia_counter)
3104                 return;
3105         if (cur_frontswap_pages <= frontswap_hysteresis)
3106                 tgt_frontswap_pages = 0;
3107         else
3108                 tgt_frontswap_pages = cur_frontswap_pages -
3109                         (cur_frontswap_pages / frontswap_hysteresis);
3110         frontswap_shrink(tgt_frontswap_pages);
3111 }
3112
3113 static int __init ramster_nofrontswap_selfshrink_setup(char *s)
3114 {
3115         use_frontswap_selfshrink = false;
3116         return 1;
3117 }
3118
3119 __setup("noselfshrink", ramster_nofrontswap_selfshrink_setup);
3120
3121 static void selfshrink_process(struct work_struct *work)
3122 {
3123         if (frontswap_selfshrinking && frontswap_enabled) {
3124                 frontswap_selfshrink();
3125                 schedule_delayed_work(&selfshrink_worker,
3126                         selfshrink_interval * HZ);
3127         }
3128 }
3129
3130 static int ramster_enabled;
3131
3132 static int __init ramster_selfshrink_init(void)
3133 {
3134         frontswap_selfshrinking = ramster_enabled && use_frontswap_selfshrink;
3135         if (frontswap_selfshrinking)
3136                 pr_info("ramster: Initializing frontswap "
3137                                         "selfshrinking driver.\n");
3138         else
3139                 return -ENODEV;
3140
3141         schedule_delayed_work(&selfshrink_worker, selfshrink_interval * HZ);
3142
3143         return 0;
3144 }
3145
3146 subsys_initcall(ramster_selfshrink_init);
3147 #endif
3148
3149 /*
3150  * zcache initialization
3151  * NOTE FOR NOW ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
3152  * NOTHING HAPPENS!
3153  */
3154
3155 static int ramster_enabled;
3156
3157 static int __init enable_ramster(char *s)
3158 {
3159         ramster_enabled = 1;
3160         return 1;
3161 }
3162 __setup("ramster", enable_ramster);
3163
3164 /* allow independent dynamic disabling of cleancache and frontswap */
3165
3166 static int use_cleancache = 1;
3167
3168 static int __init no_cleancache(char *s)
3169 {
3170         pr_info("INIT no_cleancache called\n");
3171         use_cleancache = 0;
3172         return 1;
3173 }
3174
3175 /*
3176  * FIXME: need to guarantee this gets checked before zcache_init is called
3177  * What is the correct way to achieve this?
3178  */
3179 early_param("nocleancache", no_cleancache);
3180
3181 static int use_frontswap = 1;
3182
3183 static int __init no_frontswap(char *s)
3184 {
3185         pr_info("INIT no_frontswap called\n");
3186         use_frontswap = 0;
3187         return 1;
3188 }
3189
3190 __setup("nofrontswap", no_frontswap);
3191
3192 static int __init zcache_init(void)
3193 {
3194         int ret = 0;
3195
3196 #ifdef CONFIG_SYSFS
3197         ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
3198         ret = sysfs_create_group(mm_kobj, &ramster_attr_group);
3199         if (ret) {
3200                 pr_err("ramster: can't create sysfs\n");
3201                 goto out;
3202         }
3203 #endif /* CONFIG_SYSFS */
3204 #if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
3205         if (ramster_enabled) {
3206                 unsigned int cpu;
3207
3208                 (void)ramster_o2net_register_handlers();
3209                 tmem_register_hostops(&zcache_hostops);
3210                 tmem_register_pamops(&zcache_pamops);
3211                 ret = register_cpu_notifier(&zcache_cpu_notifier_block);
3212                 if (ret) {
3213                         pr_err("ramster: can't register cpu notifier\n");
3214                         goto out;
3215                 }
3216                 for_each_online_cpu(cpu) {
3217                         void *pcpu = (void *)(long)cpu;
3218                         zcache_cpu_notifier(&zcache_cpu_notifier_block,
3219                                 CPU_UP_PREPARE, pcpu);
3220                 }
3221         }
3222         zcache_objnode_cache = kmem_cache_create("zcache_objnode",
3223                                 sizeof(struct tmem_objnode), 0, 0, NULL);
3224         zcache_obj_cache = kmem_cache_create("zcache_obj",
3225                                 sizeof(struct tmem_obj), 0, 0, NULL);
3226         ramster_flnode_cache = kmem_cache_create("ramster_flnode",
3227                                 sizeof(struct flushlist_node), 0, 0, NULL);
3228 #endif
3229 #ifdef CONFIG_CLEANCACHE
3230         pr_info("INIT ramster_enabled=%d use_cleancache=%d\n",
3231                                         ramster_enabled, use_cleancache);
3232         if (ramster_enabled && use_cleancache) {
3233                 struct cleancache_ops old_ops;
3234
3235                 zbud_init();
3236                 register_shrinker(&zcache_shrinker);
3237                 old_ops = zcache_cleancache_register_ops();
3238                 pr_info("ramster: cleancache enabled using kernel "
3239                         "transcendent memory and compression buddies\n");
3240                 if (old_ops.init_fs != NULL)
3241                         pr_warning("ramster: cleancache_ops overridden");
3242         }
3243 #endif
3244 #ifdef CONFIG_FRONTSWAP
3245         pr_info("INIT ramster_enabled=%d use_frontswap=%d\n",
3246                                         ramster_enabled, use_frontswap);
3247         if (ramster_enabled && use_frontswap) {
3248                 struct frontswap_ops old_ops;
3249
3250                 zcache_new_client(LOCAL_CLIENT);
3251                 old_ops = zcache_frontswap_register_ops();
3252                 pr_info("ramster: frontswap enabled using kernel "
3253                         "transcendent memory and xvmalloc\n");
3254                 if (old_ops.init != NULL)
3255                         pr_warning("ramster: frontswap_ops overridden");
3256         }
3257         if (ramster_enabled && (use_frontswap || use_cleancache))
3258                 ramster_remotify_init();
3259 #endif
3260 out:
3261         return ret;
3262 }
3263
3264 module_init(zcache_init)