libceph: add support for primary_temp mappings
[firefly-linux-kernel-4.4.55.git] / net / ceph / osdmap.c
index b844a92736669017a6dc53492119afed25fa5ab7..20a38a37794cedd554c23f70ca97e02748e3d4a1 100644 (file)
@@ -343,7 +343,7 @@ bad:
 
 /*
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
+ * to a set of osds) and primary_temp (explicit primary setting)
  */
 static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
 {
@@ -506,7 +506,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
        kfree(pi);
 }
 
-static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 {
        u8 ev, cv;
        unsigned len, num;
@@ -587,7 +587,7 @@ bad:
        return -EINVAL;
 }
 
-static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
 {
        struct ceph_pg_pool_info *pi;
        u32 num, len;
@@ -633,6 +633,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
                rb_erase(&pg->node, &map->pg_temp);
                kfree(pg);
        }
+       while (!RB_EMPTY_ROOT(&map->primary_temp)) {
+               struct ceph_pg_mapping *pg =
+                       rb_entry(rb_first(&map->primary_temp),
+                                struct ceph_pg_mapping, node);
+               rb_erase(&pg->node, &map->primary_temp);
+               kfree(pg);
+       }
        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
                struct ceph_pg_pool_info *pi =
                        rb_entry(rb_first(&map->pg_pools),
@@ -642,94 +649,403 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
        kfree(map->osd_state);
        kfree(map->osd_weight);
        kfree(map->osd_addr);
+       kfree(map->osd_primary_affinity);
        kfree(map);
 }
 
 /*
- * adjust max osd value.  reallocate arrays.
+ * Adjust max_osd value, (re)allocate arrays.
+ *
+ * The new elements are properly initialized.
  */
 static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 {
        u8 *state;
-       struct ceph_entity_addr *addr;
        u32 *weight;
+       struct ceph_entity_addr *addr;
+       int i;
 
-       state = kcalloc(max, sizeof(*state), GFP_NOFS);
-       addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
-       weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
-       if (state == NULL || addr == NULL || weight == NULL) {
+       state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
+       weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
+       addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
+       if (!state || !weight || !addr) {
                kfree(state);
-               kfree(addr);
                kfree(weight);
+               kfree(addr);
+
                return -ENOMEM;
        }
 
-       /* copy old? */
-       if (map->osd_state) {
-               memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
-               memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
-               memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-               kfree(map->osd_state);
-               kfree(map->osd_addr);
-               kfree(map->osd_weight);
+       for (i = map->max_osd; i < max; i++) {
+               state[i] = 0;
+               weight[i] = CEPH_OSD_OUT;
+               memset(addr + i, 0, sizeof(*addr));
        }
 
        map->osd_state = state;
        map->osd_weight = weight;
        map->osd_addr = addr;
+
+       if (map->osd_primary_affinity) {
+               u32 *affinity;
+
+               affinity = krealloc(map->osd_primary_affinity,
+                                   max*sizeof(*affinity), GFP_NOFS);
+               if (!affinity)
+                       return -ENOMEM;
+
+               for (i = map->max_osd; i < max; i++)
+                       affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+               map->osd_primary_affinity = affinity;
+       }
+
        map->max_osd = max;
+
        return 0;
 }
 
+#define OSDMAP_WRAPPER_COMPAT_VER      7
+#define OSDMAP_CLIENT_DATA_COMPAT_VER  1
+
+/*
+ * Return 0 or error.  On success, *v is set to 0 for old (v6) osdmaps,
+ * to struct_v of the client_data section for new (v7 and above)
+ * osdmaps.
+ */
+static int get_osdmap_client_data_v(void **p, void *end,
+                                   const char *prefix, u8 *v)
+{
+       u8 struct_v;
+
+       ceph_decode_8_safe(p, end, struct_v, e_inval);
+       if (struct_v >= 7) {
+               u8 struct_compat;
+
+               ceph_decode_8_safe(p, end, struct_compat, e_inval);
+               if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
+                       pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n",
+                                  struct_v, struct_compat,
+                                  OSDMAP_WRAPPER_COMPAT_VER, prefix);
+                       return -EINVAL;
+               }
+               *p += 4; /* ignore wrapper struct_len */
+
+               ceph_decode_8_safe(p, end, struct_v, e_inval);
+               ceph_decode_8_safe(p, end, struct_compat, e_inval);
+               if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
+                       pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n",
+                                  struct_v, struct_compat,
+                                  OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
+                       return -EINVAL;
+               }
+               *p += 4; /* ignore client data struct_len */
+       } else {
+               u16 version;
+
+               *p -= 1;
+               ceph_decode_16_safe(p, end, version, e_inval);
+               if (version < 6) {
+                       pr_warning("got v %d < 6 of %s ceph_osdmap\n", version,
+                                  prefix);
+                       return -EINVAL;
+               }
+
+               /* old osdmap enconding */
+               struct_v = 0;
+       }
+
+       *v = struct_v;
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
+static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
+                         bool incremental)
+{
+       u32 n;
+
+       ceph_decode_32_safe(p, end, n, e_inval);
+       while (n--) {
+               struct ceph_pg_pool_info *pi;
+               u64 pool;
+               int ret;
+
+               ceph_decode_64_safe(p, end, pool, e_inval);
+
+               pi = __lookup_pg_pool(&map->pg_pools, pool);
+               if (!incremental || !pi) {
+                       pi = kzalloc(sizeof(*pi), GFP_NOFS);
+                       if (!pi)
+                               return -ENOMEM;
+
+                       pi->id = pool;
+
+                       ret = __insert_pg_pool(&map->pg_pools, pi);
+                       if (ret) {
+                               kfree(pi);
+                               return ret;
+                       }
+               }
+
+               ret = decode_pool(p, end, pi);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
+static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+       return __decode_pools(p, end, map, false);
+}
+
+static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+       return __decode_pools(p, end, map, true);
+}
+
+static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
+                           bool incremental)
+{
+       u32 n;
+
+       ceph_decode_32_safe(p, end, n, e_inval);
+       while (n--) {
+               struct ceph_pg pgid;
+               u32 len, i;
+               int ret;
+
+               ret = ceph_decode_pgid(p, end, &pgid);
+               if (ret)
+                       return ret;
+
+               ceph_decode_32_safe(p, end, len, e_inval);
+
+               ret = __remove_pg_mapping(&map->pg_temp, pgid);
+               BUG_ON(!incremental && ret != -ENOENT);
+
+               if (!incremental || len > 0) {
+                       struct ceph_pg_mapping *pg;
+
+                       ceph_decode_need(p, end, len*sizeof(u32), e_inval);
+
+                       if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
+                               return -EINVAL;
+
+                       pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS);
+                       if (!pg)
+                               return -ENOMEM;
+
+                       pg->pgid = pgid;
+                       pg->pg_temp.len = len;
+                       for (i = 0; i < len; i++)
+                               pg->pg_temp.osds[i] = ceph_decode_32(p);
+
+                       ret = __insert_pg_mapping(pg, &map->pg_temp);
+                       if (ret) {
+                               kfree(pg);
+                               return ret;
+                       }
+               }
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
+static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+       return __decode_pg_temp(p, end, map, false);
+}
+
+static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+       return __decode_pg_temp(p, end, map, true);
+}
+
+static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
+                                bool incremental)
+{
+       u32 n;
+
+       ceph_decode_32_safe(p, end, n, e_inval);
+       while (n--) {
+               struct ceph_pg pgid;
+               u32 osd;
+               int ret;
+
+               ret = ceph_decode_pgid(p, end, &pgid);
+               if (ret)
+                       return ret;
+
+               ceph_decode_32_safe(p, end, osd, e_inval);
+
+               ret = __remove_pg_mapping(&map->primary_temp, pgid);
+               BUG_ON(!incremental && ret != -ENOENT);
+
+               if (!incremental || osd != (u32)-1) {
+                       struct ceph_pg_mapping *pg;
+
+                       pg = kzalloc(sizeof(*pg), GFP_NOFS);
+                       if (!pg)
+                               return -ENOMEM;
+
+                       pg->pgid = pgid;
+                       pg->primary_temp.osd = osd;
+
+                       ret = __insert_pg_mapping(pg, &map->primary_temp);
+                       if (ret) {
+                               kfree(pg);
+                               return ret;
+                       }
+               }
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
+static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+       return __decode_primary_temp(p, end, map, false);
+}
+
+static int decode_new_primary_temp(void **p, void *end,
+                                  struct ceph_osdmap *map)
+{
+       return __decode_primary_temp(p, end, map, true);
+}
+
+u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
+{
+       BUG_ON(osd >= map->max_osd);
+
+       if (!map->osd_primary_affinity)
+               return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+       return map->osd_primary_affinity[osd];
+}
+
+static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
+{
+       BUG_ON(osd >= map->max_osd);
+
+       if (!map->osd_primary_affinity) {
+               int i;
+
+               map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32),
+                                                   GFP_NOFS);
+               if (!map->osd_primary_affinity)
+                       return -ENOMEM;
+
+               for (i = 0; i < map->max_osd; i++)
+                       map->osd_primary_affinity[i] =
+                           CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+       }
+
+       map->osd_primary_affinity[osd] = aff;
+
+       return 0;
+}
+
+static int decode_primary_affinity(void **p, void *end,
+                                  struct ceph_osdmap *map)
+{
+       u32 len, i;
+
+       ceph_decode_32_safe(p, end, len, e_inval);
+       if (len == 0) {
+               kfree(map->osd_primary_affinity);
+               map->osd_primary_affinity = NULL;
+               return 0;
+       }
+       if (len != map->max_osd)
+               goto e_inval;
+
+       ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
+
+       for (i = 0; i < map->max_osd; i++) {
+               int ret;
+
+               ret = set_primary_affinity(map, i, ceph_decode_32(p));
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
+static int decode_new_primary_affinity(void **p, void *end,
+                                      struct ceph_osdmap *map)
+{
+       u32 n;
+
+       ceph_decode_32_safe(p, end, n, e_inval);
+       while (n--) {
+               u32 osd, aff;
+               int ret;
+
+               ceph_decode_32_safe(p, end, osd, e_inval);
+               ceph_decode_32_safe(p, end, aff, e_inval);
+
+               ret = set_primary_affinity(map, osd, aff);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
 /*
  * decode a full map.
  */
 static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 {
-       u16 version;
+       u8 struct_v;
        u32 epoch = 0;
        void *start = *p;
        u32 max;
        u32 len, i;
        int err;
-       struct ceph_pg_pool_info *pi;
 
        dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
 
-       ceph_decode_16_safe(p, end, version, e_inval);
-       if (version > 6) {
-               pr_warning("got unknown v %d > 6 of osdmap\n", version);
-               goto e_inval;
-       }
-       if (version < 6) {
-               pr_warning("got old v %d < 6 of osdmap\n", version);
-               goto e_inval;
-       }
+       err = get_osdmap_client_data_v(p, end, "full", &struct_v);
+       if (err)
+               goto bad;
 
-       ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), e_inval);
+       /* fsid, epoch, created, modified */
+       ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
+                        sizeof(map->created) + sizeof(map->modified), e_inval);
        ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
        epoch = map->epoch = ceph_decode_32(p);
        ceph_decode_copy(p, &map->created, sizeof(map->created));
        ceph_decode_copy(p, &map->modified, sizeof(map->modified));
 
-       ceph_decode_32_safe(p, end, max, e_inval);
-       while (max--) {
-               ceph_decode_need(p, end, 8 + 2, e_inval);
-               pi = kzalloc(sizeof(*pi), GFP_NOFS);
-               if (!pi) {
-                       err = -ENOMEM;
-                       goto bad;
-               }
-               pi->id = ceph_decode_64(p);
-               err = __decode_pool(p, end, pi);
-               if (err < 0) {
-                       kfree(pi);
-                       goto bad;
-               }
-               __insert_pg_pool(&map->pg_pools, pi);
-       }
+       /* pools */
+       err = decode_pools(p, end, map);
+       if (err)
+               goto bad;
 
-       err = __decode_pool_names(p, end, map);
+       /* pool_name */
+       err = decode_pool_names(p, end, map);
        if (err)
                goto bad;
 
@@ -769,35 +1085,26 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
                ceph_decode_addr(&map->osd_addr[i]);
 
        /* pg_temp */
-       ceph_decode_32_safe(p, end, len, e_inval);
-       for (i = 0; i < len; i++) {
-               int n, j;
-               struct ceph_pg pgid;
-               struct ceph_pg_mapping *pg;
+       err = decode_pg_temp(p, end, map);
+       if (err)
+               goto bad;
 
-               err = ceph_decode_pgid(p, end, &pgid);
+       /* primary_temp */
+       if (struct_v >= 1) {
+               err = decode_primary_temp(p, end, map);
                if (err)
                        goto bad;
-               ceph_decode_need(p, end, sizeof(u32), e_inval);
-               n = ceph_decode_32(p);
-               if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
-                       goto e_inval;
-               ceph_decode_need(p, end, n * sizeof(u32), e_inval);
-               pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
-               if (!pg) {
-                       err = -ENOMEM;
-                       goto bad;
-               }
-               pg->pgid = pgid;
-               pg->len = n;
-               for (j = 0; j < n; j++)
-                       pg->osds[j] = ceph_decode_32(p);
+       }
 
-               err = __insert_pg_mapping(pg, &map->pg_temp);
+       /* primary_affinity */
+       if (struct_v >= 2) {
+               err = decode_primary_affinity(p, end, map);
                if (err)
                        goto bad;
-               dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed,
-                    len);
+       } else {
+               /* XXX can this happen? */
+               kfree(map->osd_primary_affinity);
+               map->osd_primary_affinity = NULL;
        }
 
        /* crush */
@@ -840,6 +1147,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
                return ERR_PTR(-ENOMEM);
 
        map->pg_temp = RB_ROOT;
+       map->primary_temp = RB_ROOT;
        mutex_init(&map->crush_scratch_mutex);
 
        ret = osdmap_decode(p, end, map);
@@ -868,18 +1176,17 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        __s32 new_flags, max;
        void *start = *p;
        int err;
-       u16 version;
+       u8 struct_v;
 
        dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
 
-       ceph_decode_16_safe(p, end, version, e_inval);
-       if (version != 6) {
-               pr_warning("got unknown v %d != 6 of inc osdmap\n", version);
-               goto e_inval;
-       }
+       err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
+       if (err)
+               goto bad;
 
-       ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
-                        e_inval);
+       /* fsid, epoch, modified, new_pool_max, new_flags */
+       ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
+                        sizeof(u64) + sizeof(u32), e_inval);
        ceph_decode_copy(p, &fsid, sizeof(fsid));
        epoch = ceph_decode_32(p);
        BUG_ON(epoch != map->epoch+1);
@@ -913,10 +1220,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        if (new_pool_max >= 0)
                map->pool_max = new_pool_max;
 
-       ceph_decode_need(p, end, 5*sizeof(u32), e_inval);
-
        /* new max? */
-       max = ceph_decode_32(p);
+       ceph_decode_32_safe(p, end, max, e_inval);
        if (max >= 0) {
                err = osdmap_set_max_osd(map, max);
                if (err)
@@ -932,31 +1237,15 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                newcrush = NULL;
        }
 
-       /* new_pool */
-       ceph_decode_32_safe(p, end, len, e_inval);
-       while (len--) {
-               struct ceph_pg_pool_info *pi;
+       /* new_pools */
+       err = decode_new_pools(p, end, map);
+       if (err)
+               goto bad;
 
-               ceph_decode_64_safe(p, end, pool, e_inval);
-               pi = __lookup_pg_pool(&map->pg_pools, pool);
-               if (!pi) {
-                       pi = kzalloc(sizeof(*pi), GFP_NOFS);
-                       if (!pi) {
-                               err = -ENOMEM;
-                               goto bad;
-                       }
-                       pi->id = pool;
-                       __insert_pg_pool(&map->pg_pools, pi);
-               }
-               err = __decode_pool(p, end, pi);
-               if (err < 0)
-                       goto bad;
-       }
-       if (version >= 5) {
-               err = __decode_pool_names(p, end, map);
-               if (err)
-                       goto bad;
-       }
+       /* new_pool_names */
+       err = decode_pool_names(p, end, map);
+       if (err)
+               goto bad;
 
        /* old_pool */
        ceph_decode_32_safe(p, end, len, e_inval);
@@ -1014,47 +1303,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        }
 
        /* new_pg_temp */
-       ceph_decode_32_safe(p, end, len, e_inval);
-       while (len--) {
-               struct ceph_pg_mapping *pg;
-               int j;
-               struct ceph_pg pgid;
-               u32 pglen;
+       err = decode_new_pg_temp(p, end, map);
+       if (err)
+               goto bad;
 
-               err = ceph_decode_pgid(p, end, &pgid);
+       /* new_primary_temp */
+       if (struct_v >= 1) {
+               err = decode_new_primary_temp(p, end, map);
+               if (err)
+                       goto bad;
+       }
+
+       /* new_primary_affinity */
+       if (struct_v >= 2) {
+               err = decode_new_primary_affinity(p, end, map);
                if (err)
                        goto bad;
-               ceph_decode_need(p, end, sizeof(u32), e_inval);
-               pglen = ceph_decode_32(p);
-               if (pglen) {
-                       ceph_decode_need(p, end, pglen*sizeof(u32), e_inval);
-
-                       /* removing existing (if any) */
-                       (void) __remove_pg_mapping(&map->pg_temp, pgid);
-
-                       /* insert */
-                       if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
-                               goto e_inval;
-                       pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
-                       if (!pg) {
-                               err = -ENOMEM;
-                               goto bad;
-                       }
-                       pg->pgid = pgid;
-                       pg->len = pglen;
-                       for (j = 0; j < pglen; j++)
-                               pg->osds[j] = ceph_decode_32(p);
-                       err = __insert_pg_mapping(pg, &map->pg_temp);
-                       if (err) {
-                               kfree(pg);
-                               goto bad;
-                       }
-                       dout(" added pg_temp %lld.%x len %d\n", pgid.pool,
-                            pgid.seed, pglen);
-               } else {
-                       /* remove */
-                       __remove_pg_mapping(&map->pg_temp, pgid);
-               }
        }
 
        /* ignore the rest */
@@ -1213,8 +1477,8 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
                                    pool->pg_num_mask);
        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
        if (pg) {
-               *num = pg->len;
-               return pg->osds;
+               *num = pg->pg_temp.len;
+               return pg->pg_temp.osds;
        }
 
        /* crush */
@@ -1257,24 +1521,186 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 }
 
 /*
- * Return acting set for given pgid.
+ * Calculate raw (crush) set for given pgid.
+ *
+ * Return raw set length, or error.
+ */
+static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
+                         struct ceph_pg_pool_info *pool,
+                         struct ceph_pg pgid, u32 pps, int *osds)
+{
+       int ruleno;
+       int len;
+
+       /* crush */
+       ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+                                pool->type, pool->size);
+       if (ruleno < 0) {
+               pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
+                      pgid.pool, pool->crush_ruleset, pool->type,
+                      pool->size);
+               return -ENOENT;
+       }
+
+       len = do_crush(osdmap, ruleno, pps, osds,
+                      min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+                      osdmap->osd_weight, osdmap->max_osd);
+       if (len < 0) {
+               pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
+                      len, ruleno, pgid.pool, pool->crush_ruleset,
+                      pool->type, pool->size);
+               return len;
+       }
+
+       return len;
+}
+
+/*
+ * Given raw set, calculate up set and up primary.
+ *
+ * Return up set length.  *primary is set to up primary osd id, or -1
+ * if up set is empty.
+ */
+static int raw_to_up_osds(struct ceph_osdmap *osdmap,
+                         struct ceph_pg_pool_info *pool,
+                         int *osds, int len, int *primary)
+{
+       int up_primary = -1;
+       int i;
+
+       if (ceph_can_shift_osds(pool)) {
+               int removed = 0;
+
+               for (i = 0; i < len; i++) {
+                       if (ceph_osd_is_down(osdmap, osds[i])) {
+                               removed++;
+                               continue;
+                       }
+                       if (removed)
+                               osds[i - removed] = osds[i];
+               }
+
+               len -= removed;
+               if (len > 0)
+                       up_primary = osds[0];
+       } else {
+               for (i = len - 1; i >= 0; i--) {
+                       if (ceph_osd_is_down(osdmap, osds[i]))
+                               osds[i] = CRUSH_ITEM_NONE;
+                       else
+                               up_primary = osds[i];
+               }
+       }
+
+       *primary = up_primary;
+       return len;
+}
+
+/*
+ * Given up set, apply pg_temp and primary_temp mappings.
+ *
+ * Return acting set length.  *primary is set to acting primary osd id,
+ * or -1 if acting set is empty.
+ */
+static int apply_temps(struct ceph_osdmap *osdmap,
+                      struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
+                      int *osds, int len, int *primary)
+{
+       struct ceph_pg_mapping *pg;
+       int temp_len;
+       int temp_primary;
+       int i;
+
+       /* raw_pg -> pg */
+       pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
+                                   pool->pg_num_mask);
+
+       /* pg_temp? */
+       pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+       if (pg) {
+               temp_len = 0;
+               temp_primary = -1;
+
+               for (i = 0; i < pg->pg_temp.len; i++) {
+                       if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
+                               if (ceph_can_shift_osds(pool))
+                                       continue;
+                               else
+                                       osds[temp_len++] = CRUSH_ITEM_NONE;
+                       } else {
+                               osds[temp_len++] = pg->pg_temp.osds[i];
+                       }
+               }
+
+               /* apply pg_temp's primary */
+               for (i = 0; i < temp_len; i++) {
+                       if (osds[i] != CRUSH_ITEM_NONE) {
+                               temp_primary = osds[i];
+                               break;
+                       }
+               }
+       } else {
+               temp_len = len;
+               temp_primary = *primary;
+       }
+
+       /* primary_temp? */
+       pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
+       if (pg)
+               temp_primary = pg->primary_temp.osd;
+
+       *primary = temp_primary;
+       return temp_len;
+}
+
+/*
+ * Calculate acting set for given pgid.
+ *
+ * Return acting set length, or error.  *primary is set to acting
+ * primary osd id, or -1 if acting set is empty or on error.
  */
 int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                       int *acting)
+                       int *osds, int *primary)
 {
-       int rawosds[CEPH_PG_MAX_SIZE], *osds;
-       int i, o, num = CEPH_PG_MAX_SIZE;
+       struct ceph_pg_pool_info *pool;
+       u32 pps;
+       int len;
 
-       osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-       if (!osds)
-               return -1;
+       pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+       if (!pool) {
+               *primary = -1;
+               return -ENOENT;
+       }
 
-       /* primary is first up osd */
-       o = 0;
-       for (i = 0; i < num; i++)
-               if (ceph_osd_is_up(osdmap, osds[i]))
-                       acting[o++] = osds[i];
-       return o;
+       if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+               /* hash pool id and seed so that pool PGs do not overlap */
+               pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
+                                    ceph_stable_mod(pgid.seed, pool->pgp_num,
+                                                    pool->pgp_num_mask),
+                                    pgid.pool);
+       } else {
+               /*
+                * legacy behavior: add ps and pool together.  this is
+                * not a great approach because the PGs from each pool
+                * will overlap on top of each other: 0.5 == 1.4 ==
+                * 2.3 == ...
+                */
+               pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
+                                     pool->pgp_num_mask) +
+                       (unsigned)pgid.pool;
+       }
+
+       len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
+       if (len < 0) {
+               *primary = -1;
+               return len;
+       }
+
+       len = raw_to_up_osds(osdmap, pool, osds, len, primary);
+
+       len = apply_temps(osdmap, pool, pgid, osds, len, primary);
+
+       return len;
 }
 
 /*