libceph: add support for primary_temp mappings
[firefly-linux-kernel-4.4.55.git] / net / ceph / osdmap.c
index 6497322d2e3c780f99099630ee54d580617db5cb..20a38a37794cedd554c23f70ca97e02748e3d4a1 100644 (file)
@@ -343,7 +343,7 @@ bad:
 
 /*
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
+ * to a set of osds) and primary_temp (explicit primary setting)
  */
 static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
 {
@@ -633,6 +633,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
                rb_erase(&pg->node, &map->pg_temp);
                kfree(pg);
        }
+       while (!RB_EMPTY_ROOT(&map->primary_temp)) {
+               struct ceph_pg_mapping *pg =
+                       rb_entry(rb_first(&map->primary_temp),
+                                struct ceph_pg_mapping, node);
+               rb_erase(&pg->node, &map->primary_temp);
+               kfree(pg);
+       }
        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
                struct ceph_pg_pool_info *pi =
                        rb_entry(rb_first(&map->pg_pools),
@@ -642,6 +649,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
        kfree(map->osd_state);
        kfree(map->osd_weight);
        kfree(map->osd_addr);
+       kfree(map->osd_primary_affinity);
        kfree(map);
 }
 
@@ -678,11 +686,82 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
        map->osd_weight = weight;
        map->osd_addr = addr;
 
+       if (map->osd_primary_affinity) {
+               u32 *affinity;
+
+               affinity = krealloc(map->osd_primary_affinity,
+                                   max*sizeof(*affinity), GFP_NOFS);
+               if (!affinity)
+                       return -ENOMEM;
+
+               for (i = map->max_osd; i < max; i++)
+                       affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+               map->osd_primary_affinity = affinity;
+       }
+
        map->max_osd = max;
 
        return 0;
 }
 
+#define OSDMAP_WRAPPER_COMPAT_VER      7
+#define OSDMAP_CLIENT_DATA_COMPAT_VER  1
+
+/*
+ * Return 0 or error.  On success, *v is set to 0 for old (v6) osdmaps,
+ * to struct_v of the client_data section for new (v7 and above)
+ * osdmaps.
+ */
+static int get_osdmap_client_data_v(void **p, void *end,
+                                   const char *prefix, u8 *v)
+{
+       u8 struct_v;
+
+       ceph_decode_8_safe(p, end, struct_v, e_inval);
+       if (struct_v >= 7) {
+               u8 struct_compat;
+
+               ceph_decode_8_safe(p, end, struct_compat, e_inval);
+               if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
+                       pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n",
+                                  struct_v, struct_compat,
+                                  OSDMAP_WRAPPER_COMPAT_VER, prefix);
+                       return -EINVAL;
+               }
+               *p += 4; /* ignore wrapper struct_len */
+
+               ceph_decode_8_safe(p, end, struct_v, e_inval);
+               ceph_decode_8_safe(p, end, struct_compat, e_inval);
+               if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
+                       pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n",
+                                  struct_v, struct_compat,
+                                  OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
+                       return -EINVAL;
+               }
+               *p += 4; /* ignore client data struct_len */
+       } else {
+               u16 version;
+
+               *p -= 1;
+               ceph_decode_16_safe(p, end, version, e_inval);
+               if (version < 6) {
+                       pr_warning("got v %d < 6 of %s ceph_osdmap\n", version,
+                                  prefix);
+                       return -EINVAL;
+               }
+
+               /* old osdmap enconding */
+               struct_v = 0;
+       }
+
+       *v = struct_v;
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
 static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
                          bool incremental)
 {
@@ -765,9 +844,9 @@ static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
                                return -ENOMEM;
 
                        pg->pgid = pgid;
-                       pg->len = len;
+                       pg->pg_temp.len = len;
                        for (i = 0; i < len; i++)
-                               pg->osds[i] = ceph_decode_32(p);
+                               pg->pg_temp.osds[i] = ceph_decode_32(p);
 
                        ret = __insert_pg_mapping(pg, &map->pg_temp);
                        if (ret) {
@@ -793,12 +872,153 @@ static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
        return __decode_pg_temp(p, end, map, true);
 }
 
+static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
+                                bool incremental)
+{
+       u32 n;
+
+       ceph_decode_32_safe(p, end, n, e_inval);
+       while (n--) {
+               struct ceph_pg pgid;
+               u32 osd;
+               int ret;
+
+               ret = ceph_decode_pgid(p, end, &pgid);
+               if (ret)
+                       return ret;
+
+               ceph_decode_32_safe(p, end, osd, e_inval);
+
+               ret = __remove_pg_mapping(&map->primary_temp, pgid);
+               BUG_ON(!incremental && ret != -ENOENT);
+
+               if (!incremental || osd != (u32)-1) {
+                       struct ceph_pg_mapping *pg;
+
+                       pg = kzalloc(sizeof(*pg), GFP_NOFS);
+                       if (!pg)
+                               return -ENOMEM;
+
+                       pg->pgid = pgid;
+                       pg->primary_temp.osd = osd;
+
+                       ret = __insert_pg_mapping(pg, &map->primary_temp);
+                       if (ret) {
+                               kfree(pg);
+                               return ret;
+                       }
+               }
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
+static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+       return __decode_primary_temp(p, end, map, false);
+}
+
+static int decode_new_primary_temp(void **p, void *end,
+                                  struct ceph_osdmap *map)
+{
+       return __decode_primary_temp(p, end, map, true);
+}
+
+u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
+{
+       BUG_ON(osd >= map->max_osd);
+
+       if (!map->osd_primary_affinity)
+               return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+       return map->osd_primary_affinity[osd];
+}
+
+static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
+{
+       BUG_ON(osd >= map->max_osd);
+
+       if (!map->osd_primary_affinity) {
+               int i;
+
+               map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32),
+                                                   GFP_NOFS);
+               if (!map->osd_primary_affinity)
+                       return -ENOMEM;
+
+               for (i = 0; i < map->max_osd; i++)
+                       map->osd_primary_affinity[i] =
+                           CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+       }
+
+       map->osd_primary_affinity[osd] = aff;
+
+       return 0;
+}
+
+static int decode_primary_affinity(void **p, void *end,
+                                  struct ceph_osdmap *map)
+{
+       u32 len, i;
+
+       ceph_decode_32_safe(p, end, len, e_inval);
+       if (len == 0) {
+               kfree(map->osd_primary_affinity);
+               map->osd_primary_affinity = NULL;
+               return 0;
+       }
+       if (len != map->max_osd)
+               goto e_inval;
+
+       ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
+
+       for (i = 0; i < map->max_osd; i++) {
+               int ret;
+
+               ret = set_primary_affinity(map, i, ceph_decode_32(p));
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
+static int decode_new_primary_affinity(void **p, void *end,
+                                      struct ceph_osdmap *map)
+{
+       u32 n;
+
+       ceph_decode_32_safe(p, end, n, e_inval);
+       while (n--) {
+               u32 osd, aff;
+               int ret;
+
+               ceph_decode_32_safe(p, end, osd, e_inval);
+               ceph_decode_32_safe(p, end, aff, e_inval);
+
+               ret = set_primary_affinity(map, osd, aff);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
 /*
  * decode a full map.
  */
 static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 {
-       u16 version;
+       u8 struct_v;
        u32 epoch = 0;
        void *start = *p;
        u32 max;
@@ -807,15 +1027,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 
        dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
 
-       ceph_decode_16_safe(p, end, version, e_inval);
-       if (version > 6) {
-               pr_warning("got unknown v %d > 6 of osdmap\n", version);
-               goto e_inval;
-       }
-       if (version < 6) {
-               pr_warning("got old v %d < 6 of osdmap\n", version);
-               goto e_inval;
-       }
+       err = get_osdmap_client_data_v(p, end, "full", &struct_v);
+       if (err)
+               goto bad;
 
        /* fsid, epoch, created, modified */
        ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
@@ -875,6 +1089,24 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
        if (err)
                goto bad;
 
+       /* primary_temp */
+       if (struct_v >= 1) {
+               err = decode_primary_temp(p, end, map);
+               if (err)
+                       goto bad;
+       }
+
+       /* primary_affinity */
+       if (struct_v >= 2) {
+               err = decode_primary_affinity(p, end, map);
+               if (err)
+                       goto bad;
+       } else {
+               /* XXX can this happen? */
+               kfree(map->osd_primary_affinity);
+               map->osd_primary_affinity = NULL;
+       }
+
        /* crush */
        ceph_decode_32_safe(p, end, len, e_inval);
        map->crush = crush_decode(*p, min(*p + len, end));
@@ -915,6 +1147,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
                return ERR_PTR(-ENOMEM);
 
        map->pg_temp = RB_ROOT;
+       map->primary_temp = RB_ROOT;
        mutex_init(&map->crush_scratch_mutex);
 
        ret = osdmap_decode(p, end, map);
@@ -943,15 +1176,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        __s32 new_flags, max;
        void *start = *p;
        int err;
-       u16 version;
+       u8 struct_v;
 
        dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
 
-       ceph_decode_16_safe(p, end, version, e_inval);
-       if (version != 6) {
-               pr_warning("got unknown v %d != 6 of inc osdmap\n", version);
-               goto e_inval;
-       }
+       err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
+       if (err)
+               goto bad;
 
        /* fsid, epoch, modified, new_pool_max, new_flags */
        ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
@@ -1076,6 +1307,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        if (err)
                goto bad;
 
+       /* new_primary_temp */
+       if (struct_v >= 1) {
+               err = decode_new_primary_temp(p, end, map);
+               if (err)
+                       goto bad;
+       }
+
+       /* new_primary_affinity */
+       if (struct_v >= 2) {
+               err = decode_new_primary_affinity(p, end, map);
+               if (err)
+                       goto bad;
+       }
+
        /* ignore the rest */
        *p = end;
 
@@ -1232,8 +1477,8 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
                                    pool->pg_num_mask);
        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
        if (pg) {
-               *num = pg->len;
-               return pg->osds;
+               *num = pg->pg_temp.len;
+               return pg->pg_temp.osds;
        }
 
        /* crush */
@@ -1276,24 +1521,186 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 }
 
 /*
- * Return acting set for given pgid.
+ * Calculate raw (crush) set for given pgid.
+ *
+ * Return raw set length, or error.
+ */
+static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
+                         struct ceph_pg_pool_info *pool,
+                         struct ceph_pg pgid, u32 pps, int *osds)
+{
+       int ruleno;
+       int len;
+
+       /* crush */
+       ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+                                pool->type, pool->size);
+       if (ruleno < 0) {
+               pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
+                      pgid.pool, pool->crush_ruleset, pool->type,
+                      pool->size);
+               return -ENOENT;
+       }
+
+       len = do_crush(osdmap, ruleno, pps, osds,
+                      min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+                      osdmap->osd_weight, osdmap->max_osd);
+       if (len < 0) {
+               pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
+                      len, ruleno, pgid.pool, pool->crush_ruleset,
+                      pool->type, pool->size);
+               return len;
+       }
+
+       return len;
+}
+
+/*
+ * Given raw set, calculate up set and up primary.
+ *
+ * Return up set length.  *primary is set to up primary osd id, or -1
+ * if up set is empty.
+ */
+static int raw_to_up_osds(struct ceph_osdmap *osdmap,
+                         struct ceph_pg_pool_info *pool,
+                         int *osds, int len, int *primary)
+{
+       int up_primary = -1;
+       int i;
+
+       if (ceph_can_shift_osds(pool)) {
+               int removed = 0;
+
+               for (i = 0; i < len; i++) {
+                       if (ceph_osd_is_down(osdmap, osds[i])) {
+                               removed++;
+                               continue;
+                       }
+                       if (removed)
+                               osds[i - removed] = osds[i];
+               }
+
+               len -= removed;
+               if (len > 0)
+                       up_primary = osds[0];
+       } else {
+               for (i = len - 1; i >= 0; i--) {
+                       if (ceph_osd_is_down(osdmap, osds[i]))
+                               osds[i] = CRUSH_ITEM_NONE;
+                       else
+                               up_primary = osds[i];
+               }
+       }
+
+       *primary = up_primary;
+       return len;
+}
+
+/*
+ * Given up set, apply pg_temp and primary_temp mappings.
+ *
+ * Return acting set length.  *primary is set to acting primary osd id,
+ * or -1 if acting set is empty.
+ */
+static int apply_temps(struct ceph_osdmap *osdmap,
+                      struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
+                      int *osds, int len, int *primary)
+{
+       struct ceph_pg_mapping *pg;
+       int temp_len;
+       int temp_primary;
+       int i;
+
+       /* raw_pg -> pg */
+       pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
+                                   pool->pg_num_mask);
+
+       /* pg_temp? */
+       pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+       if (pg) {
+               temp_len = 0;
+               temp_primary = -1;
+
+               for (i = 0; i < pg->pg_temp.len; i++) {
+                       if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
+                               if (ceph_can_shift_osds(pool))
+                                       continue;
+                               else
+                                       osds[temp_len++] = CRUSH_ITEM_NONE;
+                       } else {
+                               osds[temp_len++] = pg->pg_temp.osds[i];
+                       }
+               }
+
+               /* apply pg_temp's primary */
+               for (i = 0; i < temp_len; i++) {
+                       if (osds[i] != CRUSH_ITEM_NONE) {
+                               temp_primary = osds[i];
+                               break;
+                       }
+               }
+       } else {
+               temp_len = len;
+               temp_primary = *primary;
+       }
+
+       /* primary_temp? */
+       pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
+       if (pg)
+               temp_primary = pg->primary_temp.osd;
+
+       *primary = temp_primary;
+       return temp_len;
+}
+
+/*
+ * Calculate acting set for given pgid.
+ *
+ * Return acting set length, or error.  *primary is set to acting
+ * primary osd id, or -1 if acting set is empty or on error.
  */
 int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                       int *acting)
+                       int *osds, int *primary)
 {
-       int rawosds[CEPH_PG_MAX_SIZE], *osds;
-       int i, o, num = CEPH_PG_MAX_SIZE;
+       struct ceph_pg_pool_info *pool;
+       u32 pps;
+       int len;
 
-       osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-       if (!osds)
-               return -1;
+       pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+       if (!pool) {
+               *primary = -1;
+               return -ENOENT;
+       }
 
-       /* primary is first up osd */
-       o = 0;
-       for (i = 0; i < num; i++)
-               if (ceph_osd_is_up(osdmap, osds[i]))
-                       acting[o++] = osds[i];
-       return o;
+       if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+               /* hash pool id and seed so that pool PGs do not overlap */
+               pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
+                                    ceph_stable_mod(pgid.seed, pool->pgp_num,
+                                                    pool->pgp_num_mask),
+                                    pgid.pool);
+       } else {
+               /*
+                * legacy behavior: add ps and pool together.  this is
+                * not a great approach because the PGs from each pool
+                * will overlap on top of each other: 0.5 == 1.4 ==
+                * 2.3 == ...
+                */
+               pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
+                                     pool->pgp_num_mask) +
+                       (unsigned)pgid.pool;
+       }
+
+       len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
+       if (len < 0) {
+               *primary = -1;
+               return len;
+       }
+
+       len = raw_to_up_osds(osdmap, pool, osds, len, primary);
+
+       len = apply_temps(osdmap, pool, pgid, osds, len, primary);
+
+       return len;
 }
 
 /*