libceph: add support for primary_temp mappings
[firefly-linux-kernel-4.4.55.git] / net / ceph / osdmap.c
index c0fc517ab321b0eb15bd269ac3dd363e945f91b3..20a38a37794cedd554c23f70ca97e02748e3d4a1 100644 (file)
@@ -649,6 +649,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
        kfree(map->osd_state);
        kfree(map->osd_weight);
        kfree(map->osd_addr);
+       kfree(map->osd_primary_affinity);
        kfree(map);
 }
 
@@ -685,6 +686,20 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
        map->osd_weight = weight;
        map->osd_addr = addr;
 
+       if (map->osd_primary_affinity) {
+               u32 *affinity;
+
+               affinity = krealloc(map->osd_primary_affinity,
+                                   max*sizeof(*affinity), GFP_NOFS);
+               if (!affinity)
+                       return -ENOMEM;
+
+               for (i = map->max_osd; i < max; i++)
+                       affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+               map->osd_primary_affinity = affinity;
+       }
+
        map->max_osd = max;
 
        return 0;
@@ -857,6 +872,147 @@ static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
        return __decode_pg_temp(p, end, map, true);
 }
 
+static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
+                                bool incremental)
+{
+       u32 n;
+
+       ceph_decode_32_safe(p, end, n, e_inval);
+       while (n--) {
+               struct ceph_pg pgid;
+               u32 osd;
+               int ret;
+
+               ret = ceph_decode_pgid(p, end, &pgid);
+               if (ret)
+                       return ret;
+
+               ceph_decode_32_safe(p, end, osd, e_inval);
+
+               ret = __remove_pg_mapping(&map->primary_temp, pgid);
+               BUG_ON(!incremental && ret != -ENOENT);
+
+               if (!incremental || osd != (u32)-1) {
+                       struct ceph_pg_mapping *pg;
+
+                       pg = kzalloc(sizeof(*pg), GFP_NOFS);
+                       if (!pg)
+                               return -ENOMEM;
+
+                       pg->pgid = pgid;
+                       pg->primary_temp.osd = osd;
+
+                       ret = __insert_pg_mapping(pg, &map->primary_temp);
+                       if (ret) {
+                               kfree(pg);
+                               return ret;
+                       }
+               }
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
+static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+       return __decode_primary_temp(p, end, map, false);
+}
+
+static int decode_new_primary_temp(void **p, void *end,
+                                  struct ceph_osdmap *map)
+{
+       return __decode_primary_temp(p, end, map, true);
+}
+
+u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
+{
+       BUG_ON(osd >= map->max_osd);
+
+       if (!map->osd_primary_affinity)
+               return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+       return map->osd_primary_affinity[osd];
+}
+
+static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
+{
+       BUG_ON(osd >= map->max_osd);
+
+       if (!map->osd_primary_affinity) {
+               int i;
+
+               map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32),
+                                                   GFP_NOFS);
+               if (!map->osd_primary_affinity)
+                       return -ENOMEM;
+
+               for (i = 0; i < map->max_osd; i++)
+                       map->osd_primary_affinity[i] =
+                           CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+       }
+
+       map->osd_primary_affinity[osd] = aff;
+
+       return 0;
+}
+
+static int decode_primary_affinity(void **p, void *end,
+                                  struct ceph_osdmap *map)
+{
+       u32 len, i;
+
+       ceph_decode_32_safe(p, end, len, e_inval);
+       if (len == 0) {
+               kfree(map->osd_primary_affinity);
+               map->osd_primary_affinity = NULL;
+               return 0;
+       }
+       if (len != map->max_osd)
+               goto e_inval;
+
+       ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
+
+       for (i = 0; i < map->max_osd; i++) {
+               int ret;
+
+               ret = set_primary_affinity(map, i, ceph_decode_32(p));
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
+static int decode_new_primary_affinity(void **p, void *end,
+                                      struct ceph_osdmap *map)
+{
+       u32 n;
+
+       ceph_decode_32_safe(p, end, n, e_inval);
+       while (n--) {
+               u32 osd, aff;
+               int ret;
+
+               ceph_decode_32_safe(p, end, osd, e_inval);
+               ceph_decode_32_safe(p, end, aff, e_inval);
+
+               ret = set_primary_affinity(map, osd, aff);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
 /*
  * decode a full map.
  */
@@ -933,6 +1089,24 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
        if (err)
                goto bad;
 
+       /* primary_temp */
+       if (struct_v >= 1) {
+               err = decode_primary_temp(p, end, map);
+               if (err)
+                       goto bad;
+       }
+
+       /* primary_affinity */
+       if (struct_v >= 2) {
+               err = decode_primary_affinity(p, end, map);
+               if (err)
+                       goto bad;
+       } else {
+               /* XXX can this happen? */
+               kfree(map->osd_primary_affinity);
+               map->osd_primary_affinity = NULL;
+       }
+
        /* crush */
        ceph_decode_32_safe(p, end, len, e_inval);
        map->crush = crush_decode(*p, min(*p + len, end));
@@ -1133,6 +1307,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        if (err)
                goto bad;
 
+       /* new_primary_temp */
+       if (struct_v >= 1) {
+               err = decode_new_primary_temp(p, end, map);
+               if (err)
+                       goto bad;
+       }
+
+       /* new_primary_affinity */
+       if (struct_v >= 2) {
+               err = decode_new_primary_affinity(p, end, map);
+               if (err)
+                       goto bad;
+       }
+
        /* ignore the rest */
        *p = end;
 
@@ -1333,24 +1521,186 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 }
 
 /*
- * Return acting set for given pgid.
+ * Calculate raw (crush) set for given pgid.
+ *
+ * Return raw set length, or error.
+ */
+static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
+                         struct ceph_pg_pool_info *pool,
+                         struct ceph_pg pgid, u32 pps, int *osds)
+{
+       int ruleno;
+       int len;
+
+       /* crush */
+       ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+                                pool->type, pool->size);
+       if (ruleno < 0) {
+               pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
+                      pgid.pool, pool->crush_ruleset, pool->type,
+                      pool->size);
+               return -ENOENT;
+       }
+
+       len = do_crush(osdmap, ruleno, pps, osds,
+                      min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+                      osdmap->osd_weight, osdmap->max_osd);
+       if (len < 0) {
+               pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
+                      len, ruleno, pgid.pool, pool->crush_ruleset,
+                      pool->type, pool->size);
+               return len;
+       }
+
+       return len;
+}
+
+/*
+ * Given raw set, calculate up set and up primary.
+ *
+ * Return up set length.  *primary is set to up primary osd id, or -1
+ * if up set is empty.
+ */
+static int raw_to_up_osds(struct ceph_osdmap *osdmap,
+                         struct ceph_pg_pool_info *pool,
+                         int *osds, int len, int *primary)
+{
+       int up_primary = -1;
+       int i;
+
+       if (ceph_can_shift_osds(pool)) {
+               int removed = 0;
+
+               for (i = 0; i < len; i++) {
+                       if (ceph_osd_is_down(osdmap, osds[i])) {
+                               removed++;
+                               continue;
+                       }
+                       if (removed)
+                               osds[i - removed] = osds[i];
+               }
+
+               len -= removed;
+               if (len > 0)
+                       up_primary = osds[0];
+       } else {
+               for (i = len - 1; i >= 0; i--) {
+                       if (ceph_osd_is_down(osdmap, osds[i]))
+                               osds[i] = CRUSH_ITEM_NONE;
+                       else
+                               up_primary = osds[i];
+               }
+       }
+
+       *primary = up_primary;
+       return len;
+}
+
+/*
+ * Given up set, apply pg_temp and primary_temp mappings.
+ *
+ * Return acting set length.  *primary is set to acting primary osd id,
+ * or -1 if acting set is empty.
+ */
+static int apply_temps(struct ceph_osdmap *osdmap,
+                      struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
+                      int *osds, int len, int *primary)
+{
+       struct ceph_pg_mapping *pg;
+       int temp_len;
+       int temp_primary;
+       int i;
+
+       /* raw_pg -> pg */
+       pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
+                                   pool->pg_num_mask);
+
+       /* pg_temp? */
+       pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+       if (pg) {
+               temp_len = 0;
+               temp_primary = -1;
+
+               for (i = 0; i < pg->pg_temp.len; i++) {
+                       if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
+                               if (ceph_can_shift_osds(pool))
+                                       continue;
+                               else
+                                       osds[temp_len++] = CRUSH_ITEM_NONE;
+                       } else {
+                               osds[temp_len++] = pg->pg_temp.osds[i];
+                       }
+               }
+
+               /* apply pg_temp's primary */
+               for (i = 0; i < temp_len; i++) {
+                       if (osds[i] != CRUSH_ITEM_NONE) {
+                               temp_primary = osds[i];
+                               break;
+                       }
+               }
+       } else {
+               temp_len = len;
+               temp_primary = *primary;
+       }
+
+       /* primary_temp? */
+       pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
+       if (pg)
+               temp_primary = pg->primary_temp.osd;
+
+       *primary = temp_primary;
+       return temp_len;
+}
+
+/*
+ * Calculate acting set for given pgid.
+ *
+ * Return acting set length, or error.  *primary is set to acting
+ * primary osd id, or -1 if acting set is empty or on error.
  */
 int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                       int *acting)
+                       int *osds, int *primary)
 {
-       int rawosds[CEPH_PG_MAX_SIZE], *osds;
-       int i, o, num = CEPH_PG_MAX_SIZE;
+       struct ceph_pg_pool_info *pool;
+       u32 pps;
+       int len;
 
-       osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-       if (!osds)
-               return -1;
+       pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+       if (!pool) {
+               *primary = -1;
+               return -ENOENT;
+       }
 
-       /* primary is first up osd */
-       o = 0;
-       for (i = 0; i < num; i++)
-               if (ceph_osd_is_up(osdmap, osds[i]))
-                       acting[o++] = osds[i];
-       return o;
+       if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+               /* hash pool id and seed so that pool PGs do not overlap */
+               pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
+                                    ceph_stable_mod(pgid.seed, pool->pgp_num,
+                                                    pool->pgp_num_mask),
+                                    pgid.pool);
+       } else {
+               /*
+                * legacy behavior: add ps and pool together.  this is
+                * not a great approach because the PGs from each pool
+                * will overlap on top of each other: 0.5 == 1.4 ==
+                * 2.3 == ...
+                */
+               pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
+                                     pool->pgp_num_mask) +
+                       (unsigned)pgid.pool;
+       }
+
+       len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
+       if (len < 0) {
+               *primary = -1;
+               return len;
+       }
+
+       len = raw_to_up_osds(osdmap, pool, osds, len, primary);
+
+       len = apply_temps(osdmap, pool, pgid, osds, len, primary);
+
+       return len;
 }
 
 /*