Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[firefly-linux-kernel-4.4.55.git] / net / ceph / osdmap.c
index df9389ddd56c0c662325a10b9d3fd68f2ba35bae..e632b5a52f5b89cb2e275b64494905cc7ebfc8e7 100644 (file)
@@ -1005,6 +1005,8 @@ static int decode_new_primary_affinity(void **p, void *end,
                ret = set_primary_affinity(map, osd, aff);
                if (ret)
                        return ret;
+
+               pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
        }
 
        return 0;
@@ -1455,71 +1457,6 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
        return r;
 }
 
-/*
- * Calculate raw osd vector for the given pgid.  Return pointer to osd
- * array, or NULL on failure.
- */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                       int *osds, int *num)
-{
-       struct ceph_pg_mapping *pg;
-       struct ceph_pg_pool_info *pool;
-       int ruleno;
-       int r;
-       u32 pps;
-
-       pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
-       if (!pool)
-               return NULL;
-
-       /* pg_temp? */
-       pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
-                                   pool->pg_num_mask);
-       pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
-       if (pg) {
-               *num = pg->pg_temp.len;
-               return pg->pg_temp.osds;
-       }
-
-       /* crush */
-       ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
-                                pool->type, pool->size);
-       if (ruleno < 0) {
-               pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
-                      pgid.pool, pool->crush_ruleset, pool->type,
-                      pool->size);
-               return NULL;
-       }
-
-       if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
-               /* hash pool id and seed sothat pool PGs do not overlap */
-               pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
-                                    ceph_stable_mod(pgid.seed, pool->pgp_num,
-                                                    pool->pgp_num_mask),
-                                    pgid.pool);
-       } else {
-               /*
-                * legacy ehavior: add ps and pool together.  this is
-                * not a great approach because the PGs from each pool
-                * will overlap on top of each other: 0.5 == 1.4 ==
-                * 2.3 == ...
-                */
-               pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
-                                     pool->pgp_num_mask) +
-                       (unsigned)pgid.pool;
-       }
-       r = do_crush(osdmap, ruleno, pps, osds, min_t(int, pool->size, *num),
-                    osdmap->osd_weight, osdmap->max_osd);
-       if (r < 0) {
-               pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
-                      " size %d\n", r, pgid.pool, pool->crush_ruleset,
-                      pool->type, pool->size);
-               return NULL;
-       }
-       *num = r;
-       return osds;
-}
-
 /*
  * Calculate raw (crush) set for given pgid.
  *
@@ -1596,8 +1533,74 @@ static int raw_to_up_osds(struct ceph_osdmap *osdmap,
        return len;
 }
 
+static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
+                                  struct ceph_pg_pool_info *pool,
+                                  int *osds, int len, int *primary)
+{
+       int i;
+       int pos = -1;
+
+       /*
+        * Do we have any non-default primary_affinity values for these
+        * osds?
+        */
+       if (!osdmap->osd_primary_affinity)
+               return;
+
+       for (i = 0; i < len; i++) {
+               if (osds[i] != CRUSH_ITEM_NONE &&
+                   osdmap->osd_primary_affinity[i] !=
+                                       CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+                       break;
+               }
+       }
+       if (i == len)
+               return;
+
+       /*
+        * Pick the primary.  Feed both the seed (for the pg) and the
+        * osd into the hash/rng so that a proportional fraction of an
+        * osd's pgs get rejected as primary.
+        */
+       for (i = 0; i < len; i++) {
+               int osd;
+               u32 aff;
+
+               osd = osds[i];
+               if (osd == CRUSH_ITEM_NONE)
+                       continue;
+
+               aff = osdmap->osd_primary_affinity[osd];
+               if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
+                   (crush_hash32_2(CRUSH_HASH_RJENKINS1,
+                                   pps, osd) >> 16) >= aff) {
+                       /*
+                        * We chose not to use this primary.  Note it
+                        * anyway as a fallback in case we don't pick
+                        * anyone else, but keep looking.
+                        */
+                       if (pos < 0)
+                               pos = i;
+               } else {
+                       pos = i;
+                       break;
+               }
+       }
+       if (pos < 0)
+               return;
+
+       *primary = osds[pos];
+
+       if (ceph_can_shift_osds(pool) && pos > 0) {
+               /* move the new primary to the front */
+               for (i = pos; i > 0; i--)
+                       osds[i] = osds[i - 1];
+               osds[0] = *primary;
+       }
+}
+
 /*
- * Given up set, apply pg_temp mapping.
+ * Given up set, apply pg_temp and primary_temp mappings.
  *
  * Return acting set length.  *primary is set to acting primary osd id,
  * or -1 if acting set is empty.
@@ -1644,6 +1647,11 @@ static int apply_temps(struct ceph_osdmap *osdmap,
                temp_primary = *primary;
        }
 
+       /* primary_temp? */
+       pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
+       if (pg)
+               temp_primary = pg->primary_temp.osd;
+
        *primary = temp_primary;
        return temp_len;
 }
@@ -1693,6 +1701,8 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 
        len = raw_to_up_osds(osdmap, pool, osds, len, primary);
 
+       apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
+
        len = apply_temps(osdmap, pool, pgid, osds, len, primary);
 
        return len;
@@ -1703,17 +1713,11 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
  */
 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
-       int rawosds[CEPH_PG_MAX_SIZE], *osds;
-       int i, num = CEPH_PG_MAX_SIZE;
+       int osds[CEPH_PG_MAX_SIZE];
+       int primary;
 
-       osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-       if (!osds)
-               return -1;
+       ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
 
-       /* primary is first up osd */
-       for (i = 0; i < num; i++)
-               if (ceph_osd_is_up(osdmap, osds[i]))
-                       return osds[i];
-       return -1;
+       return primary;
 }
 EXPORT_SYMBOL(ceph_calc_pg_primary);