ret = set_primary_affinity(map, osd, aff);
if (ret)
return ret;
+
+ pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
}
return 0;
return r;
}
-/*
- * Calculate raw osd vector for the given pgid. Return pointer to osd
- * array, or NULL on failure.
- */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
- int *osds, int *num)
-{
- struct ceph_pg_mapping *pg;
- struct ceph_pg_pool_info *pool;
- int ruleno;
- int r;
- u32 pps;
-
- pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
- if (!pool)
- return NULL;
-
- /* pg_temp? */
- pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
- pool->pg_num_mask);
- pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
- if (pg) {
- *num = pg->pg_temp.len;
- return pg->pg_temp.osds;
- }
-
- /* crush */
- ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
- pool->type, pool->size);
- if (ruleno < 0) {
- pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
- pgid.pool, pool->crush_ruleset, pool->type,
- pool->size);
- return NULL;
- }
-
- if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
- /* hash pool id and seed sothat pool PGs do not overlap */
- pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
- ceph_stable_mod(pgid.seed, pool->pgp_num,
- pool->pgp_num_mask),
- pgid.pool);
- } else {
- /*
- * legacy ehavior: add ps and pool together. this is
- * not a great approach because the PGs from each pool
- * will overlap on top of each other: 0.5 == 1.4 ==
- * 2.3 == ...
- */
- pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
- pool->pgp_num_mask) +
- (unsigned)pgid.pool;
- }
- r = do_crush(osdmap, ruleno, pps, osds, min_t(int, pool->size, *num),
- osdmap->osd_weight, osdmap->max_osd);
- if (r < 0) {
- pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
- " size %d\n", r, pgid.pool, pool->crush_ruleset,
- pool->type, pool->size);
- return NULL;
- }
- *num = r;
- return osds;
-}
-
/*
* Calculate raw (crush) set for given pgid.
*
return len;
}
+static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
+ struct ceph_pg_pool_info *pool,
+ int *osds, int len, int *primary)
+{
+ int i;
+ int pos = -1;
+
+ /*
+ * Do we have any non-default primary_affinity values for these
+ * osds?
+ */
+ if (!osdmap->osd_primary_affinity)
+ return;
+
+ for (i = 0; i < len; i++) {
+ if (osds[i] != CRUSH_ITEM_NONE &&
+ osdmap->osd_primary_affinity[i] !=
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+ break;
+ }
+ }
+ if (i == len)
+ return;
+
+ /*
+ * Pick the primary. Feed both the seed (for the pg) and the
+ * osd into the hash/rng so that a proportional fraction of an
+ * osd's pgs get rejected as primary.
+ */
+ for (i = 0; i < len; i++) {
+ int osd;
+ u32 aff;
+
+ osd = osds[i];
+ if (osd == CRUSH_ITEM_NONE)
+ continue;
+
+ aff = osdmap->osd_primary_affinity[osd];
+ if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
+ (crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ pps, osd) >> 16) >= aff) {
+ /*
+ * We chose not to use this primary. Note it
+ * anyway as a fallback in case we don't pick
+ * anyone else, but keep looking.
+ */
+ if (pos < 0)
+ pos = i;
+ } else {
+ pos = i;
+ break;
+ }
+ }
+ if (pos < 0)
+ return;
+
+ *primary = osds[pos];
+
+ if (ceph_can_shift_osds(pool) && pos > 0) {
+ /* move the new primary to the front */
+ for (i = pos; i > 0; i--)
+ osds[i] = osds[i - 1];
+ osds[0] = *primary;
+ }
+}
+
/*
- * Given up set, apply pg_temp mapping.
+ * Given up set, apply pg_temp and primary_temp mappings.
*
* Return acting set length. *primary is set to acting primary osd id,
* or -1 if acting set is empty.
temp_primary = *primary;
}
+ /* primary_temp? */
+ pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
+ if (pg)
+ temp_primary = pg->primary_temp.osd;
+
*primary = temp_primary;
return temp_len;
}
len = raw_to_up_osds(osdmap, pool, osds, len, primary);
+ apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
+
len = apply_temps(osdmap, pool, pgid, osds, len, primary);
return len;
*/
int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
{
- int rawosds[CEPH_PG_MAX_SIZE], *osds;
- int i, num = CEPH_PG_MAX_SIZE;
+ int osds[CEPH_PG_MAX_SIZE];
+ int primary;
- osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
- if (!osds)
- return -1;
+ ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
- /* primary is first up osd */
- for (i = 0; i < num; i++)
- if (ceph_osd_is_up(osdmap, osds[i]))
- return osds[i];
- return -1;
+ return primary;
}
EXPORT_SYMBOL(ceph_calc_pg_primary);