From d30291b985d1854565d7f2c82a4457869d5265e8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 29 Apr 2016 19:54:20 +0200 Subject: libceph: variable-sized ceph_object_id Currently ceph_object_id can hold object names of up to 100 (CEPH_MAX_OID_NAME_LEN) characters. This is enough for all use cases, expect one - long rbd image names: - a format 1 header is named ".rbd" - an object that points to a format 2 header is named "rbd_id." We operate on these potentially long-named objects during rbd map, and, for format 1 images, during header refresh. (A format 2 header name is a small system-generated string.) Lift this 100 character limit by making ceph_object_id be able to point to an externally-allocated string. Apart from being able to work with almost arbitrarily-long named objects, this allows us to reduce the size of ceph_object_id from >100 bytes to 64 bytes. Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) (limited to 'net/ceph/osdmap.c') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 243574c8cf33..4668b871ca47 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1381,8 +1381,99 @@ bad: return ERR_PTR(err); } +void ceph_oid_copy(struct ceph_object_id *dest, + const struct ceph_object_id *src) +{ + WARN_ON(!ceph_oid_empty(dest)); + + if (src->name != src->inline_name) { + /* very rare, see ceph_object_id definition */ + dest->name = kmalloc(src->name_len + 1, + GFP_NOIO | __GFP_NOFAIL); + } + memcpy(dest->name, src->name, src->name_len + 1); + dest->name_len = src->name_len; +} +EXPORT_SYMBOL(ceph_oid_copy); +static __printf(2, 0) +int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap) +{ + int len; + + WARN_ON(!ceph_oid_empty(oid)); + + len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap); + if (len >= sizeof(oid->inline_name)) + return len; + + oid->name_len = len; + return 0; +} + +/* + * If oid doesn't fit into inline buffer, BUG. + */ +void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + BUG_ON(oid_printf_vargs(oid, fmt, ap)); + va_end(ap); +} +EXPORT_SYMBOL(ceph_oid_printf); + +static __printf(3, 0) +int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp, + const char *fmt, va_list ap) +{ + va_list aq; + int len; + + va_copy(aq, ap); + len = oid_printf_vargs(oid, fmt, aq); + va_end(aq); + + if (len) { + char *external_name; + + external_name = kmalloc(len + 1, gfp); + if (!external_name) + return -ENOMEM; + + oid->name = external_name; + WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len); + oid->name_len = len; + } + + return 0; +} + +/* + * If oid doesn't fit into inline buffer, allocate. + */ +int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, + const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = oid_aprintf_vargs(oid, gfp, fmt, ap); + va_end(ap); + + return ret; +} +EXPORT_SYMBOL(ceph_oid_aprintf); + +void ceph_oid_destroy(struct ceph_object_id *oid) +{ + if (oid->name != oid->inline_name) + kfree(oid->name); +} +EXPORT_SYMBOL(ceph_oid_destroy); /* * calculate file layout from given offset, length. @@ -1474,7 +1565,7 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, oid->name_len); - dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, + dout("%s %*pE pgid %llu.%x\n", __func__, oid->name_len, oid->name, pg_out->pool, pg_out->seed); return 0; } -- cgit v1.2.3-70-g09d2 From 0c0a8de13f9612a663b050afa955e6668858d1eb Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:21 +0200 Subject: libceph: nuke unused fields and functions Either unused or useless: osdmap->mkfs_epoch osd->o_marked_for_keepalive monc->num_generic_requests osdc->map_waiters osdc->last_requested_map osdc->timeout_tid osd_req_op_cls_response_data() osdmap_apply_incremental() @msgr arg Signed-off-by: Ilya Dryomov --- include/linux/ceph/mon_client.h | 1 - include/linux/ceph/osd_client.h | 8 -------- include/linux/ceph/osdmap.h | 6 ++---- net/ceph/mon_client.c | 3 --- net/ceph/osd_client.c | 13 +------------ net/ceph/osdmap.c | 3 +-- 6 files changed, 4 insertions(+), 30 deletions(-) (limited to 'net/ceph/osdmap.c') diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index e230e7ed60d3..330d045e4092 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -77,7 +77,6 @@ struct ceph_mon_client { /* pending generic requests */ struct rb_root generic_request_tree; - int num_generic_requests; u64 last_tid; /* subs, indexed with CEPH_SUB_* */ diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 66a1fcd5bff7..63854a8df183 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -37,11 +37,9 @@ struct ceph_osd { struct list_head o_osd_lru; struct ceph_auth_handshake o_auth; unsigned long lru_ttl; - int o_marked_for_keepalive; struct list_head o_keepalive_item; }; - #define CEPH_OSD_SLAB_OPS 2 #define CEPH_OSD_MAX_OPS 16 @@ -206,13 +204,10 @@ struct ceph_osd_client { struct ceph_osdmap *osdmap; /* current map */ struct rw_semaphore map_sem; - struct completion map_waiters; - u64 last_requested_map; struct mutex request_mutex; struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ - u64 timeout_tid; /* tid of timeout triggering rq */ u64 last_tid; /* tid of last request */ struct rb_root requests; /* pending requests */ struct list_head req_lru; /* in-flight lru */ @@ -271,9 +266,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, extern struct ceph_osd_data *osd_req_op_extent_osd_data( struct ceph_osd_request *osd_req, unsigned int which); -extern struct ceph_osd_data *osd_req_op_cls_response_data( - struct ceph_osd_request *osd_req, - unsigned int which); extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, unsigned int which, diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 777a29412706..ce7a41a182d4 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -123,7 +123,6 @@ struct ceph_pg_mapping { struct ceph_osdmap { struct ceph_fsid fsid; u32 epoch; - u32 mkfs_epoch; struct ceph_timespec created, modified; u32 flags; /* CEPH_OSDMAP_* */ @@ -205,9 +204,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) } extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); -extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, - struct ceph_osdmap *map, - struct ceph_messenger *msgr); +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, + struct ceph_osdmap *map); extern void ceph_osdmap_destroy(struct ceph_osdmap *map); /* calculate mapping of a file extent to an object */ diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index cf638c009cfa..3dfafdad92aa 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -579,7 +579,6 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, req->tid = tid != 0 ? tid : ++monc->last_tid; req->request->hdr.tid = cpu_to_le64(req->tid); __insert_generic_request(monc, req); - monc->num_generic_requests++; ceph_con_send(&monc->con, ceph_msg_get(req->request)); mutex_unlock(&monc->mutex); @@ -587,7 +586,6 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, mutex_lock(&monc->mutex); rb_erase(&req->node, &monc->generic_request_tree); - monc->num_generic_requests--; if (!err) err = req->result; @@ -914,7 +912,6 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); monc->generic_request_tree = RB_ROOT; - monc->num_generic_requests = 0; monc->last_tid = 0; return 0; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 95910aed8e2e..32ba09be6ee6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -143,14 +143,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_osd_data); -struct ceph_osd_data * -osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, - unsigned int which) -{ - return osd_req_op_data(osd_req, which, cls, response_data); -} -EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ - void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, @@ -2166,8 +2158,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) dout("applying incremental map %u len %d\n", epoch, maplen); newmap = osdmap_apply_incremental(&p, next, - osdc->osdmap, - &osdc->client->msgr); + osdc->osdmap); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad; @@ -2674,8 +2665,6 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osdc->client = client; osdc->osdmap = NULL; init_rwsem(&osdc->map_sem); - init_completion(&osdc->map_waiters); - osdc->last_requested_map = 0; mutex_init(&osdc->request_mutex); osdc->last_tid = 0; osdc->osds = RB_ROOT; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4668b871ca47..9a0cc072a909 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1204,8 +1204,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) * decode and apply an incremental map update. */ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, - struct ceph_osdmap *map, - struct ceph_messenger *msgr) + struct ceph_osdmap *map) { struct crush_map *newcrush = NULL; struct ceph_fsid fsid; -- cgit v1.2.3-70-g09d2 From d9591f5e28686277d9312d3c7422faf1368b305e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:22 +0200 Subject: libceph: rename ceph_oloc_oid_to_pg() Rename ceph_oloc_oid_to_pg() to ceph_object_locator_to_pg(). Emphasise that returned is raw PG and return -ENOENT instead of -EIO if the pool doesn't exist. Signed-off-by: Ilya Dryomov --- fs/ceph/ioctl.c | 2 +- include/linux/ceph/osdmap.h | 9 ++++----- net/ceph/osd_client.c | 4 ++-- net/ceph/osdmap.c | 31 ++++++++++++++++--------------- 4 files changed, 23 insertions(+), 23 deletions(-) (limited to 'net/ceph/osdmap.c') diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index db296709784a..cca7fff22725 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -215,7 +215,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); ceph_oid_printf(&oid, "%s", dl.object_name); - r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); + r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); if (r < 0) { up_read(&osdc->map_sem); return r; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index ce7a41a182d4..b70440c05b49 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -213,11 +213,10 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, u64 *bno, u64 *oxoff, u64 *oxlen); -/* calculate mapping of object to a placement group */ -extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, - struct ceph_object_locator *oloc, - struct ceph_object_id *oid, - struct ceph_pg *pg_out); +int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid); extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 8256051ed88f..cb9f1953f5fb 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1324,8 +1324,8 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap, /* !pi is caught in ceph_oloc_oid_to_pg() */ } - return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, - &req->r_target_oid, pg_out); + return ceph_object_locator_to_pg(osdmap, &req->r_target_oid, + &req->r_target_oloc, pg_out); } static void __enqueue_request(struct ceph_osd_request *req) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 9a0cc072a909..6267839cb246 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1545,30 +1545,31 @@ invalid: EXPORT_SYMBOL(ceph_calc_file_object_mapping); /* - * Calculate mapping of a (oloc, oid) pair to a PG. Should only be - * called with target's (oloc, oid), since tiering isn't taken into - * account. + * Map an object into a PG. + * + * Should only be called with target_oid and target_oloc (as opposed to + * base_oid and base_oloc), since tiering isn't taken into account. */ -int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, - struct ceph_object_locator *oloc, - struct ceph_object_id *oid, - struct ceph_pg *pg_out) +int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid) { struct ceph_pg_pool_info *pi; - pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); + pi = ceph_pg_pool_by_id(osdmap, oloc->pool); if (!pi) - return -EIO; + return -ENOENT; - pg_out->pool = oloc->pool; - pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, - oid->name_len); + raw_pgid->pool = oloc->pool; + raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); - dout("%s %*pE pgid %llu.%x\n", __func__, oid->name_len, oid->name, - pg_out->pool, pg_out->seed); + dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len, + oid->name, raw_pgid->pool, raw_pgid->seed); return 0; } -EXPORT_SYMBOL(ceph_oloc_oid_to_pg); +EXPORT_SYMBOL(ceph_object_locator_to_pg); static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, -- cgit v1.2.3-70-g09d2 From 6f3bfd45cd233eea0b07e3cabc0386b5de9321d2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:22 +0200 Subject: libceph: ceph_osds, ceph_pg_to_up_acting_osds() Knowning just acting set isn't enough, we need to be able to record up set as well to detect interval changes. This means returning (up[], up_len, up_primary, acting[], acting_len, acting_primary) and passing it around. Introduce and switch to ceph_osds to help with that. Rename ceph_calc_pg_acting() to ceph_pg_to_up_acting_osds() and return both up and acting sets from it. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 21 ++- net/ceph/osd_client.c | 36 +++--- net/ceph/osdmap.c | 304 ++++++++++++++++++++++++++------------------ 3 files changed, 215 insertions(+), 146 deletions(-) (limited to 'net/ceph/osdmap.c') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index b70440c05b49..942189d311e0 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -208,6 +208,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map); extern void ceph_osdmap_destroy(struct ceph_osdmap *map); +struct ceph_osds { + int osds[CEPH_PG_MAX_SIZE]; + int size; + int primary; /* id, NOT index */ +}; + +static inline void ceph_osds_init(struct ceph_osds *set) +{ + set->size = 0; + set->primary = -1; +} + +void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); + /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, @@ -218,9 +232,10 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, struct ceph_object_locator *oloc, struct ceph_pg *raw_pgid); -extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, - struct ceph_pg pgid, - int *osds, int *primary); +void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid, + struct ceph_osds *up, + struct ceph_osds *acting); extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cb9f1953f5fb..0ff400a56cd6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1358,8 +1358,7 @@ static int __map_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, int force_resend) { struct ceph_pg pgid; - int acting[CEPH_PG_MAX_SIZE]; - int num, o; + struct ceph_osds up, acting; int err; bool was_paused; @@ -1372,9 +1371,7 @@ static int __map_request(struct ceph_osd_client *osdc, } req->r_pgid = pgid; - num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); - if (num < 0) - num = 0; + ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); was_paused = req->r_paused; req->r_paused = __req_should_be_paused(osdc, req); @@ -1382,21 +1379,23 @@ static int __map_request(struct ceph_osd_client *osdc, force_resend = 1; if ((!force_resend && - req->r_osd && req->r_osd->o_osd == o && + req->r_osd && req->r_osd->o_osd == acting.primary && req->r_sent >= req->r_osd->o_incarnation && - req->r_num_pg_osds == num && - memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || - (req->r_osd == NULL && o == -1) || + req->r_num_pg_osds == acting.size && + memcmp(req->r_pg_osds, acting.osds, + acting.size * sizeof(acting.osds[0])) == 0) || + (req->r_osd == NULL && acting.primary == -1) || req->r_paused) return 0; /* no change */ dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", - req->r_tid, pgid.pool, pgid.seed, o, + req->r_tid, pgid.pool, pgid.seed, acting.primary, req->r_osd ? req->r_osd->o_osd : -1); /* record full pg acting set */ - memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); - req->r_num_pg_osds = num; + memcpy(req->r_pg_osds, acting.osds, + acting.size * sizeof(acting.osds[0])); + req->r_num_pg_osds = acting.size; if (req->r_osd) { __cancel_request(req); @@ -1405,21 +1404,22 @@ static int __map_request(struct ceph_osd_client *osdc, req->r_osd = NULL; } - req->r_osd = lookup_osd(&osdc->osds, o); - if (!req->r_osd && o >= 0) { + req->r_osd = lookup_osd(&osdc->osds, acting.primary); + if (!req->r_osd && acting.primary >= 0) { err = -ENOMEM; - req->r_osd = create_osd(osdc, o); + req->r_osd = create_osd(osdc, acting.primary); if (!req->r_osd) { list_move(&req->r_req_lru_item, &osdc->req_notarget); goto out; } - dout("map_request osd %p is osd%d\n", req->r_osd, o); + dout("map_request osd %p is osd%d\n", req->r_osd, + acting.primary); insert_osd(&osdc->osds, req->r_osd); ceph_con_open(&req->r_osd->o_con, - CEPH_ENTITY_TYPE_OSD, o, - &osdc->osdmap->osd_addr[o]); + CEPH_ENTITY_TYPE_OSD, acting.primary, + &osdc->osdmap->osd_addr[acting.primary]); } __enqueue_request(req); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 6267839cb246..f5fc8fc63879 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1474,6 +1474,38 @@ void ceph_oid_destroy(struct ceph_object_id *oid) } EXPORT_SYMBOL(ceph_oid_destroy); +static bool osds_valid(const struct ceph_osds *set) +{ + /* non-empty set */ + if (set->size > 0 && set->primary >= 0) + return true; + + /* empty can_shift_osds set */ + if (!set->size && set->primary == -1) + return true; + + /* empty !can_shift_osds set - all NONE */ + if (set->size > 0 && set->primary == -1) { + int i; + + for (i = 0; i < set->size; i++) { + if (set->osds[i] != CRUSH_ITEM_NONE) + break; + } + if (i == set->size) + return true; + } + + return false; +} + +void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) +{ + memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0])); + dest->size = src->size; + dest->primary = src->primary; +} + /* * calculate file layout from given offset, length. * fill in correct oid, logical length, and object extent @@ -1571,6 +1603,46 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, } EXPORT_SYMBOL(ceph_object_locator_to_pg); +/* + * Map a raw PG (full precision ps) into an actual PG. + */ +static void raw_pg_to_pg(struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_pg *pgid) +{ + pgid->pool = raw_pgid->pool; + pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num, + pi->pg_num_mask); +} + +/* + * Map a raw PG (full precision ps) into a placement ps (placement + * seed). Include pool id in that value so that different pools don't + * use the same seeds. + */ +static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid) +{ + if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) { + /* hash pool id and seed so that pool PGs do not overlap */ + return crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(raw_pgid->seed, + pi->pgp_num, + pi->pgp_num_mask), + raw_pgid->pool); + } else { + /* + * legacy behavior: add ps and pool together. this is + * not a great approach because the PGs from each pool + * will overlap on top of each other: 0.5 == 1.4 == + * 2.3 == ... + */ + return ceph_stable_mod(raw_pgid->seed, pi->pgp_num, + pi->pgp_num_mask) + + (unsigned)raw_pgid->pool; + } +} + static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max) @@ -1588,84 +1660,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, } /* - * Calculate raw (crush) set for given pgid. + * Calculate raw set (CRUSH output) for given PG. The result may + * contain nonexistent OSDs. ->primary is undefined for a raw set. * - * Return raw set length, or error. + * Placement seed (CRUSH input) is returned through @ppps. */ -static int pg_to_raw_osds(struct ceph_osdmap *osdmap, - struct ceph_pg_pool_info *pool, - struct ceph_pg pgid, u32 pps, int *osds) +static void pg_to_raw_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_osds *raw, + u32 *ppps) { + u32 pps = raw_pg_to_pps(pi, raw_pgid); int ruleno; int len; - /* crush */ - ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, - pool->type, pool->size); + ceph_osds_init(raw); + if (ppps) + *ppps = pps; + + ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type, + pi->size); if (ruleno < 0) { pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", - pgid.pool, pool->crush_ruleset, pool->type, - pool->size); - return -ENOENT; + pi->id, pi->crush_ruleset, pi->type, pi->size); + return; } - len = do_crush(osdmap, ruleno, pps, osds, - min_t(int, pool->size, CEPH_PG_MAX_SIZE), + len = do_crush(osdmap, ruleno, pps, raw->osds, + min_t(int, pi->size, ARRAY_SIZE(raw->osds)), osdmap->osd_weight, osdmap->max_osd); if (len < 0) { pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", - len, ruleno, pgid.pool, pool->crush_ruleset, - pool->type, pool->size); - return len; + len, ruleno, pi->id, pi->crush_ruleset, pi->type, + pi->size); + return; } - return len; + raw->size = len; } /* - * Given raw set, calculate up set and up primary. + * Given raw set, calculate up set and up primary. By definition of an + * up set, the result won't contain nonexistent or down OSDs. * - * Return up set length. *primary is set to up primary osd id, or -1 - * if up set is empty. + * This is done in-place - on return @set is the up set. If it's + * empty, ->primary will remain undefined. */ -static int raw_to_up_osds(struct ceph_osdmap *osdmap, - struct ceph_pg_pool_info *pool, - int *osds, int len, int *primary) +static void raw_to_up_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + struct ceph_osds *set) { - int up_primary = -1; int i; - if (ceph_can_shift_osds(pool)) { + /* ->primary is undefined for a raw set */ + BUG_ON(set->primary != -1); + + if (ceph_can_shift_osds(pi)) { int removed = 0; - for (i = 0; i < len; i++) { - if (ceph_osd_is_down(osdmap, osds[i])) { + /* shift left */ + for (i = 0; i < set->size; i++) { + if (ceph_osd_is_down(osdmap, set->osds[i])) { removed++; continue; } if (removed) - osds[i - removed] = osds[i]; + set->osds[i - removed] = set->osds[i]; } - - len -= removed; - if (len > 0) - up_primary = osds[0]; + set->size -= removed; + if (set->size > 0) + set->primary = set->osds[0]; } else { - for (i = len - 1; i >= 0; i--) { - if (ceph_osd_is_down(osdmap, osds[i])) - osds[i] = CRUSH_ITEM_NONE; + /* set down/dne devices to NONE */ + for (i = set->size - 1; i >= 0; i--) { + if (ceph_osd_is_down(osdmap, set->osds[i])) + set->osds[i] = CRUSH_ITEM_NONE; else - up_primary = osds[i]; + set->primary = set->osds[i]; } } - - *primary = up_primary; - return len; } -static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, - struct ceph_pg_pool_info *pool, - int *osds, int len, int *primary) +static void apply_primary_affinity(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + u32 pps, + struct ceph_osds *up) { int i; int pos = -1; @@ -1677,8 +1757,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, if (!osdmap->osd_primary_affinity) return; - for (i = 0; i < len; i++) { - int osd = osds[i]; + for (i = 0; i < up->size; i++) { + int osd = up->osds[i]; if (osd != CRUSH_ITEM_NONE && osdmap->osd_primary_affinity[osd] != @@ -1686,7 +1766,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, break; } } - if (i == len) + if (i == up->size) return; /* @@ -1694,8 +1774,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, * osd into the hash/rng so that a proportional fraction of an * osd's pgs get rejected as primary. */ - for (i = 0; i < len; i++) { - int osd = osds[i]; + for (i = 0; i < up->size; i++) { + int osd = up->osds[i]; u32 aff; if (osd == CRUSH_ITEM_NONE) @@ -1720,123 +1800,99 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, if (pos < 0) return; - *primary = osds[pos]; + up->primary = up->osds[pos]; - if (ceph_can_shift_osds(pool) && pos > 0) { + if (ceph_can_shift_osds(pi) && pos > 0) { /* move the new primary to the front */ for (i = pos; i > 0; i--) - osds[i] = osds[i - 1]; - osds[0] = *primary; + up->osds[i] = up->osds[i - 1]; + up->osds[0] = up->primary; } } /* - * Given up set, apply pg_temp and primary_temp mappings. + * Get pg_temp and primary_temp mappings for given PG. * - * Return acting set length. *primary is set to acting primary osd id, - * or -1 if acting set is empty. + * Note that a PG may have none, only pg_temp, only primary_temp or + * both pg_temp and primary_temp mappings. This means @temp isn't + * always a valid OSD set on return: in the "only primary_temp" case, + * @temp will have its ->primary >= 0 but ->size == 0. */ -static int apply_temps(struct ceph_osdmap *osdmap, - struct ceph_pg_pool_info *pool, struct ceph_pg pgid, - int *osds, int len, int *primary) +static void get_temp_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_osds *temp) { + struct ceph_pg pgid; struct ceph_pg_mapping *pg; - int temp_len; - int temp_primary; int i; - /* raw_pg -> pg */ - pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, - pool->pg_num_mask); + raw_pg_to_pg(pi, raw_pgid, &pgid); + ceph_osds_init(temp); /* pg_temp? */ pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { - temp_len = 0; - temp_primary = -1; - for (i = 0; i < pg->pg_temp.len; i++) { if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { - if (ceph_can_shift_osds(pool)) + if (ceph_can_shift_osds(pi)) continue; - else - osds[temp_len++] = CRUSH_ITEM_NONE; + + temp->osds[temp->size++] = CRUSH_ITEM_NONE; } else { - osds[temp_len++] = pg->pg_temp.osds[i]; + temp->osds[temp->size++] = pg->pg_temp.osds[i]; } } /* apply pg_temp's primary */ - for (i = 0; i < temp_len; i++) { - if (osds[i] != CRUSH_ITEM_NONE) { - temp_primary = osds[i]; + for (i = 0; i < temp->size; i++) { + if (temp->osds[i] != CRUSH_ITEM_NONE) { + temp->primary = temp->osds[i]; break; } } - } else { - temp_len = len; - temp_primary = *primary; } /* primary_temp? */ pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); if (pg) - temp_primary = pg->primary_temp.osd; - - *primary = temp_primary; - return temp_len; + temp->primary = pg->primary_temp.osd; } /* - * Calculate acting set for given pgid. + * Map a PG to its acting set as well as its up set. * - * Return acting set length, or error. *primary is set to acting - * primary osd id, or -1 if acting set is empty or on error. + * Acting set is used for data mapping purposes, while up set can be + * recorded for detecting interval changes and deciding whether to + * resend a request. */ -int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds, int *primary) +void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid, + struct ceph_osds *up, + struct ceph_osds *acting) { - struct ceph_pg_pool_info *pool; + struct ceph_pg_pool_info *pi; u32 pps; - int len; - pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); - if (!pool) { - *primary = -1; - return -ENOENT; + pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); + if (!pi) { + ceph_osds_init(up); + ceph_osds_init(acting); + goto out; } - if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { - /* hash pool id and seed so that pool PGs do not overlap */ - pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, - ceph_stable_mod(pgid.seed, pool->pgp_num, - pool->pgp_num_mask), - pgid.pool); - } else { - /* - * legacy behavior: add ps and pool together. this is - * not a great approach because the PGs from each pool - * will overlap on top of each other: 0.5 == 1.4 == - * 2.3 == ... - */ - pps = ceph_stable_mod(pgid.seed, pool->pgp_num, - pool->pgp_num_mask) + - (unsigned)pgid.pool; - } - - len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); - if (len < 0) { - *primary = -1; - return len; + pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); + raw_to_up_osds(osdmap, pi, up); + apply_primary_affinity(osdmap, pi, pps, up); + get_temp_osds(osdmap, pi, raw_pgid, acting); + if (!acting->size) { + memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); + acting->size = up->size; + if (acting->primary == -1) + acting->primary = up->primary; } - - len = raw_to_up_osds(osdmap, pool, osds, len, primary); - - apply_primary_affinity(osdmap, pps, pool, osds, len, primary); - - len = apply_temps(osdmap, pool, pgid, osds, len, primary); - - return len; +out: + WARN_ON(!osds_valid(up) || !osds_valid(acting)); } /* @@ -1844,11 +1900,9 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, */ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) { - int osds[CEPH_PG_MAX_SIZE]; - int primary; - - ceph_calc_pg_acting(osdmap, pgid, osds, &primary); + struct ceph_osds up, acting; - return primary; + ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting); + return acting.primary; } EXPORT_SYMBOL(ceph_calc_pg_primary); -- cgit v1.2.3-70-g09d2 From f81f16339a05775df600b2ff75a79be1864975c1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: libceph: rename ceph_calc_pg_primary() Rename ceph_calc_pg_primary() to ceph_pg_to_acting_primary() to emphasise that it returns acting primary. Signed-off-by: Ilya Dryomov --- fs/ceph/ioctl.c | 2 +- include/linux/ceph/osdmap.h | 4 ++-- net/ceph/osdmap.c | 9 +++++---- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net/ceph/osdmap.c') diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index cca7fff22725..1831ad6cf066 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -221,7 +221,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return r; } - dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); + dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = ceph_osd_addr(osdc->osdmap, dl.osd); diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 942189d311e0..3fd978a1639b 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -236,8 +236,8 @@ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid, struct ceph_osds *up, struct ceph_osds *acting); -extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, - struct ceph_pg pgid); +int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid); extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f5fc8fc63879..656384a8fd1e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1896,13 +1896,14 @@ out: } /* - * Return primary osd for given pgid, or -1 if none. + * Return acting primary for given PG, or -1 if none. */ -int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) +int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid) { struct ceph_osds up, acting; - ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting); + ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting); return acting.primary; } -EXPORT_SYMBOL(ceph_calc_pg_primary); +EXPORT_SYMBOL(ceph_pg_to_acting_primary); -- cgit v1.2.3-70-g09d2 From f984cb76cc5fb9fc76d6abb6c4694a5412e3f49b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: libceph: make pgid_cmp() global calc_target() code is going to need to know how to compare PGs. Take lhs and rhs pgid by const * while at it. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 2 ++ net/ceph/osdmap.c | 23 ++++++++++++----------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'net/ceph/osdmap.c') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 3fd978a1639b..7783237ab06c 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -24,6 +24,8 @@ struct ceph_pg { uint32_t seed; }; +int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); + #define CEPH_POOL_FLAG_HASHPSPOOL 1 struct ceph_pg_pool_info { diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 656384a8fd1e..3c7dc5e581ab 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -380,23 +380,24 @@ bad: return ERR_PTR(err); } -/* - * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid - * to a set of osds) and primary_temp (explicit primary setting) - */ -static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) +int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs) { - if (l.pool < r.pool) + if (lhs->pool < rhs->pool) return -1; - if (l.pool > r.pool) + if (lhs->pool > rhs->pool) return 1; - if (l.seed < r.seed) + if (lhs->seed < rhs->seed) return -1; - if (l.seed > r.seed) + if (lhs->seed > rhs->seed) return 1; + return 0; } +/* + * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid + * to a set of osds) and primary_temp (explicit primary setting) + */ static int __insert_pg_mapping(struct ceph_pg_mapping *new, struct rb_root *root) { @@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new, while (*p) { parent = *p; pg = rb_entry(parent, struct ceph_pg_mapping, node); - c = pgid_cmp(new->pgid, pg->pgid); + c = ceph_pg_compare(&new->pgid, &pg->pgid); if (c < 0) p = &(*p)->rb_left; else if (c > 0) @@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, while (n) { pg = rb_entry(n, struct ceph_pg_mapping, node); - c = pgid_cmp(pgid, pg->pgid); + c = ceph_pg_compare(&pgid, &pg->pgid); if (c < 0) { n = n->rb_left; } else if (c > 0) { -- cgit v1.2.3-70-g09d2 From 04812acf572ef41fd51c11e0bf3385f34c0e1b5b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: libceph: pi->min_size, pi->last_force_request_resend Add and decode pi->min_size and pi->last_force_request_resend. These are going to be used by calc_target(). Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 9 ++++++--- net/ceph/debugfs.c | 10 ++++++---- net/ceph/osdmap.c | 48 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 59 insertions(+), 8 deletions(-) (limited to 'net/ceph/osdmap.c') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 7783237ab06c..989294d0b8d2 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -26,20 +26,23 @@ struct ceph_pg { int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); -#define CEPH_POOL_FLAG_HASHPSPOOL 1 +#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id + together */ struct ceph_pg_pool_info { struct rb_node node; s64 id; - u8 type; + u8 type; /* CEPH_POOL_TYPE_* */ u8 size; + u8 min_size; u8 crush_ruleset; u8 object_hash; + u32 last_force_request_resend; u32 pg_num, pgp_num; int pg_num_mask, pgp_num_mask; s64 read_tier; s64 write_tier; /* wins for read+write ops */ - u64 flags; + u64 flags; /* CEPH_POOL_FLAG_* */ char *name; }; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 6f8413293d15..7f1cc22c3e8b 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -66,12 +66,14 @@ static int osdmap_show(struct seq_file *s, void *p) (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { - struct ceph_pg_pool_info *pool = + struct ceph_pg_pool_info *pi = rb_entry(n, struct ceph_pg_pool_info, node); - seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", - pool->id, pool->pg_num, pool->pg_num_mask, - pool->read_tier, pool->write_tier); + seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n", + pi->id, pi->name, pi->type, pi->size, pi->min_size, + pi->pg_num, pi->pg_num_mask, pi->flags, + pi->last_force_request_resend, pi->read_tier, + pi->write_tier); } for (i = 0; i < map->max_osd; i++) { struct ceph_entity_addr *addr = &map->osd_addr[i]; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 3c7dc5e581ab..66c3ebead92f 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -597,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) *p += 4; /* skip crash_replay_interval */ if (ev >= 7) - *p += 1; /* skip min_size */ + pi->min_size = ceph_decode_8(p); + else + pi->min_size = pi->size - pi->size / 2; if (ev >= 8) *p += 8 + 8; /* skip quota_max_* */ @@ -617,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) pi->write_tier = -1; } + if (ev >= 10) { + /* skip properties */ + num = ceph_decode_32(p); + while (num--) { + len = ceph_decode_32(p); + *p += len; /* key */ + len = ceph_decode_32(p); + *p += len; /* val */ + } + } + + if (ev >= 11) { + /* skip hit_set_params */ + *p += 1 + 1; /* versions */ + len = ceph_decode_32(p); + *p += len; + + *p += 4; /* skip hit_set_period */ + *p += 4; /* skip hit_set_count */ + } + + if (ev >= 12) + *p += 4; /* skip stripe_width */ + + if (ev >= 13) { + *p += 8; /* skip target_max_bytes */ + *p += 8; /* skip target_max_objects */ + *p += 4; /* skip cache_target_dirty_ratio_micro */ + *p += 4; /* skip cache_target_full_ratio_micro */ + *p += 4; /* skip cache_min_flush_age */ + *p += 4; /* skip cache_min_evict_age */ + } + + if (ev >= 14) { + /* skip erasure_code_profile */ + len = ceph_decode_32(p); + *p += len; + } + + if (ev >= 15) + pi->last_force_request_resend = ceph_decode_32(p); + else + pi->last_force_request_resend = 0; + /* ignore the rest */ *p = pool_end; -- cgit v1.2.3-70-g09d2 From 63244fa123a755e4bbaee03022b68613c71d1332 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: libceph: introduce ceph_osd_request_target, calc_target() Introduce ceph_osd_request_target, containing all mapping-related fields of ceph_osd_request and calc_target() for calculating mappings and populating it. Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 +- fs/ceph/file.c | 2 +- include/linux/ceph/osd_client.h | 23 ++++++ include/linux/ceph/osdmap.h | 34 +++++++++ include/linux/ceph/rados.h | 5 ++ net/ceph/osd_client.c | 157 +++++++++++++++++++++++++++++++++++++++- net/ceph/osdmap.c | 121 +++++++++++++++++++++++++++++++ 7 files changed, 340 insertions(+), 4 deletions(-) (limited to 'net/ceph/osdmap.c') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6f28dd9bacb2..c5d75486823b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1774,7 +1774,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); - wr_req->r_base_oloc.pool = pool; + ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9d470397e249..36b4a41dfa67 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -714,7 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - req->r_base_oloc = orig_req->r_base_oloc; + ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); ret = ceph_osdc_alloc_messages(req, GFP_NOFS); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 63854a8df183..48806ee4488d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -24,6 +24,8 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, struct ceph_msg *); typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); +#define CEPH_HOMELESS_OSD -1 + /* a given osd we're communicating with */ struct ceph_osd { atomic_t o_ref; @@ -118,6 +120,27 @@ struct ceph_osd_req_op { }; }; +struct ceph_osd_request_target { + struct ceph_object_id base_oid; + struct ceph_object_locator base_oloc; + struct ceph_object_id target_oid; + struct ceph_object_locator target_oloc; + + struct ceph_pg pgid; + u32 pg_num; + u32 pg_num_mask; + struct ceph_osds acting; + struct ceph_osds up; + int size; + int min_size; + bool sort_bitwise; + + unsigned int flags; /* CEPH_OSD_FLAG_* */ + bool paused; + + int osd; +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 989294d0b8d2..420bb7968b25 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -28,6 +28,7 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id together */ +#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */ struct ceph_pg_pool_info { struct rb_node node; @@ -62,6 +63,22 @@ struct ceph_object_locator { s64 pool; }; +static inline void ceph_oloc_init(struct ceph_object_locator *oloc) +{ + oloc->pool = -1; +} + +static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc) +{ + return oloc->pool == -1; +} + +static inline void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src) +{ + dest->pool = src->pool; +} + /* * Maximum supported by kernel client object name length * @@ -227,6 +244,23 @@ static inline void ceph_osds_init(struct ceph_osds *set) void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); +bool ceph_is_new_interval(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + const struct ceph_osds *old_up, + const struct ceph_osds *new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + u32 old_pg_num, + u32 new_pg_num, + bool old_sort_bitwise, + bool new_sort_bitwise, + const struct ceph_pg *pgid); +bool ceph_osds_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + bool any_change); + /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 913c87c26d33..f28ed864e682 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ +#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ +#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ +#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ +#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ +#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ /* * The error code to return when an OSD can't handle a write diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0ff400a56cd6..cff3a7e29233 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -298,6 +298,30 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, } } +/* + * Assumes @t is zero-initialized. + */ +static void target_init(struct ceph_osd_request_target *t) +{ + ceph_oid_init(&t->base_oid); + ceph_oloc_init(&t->base_oloc); + ceph_oid_init(&t->target_oid); + ceph_oloc_init(&t->target_oloc); + + ceph_osds_init(&t->acting); + ceph_osds_init(&t->up); + t->size = -1; + t->min_size = -1; + + t->osd = CEPH_HOMELESS_OSD; +} + +static void target_destroy(struct ceph_osd_request_target *t) +{ + ceph_oid_destroy(&t->base_oid); + ceph_oid_destroy(&t->target_oid); +} + /* * requests */ @@ -1273,6 +1297,11 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_set_request_linger); +static bool __pool_full(struct ceph_pg_pool_info *pi) +{ + return pi->flags & CEPH_POOL_FLAG_FULL; +} + /* * Returns whether a request should be blocked from being sent * based on the current osdmap and osd_client settings. @@ -1289,6 +1318,20 @@ static bool __req_should_be_paused(struct ceph_osd_client *osdc, (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); } +static bool target_should_be_paused(struct ceph_osd_client *osdc, + const struct ceph_osd_request_target *t, + struct ceph_pg_pool_info *pi) +{ + bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); + bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + __pool_full(pi); + + WARN_ON(pi->id != t->base_oloc.pool); + return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || + (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); +} + /* * Calculate mapping of a request to a PG. Takes tiering into account. */ @@ -1328,6 +1371,116 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap, &req->r_target_oloc, pg_out); } +enum calc_target_result { + CALC_TARGET_NO_ACTION = 0, + CALC_TARGET_NEED_RESEND, + CALC_TARGET_POOL_DNE, +}; + +static enum calc_target_result calc_target(struct ceph_osd_client *osdc, + struct ceph_osd_request_target *t, + u32 *last_force_resend, + bool any_change) +{ + struct ceph_pg_pool_info *pi; + struct ceph_pg pgid, last_pgid; + struct ceph_osds up, acting; + bool force_resend = false; + bool need_check_tiering = false; + bool need_resend = false; + bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap, + CEPH_OSDMAP_SORTBITWISE); + enum calc_target_result ct_res; + int ret; + + pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); + if (!pi) { + t->osd = CEPH_HOMELESS_OSD; + ct_res = CALC_TARGET_POOL_DNE; + goto out; + } + + if (osdc->osdmap->epoch == pi->last_force_request_resend) { + if (last_force_resend && + *last_force_resend < pi->last_force_request_resend) { + *last_force_resend = pi->last_force_request_resend; + force_resend = true; + } else if (!last_force_resend) { + force_resend = true; + } + } + if (ceph_oid_empty(&t->target_oid) || force_resend) { + ceph_oid_copy(&t->target_oid, &t->base_oid); + need_check_tiering = true; + } + if (ceph_oloc_empty(&t->target_oloc) || force_resend) { + ceph_oloc_copy(&t->target_oloc, &t->base_oloc); + need_check_tiering = true; + } + + if (need_check_tiering && + (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) + t->target_oloc.pool = pi->read_tier; + if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) + t->target_oloc.pool = pi->write_tier; + } + + ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid, + &t->target_oloc, &pgid); + if (ret) { + WARN_ON(ret != -ENOENT); + t->osd = CEPH_HOMELESS_OSD; + ct_res = CALC_TARGET_POOL_DNE; + goto out; + } + last_pgid.pool = pgid.pool; + last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); + + ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); + if (any_change && + ceph_is_new_interval(&t->acting, + &acting, + &t->up, + &up, + t->size, + pi->size, + t->min_size, + pi->min_size, + t->pg_num, + pi->pg_num, + t->sort_bitwise, + sort_bitwise, + &last_pgid)) + force_resend = true; + + if (t->paused && !target_should_be_paused(osdc, t, pi)) { + t->paused = false; + need_resend = true; + } + + if (ceph_pg_compare(&t->pgid, &pgid) || + ceph_osds_changed(&t->acting, &acting, any_change) || + force_resend) { + t->pgid = pgid; /* struct */ + ceph_osds_copy(&t->acting, &acting); + ceph_osds_copy(&t->up, &up); + t->size = pi->size; + t->min_size = pi->min_size; + t->pg_num = pi->pg_num; + t->pg_num_mask = pi->pg_num_mask; + t->sort_bitwise = sort_bitwise; + + t->osd = acting.primary; + need_resend = true; + } + + ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION; +out: + dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd); + return ct_res; +} + static void __enqueue_request(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; @@ -1805,12 +1958,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) redir.oloc.pool = -1; } - if (redir.oloc.pool != -1) { + if (!ceph_oloc_empty(&redir.oloc)) { dout("redirect pool %lld\n", redir.oloc.pool); __unregister_request(osdc, req); - req->r_target_oloc = redir.oloc; /* struct */ + ceph_oloc_copy(&req->r_target_oloc, &redir.oloc); /* * Start redirect requests with nofail=true. If diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 66c3ebead92f..7d4a5b43085e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1521,6 +1521,32 @@ void ceph_oid_destroy(struct ceph_object_id *oid) } EXPORT_SYMBOL(ceph_oid_destroy); +/* + * osds only + */ +static bool __osds_equal(const struct ceph_osds *lhs, + const struct ceph_osds *rhs) +{ + if (lhs->size == rhs->size && + !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0]))) + return true; + + return false; +} + +/* + * osds + primary + */ +static bool osds_equal(const struct ceph_osds *lhs, + const struct ceph_osds *rhs) +{ + if (__osds_equal(lhs, rhs) && + lhs->primary == rhs->primary) + return true; + + return false; +} + static bool osds_valid(const struct ceph_osds *set) { /* non-empty set */ @@ -1553,6 +1579,101 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) dest->primary = src->primary; } +static bool is_split(const struct ceph_pg *pgid, + u32 old_pg_num, + u32 new_pg_num) +{ + int old_bits = calc_bits_of(old_pg_num); + int old_mask = (1 << old_bits) - 1; + int n; + + WARN_ON(pgid->seed >= old_pg_num); + if (new_pg_num <= old_pg_num) + return false; + + for (n = 1; ; n++) { + int next_bit = n << (old_bits - 1); + u32 s = next_bit | pgid->seed; + + if (s < old_pg_num || s == pgid->seed) + continue; + if (s >= new_pg_num) + break; + + s = ceph_stable_mod(s, old_pg_num, old_mask); + if (s == pgid->seed) + return true; + } + + return false; +} + +bool ceph_is_new_interval(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + const struct ceph_osds *old_up, + const struct ceph_osds *new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + u32 old_pg_num, + u32 new_pg_num, + bool old_sort_bitwise, + bool new_sort_bitwise, + const struct ceph_pg *pgid) +{ + return !osds_equal(old_acting, new_acting) || + !osds_equal(old_up, new_up) || + old_size != new_size || + old_min_size != new_min_size || + is_split(pgid, old_pg_num, new_pg_num) || + old_sort_bitwise != new_sort_bitwise; +} + +static int calc_pg_rank(int osd, const struct ceph_osds *acting) +{ + int i; + + for (i = 0; i < acting->size; i++) { + if (acting->osds[i] == osd) + return i; + } + + return -1; +} + +static bool primary_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting) +{ + if (!old_acting->size && !new_acting->size) + return false; /* both still empty */ + + if (!old_acting->size ^ !new_acting->size) + return true; /* was empty, now not, or vice versa */ + + if (old_acting->primary != new_acting->primary) + return true; /* primary changed */ + + if (calc_pg_rank(old_acting->primary, old_acting) != + calc_pg_rank(new_acting->primary, new_acting)) + return true; + + return false; /* same primary (tho replicas may have changed) */ +} + +bool ceph_osds_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + bool any_change) +{ + if (primary_changed(old_acting, new_acting)) + return true; + + if (any_change && !__osds_equal(old_acting, new_acting)) + return true; + + return false; +} + /* * calculate file layout from given offset, length. * fill in correct oid, logical length, and object extent -- cgit v1.2.3-70-g09d2 From e5253a7bde13788d9dc75f42eb47ea119af5609f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:25 +0200 Subject: libceph: allocate dummy osdmap in ceph_osdc_init() This leads to a simpler osdmap handling code, particularly when dealing with pi->was_full, which is introduced in a later commit. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 1 + net/ceph/osd_client.c | 22 +++++++++++----------- net/ceph/osdmap.c | 23 ++++++++++++++++++----- 3 files changed, 30 insertions(+), 16 deletions(-) (limited to 'net/ceph/osdmap.c') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 420bb7968b25..8468c734d712 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -225,6 +225,7 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) return 0; } +struct ceph_osdmap *ceph_osdmap_alloc(void); extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 41dabce9c9c3..9c35fd84a410 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2255,7 +2255,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) struct ceph_fsid fsid; bool was_full; - dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); + dout("handle_map have %u\n", osdc->osdmap->epoch); p = msg->front.iov_base; end = p + msg->front.iov_len; @@ -2278,7 +2278,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) maplen = ceph_decode_32(&p); ceph_decode_need(&p, end, maplen, bad); next = p + maplen; - if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { + if (osdc->osdmap->epoch+1 == epoch) { dout("applying incremental map %u len %d\n", epoch, maplen); newmap = osdmap_apply_incremental(&p, next, @@ -2317,7 +2317,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) if (nr_maps > 1) { dout("skipping non-latest full map %u len %d\n", epoch, maplen); - } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { + } else if (osdc->osdmap->epoch >= epoch) { dout("skipping full map %u len %d, " "older than our %u\n", epoch, maplen, osdc->osdmap->epoch); @@ -2347,8 +2347,6 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) nr_maps--; } - if (!osdc->osdmap) - goto bad; done: downgrade_write(&osdc->map_sem); ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, @@ -2690,7 +2688,6 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) dout("init\n"); osdc->client = client; - osdc->osdmap = NULL; init_rwsem(&osdc->map_sem); mutex_init(&osdc->request_mutex); osdc->last_tid = 0; @@ -2709,10 +2706,14 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osdc->event_count = 0; err = -ENOMEM; + osdc->osdmap = ceph_osdmap_alloc(); + if (!osdc->osdmap) + goto out; + osdc->req_mempool = mempool_create_slab_pool(10, ceph_osd_request_cache); if (!osdc->req_mempool) - goto out; + goto out_map; err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, PAGE_SIZE, 10, true, "osd_op"); @@ -2741,6 +2742,8 @@ out_msgpool: ceph_msgpool_destroy(&osdc->msgpool_op); out_mempool: mempool_destroy(osdc->req_mempool); +out_map: + ceph_osdmap_destroy(osdc->osdmap); out: return err; } @@ -2760,10 +2763,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) } mutex_unlock(&osdc->request_mutex); - if (osdc->osdmap) { - ceph_osdmap_destroy(osdc->osdmap); - osdc->osdmap = NULL; - } + ceph_osdmap_destroy(osdc->osdmap); mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7d4a5b43085e..cde52e94732f 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -707,6 +707,23 @@ bad: /* * osd map */ +struct ceph_osdmap *ceph_osdmap_alloc(void) +{ + struct ceph_osdmap *map; + + map = kzalloc(sizeof(*map), GFP_NOIO); + if (!map) + return NULL; + + map->pg_pools = RB_ROOT; + map->pool_max = -1; + map->pg_temp = RB_ROOT; + map->primary_temp = RB_ROOT; + mutex_init(&map->crush_scratch_mutex); + + return map; +} + void ceph_osdmap_destroy(struct ceph_osdmap *map) { dout("osdmap_destroy %p\n", map); @@ -1230,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) struct ceph_osdmap *map; int ret; - map = kzalloc(sizeof(*map), GFP_NOFS); + map = ceph_osdmap_alloc(); if (!map) return ERR_PTR(-ENOMEM); - map->pg_temp = RB_ROOT; - map->primary_temp = RB_ROOT; - mutex_init(&map->crush_scratch_mutex); - ret = osdmap_decode(p, end, map); if (ret) { ceph_osdmap_destroy(map); -- cgit v1.2.3-70-g09d2