From 3593815022998ab75379f64735ff67b3ea388cbe Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Aug 2012 11:29:46 -0500 Subject: rbd: simplify __rbd_init_snaps_header() The purpose of __rbd_init_snaps_header() is to compare a new snapshot context with an rbd device's list of existing snapshots. It updates the list by adding any new snapshots or removing any that are not present in the new snapshot context. The code as written is a little confusing, because it traverses both the existing snapshot list and the set of snapshots in the snapshot context in reverse. This was done based on an assumption about snapshots that is not true--namely that a duplicate snapshot name could cause an error in intepreting things if they were not processed in ascending order. These precautions are not necessary, because: - all snapshots are uniquely identified by their snapshot id - a new snapshot cannot be created if the rbd device has another snapshot with the same name (It is furthermore not currently possible to rename a snapshot.) This patch re-implements __rbd_init_snaps_header() so it passes through both the existing snapshot list and the entries in the snapshot context in forward order. It still does the same thing as before, but I find the logic considerably easier to understand. By going forward through the names in the snapshot context, there is no longer a need for the rbd_prev_snap_name() helper function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 143 +++++++++++++++++++++++----------------------------- 1 file changed, 64 insertions(+), 79 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 54a55f03115d..bf418a6afbe0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2066,97 +2066,82 @@ err: } /* - * search for the previous snap in a null delimited string list - */ -const char *rbd_prev_snap_name(const char *name, const char *start) -{ - if (name < start + 2) - return NULL; - - name -= 2; - while (*name) { - if (name == start) - return start; - name--; - } - return name + 1; -} - -/* - * compare the old list of snapshots that we have to what's in the header - * and update it accordingly. Note that the header holds the snapshots - * in a reverse order (from newest to oldest) and we need to go from - * older to new so that we don't get a duplicate snap name when - * doing the process (e.g., removed snapshot and recreated a new - * one with the same name. + * Scan the rbd device's current snapshot list and compare it to the + * newly-received snapshot context. Remove any existing snapshots + * not present in the new snapshot context. Add a new snapshot for + * any snaphots in the snapshot context not in the current list. + * And verify there are no changes to snapshots we already know + * about. + * + * Assumes the snapshots in the snapshot context are sorted by + * snapshot id, highest id first. (Snapshots in the rbd_dev's list + * are also maintained in that order.) */ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) { - const char *name, *first_name; - int i = rbd_dev->header.total_snaps; - struct rbd_snap *snap, *old_snap = NULL; - struct list_head *p, *n; + struct ceph_snap_context *snapc = rbd_dev->header.snapc; + const u32 snap_count = snapc->num_snaps; + char *snap_name = rbd_dev->header.snap_names; + struct list_head *head = &rbd_dev->snaps; + struct list_head *links = head->next; + u32 index = 0; - first_name = rbd_dev->header.snap_names; - name = first_name + rbd_dev->header.snap_names_len; + while (index < snap_count || links != head) { + u64 snap_id; + struct rbd_snap *snap; - list_for_each_prev_safe(p, n, &rbd_dev->snaps) { - u64 cur_id; + snap_id = index < snap_count ? snapc->snaps[index] + : CEPH_NOSNAP; + snap = links != head ? list_entry(links, struct rbd_snap, node) + : NULL; + BUG_ON(snap && snap->id == CEPH_NOSNAP); - old_snap = list_entry(p, struct rbd_snap, node); + if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { + struct list_head *next = links->next; - if (i) - cur_id = rbd_dev->header.snapc->snaps[i - 1]; + /* Existing snapshot not in the new snap context */ - if (!i || old_snap->id < cur_id) { - /* - * old_snap->id was skipped, thus was - * removed. If this rbd_dev is mapped to - * the removed snapshot, record that it no - * longer exists, to prevent further I/O. - */ - if (rbd_dev->snap_id == old_snap->id) + if (rbd_dev->snap_id == snap->id) rbd_dev->snap_exists = false; - __rbd_remove_snap_dev(old_snap); - continue; - } - if (old_snap->id == cur_id) { - /* we have this snapshot already */ - i--; - name = rbd_prev_snap_name(name, first_name); + __rbd_remove_snap_dev(snap); + + /* Done with this list entry; advance */ + + links = next; continue; } - for (; i > 0; - i--, name = rbd_prev_snap_name(name, first_name)) { - if (!name) { - WARN_ON(1); - return -EINVAL; - } - cur_id = rbd_dev->header.snapc->snaps[i]; - /* snapshot removal? handle it above */ - if (cur_id >= old_snap->id) - break; - /* a new snapshot */ - snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); - if (IS_ERR(snap)) - return PTR_ERR(snap); - - /* note that we add it backward so using n and not p */ - list_add(&snap->node, n); - p = &snap->node; - } - } - /* we're done going over the old snap list, just add what's left */ - for (; i > 0; i--) { - name = rbd_prev_snap_name(name, first_name); - if (!name) { - WARN_ON(1); - return -EINVAL; + + if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { + struct rbd_snap *new_snap; + + /* We haven't seen this snapshot before */ + + new_snap = __rbd_add_snap_dev(rbd_dev, index, + snap_name); + if (IS_ERR(new_snap)) + return PTR_ERR(new_snap); + + /* New goes before existing, or at end of list */ + + if (snap) + list_add_tail(&new_snap->node, &snap->node); + else + list_add(&new_snap->node, head); + } else { + /* Already have this one */ + + BUG_ON(snap->size != rbd_dev->header.snap_sizes[index]); + BUG_ON(strcmp(snap->name, snap_name)); + + /* Done with this list entry; advance */ + + links = links->next; } - snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); - if (IS_ERR(snap)) - return PTR_ERR(snap); - list_add(&snap->node, &rbd_dev->snaps); + + /* Advance to the next entry in the snapshot context */ + + index++; + snap_name += strlen(snap_name) + 1; } return 0; -- cgit v1.2.3-70-g09d2 From 0f1d3f938527f319d830ef3082c218c77cfd159f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Aug 2012 11:29:44 -0500 Subject: rbd: make snap_names_len a u64 The snap_names_len field of an rbd_image_header structure is defined with type size_t. That field is used as both the source and target of 64-bit byte-order swapping operations though, so it's best to define it with type u64 instead. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bf418a6afbe0..02de524d4b67 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -81,7 +81,7 @@ struct rbd_image_header { __u8 crypt_type; __u8 comp_type; struct ceph_snap_context *snapc; - size_t snap_names_len; + u64 snap_names_len; u32 total_snaps; char *snap_names; @@ -510,6 +510,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header, if (snap_count) { header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); + BUG_ON(header->snap_names_len > (u64) SIZE_MAX); header->snap_names = kmalloc(header->snap_names_len, GFP_KERNEL); if (!header->snap_names) -- cgit v1.2.3-70-g09d2 From d78fd7ae03136c0610bee33eeebb4ffe67c752d5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 26 Jul 2012 23:37:14 -0500 Subject: rbd: ensure invalid pointers are made null Fix a number of spots where a pointer value that is known to have become invalid but was not reset to null. Also, toss in a change so we use sizeof (object) rather than sizeof (type). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 02de524d4b67..e5eaa70e8826 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -568,6 +568,7 @@ err_sizes: err_names: kfree(header->snap_names); header->snap_names = NULL; + header->snap_names_len = 0; err_snapc: kfree(header->snapc); header->snapc = NULL; @@ -631,9 +632,14 @@ done: static void rbd_header_free(struct rbd_image_header *header) { kfree(header->object_prefix); + header->object_prefix = NULL; kfree(header->snap_sizes); + header->snap_sizes = NULL; kfree(header->snap_names); + header->snap_names = NULL; + header->snap_names_len = 0; ceph_put_snap_context(header->snapc); + header->snapc = NULL; } /* @@ -2418,7 +2424,10 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, out_err: kfree(rbd_dev->header_name); + rbd_dev->header_name = NULL; kfree(rbd_dev->image_name); + rbd_dev->image_name = NULL; + rbd_dev->image_name_len = 0; kfree(rbd_dev->pool_name); rbd_dev->pool_name = NULL; @@ -2470,6 +2479,7 @@ static ssize_t rbd_add(struct bus_type *bus, options); if (IS_ERR(rbd_dev->rbd_client)) { rc = PTR_ERR(rbd_dev->rbd_client); + rbd_dev->rbd_client = NULL; goto err_put_id; } -- cgit v1.2.3-70-g09d2 From d2bb24e506596ad0c10e4a7f4b2fca88cc75c0bc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 26 Jul 2012 23:37:14 -0500 Subject: rbd: use sizeof (object) instead of sizeof (type) Fix a few spots in rbd_header_from_disk() to use sizeof (object) rather than sizeof (type). Use a local variable to record sizes to shorten some lines and improve readability. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e5eaa70e8826..2a94f8e81f67 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -494,17 +494,21 @@ static int rbd_header_from_disk(struct rbd_image_header *header, u32 allocated_snaps) { u32 snap_count; + size_t size; if (!rbd_dev_ondisk_valid(ondisk)) return -ENXIO; snap_count = le32_to_cpu(ondisk->snap_count); - if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context)) - / sizeof (u64)) + + /* Make sure we don't overflow below */ + size = SIZE_MAX - sizeof (struct ceph_snap_context); + if (snap_count > size / sizeof (header->snapc->snaps[0])) return -EINVAL; - header->snapc = kmalloc(sizeof(struct ceph_snap_context) + - snap_count * sizeof(u64), - GFP_KERNEL); + + size = sizeof (struct ceph_snap_context); + size += snap_count * sizeof (header->snapc->snaps[0]); + header->snapc = kmalloc(size, GFP_KERNEL); if (!header->snapc) return -ENOMEM; @@ -515,8 +519,8 @@ static int rbd_header_from_disk(struct rbd_image_header *header, GFP_KERNEL); if (!header->snap_names) goto err_snapc; - header->snap_sizes = kmalloc(snap_count * sizeof(u64), - GFP_KERNEL); + size = snap_count * sizeof (*header->snap_sizes); + header->snap_sizes = kmalloc(size, GFP_KERNEL); if (!header->snap_sizes) goto err_names; } else { @@ -526,14 +530,12 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snap_sizes = NULL; } - header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1, - GFP_KERNEL); + size = sizeof (ondisk->block_name) + 1; + header->object_prefix = kmalloc(size, GFP_KERNEL); if (!header->object_prefix) goto err_sizes; - - memcpy(header->object_prefix, ondisk->block_name, - sizeof(ondisk->block_name)); - header->object_prefix[sizeof (ondisk->block_name)] = '\0'; + memcpy(header->object_prefix, ondisk->block_name, size - 1); + header->object_prefix[size - 1] = '\0'; header->image_size = le64_to_cpu(ondisk->image_size); header->obj_order = ondisk->options.order; -- cgit v1.2.3-70-g09d2 From 6a52325f61760c6a7d7f3ea9736029bc9f63e7f3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 19 Jul 2012 17:12:59 -0500 Subject: rbd: rearrange rbd_header_from_disk() This just moves code around for the most part. It was pulled out as a separate patch to avoid cluttering up some upcoming patches which are more substantive. The point is basically to group everything related to initializing the snapshot context together. The only functional change is that rbd_header_from_disk() now ensures the (in-core) header it is passed is zero-filled. This allows a simpler error handling path in rbd_header_from_disk(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2a94f8e81f67..c9de0f8e808e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -506,11 +506,14 @@ static int rbd_header_from_disk(struct rbd_image_header *header, if (snap_count > size / sizeof (header->snapc->snaps[0])) return -EINVAL; - size = sizeof (struct ceph_snap_context); - size += snap_count * sizeof (header->snapc->snaps[0]); - header->snapc = kmalloc(size, GFP_KERNEL); - if (!header->snapc) + memset(header, 0, sizeof (*header)); + + size = sizeof (ondisk->block_name) + 1; + header->object_prefix = kmalloc(size, GFP_KERNEL); + if (!header->object_prefix) return -ENOMEM; + memcpy(header->object_prefix, ondisk->block_name, size - 1); + header->object_prefix[size - 1] = '\0'; if (snap_count) { header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); @@ -518,11 +521,12 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snap_names = kmalloc(header->snap_names_len, GFP_KERNEL); if (!header->snap_names) - goto err_snapc; + goto out_err; + size = snap_count * sizeof (*header->snap_sizes); header->snap_sizes = kmalloc(size, GFP_KERNEL); if (!header->snap_sizes) - goto err_names; + goto out_err; } else { WARN_ON(ondisk->snap_names_len); header->snap_names_len = 0; @@ -530,22 +534,23 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snap_sizes = NULL; } - size = sizeof (ondisk->block_name) + 1; - header->object_prefix = kmalloc(size, GFP_KERNEL); - if (!header->object_prefix) - goto err_sizes; - memcpy(header->object_prefix, ondisk->block_name, size - 1); - header->object_prefix[size - 1] = '\0'; - header->image_size = le64_to_cpu(ondisk->image_size); header->obj_order = ondisk->options.order; header->crypt_type = ondisk->options.crypt_type; header->comp_type = ondisk->options.comp_type; + header->total_snaps = snap_count; + + /* Set up the snapshot context */ + + size = sizeof (struct ceph_snap_context); + size += snap_count * sizeof (header->snapc->snaps[0]); + header->snapc = kzalloc(size, GFP_KERNEL); + if (!header->snapc) + goto out_err; atomic_set(&header->snapc->nref, 1); header->snapc->seq = le64_to_cpu(ondisk->snap_seq); header->snapc->num_snaps = snap_count; - header->total_snaps = snap_count; if (snap_count && allocated_snaps == snap_count) { int i; @@ -564,16 +569,14 @@ static int rbd_header_from_disk(struct rbd_image_header *header, return 0; -err_sizes: +out_err: kfree(header->snap_sizes); header->snap_sizes = NULL; -err_names: kfree(header->snap_names); header->snap_names = NULL; header->snap_names_len = 0; -err_snapc: - kfree(header->snapc); - header->snapc = NULL; + kfree(header->object_prefix); + header->object_prefix = NULL; return -ENOMEM; } -- cgit v1.2.3-70-g09d2 From 28cb775de1bd1bcc62c43f767ab81b7b9cfb6678 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 26 Jul 2012 23:37:15 -0500 Subject: rbd: return earlier in rbd_header_from_disk() The only caller of rbd_header_from_disk() is rbd_read_header(). It passes as allocated_snaps the number of snapshots it will have received from the server for the snapshot context that rbd_header_from_disk() is to interpret. The first time through it provides 0--mainly to extract the number of snapshots from the snapshot context header--so that it can allocate an appropriately-sized buffer to receive the entire snapshot context from the server in a second request. rbd_header_from_disk() will not fill in the array of snapshot ids unless the number in the snapshot matches the number the caller had allocated. This patch adjusts that logic a little further to be more efficient. rbd_read_header() doesn't even examine the snapshot context unless the snapshot count (stored in header->total_snaps) matches the number of snapshots allocated. So rbd_header_from_disk() doesn't need to allocate or fill in the snapshot context field at all in that case. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c9de0f8e808e..aff4e8a01ea5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -540,7 +540,14 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->comp_type = ondisk->options.comp_type; header->total_snaps = snap_count; - /* Set up the snapshot context */ + /* + * If the number of snapshot ids provided by the caller + * doesn't match the number in the entire context there's + * no point in going further. Caller will try again after + * getting an updated snapshot context from the server. + */ + if (allocated_snaps != snap_count) + return 0; size = sizeof (struct ceph_snap_context); size += snap_count * sizeof (header->snapc->snaps[0]); @@ -552,8 +559,10 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snapc->seq = le64_to_cpu(ondisk->snap_seq); header->snapc->num_snaps = snap_count; - if (snap_count && allocated_snaps == snap_count) { - int i; + /* Fill in the snapshot information */ + + if (snap_count) { + u32 i; for (i = 0; i < snap_count; i++) { header->snapc->snaps[i] = -- cgit v1.2.3-70-g09d2 From 103a150f0cc57576b1c4b80bf07af60a14349eee Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Aug 2012 11:29:45 -0500 Subject: rbd: expand rbd_dev_ondisk_valid() checks Add checks on the validity of the snap_count and snap_names_len field values in rbd_dev_ondisk_valid(). This eliminates the need to do them in rbd_header_from_disk(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index aff4e8a01ea5..5bcd4ebb22e7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -481,8 +481,31 @@ static void rbd_coll_release(struct kref *kref) static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) { - return !memcmp(&ondisk->text, - RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)); + size_t size; + u32 snap_count; + + /* The header has to start with the magic rbd header text */ + if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) + return false; + + /* + * The size of a snapshot header has to fit in a size_t, and + * that limits the number of snapshots. + */ + snap_count = le32_to_cpu(ondisk->snap_count); + size = SIZE_MAX - sizeof (struct ceph_snap_context); + if (snap_count > size / sizeof (__le64)) + return false; + + /* + * Not only that, but the size of the entire the snapshot + * header must also be representable in a size_t. + */ + size -= snap_count * sizeof (__le64); + if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) + return false; + + return true; } /* @@ -499,15 +522,10 @@ static int rbd_header_from_disk(struct rbd_image_header *header, if (!rbd_dev_ondisk_valid(ondisk)) return -ENXIO; - snap_count = le32_to_cpu(ondisk->snap_count); - - /* Make sure we don't overflow below */ - size = SIZE_MAX - sizeof (struct ceph_snap_context); - if (snap_count > size / sizeof (header->snapc->snaps[0])) - return -EINVAL; - memset(header, 0, sizeof (*header)); + snap_count = le32_to_cpu(ondisk->snap_count); + size = sizeof (ondisk->block_name) + 1; header->object_prefix = kmalloc(size, GFP_KERNEL); if (!header->object_prefix) -- cgit v1.2.3-70-g09d2 From 4156d998409be065aa8141b6bd2c6f18be1b21e9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 2 Aug 2012 11:29:46 -0500 Subject: rbd: separate reading header from decoding it Right now rbd_read_header() both reads the header object for an rbd image and decodes its contents. It does this repeatedly if needed, in order to ensure a complete and intact header is obtained. Separate this process into two steps--reading of the raw header data (in new function, rbd_dev_v1_header_read()) and separately decoding its contents (in rbd_header_from_disk()). As a result, the latter function no longer requires its allocated_snaps argument. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 136 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 79 insertions(+), 57 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5bcd4ebb22e7..8e6e29eacb1a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -513,15 +513,11 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) * header. */ static int rbd_header_from_disk(struct rbd_image_header *header, - struct rbd_image_header_ondisk *ondisk, - u32 allocated_snaps) + struct rbd_image_header_ondisk *ondisk) { u32 snap_count; size_t size; - if (!rbd_dev_ondisk_valid(ondisk)) - return -ENXIO; - memset(header, 0, sizeof (*header)); snap_count = le32_to_cpu(ondisk->snap_count); @@ -558,15 +554,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->comp_type = ondisk->options.comp_type; header->total_snaps = snap_count; - /* - * If the number of snapshot ids provided by the caller - * doesn't match the number in the entire context there's - * no point in going further. Caller will try again after - * getting an updated snapshot context from the server. - */ - if (allocated_snaps != snap_count) - return 0; - size = sizeof (struct ceph_snap_context); size += snap_count * sizeof (header->snapc->snaps[0]); header->snapc = kzalloc(size, GFP_KERNEL); @@ -1629,61 +1616,96 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) } /* - * reload the ondisk the header + * Read the complete header for the given rbd device. + * + * Returns a pointer to a dynamically-allocated buffer containing + * the complete and validated header. Caller can pass the address + * of a variable that will be filled in with the version of the + * header object at the time it was read. + * + * Returns a pointer-coded errno if a failure occurs. */ -static int rbd_read_header(struct rbd_device *rbd_dev, - struct rbd_image_header *header) +static struct rbd_image_header_ondisk * +rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) { - ssize_t rc; - struct rbd_image_header_ondisk *dh; + struct rbd_image_header_ondisk *ondisk = NULL; u32 snap_count = 0; - u64 ver; - size_t len; + u64 names_size = 0; + u32 want_count; + int ret; /* - * First reads the fixed-size header to determine the number - * of snapshots, then re-reads it, along with all snapshot - * records as well as their stored names. + * The complete header will include an array of its 64-bit + * snapshot ids, followed by the names of those snapshots as + * a contiguous block of NUL-terminated strings. Note that + * the number of snapshots could change by the time we read + * it in, in which case we re-read it. */ - len = sizeof (*dh); - while (1) { - dh = kmalloc(len, GFP_KERNEL); - if (!dh) - return -ENOMEM; - - rc = rbd_req_sync_read(rbd_dev, - CEPH_NOSNAP, + do { + size_t size; + + kfree(ondisk); + + size = sizeof (*ondisk); + size += snap_count * sizeof (struct rbd_image_snap_ondisk); + size += names_size; + ondisk = kmalloc(size, GFP_KERNEL); + if (!ondisk) + return ERR_PTR(-ENOMEM); + + ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, rbd_dev->header_name, - 0, len, - (char *)dh, &ver); - if (rc < 0) - goto out_dh; - - rc = rbd_header_from_disk(header, dh, snap_count); - if (rc < 0) { - if (rc == -ENXIO) - pr_warning("unrecognized header format" - " for image %s\n", - rbd_dev->image_name); - goto out_dh; + 0, size, + (char *) ondisk, version); + + if (ret < 0) + goto out_err; + if (WARN_ON((size_t) ret < size)) { + ret = -ENXIO; + pr_warning("short header read for image %s" + " (want %zd got %d)\n", + rbd_dev->image_name, size, ret); + goto out_err; + } + if (!rbd_dev_ondisk_valid(ondisk)) { + ret = -ENXIO; + pr_warning("invalid header for image %s\n", + rbd_dev->image_name); + goto out_err; } - if (snap_count == header->total_snaps) - break; + names_size = le64_to_cpu(ondisk->snap_names_len); + want_count = snap_count; + snap_count = le32_to_cpu(ondisk->snap_count); + } while (snap_count != want_count); - snap_count = header->total_snaps; - len = sizeof (*dh) + - snap_count * sizeof(struct rbd_image_snap_ondisk) + - header->snap_names_len; + return ondisk; - rbd_header_free(header); - kfree(dh); - } - header->obj_version = ver; +out_err: + kfree(ondisk); -out_dh: - kfree(dh); - return rc; + return ERR_PTR(ret); +} + +/* + * reload the ondisk the header + */ +static int rbd_read_header(struct rbd_device *rbd_dev, + struct rbd_image_header *header) +{ + struct rbd_image_header_ondisk *ondisk; + u64 ver = 0; + int ret; + + ondisk = rbd_dev_v1_header_read(rbd_dev, &ver); + if (IS_ERR(ondisk)) + return PTR_ERR(ondisk); + ret = rbd_header_from_disk(header, ondisk); + if (ret >= 0) + header->obj_version = ver; + kfree(ondisk); + + return ret; } /* -- cgit v1.2.3-70-g09d2 From c98f533c9497e285109a047bfb955d683f33f7e4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 9 Aug 2012 10:33:26 -0700 Subject: ceph: let path portion of mount "device" be optional A recent change to /sbin/mountall causes any trailing '/' character in the "device" (or fs_spec) field in /etc/fstab to be stripped. As a result, an entry for a ceph mount that intends to mount the root of the name space ends up with now path portion, and the ceph mount option processing code rejects this. That is, an entry in /etc/fstab like: cephserver:port:/ /mnt ceph defaults 0 0 provides to the ceph code just "cephserver:port:" as the "device," and that gets rejected. Although this is a bug in /sbin/mountall, we can have the ceph mount code support an empty/nonexistent path, interpreting it to mean the root of the name space. RFC 5952 offers recommendations for how to express IPv6 addresses, and recommends the usage found in RFC 3986 (which specifies the format for URI's) for representing both IPv4 and IPv6 addresses that include port numbers. (See in particular the definition of "authority" found in the Appendix of RFC 3986.) According to those standards, no host specification will ever contain a '/' character. As a result, it is sufficient to scan a provided "device" from an /etc/fstab entry for the first '/' character, and if it's found, treat that as the beginning of the path. If no '/' character is present, we can treat the entire string as the monitor host specification(s), and assume the path to be the root of the name space. We'll still require a ':' to separate the host portion from the (possibly empty) path portion. This means that we can more formally define how ceph will interpret the "device" it's provided when processing a mount request: "device" will look like: [,...]:[] where is [:] is optional, but if present must begin with '/' This addresses http://tracker.newdream.net/issues/2919 Signed-off-by: Alex Elder Reviewed-by: Dan Mick --- fs/ceph/super.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b982239f38f9..2f586b0e5e0f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -307,7 +307,10 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, { struct ceph_mount_options *fsopt; const char *dev_name_end; - int err = -ENOMEM; + int err; + + if (!dev_name || !*dev_name) + return -EINVAL; fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); if (!fsopt) @@ -328,21 +331,33 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + /* + * Distinguish the server list from the path in "dev_name". + * Internally we do not include the leading '/' in the path. + * + * "dev_name" will look like: + * [,...]:[] + * where + * is [:] + * is optional, but if present must begin with '/' + */ + dev_name_end = strchr(dev_name, '/'); + if (dev_name_end) { + /* skip over leading '/' for path */ + *path = dev_name_end + 1; + } else { + /* path is empty */ + dev_name_end = dev_name + strlen(dev_name); + *path = dev_name_end; + } err = -EINVAL; - if (!dev_name) - goto out; - *path = strstr(dev_name, ":/"); - if (*path == NULL) { - pr_err("device name is missing path (no :/ in %s)\n", + dev_name_end--; /* back up to ':' separator */ + if (*dev_name_end != ':') { + pr_err("device name is missing path (no : separator in %s)\n", dev_name); goto out; } - dev_name_end = *path; dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - - /* path on server */ - *path += 2; dout("server path '%s'\n", *path); *popt = ceph_parse_options(options, dev_name, dev_name_end, -- cgit v1.2.3-70-g09d2 From 290e33593d76d1cebf873da50e036559c4820af9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 17 Aug 2012 09:47:49 -0700 Subject: libceph: remove unused monc->have_fsid This is unused; use monc->client->have_fsid. Signed-off-by: Sage Weil --- include/linux/ceph/mon_client.h | 1 - net/ceph/mon_client.c | 1 - 2 files changed, 2 deletions(-) diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 2113e3850a4e..1dc508aeb73d 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -71,7 +71,6 @@ struct ceph_mon_client { int cur_mon; /* last monitor i contacted */ unsigned long sub_sent, sub_renew_after; struct ceph_connection con; - bool have_fsid; /* pending generic requests */ struct rb_root generic_request_tree; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 900ea0f043fc..e98f6070b5ae 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -769,7 +769,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc) monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); } monc->monmap->num_mon = num_mon; - monc->have_fsid = false; return 0; } -- cgit v1.2.3-70-g09d2 From 7698f2f5e0d7b7a062213fa970b7c4e121adf38e Mon Sep 17 00:00:00 2001 From: Iulius Curt Date: Thu, 23 Aug 2012 15:14:29 +0300 Subject: libceph: Fix sparse warning Make ceph_monc_do_poolop() static to remove the following sparse warning: * net/ceph/mon_client.c:616:5: warning: symbol 'ceph_monc_do_poolop' was not declared. Should it be static? Also drops the 'ceph_monc_' prefix, now being a private function. Signed-off-by: Iulius Curt Signed-off-by: Sage Weil --- net/ceph/mon_client.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index e98f6070b5ae..812eb3b46c1f 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -637,7 +637,7 @@ bad: /* * Do a synchronous pool op. */ -int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, +static int do_poolop(struct ceph_mon_client *monc, u32 op, u32 pool, u64 snapid, char *buf, int len) { @@ -687,7 +687,7 @@ out: int ceph_monc_create_snapid(struct ceph_mon_client *monc, u32 pool, u64 *snapid) { - return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, pool, 0, (char *)snapid, sizeof(*snapid)); } @@ -696,7 +696,7 @@ EXPORT_SYMBOL(ceph_monc_create_snapid); int ceph_monc_delete_snapid(struct ceph_mon_client *monc, u32 pool, u64 snapid) { - return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, pool, snapid, 0, 0); } -- cgit v1.2.3-70-g09d2 From 843a0d0879935742bb7270c9dc8d94abb8b39cee Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 Aug 2012 17:29:51 -0500 Subject: rbd: rename block_name -> object_prefix In the on-disk image header structure there is a field "block_name" which represents what we now call the "object prefix" for an rbd image. Rename this field "object_prefix" to be consistent with modern usage. This appears to be the only remaining vestige of the use of "block" in symbols that represent objects in the rbd code. This addresses http://tracker.newdream.net/issues/1761 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin Reviewed-by: Dan Mick --- drivers/block/rbd.c | 4 ++-- drivers/block/rbd_types.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8e6e29eacb1a..14bf83ba45d3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -522,11 +522,11 @@ static int rbd_header_from_disk(struct rbd_image_header *header, snap_count = le32_to_cpu(ondisk->snap_count); - size = sizeof (ondisk->block_name) + 1; + size = sizeof (ondisk->object_prefix) + 1; header->object_prefix = kmalloc(size, GFP_KERNEL); if (!header->object_prefix) return -ENOMEM; - memcpy(header->object_prefix, ondisk->block_name, size - 1); + memcpy(header->object_prefix, ondisk->object_prefix, size - 1); header->object_prefix[size - 1] = '\0'; if (snap_count) { diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index 0924e9e41a60..d9d8a77748bc 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -47,7 +47,7 @@ struct rbd_image_snap_ondisk { struct rbd_image_header_ondisk { char text[40]; - char block_name[24]; + char object_prefix[24]; char signature[4]; char version[8]; struct { -- cgit v1.2.3-70-g09d2 From 523f32582f30768ab4e56b71b276fc1ea71f48cc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 30 Aug 2012 00:16:37 -0500 Subject: rbd: add new snapshots at the tail This fixes a bug that went in with this commit: commit f6e0c99092cca7be00fca4080cfc7081739ca544 Author: Alex Elder Date: Thu Aug 2 11:29:46 2012 -0500 rbd: simplify __rbd_init_snaps_header() The problem is that a new rbd snapshot needs to go either after an existing snapshot entry, or at the *end* of an rbd device's snapshot list. As originally coded, it is placed at the beginning. This was based on the assumption the list would be empty (so it wouldn't matter), but in fact if multiple new snapshots are added to an empty list in one shot the list will be non-empty after the first one is added. This addresses http://tracker.newdream.net/issues/3063 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 14bf83ba45d3..2b40a4af4d17 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2187,7 +2187,7 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) if (snap) list_add_tail(&new_snap->node, &snap->node); else - list_add(&new_snap->node, head); + list_add_tail(&new_snap->node, head); } else { /* Already have this one */ -- cgit v1.2.3-70-g09d2 From cc4829e5967de577794b25dfcd1a65e509d171ed Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 5 Sep 2012 14:34:32 +0800 Subject: ceph: use list_move_tail instead of list_del/list_add_tail Using list_move_tail() instead of list_del() + list_add_tail(). Signed-off-by: Wei Yongjun Signed-off-by: Sage Weil --- net/ceph/pagelist.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 665cd23020ff..92866bebb65f 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -1,4 +1,3 @@ - #include #include #include @@ -134,8 +133,8 @@ int ceph_pagelist_truncate(struct ceph_pagelist *pl, ceph_pagelist_unmap_tail(pl); while (pl->head.prev != c->page_lru) { page = list_entry(pl->head.prev, struct page, lru); - list_del(&page->lru); /* remove from pagelist */ - list_add_tail(&page->lru, &pl->free_list); /* add to reserve */ + /* move from pagelist to reserve */ + list_move_tail(&page->lru, &pl->free_list); ++pl->num_pages_free; } pl->room = c->room; -- cgit v1.2.3-70-g09d2 From 1f7ba3311530993801d6877889efff0382bcd641 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 10 Aug 2012 13:12:07 -0700 Subject: rbd: handle locking inside __rbd_client_find() There is only caller of __rbd_client_find(), and it somewhat clumsily gets the appropriate lock and gets a reference to the existing ceph_client structure if it's found. Instead, have that function handle its own locking, and acquire the reference if found while it holds the lock. Drop the underscores from the name because there's no need to signify anything special about this function. Signed-off-by: Alex Elder Reviewed-by: Yehuda Sadeh --- drivers/block/rbd.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2b40a4af4d17..15bd3ecbcf34 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -322,19 +322,28 @@ out_opt: } /* - * Find a ceph client with specific addr and configuration. + * Find a ceph client with specific addr and configuration. If + * found, bump its reference count. */ -static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts) +static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) { struct rbd_client *client_node; + bool found = false; if (ceph_opts->flags & CEPH_OPT_NOSHARE) return NULL; - list_for_each_entry(client_node, &rbd_client_list, node) - if (!ceph_compare_options(ceph_opts, client_node->client)) - return client_node; - return NULL; + spin_lock(&rbd_client_list_lock); + list_for_each_entry(client_node, &rbd_client_list, node) { + if (!ceph_compare_options(ceph_opts, client_node->client)) { + kref_get(&client_node->kref); + found = true; + break; + } + } + spin_unlock(&rbd_client_list_lock); + + return found ? client_node : NULL; } /* @@ -416,22 +425,16 @@ static struct rbd_client *rbd_get_client(const char *mon_addr, return ERR_CAST(ceph_opts); } - spin_lock(&rbd_client_list_lock); - rbdc = __rbd_client_find(ceph_opts); + rbdc = rbd_client_find(ceph_opts); if (rbdc) { /* using an existing client */ - kref_get(&rbdc->kref); - spin_unlock(&rbd_client_list_lock); - ceph_destroy_options(ceph_opts); kfree(rbd_opts); return rbdc; } - spin_unlock(&rbd_client_list_lock); rbdc = rbd_client_create(ceph_opts, rbd_opts); - if (IS_ERR(rbdc)) kfree(rbd_opts); -- cgit v1.2.3-70-g09d2 From 58c17b0e1b2278824aedc5d1201f6a43a38d6a48 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 23 Aug 2012 23:22:06 -0500 Subject: rbd: don't over-allocate space for object prefix In rbd_header_from_disk() the object prefix buffer is sized based on the maximum size it's block_name equivalent on disk could be. Instead, only allocate enough to hold null-terminated string from the on-disk header--or the maximum size of no NUL is found. Signed-off-by: Alex Elder Reviewed-by: Yehuda Sadeh --- drivers/block/rbd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 15bd3ecbcf34..a27167942a92 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -519,18 +519,19 @@ static int rbd_header_from_disk(struct rbd_image_header *header, struct rbd_image_header_ondisk *ondisk) { u32 snap_count; + size_t len; size_t size; memset(header, 0, sizeof (*header)); snap_count = le32_to_cpu(ondisk->snap_count); - size = sizeof (ondisk->object_prefix) + 1; - header->object_prefix = kmalloc(size, GFP_KERNEL); + len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); + header->object_prefix = kmalloc(len + 1, GFP_KERNEL); if (!header->object_prefix) return -ENOMEM; - memcpy(header->object_prefix, ondisk->object_prefix, size - 1); - header->object_prefix[size - 1] = '\0'; + memcpy(header->object_prefix, ondisk->object_prefix, len); + header->object_prefix[len] = '\0'; if (snap_count) { header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); -- cgit v1.2.3-70-g09d2 From f785cc1dbe90b561b8ded92df4fe9732bdc54859 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 23 Aug 2012 23:22:06 -0500 Subject: rbd: kill incore snap_names_len The only thing the on-disk snap_names_len field is needed is to size the buffer allocated to hold a copy of the snapshot names for an rbd image. So don't bother saving it in the in-core rbd_image_header structure. Just use a local variable to hold the required buffer size while it's needed. Move the code that actually copies the snapshot names up closer to where the required length is saved. Signed-off-by: Alex Elder Reviewed-by: Yehuda Sadeh --- drivers/block/rbd.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a27167942a92..163fd853a15f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -81,7 +81,6 @@ struct rbd_image_header { __u8 crypt_type; __u8 comp_type; struct ceph_snap_context *snapc; - u64 snap_names_len; u32 total_snaps; char *snap_names; @@ -534,12 +533,21 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->object_prefix[len] = '\0'; if (snap_count) { - header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); - BUG_ON(header->snap_names_len > (u64) SIZE_MAX); - header->snap_names = kmalloc(header->snap_names_len, - GFP_KERNEL); + u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); + + if (snap_names_len > (u64) SIZE_MAX) + return -EIO; + header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); if (!header->snap_names) goto out_err; + /* + * Note that rbd_dev_v1_header_read() guarantees + * the ondisk buffer we're working with has + * snap_names_len bytes beyond the end of the + * snapshot id array, this memcpy() is safe. + */ + memcpy(header->snap_names, &ondisk->snaps[snap_count], + snap_names_len); size = snap_count * sizeof (*header->snap_sizes); header->snap_sizes = kmalloc(size, GFP_KERNEL); @@ -547,7 +555,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header, goto out_err; } else { WARN_ON(ondisk->snap_names_len); - header->snap_names_len = 0; header->snap_names = NULL; header->snap_sizes = NULL; } @@ -579,10 +586,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snap_sizes[i] = le64_to_cpu(ondisk->snaps[i].image_size); } - - /* copy snapshot names */ - memcpy(header->snap_names, &ondisk->snaps[snap_count], - header->snap_names_len); } return 0; @@ -592,7 +595,6 @@ out_err: header->snap_sizes = NULL; kfree(header->snap_names); header->snap_names = NULL; - header->snap_names_len = 0; kfree(header->object_prefix); header->object_prefix = NULL; @@ -660,7 +662,6 @@ static void rbd_header_free(struct rbd_image_header *header) header->snap_sizes = NULL; kfree(header->snap_names); header->snap_names = NULL; - header->snap_names_len = 0; ceph_put_snap_context(header->snapc); header->snapc = NULL; } @@ -1800,7 +1801,6 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) rbd_dev->header.total_snaps = h.total_snaps; rbd_dev->header.snapc = h.snapc; rbd_dev->header.snap_names = h.snap_names; - rbd_dev->header.snap_names_len = h.snap_names_len; rbd_dev->header.snap_sizes = h.snap_sizes; /* Free the extra copy of the object prefix */ WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); -- cgit v1.2.3-70-g09d2 From 621901d652db10ad8ceddd25dd4b883978a873e1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 23 Aug 2012 23:22:06 -0500 Subject: rbd: more cleanup in rbd_header_from_disk() This just rearranges things a bit more in rbd_header_from_disk() so that the snapshot sizes are initialized right after the buffer to hold them is allocated and doing a little further consolidation that follows from that. Also adds a few simple comments. Signed-off-by: Alex Elder Reviewed-by: Yehuda Sadeh --- drivers/block/rbd.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 163fd853a15f..145fbf633621 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -520,6 +520,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header, u32 snap_count; size_t len; size_t size; + u32 i; memset(header, 0, sizeof (*header)); @@ -535,6 +536,8 @@ static int rbd_header_from_disk(struct rbd_image_header *header, if (snap_count) { u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); + /* Save a copy of the snapshot names */ + if (snap_names_len > (u64) SIZE_MAX) return -EIO; header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); @@ -549,10 +552,15 @@ static int rbd_header_from_disk(struct rbd_image_header *header, memcpy(header->snap_names, &ondisk->snaps[snap_count], snap_names_len); + /* Record each snapshot's size */ + size = snap_count * sizeof (*header->snap_sizes); header->snap_sizes = kmalloc(size, GFP_KERNEL); if (!header->snap_sizes) goto out_err; + for (i = 0; i < snap_count; i++) + header->snap_sizes[i] = + le64_to_cpu(ondisk->snaps[i].image_size); } else { WARN_ON(ondisk->snap_names_len); header->snap_names = NULL; @@ -565,6 +573,8 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->comp_type = ondisk->options.comp_type; header->total_snaps = snap_count; + /* Allocate and fill in the snapshot context */ + size = sizeof (struct ceph_snap_context); size += snap_count * sizeof (header->snapc->snaps[0]); header->snapc = kzalloc(size, GFP_KERNEL); @@ -574,19 +584,9 @@ static int rbd_header_from_disk(struct rbd_image_header *header, atomic_set(&header->snapc->nref, 1); header->snapc->seq = le64_to_cpu(ondisk->snap_seq); header->snapc->num_snaps = snap_count; - - /* Fill in the snapshot information */ - - if (snap_count) { - u32 i; - - for (i = 0; i < snap_count; i++) { - header->snapc->snaps[i] = - le64_to_cpu(ondisk->snaps[i].id); - header->snap_sizes[i] = - le64_to_cpu(ondisk->snaps[i].image_size); - } - } + for (i = 0; i < snap_count; i++) + header->snapc->snaps[i] = + le64_to_cpu(ondisk->snaps[i].id); return 0; -- cgit v1.2.3-70-g09d2 From f8c3892911145db7cf895cc26f53ad73dd4e7b1f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 10 Aug 2012 13:12:07 -0700 Subject: rbd: move rbd_opts to struct rbd_device The rbd options don't really apply to the ceph client. So don't store a pointer to it in the ceph_client structure, and put them (a struct, not a pointer) into the rbd_dev structure proper. Pass the rbd device structure to rbd_client_create() so it can assign rbd_dev->rbdc if successful, and have it return an error code instead of the rbd client pointer. Signed-off-by: Alex Elder Reviewed-by: Yehuda Sadeh --- drivers/block/rbd.c | 49 ++++++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 33 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 145fbf633621..839ab730a1f3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -98,7 +98,6 @@ struct rbd_options { */ struct rbd_client { struct ceph_client *client; - struct rbd_options *rbd_opts; struct kref kref; struct list_head node; }; @@ -152,6 +151,7 @@ struct rbd_device { struct gendisk *disk; /* blkdev's gendisk and rq */ struct request_queue *q; + struct rbd_options rbd_opts; struct rbd_client *rbd_client; char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ @@ -273,8 +273,7 @@ static const struct block_device_operations rbd_bd_ops = { * Initialize an rbd client instance. * We own *ceph_opts. */ -static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts, - struct rbd_options *rbd_opts) +static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) { struct rbd_client *rbdc; int ret = -ENOMEM; @@ -298,8 +297,6 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts, if (ret < 0) goto out_err; - rbdc->rbd_opts = rbd_opts; - spin_lock(&rbd_client_list_lock); list_add_tail(&rbdc->node, &rbd_client_list); spin_unlock(&rbd_client_list_lock); @@ -402,42 +399,33 @@ static int parse_rbd_opts_token(char *c, void *private) * Get a ceph client with specific addr and configuration, if one does * not exist create it. */ -static struct rbd_client *rbd_get_client(const char *mon_addr, - size_t mon_addr_len, - char *options) +static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, + size_t mon_addr_len, char *options) { - struct rbd_client *rbdc; + struct rbd_options *rbd_opts = &rbd_dev->rbd_opts; struct ceph_options *ceph_opts; - struct rbd_options *rbd_opts; - - rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); - if (!rbd_opts) - return ERR_PTR(-ENOMEM); + struct rbd_client *rbdc; rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; ceph_opts = ceph_parse_options(options, mon_addr, mon_addr + mon_addr_len, parse_rbd_opts_token, rbd_opts); - if (IS_ERR(ceph_opts)) { - kfree(rbd_opts); - return ERR_CAST(ceph_opts); - } + if (IS_ERR(ceph_opts)) + return PTR_ERR(ceph_opts); rbdc = rbd_client_find(ceph_opts); if (rbdc) { /* using an existing client */ ceph_destroy_options(ceph_opts); - kfree(rbd_opts); - - return rbdc; + } else { + rbdc = rbd_client_create(ceph_opts); + if (IS_ERR(rbdc)) + return PTR_ERR(rbdc); } + rbd_dev->rbd_client = rbdc; - rbdc = rbd_client_create(ceph_opts, rbd_opts); - if (IS_ERR(rbdc)) - kfree(rbd_opts); - - return rbdc; + return 0; } /* @@ -455,7 +443,6 @@ static void rbd_client_release(struct kref *kref) spin_unlock(&rbd_client_list_lock); ceph_destroy_client(rbdc->client); - kfree(rbdc->rbd_opts); kfree(rbdc); } @@ -2533,13 +2520,9 @@ static ssize_t rbd_add(struct bus_type *bus, if (rc) goto err_put_id; - rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1, - options); - if (IS_ERR(rbd_dev->rbd_client)) { - rc = PTR_ERR(rbd_dev->rbd_client); - rbd_dev->rbd_client = NULL; + rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options); + if (rc < 0) goto err_put_id; - } /* pick the pool */ osdc = &rbd_dev->rbd_client->client->osdc; -- cgit v1.2.3-70-g09d2 From cc0538b62c839c2df7b9f8378bb37e3b35faa608 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 10 Aug 2012 13:12:07 -0700 Subject: rbd: add read_only rbd map option Add the ability to map an rbd image read-only, by specifying either "read_only" or "ro" as an option on the rbd "command line." Also allow the inverse to be explicitly specified using "read_write" or "rw". Signed-off-by: Alex Elder Reviewed-by: Yehuda Sadeh --- drivers/block/rbd.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 839ab730a1f3..2db51cef9560 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -69,7 +69,8 @@ #define DEV_NAME_LEN 32 #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) -#define RBD_NOTIFY_TIMEOUT_DEFAULT 10 +#define RBD_NOTIFY_TIMEOUT_DEFAULT 10 +#define RBD_READ_ONLY_DEFAULT false /* * block device image metadata (in-memory version) @@ -91,6 +92,7 @@ struct rbd_image_header { struct rbd_options { int notify_timeout; + bool read_only; }; /* @@ -176,7 +178,7 @@ struct rbd_device { u64 snap_id; /* current snapshot id */ /* whether the snap_id this device reads from still exists */ bool snap_exists; - int read_only; + bool read_only; struct list_head node; @@ -351,12 +353,21 @@ enum { /* int args above */ Opt_last_string, /* string args above */ + Opt_read_only, + Opt_read_write, + /* Boolean args above */ + Opt_last_bool, }; static match_table_t rbd_opts_tokens = { {Opt_notify_timeout, "notify_timeout=%d"}, /* int args above */ /* string args above */ + {Opt_read_only, "read_only"}, + {Opt_read_only, "ro"}, /* Alternate spelling */ + {Opt_read_write, "read_write"}, + {Opt_read_write, "rw"}, /* Alternate spelling */ + /* Boolean args above */ {-1, NULL} }; @@ -381,6 +392,8 @@ static int parse_rbd_opts_token(char *c, void *private) } else if (token > Opt_last_int && token < Opt_last_string) { dout("got string token %d val %s\n", token, argstr[0].from); + } else if (token > Opt_last_string && token < Opt_last_bool) { + dout("got Boolean token %d\n", token); } else { dout("got token %d\n", token); } @@ -389,6 +402,12 @@ static int parse_rbd_opts_token(char *c, void *private) case Opt_notify_timeout: rbd_opts->notify_timeout = intval; break; + case Opt_read_only: + rbd_opts->read_only = true; + break; + case Opt_read_write: + rbd_opts->read_only = false; + break; default: BUG_ON(token); } @@ -407,6 +426,7 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, struct rbd_client *rbdc; rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; + rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; ceph_opts = ceph_parse_options(options, mon_addr, mon_addr + mon_addr_len, @@ -620,7 +640,7 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) sizeof (RBD_SNAP_HEAD_NAME))) { rbd_dev->snap_id = CEPH_NOSNAP; rbd_dev->snap_exists = false; - rbd_dev->read_only = 0; + rbd_dev->read_only = rbd_dev->rbd_opts.read_only; if (size) *size = rbd_dev->header.image_size; } else { @@ -632,7 +652,7 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) goto done; rbd_dev->snap_id = snap_id; rbd_dev->snap_exists = true; - rbd_dev->read_only = 1; + rbd_dev->read_only = true; /* No choice for snapshots */ } ret = 0; -- cgit v1.2.3-70-g09d2 From 84d34dcc116e117a41c6fc8be13430529fc2d9e7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 10 Aug 2012 13:12:07 -0700 Subject: rbd: kill notify_timeout option The "notify_timeout" rbd device option is never used, so get rid of it. Signed-off-by: Alex Elder Reviewed-by: Yehuda Sadeh --- drivers/block/rbd.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2db51cef9560..cf44da832ca1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -69,7 +69,6 @@ #define DEV_NAME_LEN 32 #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) -#define RBD_NOTIFY_TIMEOUT_DEFAULT 10 #define RBD_READ_ONLY_DEFAULT false /* @@ -91,7 +90,6 @@ struct rbd_image_header { }; struct rbd_options { - int notify_timeout; bool read_only; }; @@ -348,7 +346,6 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) * mount options */ enum { - Opt_notify_timeout, Opt_last_int, /* int args above */ Opt_last_string, @@ -360,7 +357,6 @@ enum { }; static match_table_t rbd_opts_tokens = { - {Opt_notify_timeout, "notify_timeout=%d"}, /* int args above */ /* string args above */ {Opt_read_only, "read_only"}, @@ -399,9 +395,6 @@ static int parse_rbd_opts_token(char *c, void *private) } switch (token) { - case Opt_notify_timeout: - rbd_opts->notify_timeout = intval; - break; case Opt_read_only: rbd_opts->read_only = true; break; @@ -425,7 +418,6 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, struct ceph_options *ceph_opts; struct rbd_client *rbdc; - rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; ceph_opts = ceph_parse_options(options, mon_addr, -- cgit v1.2.3-70-g09d2 From 542582fce1700c01b12e7945aaf173074e008e3e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 9 Aug 2012 10:33:25 -0700 Subject: rbd: bio_chain_clone() cleanups In bio_chain_clone(), at the end of the function the bi_next field of the tail of the new bio chain is nulled. This isn't necessary, because if "tail" is non-null, its value will be the last bio structure allocated at the top of the while loop in that function. And before that structure is added to the end of the new chain, its bi_next pointer is always made null. While touching that function, clean a few other things: - define each local variable on its own line - move the definition of "tmp" to an inner scope - move the modification of gfpmask closer to where it's used - rearrange the logic that sets the chain's tail pointer Signed-off-by: Alex Elder Reviewed-by: Yehuda Sadeh --- drivers/block/rbd.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index cf44da832ca1..43f6ef8d696f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -754,7 +754,9 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, struct bio_pair **bp, int len, gfp_t gfpmask) { - struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL; + struct bio *old_chain = *old; + struct bio *new_chain = NULL; + struct bio *tail; int total = 0; if (*bp) { @@ -763,9 +765,12 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, } while (old_chain && (total < len)) { + struct bio *tmp; + tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs); if (!tmp) goto err_out; + gfpmask &= ~__GFP_WAIT; /* can't wait after the first */ if (total + old_chain->bi_size > len) { struct bio_pair *bp; @@ -793,15 +798,12 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, } tmp->bi_bdev = NULL; - gfpmask &= ~__GFP_WAIT; tmp->bi_next = NULL; - - if (!new_chain) { - new_chain = tail = tmp; - } else { + if (new_chain) tail->bi_next = tmp; - tail = tmp; - } + else + new_chain = tmp; + tail = tmp; old_chain = old_chain->bi_next; total += tmp->bi_size; @@ -809,9 +811,6 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, BUG_ON(total < len); - if (tail) - tail->bi_next = NULL; - *old = old_chain; return new_chain; -- cgit v1.2.3-70-g09d2 From 38f5f65e9d25b0a7270c337a35c724ca3d56f4d8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 9 Aug 2012 10:33:25 -0700 Subject: rbd: drop needless test in rbd_rq_fn() There's a test for null rq pointer inside the while loop in rbd_rq_fn() that's not needed. That same test already occurred in the immediatly preceding loop condition test. Signed-off-by: Alex Elder Reviewed-by: Yehuda Sadeh --- drivers/block/rbd.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 43f6ef8d696f..3475bb2e03ab 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1475,10 +1475,6 @@ static void rbd_rq_fn(struct request_queue *q) struct rbd_req_coll *coll; struct ceph_snap_context *snapc; - /* peek at request from block layer */ - if (!rq) - break; - dout("fetched request\n"); /* filter out block requests we don't understand */ -- cgit v1.2.3-70-g09d2 From df111be6310fc41d059a485368e3c51a684859c2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 9 Aug 2012 10:33:26 -0700 Subject: rbd: check for overflow in rbd_get_num_segments() It is possible in rbd_get_num_segments() for an overflow to occur when adding the offset and length. This is easily avoided. Since the function returns an int and the one caller is already prepared to handle errors, have it return -ERANGE if overflow would occur. The overflow check would not work if a zero-length request was being tested, so short-circuit that case, returning 0 for the number of segments required. (This condition might be avoided elsewhere already, I don't know.) Have the caller end the request if either an error or 0 is returned. The returned value is passed to __blk_end_request_all(), meaning a 0 length request is not treated an error. Signed-off-by: Alex Elder Reviewed-by: Yehuda Sadeh --- drivers/block/rbd.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3475bb2e03ab..6da6990a7b57 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -50,6 +50,10 @@ #define SECTOR_SHIFT 9 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) +/* It might be useful to have this defined elsewhere too */ + +#define U64_MAX ((u64) (~0ULL)) + #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" @@ -691,8 +695,17 @@ static u64 rbd_get_segment(struct rbd_image_header *header, static int rbd_get_num_segments(struct rbd_image_header *header, u64 ofs, u64 len) { - u64 start_seg = ofs >> header->obj_order; - u64 end_seg = (ofs + len - 1) >> header->obj_order; + u64 start_seg; + u64 end_seg; + + if (!len) + return 0; + if (len - 1 > U64_MAX - ofs) + return -ERANGE; + + start_seg = ofs >> header->obj_order; + end_seg = (ofs + len - 1) >> header->obj_order; + return end_seg - start_seg + 1; } @@ -1515,6 +1528,12 @@ static void rbd_rq_fn(struct request_queue *q) size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); + if (num_segs <= 0) { + spin_lock_irq(q->queue_lock); + __blk_end_request_all(rq, num_segs); + ceph_put_snap_context(snapc); + continue; + } coll = rbd_alloc_coll(num_segs); if (!coll) { spin_lock_irq(q->queue_lock); -- cgit v1.2.3-70-g09d2 From 65ccfe21dd8fb402547bb1c50bbc2737c4ef37b8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 9 Aug 2012 10:33:26 -0700 Subject: rbd: split up rbd_get_segment() There are two places where rbd_get_segment() is called. One, in rbd_rq_fn(), only needs to know the length within a segment that an I/O request should be. The other, in rbd_do_op(), also needs the name of the object and the offset within it for the I/O request. Split out rbd_segment_name() into three dedicated functions: - rbd_segment_name() allocates and formats the name of the object for a segment containing a given rbd image offset - rbd_segment_offset() computes the offset within a segment for a given rbd image offset - rbd_segment_length() computes the length to use for I/O within a segment for a request, not to exceed the end of a segment object. In the new functions be a bit more careful, checking for possible error conditions: - watch for errors or overflows returned by snprintf() - catch (using BUG_ON()) potential overflow conditions when computing segment length Signed-off-by: Alex Elder Reviewed-by: Yehuda Sadeh --- drivers/block/rbd.c | 66 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 26 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6da6990a7b57..7ba70c49bbdb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -669,27 +669,47 @@ static void rbd_header_free(struct rbd_image_header *header) header->snapc = NULL; } -/* - * get the actual striped segment name, offset and length - */ -static u64 rbd_get_segment(struct rbd_image_header *header, - const char *object_prefix, - u64 ofs, u64 len, - char *seg_name, u64 *segofs) +static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) +{ + char *name; + u64 segment; + int ret; + + name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); + if (!name) + return NULL; + segment = offset >> rbd_dev->header.obj_order; + ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx", + rbd_dev->header.object_prefix, segment); + if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) { + pr_err("error formatting segment name for #%llu (%d)\n", + segment, ret); + kfree(name); + name = NULL; + } + + return name; +} + +static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) { - u64 seg = ofs >> header->obj_order; + u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; - if (seg_name) - snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, - "%s.%012llx", object_prefix, seg); + return offset & (segment_size - 1); +} + +static u64 rbd_segment_length(struct rbd_device *rbd_dev, + u64 offset, u64 length) +{ + u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; - ofs = ofs & ((1 << header->obj_order) - 1); - len = min_t(u64, len, (1 << header->obj_order) - ofs); + offset &= segment_size - 1; - if (segofs) - *segofs = ofs; + BUG_ON(length > U64_MAX - offset); + if (offset + length > segment_size) + length = segment_size - offset; - return len; + return length; } static int rbd_get_num_segments(struct rbd_image_header *header, @@ -1127,14 +1147,11 @@ static int rbd_do_op(struct request *rq, struct ceph_osd_req_op *ops; u32 payload_len; - seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO); + seg_name = rbd_segment_name(rbd_dev, ofs); if (!seg_name) return -ENOMEM; - - seg_len = rbd_get_segment(&rbd_dev->header, - rbd_dev->header.object_prefix, - ofs, len, - seg_name, &seg_ofs); + seg_len = rbd_segment_length(rbd_dev, ofs, len); + seg_ofs = rbd_segment_offset(rbd_dev, ofs); payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); @@ -1545,10 +1562,7 @@ static void rbd_rq_fn(struct request_queue *q) do { /* a bio clone to be passed down to OSD req */ dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); - op_size = rbd_get_segment(&rbd_dev->header, - rbd_dev->header.object_prefix, - ofs, size, - NULL, NULL); + op_size = rbd_segment_length(rbd_dev, ofs, size); kref_get(&coll->kref); bio = bio_chain_clone(&rq_bio, &next_bio, &bp, op_size, GFP_ATOMIC); -- cgit v1.2.3-70-g09d2 From aafb230ebc3bcdbbd1781f56e482ec75f7f1f263 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 6 Sep 2012 16:00:54 -0500 Subject: rbd: define rbd_assert() Define rbd_assert() and use it in place of various BUG_ON() calls now present in the code. By default assertion checking is enabled; we want to do this differently at some point. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7ba70c49bbdb..d84b5341bea2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -41,6 +41,8 @@ #include "rbd_types.h" +#define RBD_DEBUG /* Activate rbd_assert() calls */ + /* * The basic unit of block I/O is a sector. It is interpreted in a * number of contexts in Linux (blk, bio, genhd), but the default is @@ -232,6 +234,18 @@ static struct device rbd_root_dev = { .release = rbd_root_dev_release, }; +#ifdef RBD_DEBUG +#define rbd_assert(expr) \ + if (unlikely(!(expr))) { \ + printk(KERN_ERR "\nAssertion failure in %s() " \ + "at line %d:\n\n" \ + "\trbd_assert(%s);\n\n", \ + __func__, __LINE__, #expr); \ + BUG(); \ + } +#else /* !RBD_DEBUG */ +# define rbd_assert(expr) ((void) 0) +#endif /* !RBD_DEBUG */ static struct device *rbd_get_dev(struct rbd_device *rbd_dev) { @@ -406,7 +420,8 @@ static int parse_rbd_opts_token(char *c, void *private) rbd_opts->read_only = false; break; default: - BUG_ON(token); + rbd_assert(false); + break; } return 0; } @@ -705,7 +720,7 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev, offset &= segment_size - 1; - BUG_ON(length > U64_MAX - offset); + rbd_assert(length <= U64_MAX - offset); if (offset + length > segment_size) length = segment_size - offset; @@ -842,7 +857,7 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, total += tmp->bi_size; } - BUG_ON(total < len); + rbd_assert(total == len); *old = old_chain; @@ -1101,7 +1116,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, struct page **pages; int num_pages; - BUG_ON(ops == NULL); + rbd_assert(ops != NULL); num_pages = calc_pages_for(ofs , len); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); @@ -1163,7 +1178,7 @@ static int rbd_do_op(struct request *rq, /* we've taken care of segment sizes earlier when we cloned the bios. We should never have a segment truncated at this point */ - BUG_ON(seg_len < len); + rbd_assert(seg_len == len); ret = rbd_do_request(rq, rbd_dev, snapc, snapid, seg_name, seg_ofs, seg_len, @@ -2186,7 +2201,7 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) : CEPH_NOSNAP; snap = links != head ? list_entry(links, struct rbd_snap, node) : NULL; - BUG_ON(snap && snap->id == CEPH_NOSNAP); + rbd_assert(!snap || snap->id != CEPH_NOSNAP); if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { struct list_head *next = links->next; @@ -2222,8 +2237,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) } else { /* Already have this one */ - BUG_ON(snap->size != rbd_dev->header.snap_sizes[index]); - BUG_ON(strcmp(snap->name, snap_name)); + rbd_assert(snap->size == + rbd_dev->header.snap_sizes[index]); + rbd_assert(!strcmp(snap->name, snap_name)); /* Done with this list entry; advance */ @@ -2313,7 +2329,7 @@ static void rbd_id_put(struct rbd_device *rbd_dev) int rbd_id = rbd_dev->dev_id; int max_id; - BUG_ON(rbd_id < 1); + rbd_assert(rbd_id > 0); spin_lock(&rbd_dev_list_lock); list_del_init(&rbd_dev->node); @@ -2705,6 +2721,7 @@ static ssize_t rbd_remove(struct bus_type *bus, done: mutex_unlock(&ctl_mutex); + return ret; } -- cgit v1.2.3-70-g09d2 From e28393082dd3991156d12a9e64b9584cef28fe25 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 29 Aug 2012 17:11:06 -0500 Subject: rbd: rename rbd_id_get() This should have been done as part of this commit: commit de71a2970d57463d3d965025e33ec3adcf391248 Author: Alex Elder Date: Tue Jul 3 16:01:19 2012 -0500 rbd: rename rbd_device->id rbd_id_get() is assigning the rbd_dev->dev_id field. Change the name of that function as well as rbd_id_put() and rbd_id_max to reflect what they are affecting. Add some dynamic debug statements related to rbd device id activity. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d84b5341bea2..8cb8e0abfb33 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2304,26 +2304,28 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) return ret; } -static atomic64_t rbd_id_max = ATOMIC64_INIT(0); +static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); /* * Get a unique rbd identifier for the given new rbd_dev, and add * the rbd_dev to the global list. The minimum rbd id is 1. */ -static void rbd_id_get(struct rbd_device *rbd_dev) +static void rbd_dev_id_get(struct rbd_device *rbd_dev) { - rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max); + rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); spin_lock(&rbd_dev_list_lock); list_add_tail(&rbd_dev->node, &rbd_dev_list); spin_unlock(&rbd_dev_list_lock); + dout("rbd_dev %p given dev id %llu\n", rbd_dev, + (unsigned long long) rbd_dev->dev_id); } /* * Remove an rbd_dev from the global list, and record that its * identifier is no longer in use. */ -static void rbd_id_put(struct rbd_device *rbd_dev) +static void rbd_dev_id_put(struct rbd_device *rbd_dev) { struct list_head *tmp; int rbd_id = rbd_dev->dev_id; @@ -2331,6 +2333,8 @@ static void rbd_id_put(struct rbd_device *rbd_dev) rbd_assert(rbd_id > 0); + dout("rbd_dev %p released dev id %llu\n", rbd_dev, + (unsigned long long) rbd_dev->dev_id); spin_lock(&rbd_dev_list_lock); list_del_init(&rbd_dev->node); @@ -2338,7 +2342,7 @@ static void rbd_id_put(struct rbd_device *rbd_dev) * If the id being "put" is not the current maximum, there * is nothing special we need to do. */ - if (rbd_id != atomic64_read(&rbd_id_max)) { + if (rbd_id != atomic64_read(&rbd_dev_id_max)) { spin_unlock(&rbd_dev_list_lock); return; } @@ -2359,12 +2363,13 @@ static void rbd_id_put(struct rbd_device *rbd_dev) spin_unlock(&rbd_dev_list_lock); /* - * The max id could have been updated by rbd_id_get(), in + * The max id could have been updated by rbd_dev_id_get(), in * which case it now accurately reflects the new maximum. * Be careful not to overwrite the maximum value in that * case. */ - atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id); + atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); + dout(" max dev id has been reset\n"); } /* @@ -2563,7 +2568,7 @@ static ssize_t rbd_add(struct bus_type *bus, init_rwsem(&rbd_dev->header_rwsem); /* generate unique id: find highest unique id, add one */ - rbd_id_get(rbd_dev); + rbd_dev_id_get(rbd_dev); /* Fill in the device name, now that we have its id. */ BUILD_BUG_ON(DEV_NAME_LEN @@ -2631,7 +2636,7 @@ err_put_id: kfree(rbd_dev->image_name); kfree(rbd_dev->pool_name); } - rbd_id_put(rbd_dev); + rbd_dev_id_put(rbd_dev); err_nomem: kfree(rbd_dev); kfree(options); @@ -2683,7 +2688,7 @@ static void rbd_dev_release(struct device *dev) kfree(rbd_dev->header_name); kfree(rbd_dev->pool_name); kfree(rbd_dev->image_name); - rbd_id_put(rbd_dev); + rbd_dev_id_put(rbd_dev); kfree(rbd_dev); /* release module ref */ -- cgit v1.2.3-70-g09d2 From 9fcbb80024795dc1a282f11af593ae5aa3d1b67e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 23 Aug 2012 23:48:49 -0500 Subject: rbd: rename __rbd_init_snaps_header() The name __rbd_init_snaps_header() doesn't really convey what that function does very well. Its purpose is to scan a new snapshot context and either create or destroy snapshot device entries so that local host's view is consistent with the reality maintained on the OSDs. This patch just changes the name of this function, to be rbd_dev_snap_devs_update(). Still not perfect, but I think better. Also add some dynamic debug statements to this function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8cb8e0abfb33..77263681dde2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -201,7 +201,7 @@ static DEFINE_SPINLOCK(rbd_dev_list_lock); static LIST_HEAD(rbd_client_list); /* clients */ static DEFINE_SPINLOCK(rbd_client_list_lock); -static int __rbd_init_snaps_header(struct rbd_device *rbd_dev); +static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev); static void rbd_dev_release(struct device *dev); static ssize_t rbd_snap_add(struct device *dev, struct device_attribute *attr, @@ -1848,7 +1848,7 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); kfree(h.object_prefix); - ret = __rbd_init_snaps_header(rbd_dev); + ret = rbd_dev_snap_devs_update(rbd_dev); up_write(&rbd_dev->header_rwsem); @@ -1880,7 +1880,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) return rc; /* no need to lock here, as rbd_dev is not registered yet */ - rc = __rbd_init_snaps_header(rbd_dev); + rc = rbd_dev_snap_devs_update(rbd_dev); if (rc) return rc; @@ -2184,7 +2184,7 @@ err: * snapshot id, highest id first. (Snapshots in the rbd_dev's list * are also maintained in that order.) */ -static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) +static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev) { struct ceph_snap_context *snapc = rbd_dev->header.snapc; const u32 snap_count = snapc->num_snaps; @@ -2193,6 +2193,7 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) struct list_head *links = head->next; u32 index = 0; + dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count); while (index < snap_count || links != head) { u64 snap_id; struct rbd_snap *snap; @@ -2211,6 +2212,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) if (rbd_dev->snap_id == snap->id) rbd_dev->snap_exists = false; __rbd_remove_snap_dev(snap); + dout("%ssnap id %llu has been removed\n", + rbd_dev->snap_id == snap->id ? "mapped " : "", + (unsigned long long) snap->id); /* Done with this list entry; advance */ @@ -2218,6 +2222,8 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) continue; } + dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, + (unsigned long long) snap_id); if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { struct rbd_snap *new_snap; @@ -2225,11 +2231,17 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) new_snap = __rbd_add_snap_dev(rbd_dev, index, snap_name); - if (IS_ERR(new_snap)) - return PTR_ERR(new_snap); + if (IS_ERR(new_snap)) { + int err = PTR_ERR(new_snap); + + dout(" failed to add dev, error %d\n", err); + + return err; + } /* New goes before existing, or at end of list */ + dout(" added dev%s\n", snap ? "" : " at end\n"); if (snap) list_add_tail(&new_snap->node, &snap->node); else @@ -2237,6 +2249,8 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) } else { /* Already have this one */ + dout(" already present\n"); + rbd_assert(snap->size == rbd_dev->header.snap_sizes[index]); rbd_assert(!strcmp(snap->name, snap_name)); @@ -2251,6 +2265,7 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) index++; snap_name += strlen(snap_name) + 1; } + dout("%s: done\n", __func__); return 0; } -- cgit v1.2.3-70-g09d2 From 98cec111c08d8d52168e28648b4a1c2f5011cd70 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 29 Aug 2012 17:11:06 -0500 Subject: rbd: kill rbd_dev->q A copy of rbd_dev->disk->queue is held in rbd_dev->q, but it's never actually used. So get just get rid of the field. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 77263681dde2..774a36bb00ed 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -155,7 +155,6 @@ struct rbd_device { int major; /* blkdev assigned major */ struct gendisk *disk; /* blkdev's gendisk and rq */ - struct request_queue *q; struct rbd_options rbd_opts; struct rbd_client *rbd_client; @@ -1923,7 +1922,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) q->queuedata = rbd_dev; rbd_dev->disk = disk; - rbd_dev->q = q; /* finally, announce the disk to the world */ set_capacity(disk, total_size / SECTOR_SIZE); -- cgit v1.2.3-70-g09d2 From c9aadfe7860f83ee17e55fe17398f3fe948a0a84 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 30 Aug 2012 14:42:15 -0500 Subject: rbd: kill rbd_image_header->total_snaps The "total_snaps" field in an rbd header structure is never any different from the value of "num_snaps" stored within a snapshot context. Avoid any confusion by just using the value held within the snapshot context, and get rid of the "total_snaps" field. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 774a36bb00ed..eb6b7723906b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -87,7 +87,6 @@ struct rbd_image_header { __u8 crypt_type; __u8 comp_type; struct ceph_snap_context *snapc; - u32 total_snaps; char *snap_names; u64 *snap_sizes; @@ -588,7 +587,6 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->obj_order = ondisk->options.order; header->crypt_type = ondisk->options.crypt_type; header->comp_type = ondisk->options.comp_type; - header->total_snaps = snap_count; /* Allocate and fill in the snapshot context */ @@ -624,7 +622,8 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name, int i; char *p = header->snap_names; - for (i = 0; i < header->total_snaps; i++) { + rbd_assert(header->snapc != NULL); + for (i = 0; i < header->snapc->num_snaps; i++) { if (!strcmp(snap_name, p)) { /* Found it. Pass back its id and/or size */ @@ -1839,7 +1838,6 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) *hver = h.obj_version; rbd_dev->header.obj_version = h.obj_version; rbd_dev->header.image_size = h.image_size; - rbd_dev->header.total_snaps = h.total_snaps; rbd_dev->header.snapc = h.snapc; rbd_dev->header.snap_names = h.snap_names; rbd_dev->header.snap_sizes = h.snap_sizes; -- cgit v1.2.3-70-g09d2 From f84344f334df8f1d41eba7cfa7eb1024da25e1fe Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 Aug 2012 17:29:51 -0500 Subject: rbd: separate mapping info in rbd_dev Several fields in a struct rbd_dev are related to what is mapped, as opposed to the actual base rbd image. If the base image is mapped these are almost unneeded, but if a snapshot is mapped they describe information about that snapshot. In some contexts this can be a little bit confusing. So group these mapping-related field into a structure to make it clear what they are describing. This also includes a minor change that rearranges the fields in the in-core image header structure so that invariant fields are at the top, followed by those that change. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 83 +++++++++++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index eb6b7723906b..dff621060432 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -81,13 +81,15 @@ * block device image metadata (in-memory version) */ struct rbd_image_header { - u64 image_size; + /* These four fields never change for a given rbd image */ char *object_prefix; __u8 obj_order; __u8 crypt_type; __u8 comp_type; - struct ceph_snap_context *snapc; + /* The remaining fields need to be updated occasionally */ + u64 image_size; + struct ceph_snap_context *snapc; char *snap_names; u64 *snap_sizes; @@ -146,6 +148,13 @@ struct rbd_snap { u64 id; }; +struct rbd_mapping { + char *snap_name; + u64 snap_id; + bool snap_exists; + bool read_only; +}; + /* * a single device */ @@ -174,13 +183,8 @@ struct rbd_device { /* protects updating the header */ struct rw_semaphore header_rwsem; - /* name of the snapshot this device reads from */ - char *snap_name; - /* id of the snapshot this device reads from */ - u64 snap_id; /* current snapshot id */ - /* whether the snap_id this device reads from still exists */ - bool snap_exists; - bool read_only; + + struct rbd_mapping mapping; struct list_head node; @@ -261,11 +265,11 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) { struct rbd_device *rbd_dev = bdev->bd_disk->private_data; - if ((mode & FMODE_WRITE) && rbd_dev->read_only) + if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) return -EROFS; rbd_get_dev(rbd_dev); - set_device_ro(bdev, rbd_dev->read_only); + set_device_ro(bdev, rbd_dev->mapping.read_only); return 0; } @@ -375,7 +379,7 @@ enum { static match_table_t rbd_opts_tokens = { /* int args above */ /* string args above */ - {Opt_read_only, "read_only"}, + {Opt_read_only, "mapping.read_only"}, {Opt_read_only, "ro"}, /* Alternate spelling */ {Opt_read_write, "read_write"}, {Opt_read_write, "rw"}, /* Alternate spelling */ @@ -583,13 +587,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snap_sizes = NULL; } - header->image_size = le64_to_cpu(ondisk->image_size); header->obj_order = ondisk->options.order; header->crypt_type = ondisk->options.crypt_type; header->comp_type = ondisk->options.comp_type; /* Allocate and fill in the snapshot context */ + header->image_size = le64_to_cpu(ondisk->image_size); size = sizeof (struct ceph_snap_context); size += snap_count * sizeof (header->snapc->snaps[0]); header->snapc = kzalloc(size, GFP_KERNEL); @@ -645,23 +649,24 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) down_write(&rbd_dev->header_rwsem); - if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, + if (!memcmp(rbd_dev->mapping.snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { - rbd_dev->snap_id = CEPH_NOSNAP; - rbd_dev->snap_exists = false; - rbd_dev->read_only = rbd_dev->rbd_opts.read_only; + rbd_dev->mapping.snap_id = CEPH_NOSNAP; + rbd_dev->mapping.snap_exists = false; + rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; if (size) *size = rbd_dev->header.image_size; } else { u64 snap_id = 0; - ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name, + ret = snap_by_name(&rbd_dev->header, + rbd_dev->mapping.snap_name, &snap_id, size); if (ret < 0) goto done; - rbd_dev->snap_id = snap_id; - rbd_dev->snap_exists = true; - rbd_dev->read_only = true; /* No choice for snapshots */ + rbd_dev->mapping.snap_id = snap_id; + rbd_dev->mapping.snap_exists = true; + rbd_dev->mapping.read_only = true; } ret = 0; @@ -1532,7 +1537,7 @@ static void rbd_rq_fn(struct request_queue *q) size = blk_rq_bytes(rq); ofs = blk_rq_pos(rq) * SECTOR_SIZE; rq_bio = rq->bio; - if (do_write && rbd_dev->read_only) { + if (do_write && rbd_dev->mapping.read_only) { __blk_end_request_all(rq, -EROFS); continue; } @@ -1541,7 +1546,8 @@ static void rbd_rq_fn(struct request_queue *q) down_read(&rbd_dev->header_rwsem); - if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) { + if (rbd_dev->mapping.snap_id != CEPH_NOSNAP && + !rbd_dev->mapping.snap_exists) { up_read(&rbd_dev->header_rwsem); dout("request for non-existent snapshot"); spin_lock_irq(q->queue_lock); @@ -1595,7 +1601,7 @@ static void rbd_rq_fn(struct request_queue *q) coll, cur_seg); else rbd_req_read(rq, rbd_dev, - rbd_dev->snap_id, + rbd_dev->mapping.snap_id, ofs, op_size, bio, coll, cur_seg); @@ -1767,7 +1773,7 @@ static int rbd_header_add_snap(struct rbd_device *rbd_dev, struct ceph_mon_client *monc; /* we should create a snapshot only if we're pointing at the head */ - if (rbd_dev->snap_id != CEPH_NOSNAP) + if (rbd_dev->mapping.snap_id != CEPH_NOSNAP) return -EINVAL; monc = &rbd_dev->rbd_client->client->monc; @@ -1821,7 +1827,7 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) down_write(&rbd_dev->header_rwsem); /* resized? */ - if (rbd_dev->snap_id == CEPH_NOSNAP) { + if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) { sector_t size = (sector_t) h.image_size / SECTOR_SIZE; dout("setting size to %llu sectors", (unsigned long long) size); @@ -2004,7 +2010,7 @@ static ssize_t rbd_snap_show(struct device *dev, { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->snap_name); + return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name); } static ssize_t rbd_image_refresh(struct device *dev, @@ -2205,11 +2211,12 @@ static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev) /* Existing snapshot not in the new snap context */ - if (rbd_dev->snap_id == snap->id) - rbd_dev->snap_exists = false; + if (rbd_dev->mapping.snap_id == snap->id) + rbd_dev->mapping.snap_exists = false; __rbd_remove_snap_dev(snap); dout("%ssnap id %llu has been removed\n", - rbd_dev->snap_id == snap->id ? "mapped " : "", + rbd_dev->mapping.snap_id == snap->id ? + "mapped " : "", (unsigned long long) snap->id); /* Done with this list entry; advance */ @@ -2522,18 +2529,18 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, * The snapshot name is optional. If none is is supplied, * we use the default value. */ - rbd_dev->snap_name = dup_token(&buf, &len); - if (!rbd_dev->snap_name) + rbd_dev->mapping.snap_name = dup_token(&buf, &len); + if (!rbd_dev->mapping.snap_name) goto out_err; if (!len) { /* Replace the empty name with the default */ - kfree(rbd_dev->snap_name); - rbd_dev->snap_name + kfree(rbd_dev->mapping.snap_name); + rbd_dev->mapping.snap_name = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL); - if (!rbd_dev->snap_name) + if (!rbd_dev->mapping.snap_name) goto out_err; - memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, + memcpy(rbd_dev->mapping.snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME)); } @@ -2642,7 +2649,7 @@ err_out_client: rbd_put_client(rbd_dev); err_put_id: if (rbd_dev->pool_name) { - kfree(rbd_dev->snap_name); + kfree(rbd_dev->mapping.snap_name); kfree(rbd_dev->header_name); kfree(rbd_dev->image_name); kfree(rbd_dev->pool_name); @@ -2695,7 +2702,7 @@ static void rbd_dev_release(struct device *dev) unregister_blkdev(rbd_dev->major, rbd_dev->name); /* done with the id, and with the rbd_dev */ - kfree(rbd_dev->snap_name); + kfree(rbd_dev->mapping.snap_name); kfree(rbd_dev->header_name); kfree(rbd_dev->pool_name); kfree(rbd_dev->image_name); -- cgit v1.2.3-70-g09d2 From 99c1f08f6459cfa6fe1f5fb68706b437e006be2e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 30 Aug 2012 14:42:15 -0500 Subject: rbd: record mapped size Add the size of the mapped image to the set of mapping-specific fields in an rbd_device, and use it when setting the capacity of the disk. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index dff621060432..4377a8302fc3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -151,6 +151,7 @@ struct rbd_snap { struct rbd_mapping { char *snap_name; u64 snap_id; + u64 size; bool snap_exists; bool read_only; }; @@ -643,7 +644,7 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name, return -ENOENT; } -static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) +static int rbd_header_set_snap(struct rbd_device *rbd_dev) { int ret; @@ -652,19 +653,16 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) if (!memcmp(rbd_dev->mapping.snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { rbd_dev->mapping.snap_id = CEPH_NOSNAP; + rbd_dev->mapping.size = rbd_dev->header.image_size; rbd_dev->mapping.snap_exists = false; rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; - if (size) - *size = rbd_dev->header.image_size; } else { - u64 snap_id = 0; - ret = snap_by_name(&rbd_dev->header, rbd_dev->mapping.snap_name, - &snap_id, size); + &rbd_dev->mapping.snap_id, + &rbd_dev->mapping.size); if (ret < 0) goto done; - rbd_dev->mapping.snap_id = snap_id; rbd_dev->mapping.snap_exists = true; rbd_dev->mapping.read_only = true; } @@ -1830,8 +1828,12 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) { sector_t size = (sector_t) h.image_size / SECTOR_SIZE; - dout("setting size to %llu sectors", (unsigned long long) size); - set_capacity(rbd_dev->disk, size); + if (size != (sector_t) rbd_dev->mapping.size) { + dout("setting size to %llu sectors", + (unsigned long long) size); + rbd_dev->mapping.size = (u64) size; + set_capacity(rbd_dev->disk, size); + } } /* rbd_dev->header.object_prefix shouldn't change */ @@ -1875,7 +1877,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) struct request_queue *q; int rc; u64 segment_size; - u64 total_size = 0; /* contact OSD, request size info about the object being mapped */ rc = rbd_read_header(rbd_dev, &rbd_dev->header); @@ -1887,7 +1888,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (rc) return rc; - rc = rbd_header_set_snap(rbd_dev, &total_size); + rc = rbd_header_set_snap(rbd_dev); if (rc) return rc; @@ -1928,11 +1929,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->disk = disk; /* finally, announce the disk to the world */ - set_capacity(disk, total_size / SECTOR_SIZE); + set_capacity(disk, (sector_t) rbd_dev->mapping.size / SECTOR_SIZE); add_disk(disk); pr_info("%s: added with size 0x%llx\n", - disk->disk_name, (unsigned long long)total_size); + disk->disk_name, (unsigned long long) rbd_dev->mapping.size); return 0; out_disk: -- cgit v1.2.3-70-g09d2 From 3feeb8946739d980fb0922bf68363552a493a49c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 Aug 2012 17:29:52 -0500 Subject: rbd: return snap name from rbd_add_parse_args() This is the first of two patches aimed at isolating the code that sets the mapping information into a single spot. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 72 +++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4377a8302fc3..1a64ba294a76 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2477,28 +2477,31 @@ static inline char *dup_token(const char **buf, size_t *lenp) } /* - * This fills in the pool_name, image_name, image_name_len, snap_name, - * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based - * on the list of monitor addresses and other options provided via - * /sys/bus/rbd/add. + * This fills in the pool_name, image_name, image_name_len, rbd_dev, + * rbd_md_name, and name fields of the given rbd_dev, based on the + * list of monitor addresses and other options provided via + * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated + * copy of the snapshot name to map if successful, or a + * pointer-coded error otherwise. * * Note: rbd_dev is assumed to have been initially zero-filled. */ -static int rbd_add_parse_args(struct rbd_device *rbd_dev, - const char *buf, - const char **mon_addrs, - size_t *mon_addrs_size, - char *options, - size_t options_size) +static char *rbd_add_parse_args(struct rbd_device *rbd_dev, + const char *buf, + const char **mon_addrs, + size_t *mon_addrs_size, + char *options, + size_t options_size) { size_t len; - int ret; + char *err_ptr = ERR_PTR(-EINVAL); + char *snap_name; /* The first four tokens are required */ len = next_token(&buf); if (!len) - return -EINVAL; + return err_ptr; *mon_addrs_size = len + 1; *mon_addrs = buf; @@ -2506,9 +2509,9 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, len = copy_token(&buf, options, options_size); if (!len || len >= options_size) - return -EINVAL; + return err_ptr; - ret = -ENOMEM; + err_ptr = ERR_PTR(-ENOMEM); rbd_dev->pool_name = dup_token(&buf, NULL); if (!rbd_dev->pool_name) goto out_err; @@ -2526,26 +2529,21 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, goto out_err; sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); - /* - * The snapshot name is optional. If none is is supplied, - * we use the default value. - */ - rbd_dev->mapping.snap_name = dup_token(&buf, &len); - if (!rbd_dev->mapping.snap_name) - goto out_err; + /* Snapshot name is optional */ + len = next_token(&buf); if (!len) { - /* Replace the empty name with the default */ - kfree(rbd_dev->mapping.snap_name); - rbd_dev->mapping.snap_name - = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL); - if (!rbd_dev->mapping.snap_name) - goto out_err; - - memcpy(rbd_dev->mapping.snap_name, RBD_SNAP_HEAD_NAME, - sizeof (RBD_SNAP_HEAD_NAME)); + buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ + len = sizeof (RBD_SNAP_HEAD_NAME) - 1; } + snap_name = kmalloc(len + 1, GFP_KERNEL); + if (!snap_name) + goto out_err; + memcpy(snap_name, buf, len); + *(snap_name + len) = '\0'; - return 0; +dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len); + + return snap_name; out_err: kfree(rbd_dev->header_name); @@ -2556,7 +2554,7 @@ out_err: kfree(rbd_dev->pool_name); rbd_dev->pool_name = NULL; - return ret; + return err_ptr; } static ssize_t rbd_add(struct bus_type *bus, @@ -2569,6 +2567,7 @@ static ssize_t rbd_add(struct bus_type *bus, size_t mon_addrs_size = 0; struct ceph_osd_client *osdc; int rc = -ENOMEM; + char *snap_name; if (!try_module_get(THIS_MODULE)) return -ENODEV; @@ -2595,10 +2594,13 @@ static ssize_t rbd_add(struct bus_type *bus, sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); /* parse add command */ - rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, - options, count); - if (rc) + snap_name = rbd_add_parse_args(rbd_dev, buf, + &mon_addrs, &mon_addrs_size, options, count); + if (IS_ERR(snap_name)) { + rc = PTR_ERR(snap_name); goto err_put_id; + } + rbd_dev->mapping.snap_name = snap_name; rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options); if (rc < 0) -- cgit v1.2.3-70-g09d2 From 4e1105a299adf7ac421d42a8be05205f51610f3c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 Aug 2012 17:29:52 -0500 Subject: rbd: set mapping name with the rest With the exception of the snapshot name, all of the mapping-specific fields in an rbd device structure are set in rbd_header_set_snap(). Pass the snapshot name to be assigned into rbd_header_set_snap() to keep all of the mapping assignments together. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1a64ba294a76..23fa962fea36 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -644,21 +644,20 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name, return -ENOENT; } -static int rbd_header_set_snap(struct rbd_device *rbd_dev) +static int rbd_header_set_snap(struct rbd_device *rbd_dev, char *snap_name) { int ret; down_write(&rbd_dev->header_rwsem); - if (!memcmp(rbd_dev->mapping.snap_name, RBD_SNAP_HEAD_NAME, + if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { rbd_dev->mapping.snap_id = CEPH_NOSNAP; rbd_dev->mapping.size = rbd_dev->header.image_size; rbd_dev->mapping.snap_exists = false; rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; } else { - ret = snap_by_name(&rbd_dev->header, - rbd_dev->mapping.snap_name, + ret = snap_by_name(&rbd_dev->header, snap_name, &rbd_dev->mapping.snap_id, &rbd_dev->mapping.size); if (ret < 0) @@ -666,6 +665,7 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev) rbd_dev->mapping.snap_exists = true; rbd_dev->mapping.read_only = true; } + rbd_dev->mapping.snap_name = snap_name; ret = 0; done: @@ -1888,7 +1888,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (rc) return rc; - rc = rbd_header_set_snap(rbd_dev); + rc = rbd_header_set_snap(rbd_dev, snap_name); if (rc) return rc; @@ -2600,7 +2600,6 @@ static ssize_t rbd_add(struct bus_type *bus, rc = PTR_ERR(snap_name); goto err_put_id; } - rbd_dev->mapping.snap_name = snap_name; rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options); if (rc < 0) -- cgit v1.2.3-70-g09d2 From 8836b995fd192dba23d312d2a4fba68dd8ca7183 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 30 Aug 2012 14:42:15 -0500 Subject: rbd: simplify snap_by_name() interface There is only one caller of snap_by_name(), and it passes two values to be assigned, both of which are found within an rbd device structure. Change the interface so it just passes the address of the rbd_dev, and make the assignments to its fields directly. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 23fa962fea36..46b8f8e536be 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -621,10 +621,10 @@ out_err: return -ENOMEM; } -static int snap_by_name(struct rbd_image_header *header, const char *snap_name, - u64 *seq, u64 *size) +static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) { int i; + struct rbd_image_header *header = &rbd_dev->header; char *p = header->snap_names; rbd_assert(header->snapc != NULL); @@ -633,10 +633,9 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name, /* Found it. Pass back its id and/or size */ - if (seq) - *seq = header->snapc->snaps[i]; - if (size) - *size = header->snap_sizes[i]; + rbd_dev->mapping.snap_id = header->snapc->snaps[i]; + rbd_dev->mapping.size = header->snap_sizes[i]; + return i; } p += strlen(p) + 1; /* Skip ahead to the next name */ @@ -657,9 +656,7 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev, char *snap_name) rbd_dev->mapping.snap_exists = false; rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; } else { - ret = snap_by_name(&rbd_dev->header, snap_name, - &rbd_dev->mapping.snap_id, - &rbd_dev->mapping.size); + ret = snap_by_name(rbd_dev, snap_name); if (ret < 0) goto done; rbd_dev->mapping.snap_exists = true; -- cgit v1.2.3-70-g09d2 From 2ac4e75d89e9df8eea6390a759eac2b6df0ebff6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 10 Jul 2012 20:30:10 -0500 Subject: rbd: do some header initialization earlier Move some of the code that initializes an rbd header out of rbd_init_disk() and into its caller. Move the code at the end of rbd_init_disk() that sets the device capacity and activates the Linux device out of that function and into the caller, ensuring we still have the disk size available where we need it. Update rbd_free_disk() so it still aligns well as an inverse of rbd_init_disk(), moving the rbd_header_free() call out to its caller. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 46b8f8e536be..6e735a754b5f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1652,8 +1652,6 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) if (!disk) return; - rbd_header_free(&rbd_dev->header); - if (disk->flags & GENHD_FL_UP) del_gendisk(disk); if (disk->queue) @@ -1875,20 +1873,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) int rc; u64 segment_size; - /* contact OSD, request size info about the object being mapped */ - rc = rbd_read_header(rbd_dev, &rbd_dev->header); - if (rc) - return rc; - - /* no need to lock here, as rbd_dev is not registered yet */ - rc = rbd_dev_snap_devs_update(rbd_dev); - if (rc) - return rc; - - rc = rbd_header_set_snap(rbd_dev, snap_name); - if (rc) - return rc; - /* create gendisk info */ rc = -ENOMEM; disk = alloc_disk(RBD_MINORS_PER_MAJOR); @@ -1925,12 +1909,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->disk = disk; - /* finally, announce the disk to the world */ - set_capacity(disk, (sector_t) rbd_dev->mapping.size / SECTOR_SIZE); - add_disk(disk); - - pr_info("%s: added with size 0x%llx\n", - disk->disk_name, (unsigned long long) rbd_dev->mapping.size); return 0; out_disk: @@ -2622,13 +2600,35 @@ static ssize_t rbd_add(struct bus_type *bus, /* * At this point cleanup in the event of an error is the job * of the sysfs code (initiated by rbd_bus_del_dev()). - * - * Set up and announce blkdev mapping. */ + + /* contact OSD, request size info about the object being mapped */ + rc = rbd_read_header(rbd_dev, &rbd_dev->header); + if (rc) + goto err_out_bus; + + /* no need to lock here, as rbd_dev is not registered yet */ + rc = rbd_dev_snap_devs_update(rbd_dev); + if (rc) + goto err_out_bus; + + rc = rbd_header_set_snap(rbd_dev, snap_name); + if (rc) + goto err_out_bus; + + /* Set up the blkdev mapping. */ + rc = rbd_init_disk(rbd_dev); if (rc) goto err_out_bus; + /* Everything's ready. Announce the disk to the world. */ + + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); + add_disk(rbd_dev->disk); + pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, + (unsigned long long) rbd_dev->mapping.size); + rc = rbd_init_watch_dev(rbd_dev); if (rc) goto err_out_bus; @@ -2700,6 +2700,9 @@ static void rbd_dev_release(struct device *dev) rbd_free_disk(rbd_dev); unregister_blkdev(rbd_dev->major, rbd_dev->name); + /* release allocated disk header fields */ + rbd_header_free(&rbd_dev->header); + /* done with the id, and with the rbd_dev */ kfree(rbd_dev->mapping.snap_name); kfree(rbd_dev->header_name); -- cgit v1.2.3-70-g09d2 From 1fcdb8aa1f58af72eb8206ba97fab2df77df2b14 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 29 Aug 2012 17:11:06 -0500 Subject: rbd: simplify rbd_init_disk() a bit This just simplifies a few things in rbd_init_disk(), now that the previous patch has moved a bunch of initialization code out if it. Done separately to facilitate review. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6e735a754b5f..634a16c40291 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1870,14 +1870,12 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) { struct gendisk *disk; struct request_queue *q; - int rc; u64 segment_size; /* create gendisk info */ - rc = -ENOMEM; disk = alloc_disk(RBD_MINORS_PER_MAJOR); if (!disk) - goto out; + return -ENOMEM; snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", rbd_dev->dev_id); @@ -1887,7 +1885,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->private_data = rbd_dev; /* init rq */ - rc = -ENOMEM; q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); if (!q) goto out_disk; @@ -1910,11 +1907,10 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->disk = disk; return 0; - out_disk: put_disk(disk); -out: - return rc; + + return -ENOMEM; } /* -- cgit v1.2.3-70-g09d2 From 4bb1f1ed0063870f34ae5783cda08924964bac0b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 23 Aug 2012 23:48:49 -0500 Subject: rbd: move locking out of rbd_header_set_snap() Move the calls to get the header semaphore out of rbd_header_set_snap() and into its caller. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 634a16c40291..214c937a6de5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -647,8 +647,6 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev, char *snap_name) { int ret; - down_write(&rbd_dev->header_rwsem); - if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { rbd_dev->mapping.snap_id = CEPH_NOSNAP; @@ -666,7 +664,6 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev, char *snap_name) ret = 0; done: - up_write(&rbd_dev->header_rwsem); return ret; } @@ -2608,7 +2605,9 @@ static ssize_t rbd_add(struct bus_type *bus, if (rc) goto err_out_bus; + down_write(&rbd_dev->header_rwsem); rc = rbd_header_set_snap(rbd_dev, snap_name); + up_write(&rbd_dev->header_rwsem); if (rc) goto err_out_bus; -- cgit v1.2.3-70-g09d2 From cd789ab9cacbda1aad43304b89cff29004b793ea Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 30 Aug 2012 00:16:38 -0500 Subject: rbd: don't register snapshots in bus_add_dev() When rbd_bus_add_dev() is called (one spot--in rbd_add()), the rbd image header has not even been read yet. This means that the list of snapshots will be empty at the time of the call. As a result, there is no need for the code that calls rbd_register_snap_dev() for each entry in that list--so get rid of it. Once the header has been read (just after returning), a call will be made to rbd_dev_snap_devs_update(), which will then find every snapshot in the context to be new and will therefore call rbd_register_snap_dev() via __rbd_add_snap_dev() accomplishing the same thing. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 214c937a6de5..144694ee03a5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2244,29 +2244,21 @@ static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev) static int rbd_bus_add_dev(struct rbd_device *rbd_dev) { - int ret; struct device *dev; - struct rbd_snap *snap; + int ret; mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - dev = &rbd_dev->dev; + dev = &rbd_dev->dev; dev->bus = &rbd_bus_type; dev->type = &rbd_device_type; dev->parent = &rbd_root_dev; dev->release = rbd_dev_release; dev_set_name(dev, "%d", rbd_dev->dev_id); ret = device_register(dev); - if (ret < 0) - goto out; - list_for_each_entry(snap, &rbd_dev->snaps, node) { - ret = rbd_register_snap_dev(snap, &rbd_dev->dev); - if (ret < 0) - break; - } -out: mutex_unlock(&ctl_mutex); + return ret; } -- cgit v1.2.3-70-g09d2 From e86924a8092fda66b859f12a4d7d37a4a458d74a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 10 Jul 2012 20:30:11 -0500 Subject: rbd: use snaps list in rbd_snap_by_name() An rbd_dev structure maintains a list of current snapshots that have already been fully initialized. The entries on the list have type struct rbd_snap, and each entry contains a copy of information that's found in the rbd_dev's snapshot context and header. The only caller of snap_by_name() is rbd_header_set_snap(). In that call site any positive return value (the index in the snapshot array) is ignored, so there's no need to return the index in the snapshot context's id array when it's found. rbd_header_set_snap() also has only one caller--rbd_add()--and that call is made after a call to rbd_dev_snap_devs_update(). Because the rbd_snap structures are initialized in that function, the current snapshot list can be used instead of the snapshot context to look up a snapshot's information by name. Change snap_by_name() so it uses the snapshot list rather than the rbd_dev's snapshot context in looking up snapshot information. Return 0 if it's found rather than the snapshot id. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 144694ee03a5..1ecdeb15b618 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -623,23 +623,18 @@ out_err: static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) { - int i; - struct rbd_image_header *header = &rbd_dev->header; - char *p = header->snap_names; - - rbd_assert(header->snapc != NULL); - for (i = 0; i < header->snapc->num_snaps; i++) { - if (!strcmp(snap_name, p)) { - /* Found it. Pass back its id and/or size */ + struct rbd_snap *snap; - rbd_dev->mapping.snap_id = header->snapc->snaps[i]; - rbd_dev->mapping.size = header->snap_sizes[i]; + list_for_each_entry(snap, &rbd_dev->snaps, node) { + if (!strcmp(snap_name, snap->name)) { + rbd_dev->mapping.snap_id = snap->id; + rbd_dev->mapping.size = snap->size; - return i; + return 0; } - p += strlen(p) + 1; /* Skip ahead to the next name */ } + return -ENOENT; } @@ -653,6 +648,7 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev, char *snap_name) rbd_dev->mapping.size = rbd_dev->header.image_size; rbd_dev->mapping.snap_exists = false; rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; + ret = 0; } else { ret = snap_by_name(rbd_dev, snap_name); if (ret < 0) @@ -661,8 +657,6 @@ static int rbd_header_set_snap(struct rbd_device *rbd_dev, char *snap_name) rbd_dev->mapping.read_only = true; } rbd_dev->mapping.snap_name = snap_name; - - ret = 0; done: return ret; } -- cgit v1.2.3-70-g09d2 From 3fcf2581c2c3c910aa46f6d205e502a97243ca2c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 3 Jul 2012 16:01:19 -0500 Subject: rbd: assign header name later Move the assignment of the header name for an rbd image a bit later, outside rbd_add_parse_args() and into its caller. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1ecdeb15b618..48901b51f648 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2474,15 +2474,6 @@ static char *rbd_add_parse_args(struct rbd_device *rbd_dev, if (!rbd_dev->image_name) goto out_err; - /* Create the name of the header object */ - - rbd_dev->header_name = kmalloc(rbd_dev->image_name_len - + sizeof (RBD_SUFFIX), - GFP_KERNEL); - if (!rbd_dev->header_name) - goto out_err; - sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); - /* Snapshot name is optional */ len = next_token(&buf); if (!len) { @@ -2500,8 +2491,6 @@ dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len); return snap_name; out_err: - kfree(rbd_dev->header_name); - rbd_dev->header_name = NULL; kfree(rbd_dev->image_name); rbd_dev->image_name = NULL; rbd_dev->image_name_len = 0; @@ -2566,6 +2555,15 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_client; rbd_dev->pool_id = rc; + /* Create the name of the header object */ + + rbd_dev->header_name = kmalloc(rbd_dev->image_name_len + + sizeof (RBD_SUFFIX), + GFP_KERNEL); + if (!rbd_dev->header_name) + goto err_out_client; + sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); + /* register our block device */ rc = register_blkdev(0, rbd_dev->name); if (rc < 0) @@ -2626,11 +2624,11 @@ err_out_bus: err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_client: + kfree(rbd_dev->header_name); rbd_put_client(rbd_dev); err_put_id: if (rbd_dev->pool_name) { kfree(rbd_dev->mapping.snap_name); - kfree(rbd_dev->header_name); kfree(rbd_dev->image_name); kfree(rbd_dev->pool_name); } -- cgit v1.2.3-70-g09d2 From 304f68086f8206da7c5930a9cb0207c91d1983a6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 Aug 2012 17:29:52 -0500 Subject: rbd: defer registering snapshot devices When a new snapshot is found in an rbd device's updated snapshot context, __rbd_add_snap_dev() is called to create and insert an entry in the rbd devices list of snapshots. In addition, a Linux device is registered to represent the snapshot. For version 2 rbd images, it will be undesirable to initialize the device right away. So in anticipation of that, this patch separates the insertion of a snapshot entry in the snaps list from the creation of devices for those snapshots. To do this, create a new function rbd_dev_snaps_register() which traverses the list of snapshots and calls rbd_register_snap_dev() on any that have not yet been registered. Rename rbd_dev_snap_devs_update() to be rbd_dev_snaps_update() to better reflect that only the entry in the snaps list and not the snapshot's device is affected by the function. For now, call rbd_dev_snaps_register() immediately after each call to rbd_dev_snaps_update(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 60 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 48901b51f648..0d812603e6d5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -204,7 +204,9 @@ static DEFINE_SPINLOCK(rbd_dev_list_lock); static LIST_HEAD(rbd_client_list); /* clients */ static DEFINE_SPINLOCK(rbd_client_list_lock); -static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev); +static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); +static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); + static void rbd_dev_release(struct device *dev); static ssize_t rbd_snap_add(struct device *dev, struct device_attribute *attr, @@ -1839,7 +1841,9 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); kfree(h.object_prefix); - ret = rbd_dev_snap_devs_update(rbd_dev); + ret = rbd_dev_snaps_update(rbd_dev); + if (!ret) + ret = rbd_dev_snaps_register(rbd_dev); up_write(&rbd_dev->header_rwsem); @@ -2084,10 +2088,21 @@ static struct device_type rbd_snap_device_type = { .release = rbd_snap_dev_release, }; +static bool rbd_snap_registered(struct rbd_snap *snap) +{ + bool ret = snap->dev.type == &rbd_snap_device_type; + bool reg = device_is_registered(&snap->dev); + + rbd_assert(!ret ^ reg); + + return ret; +} + static void __rbd_remove_snap_dev(struct rbd_snap *snap) { list_del(&snap->node); - device_unregister(&snap->dev); + if (device_is_registered(&snap->dev)) + device_unregister(&snap->dev); } static int rbd_register_snap_dev(struct rbd_snap *snap, @@ -2100,6 +2115,8 @@ static int rbd_register_snap_dev(struct rbd_snap *snap, dev->parent = parent; dev->release = rbd_snap_dev_release; dev_set_name(dev, "snap_%s", snap->name); + dout("%s: registering device for snapshot %s\n", __func__, snap->name); + ret = device_register(dev); return ret; @@ -2122,11 +2139,6 @@ static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, snap->size = rbd_dev->header.snap_sizes[i]; snap->id = rbd_dev->header.snapc->snaps[i]; - if (device_is_registered(&rbd_dev->dev)) { - ret = rbd_register_snap_dev(snap, &rbd_dev->dev); - if (ret < 0) - goto err; - } return snap; @@ -2149,7 +2161,7 @@ err: * snapshot id, highest id first. (Snapshots in the rbd_dev's list * are also maintained in that order.) */ -static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev) +static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) { struct ceph_snap_context *snapc = rbd_dev->header.snapc; const u32 snap_count = snapc->num_snaps; @@ -2236,6 +2248,31 @@ static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev) return 0; } +/* + * Scan the list of snapshots and register the devices for any that + * have not already been registered. + */ +static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) +{ + struct rbd_snap *snap; + int ret = 0; + + dout("%s called\n", __func__); + if (!device_is_registered(&rbd_dev->dev)) + return 0; + + list_for_each_entry(snap, &rbd_dev->snaps, node) { + if (!rbd_snap_registered(snap)) { + ret = rbd_register_snap_dev(snap, &rbd_dev->dev); + if (ret < 0) + break; + } + } + dout("%s: returning %d\n", __func__, ret); + + return ret; +} + static int rbd_bus_add_dev(struct rbd_device *rbd_dev) { struct device *dev; @@ -2585,7 +2622,10 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_bus; /* no need to lock here, as rbd_dev is not registered yet */ - rc = rbd_dev_snap_devs_update(rbd_dev); + rc = rbd_dev_snaps_update(rbd_dev); + if (rc) + goto err_out_bus; + rc = rbd_dev_snaps_register(rbd_dev); if (rc) goto err_out_bus; -- cgit v1.2.3-70-g09d2 From 5ed1617731a1e9201c3541a9c05ce3ec73975589 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 29 Aug 2012 17:11:07 -0500 Subject: rbd: call set_snap() before snap_devs_update() rbd_header_set_snap() is a simple initialization routine for an rbd device's mapping. It has to be called after the snapshot context for the rbd_dev has been updated, but can be done before snapshot devices have been registered. Change the name to rbd_dev_set_mapping() to better reflect its purpose, and call it a little sooner, before registering snapshot devices. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0d812603e6d5..a9f5de2706ec 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -640,7 +640,7 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) return -ENOENT; } -static int rbd_header_set_snap(struct rbd_device *rbd_dev, char *snap_name) +static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) { int ret; @@ -2625,12 +2625,13 @@ static ssize_t rbd_add(struct bus_type *bus, rc = rbd_dev_snaps_update(rbd_dev); if (rc) goto err_out_bus; - rc = rbd_dev_snaps_register(rbd_dev); + + rc = rbd_dev_set_mapping(rbd_dev, snap_name); if (rc) goto err_out_bus; down_write(&rbd_dev->header_rwsem); - rc = rbd_header_set_snap(rbd_dev, snap_name); + rc = rbd_dev_snaps_register(rbd_dev); up_write(&rbd_dev->header_rwsem); if (rc) goto err_out_bus; -- cgit v1.2.3-70-g09d2 From 05fd6f6f8c7b07e746d513e4cf862675b70aac59 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 29 Aug 2012 17:11:07 -0500 Subject: rbd: read the header before registering device Read the rbd header information and call rbd_dev_set_mapping() earlier--before registering the block device or setting up the sysfs entries for the image. The sysfs entries provide users access to some information that's only available after doing the rbd header initialization, so this will make sure it's valid right away. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a9f5de2706ec..aa4752d9d9fa 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2601,10 +2601,25 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_client; sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); + /* Get information about the image being mapped */ + + rc = rbd_read_header(rbd_dev, &rbd_dev->header); + if (rc) + goto err_out_client; + + /* no need to lock here, as rbd_dev is not registered yet */ + rc = rbd_dev_snaps_update(rbd_dev); + if (rc) + goto err_out_header; + + rc = rbd_dev_set_mapping(rbd_dev, snap_name); + if (rc) + goto err_out_header; + /* register our block device */ rc = register_blkdev(0, rbd_dev->name); if (rc < 0) - goto err_out_client; + goto err_out_header; rbd_dev->major = rc; rc = rbd_bus_add_dev(rbd_dev); @@ -2616,20 +2631,6 @@ static ssize_t rbd_add(struct bus_type *bus, * of the sysfs code (initiated by rbd_bus_del_dev()). */ - /* contact OSD, request size info about the object being mapped */ - rc = rbd_read_header(rbd_dev, &rbd_dev->header); - if (rc) - goto err_out_bus; - - /* no need to lock here, as rbd_dev is not registered yet */ - rc = rbd_dev_snaps_update(rbd_dev); - if (rc) - goto err_out_bus; - - rc = rbd_dev_set_mapping(rbd_dev, snap_name); - if (rc) - goto err_out_bus; - down_write(&rbd_dev->header_rwsem); rc = rbd_dev_snaps_register(rbd_dev); up_write(&rbd_dev->header_rwsem); @@ -2664,6 +2665,8 @@ err_out_bus: err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); +err_out_header: + rbd_header_free(&rbd_dev->header); err_out_client: kfree(rbd_dev->header_name); rbd_put_client(rbd_dev); -- cgit v1.2.3-70-g09d2 From 85ae8926751db5e09b9a12ee44609ee9e74b7aad Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 26 Jul 2012 23:37:14 -0500 Subject: rbd: defer setting device id Hold off setting the device id and formatting the device name in rbd_add() until just before it's needed. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index aa4752d9d9fa..7a600ca2dbcf 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2554,10 +2554,10 @@ static ssize_t rbd_add(struct bus_type *bus, options = kmalloc(count, GFP_KERNEL); if (!options) - goto err_nomem; + goto err_out_mem; rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); if (!rbd_dev) - goto err_nomem; + goto err_out_mem; /* static rbd_device initialization */ spin_lock_init(&rbd_dev->lock); @@ -2565,25 +2565,17 @@ static ssize_t rbd_add(struct bus_type *bus, INIT_LIST_HEAD(&rbd_dev->snaps); init_rwsem(&rbd_dev->header_rwsem); - /* generate unique id: find highest unique id, add one */ - rbd_dev_id_get(rbd_dev); - - /* Fill in the device name, now that we have its id. */ - BUILD_BUG_ON(DEV_NAME_LEN - < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); - sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); - /* parse add command */ snap_name = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, options, count); if (IS_ERR(snap_name)) { rc = PTR_ERR(snap_name); - goto err_put_id; + goto err_out_mem; } rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options); if (rc < 0) - goto err_put_id; + goto err_out_args; /* pick the pool */ osdc = &rbd_dev->rbd_client->client->osdc; @@ -2616,10 +2608,19 @@ static ssize_t rbd_add(struct bus_type *bus, if (rc) goto err_out_header; - /* register our block device */ + /* generate unique id: find highest unique id, add one */ + rbd_dev_id_get(rbd_dev); + + /* Fill in the device name, now that we have its id. */ + BUILD_BUG_ON(DEV_NAME_LEN + < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); + sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); + + /* Get our block major device number. */ + rc = register_blkdev(0, rbd_dev->name); if (rc < 0) - goto err_out_header; + goto err_out_id; rbd_dev->major = rc; rc = rbd_bus_add_dev(rbd_dev); @@ -2665,19 +2666,18 @@ err_out_bus: err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); +err_out_id: + rbd_dev_id_put(rbd_dev); err_out_header: rbd_header_free(&rbd_dev->header); err_out_client: kfree(rbd_dev->header_name); rbd_put_client(rbd_dev); -err_put_id: - if (rbd_dev->pool_name) { - kfree(rbd_dev->mapping.snap_name); - kfree(rbd_dev->image_name); - kfree(rbd_dev->pool_name); - } - rbd_dev_id_put(rbd_dev); -err_nomem: +err_out_args: + kfree(rbd_dev->mapping.snap_name); + kfree(rbd_dev->image_name); + kfree(rbd_dev->pool_name); +err_out_mem: kfree(rbd_dev); kfree(options); -- cgit v1.2.3-70-g09d2 From 0f308a3188b37f36bc5a078f5fe039a41714476e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 29 Aug 2012 17:11:07 -0500 Subject: rbd: call rbd_init_disk() sooner Call rbd_init_disk() from rbd_add() as soon as we have the major device number for the mapping. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7a600ca2dbcf..27988045b48e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2623,10 +2623,16 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_id; rbd_dev->major = rc; - rc = rbd_bus_add_dev(rbd_dev); + /* Set up the blkdev mapping. */ + + rc = rbd_init_disk(rbd_dev); if (rc) goto err_out_blkdev; + rc = rbd_bus_add_dev(rbd_dev); + if (rc) + goto err_out_disk; + /* * At this point cleanup in the event of an error is the job * of the sysfs code (initiated by rbd_bus_del_dev()). @@ -2638,12 +2644,6 @@ static ssize_t rbd_add(struct bus_type *bus, if (rc) goto err_out_bus; - /* Set up the blkdev mapping. */ - - rc = rbd_init_disk(rbd_dev); - if (rc) - goto err_out_bus; - /* Everything's ready. Announce the disk to the world. */ set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); @@ -2664,6 +2664,8 @@ err_out_bus: kfree(options); return rc; +err_out_disk: + rbd_free_disk(rbd_dev); err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: -- cgit v1.2.3-70-g09d2 From 86ff77bb68c6cda783b195a260f68fd5d32f7aaf Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 31 Aug 2012 17:29:53 -0500 Subject: rbd: drop dev registration check for new snap By the time rbd_dev_snaps_register() gets called during rbd device initialization, the main device will have already been registered. Similarly, a header refresh will only occur for an rbd device whose Linux device is registered. There is therefore no need to verify the main device is registered when registering a snapshot device. For the time being, turn the check into a WARN_ON(), but it can eventually just go away. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 27988045b48e..fa99b94b9dbb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2258,8 +2258,8 @@ static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) int ret = 0; dout("%s called\n", __func__); - if (!device_is_registered(&rbd_dev->dev)) - return 0; + if (WARN_ON(!device_is_registered(&rbd_dev->dev))) + return -EIO; list_for_each_entry(snap, &rbd_dev->snaps, node) { if (!rbd_snap_registered(snap)) { -- cgit v1.2.3-70-g09d2 From 12f029448c3d73e0f30bc5aee5964442aa95c0f4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 29 Aug 2012 17:11:07 -0500 Subject: rbd: set initial capacity in rbd_init_disk() Move the setting of the initial capacity for an rbd image mapping into rb_init_disk(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fa99b94b9dbb..3274943b2342 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1901,6 +1901,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->disk = disk; + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); + return 0; out_disk: put_disk(disk); @@ -2646,7 +2648,6 @@ static ssize_t rbd_add(struct bus_type *bus, /* Everything's ready. Announce the disk to the world. */ - set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); add_disk(rbd_dev->disk); pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, (unsigned long long) rbd_dev->mapping.size); -- cgit v1.2.3-70-g09d2 From 3ee4001e0c875ce8ebcdf5ea305e9a105b3687bd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 29 Aug 2012 17:11:07 -0500 Subject: rbd: set up watch before announcing disk We're ready to handle header object (refresh) events at the point we call rbd_bus_add_dev(). Set up the watch request on the rbd image header just after that, and after we've registered the devices for the snapshots for the initial snapshot context. Do this before announce the disk as available for use. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3274943b2342..61807c32996e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2646,16 +2646,17 @@ static ssize_t rbd_add(struct bus_type *bus, if (rc) goto err_out_bus; + rc = rbd_init_watch_dev(rbd_dev); + if (rc) + goto err_out_bus; + /* Everything's ready. Announce the disk to the world. */ add_disk(rbd_dev->disk); + pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, (unsigned long long) rbd_dev->mapping.size); - rc = rbd_init_watch_dev(rbd_dev); - if (rc) - goto err_out_bus; - return count; err_out_bus: -- cgit v1.2.3-70-g09d2 From 3cb4a687c72bd16c95f514933d68884eacac4e4e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 26 Jun 2012 12:57:03 -0700 Subject: rbd: pass flags to rbd_req_sync_exec() In order to allow both read requests and write requests to be initiated using rbd_req_sync_exec(), add an OSD flags value which can be passed down to rbd_req_sync_op(). Rename the "data" and "len" parameters to be more clear that they represent data that is outbound. At this point, this function is still only used (and only works) for write requests. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 61807c32996e..ad26502f4b0f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1437,23 +1437,33 @@ fail: } /* - * Request sync osd read + * Synchronous osd object method call */ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, const char *object_name, const char *class_name, const char *method_name, - const char *data, - int len, + const char *outbound, + size_t outbound_size, + int flags, u64 *ver) { struct ceph_osd_req_op *ops; int class_name_len = strlen(class_name); int method_name_len = strlen(method_name); + int payload_size; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, - class_name_len + method_name_len + len); + /* + * Any input parameters required by the method we're calling + * will be sent along with the class and method names as + * part of the message payload. That data and its size are + * supplied via the indata and indata_len fields (named from + * the perspective of the server side) in the OSD request + * operation. + */ + payload_size = class_name_len + method_name_len + outbound_size; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); if (!ops) return -ENOMEM; @@ -1462,13 +1472,12 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ops[0].cls.method_name = method_name; ops[0].cls.method_len = (__u8) method_name_len; ops[0].cls.argc = 0; - ops[0].cls.indata = data; - ops[0].cls.indata_len = len; + ops[0].cls.indata = outbound; + ops[0].cls.indata_len = outbound_size; ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, + flags, ops, object_name, 0, 0, NULL, NULL, ver); rbd_destroy_ops(ops); @@ -1780,7 +1789,9 @@ static int rbd_header_add_snap(struct rbd_device *rbd_dev, ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "snap_add", - data, p - data, NULL); + data, (size_t) (p - data), + CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, + NULL); kfree(data); -- cgit v1.2.3-70-g09d2 From f8d4de6e1c939d56f1ee0a21ad677401846f990c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 3 Jul 2012 16:01:19 -0500 Subject: rbd: support data returned from OSD methods An OSD object method call can be made using rbd_req_sync_exec(). Until now this has only been used for creating a new RBD snapshot, and that has only required sending data out, not receiving anything back from the OSD. We will now need to get data back from an OSD on a method call, so add parameters to rbd_req_sync_exec() that allow a buffer into which returned data should be placed to be specified, along with its size. Previously, rbd_req_sync_exec() passed a null pointer and zero size to rbd_req_sync_op(); change this so the new inbound buffer information is provided instead. Rename the "buf" and "len" parameters in rbd_req_sync_op() to make it more obvious they are describing inbound data. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ad26502f4b0f..b8956131950c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1098,8 +1098,8 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, int flags, struct ceph_osd_req_op *ops, const char *object_name, - u64 ofs, u64 len, - char *buf, + u64 ofs, u64 inbound_size, + char *inbound, struct ceph_osd_request **linger_req, u64 *ver) { @@ -1109,13 +1109,13 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, rbd_assert(ops != NULL); - num_pages = calc_pages_for(ofs , len); + num_pages = calc_pages_for(ofs, inbound_size); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, - object_name, ofs, len, NULL, + object_name, ofs, inbound_size, NULL, pages, num_pages, flags, ops, @@ -1125,8 +1125,8 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, if (ret < 0) goto done; - if ((flags & CEPH_OSD_FLAG_READ) && buf) - ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); + if ((flags & CEPH_OSD_FLAG_READ) && inbound) + ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); done: ceph_release_page_vector(pages, num_pages); @@ -1445,6 +1445,8 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, const char *method_name, const char *outbound, size_t outbound_size, + char *inbound, + size_t inbound_size, int flags, u64 *ver) { @@ -1478,7 +1480,8 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, flags, ops, - object_name, 0, 0, NULL, NULL, ver); + object_name, 0, inbound_size, inbound, + NULL, ver); rbd_destroy_ops(ops); @@ -1789,7 +1792,7 @@ static int rbd_header_add_snap(struct rbd_device *rbd_dev, ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "snap_add", - data, (size_t) (p - data), + data, (size_t) (p - data), NULL, 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, NULL); -- cgit v1.2.3-70-g09d2 From 3bb59ad515527fa75cf71d997d17cea18160da74 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 10 Jul 2012 20:30:10 -0500 Subject: rbd: define some new format constants Define constant symbols related to the rbd format 2 object names. This begins to bring this version of the "rbd_types.h" header more in line with the current user-space version of that file. Complete reconciliation of differences will be done at some point later, as a separate task. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd_types.h | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index d9d8a77748bc..cbe77fa105ba 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -15,15 +15,30 @@ #include +/* For format version 2, rbd image 'foo' consists of objects + * rbd_id.foo - id of image + * rbd_header. - image metadata + * rbd_data..0000000000000000 + * rbd_data..0000000000000001 + * ... - data + * Clients do not access header data directly in rbd format 2. + */ + +#define RBD_HEADER_PREFIX "rbd_header." +#define RBD_DATA_PREFIX "rbd_data." +#define RBD_ID_PREFIX "rbd_id." + /* - * rbd image 'foo' consists of objects - * foo.rbd - image metadata - * foo.00000000 - * foo.00000001 - * ... - data + * For format version 1, rbd image 'foo' consists of objects + * foo.rbd - image metadata + * rb...00000000 + * rb...00000001 + * ... - data + * There is no notion of a persistent image id in rbd format 1. */ #define RBD_SUFFIX ".rbd" + #define RBD_DIRECTORY "rbd_directory" #define RBD_INFO "rbd_info" -- cgit v1.2.3-70-g09d2 From 589d30e0b3e649e2660f9a67be88e235b28bc319 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 10 Jul 2012 20:30:11 -0500 Subject: rbd: define rbd_dev_image_id() New format 2 rbd images are permanently identified by a unique image id. Each rbd image also has a name, but the name can be changed. A format 2 rbd image will have an object--whose name is based on the image name--which maps an image's name to its image id. Create a new function rbd_dev_image_id() that checks for the existence of the image id object, and if it's found, records the image id in the rbd_device structure. Create a new rbd device attribute (/sys/bus/rbd//image_id) that makes this information available. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- Documentation/ABI/testing/sysfs-bus-rbd | 5 ++ drivers/block/rbd.c | 100 ++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd index 3c17b62899f6..7cbbe343af5e 100644 --- a/Documentation/ABI/testing/sysfs-bus-rbd +++ b/Documentation/ABI/testing/sysfs-bus-rbd @@ -33,6 +33,11 @@ name The name of the rbd image. +image_id + + The unique id for the rbd image. (For rbd image format 1 + this is empty.) + pool The name of the storage pool where this rbd image resides. diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b8956131950c..34f46c3b188f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -66,6 +66,8 @@ #define RBD_SNAP_HEAD_NAME "-" +#define RBD_IMAGE_ID_LEN_MAX 64 + /* * An RBD device name will be "rbd#", where the "rbd" comes from * RBD_DRV_NAME above, and # is a unique integer identifier. @@ -173,6 +175,8 @@ struct rbd_device { spinlock_t lock; /* queue lock */ struct rbd_image_header header; + char *image_id; + size_t image_id_len; char *image_name; size_t image_name_len; char *header_name; @@ -1987,6 +1991,14 @@ static ssize_t rbd_name_show(struct device *dev, return sprintf(buf, "%s\n", rbd_dev->image_name); } +static ssize_t rbd_image_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%s\n", rbd_dev->image_id); +} + static ssize_t rbd_snap_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -2015,6 +2027,7 @@ static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); +static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add); @@ -2026,6 +2039,7 @@ static struct attribute *rbd_attrs[] = { &dev_attr_pool.attr, &dev_attr_pool_id.attr, &dev_attr_name.attr, + &dev_attr_image_id.attr, &dev_attr_current_snap.attr, &dev_attr_refresh.attr, &dev_attr_create_snap.attr, @@ -2553,6 +2567,75 @@ out_err: return err_ptr; } +/* + * An rbd format 2 image has a unique identifier, distinct from the + * name given to it by the user. Internally, that identifier is + * what's used to specify the names of objects related to the image. + * + * A special "rbd id" object is used to map an rbd image name to its + * id. If that object doesn't exist, then there is no v2 rbd image + * with the supplied name. + * + * This function will record the given rbd_dev's image_id field if + * it can be determined, and in that case will return 0. If any + * errors occur a negative errno will be returned and the rbd_dev's + * image_id field will be unchanged (and should be NULL). + */ +static int rbd_dev_image_id(struct rbd_device *rbd_dev) +{ + int ret; + size_t size; + char *object_name; + void *response; + void *p; + + /* + * First, see if the format 2 image id file exists, and if + * so, get the image's persistent id from it. + */ + size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len; + object_name = kmalloc(size, GFP_NOIO); + if (!object_name) + return -ENOMEM; + sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name); + dout("rbd id object name is %s\n", object_name); + + /* Response will be an encoded string, which includes a length */ + + size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; + response = kzalloc(size, GFP_NOIO); + if (!response) { + ret = -ENOMEM; + goto out; + } + + ret = rbd_req_sync_exec(rbd_dev, object_name, + "rbd", "get_id", + NULL, 0, + response, RBD_IMAGE_ID_LEN_MAX, + CEPH_OSD_FLAG_READ, NULL); + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + if (ret < 0) + goto out; + + p = response; + rbd_dev->image_id = ceph_extract_encoded_string(&p, + p + RBD_IMAGE_ID_LEN_MAX, + &rbd_dev->image_id_len, + GFP_NOIO); + if (IS_ERR(rbd_dev->image_id)) { + ret = PTR_ERR(rbd_dev->image_id); + rbd_dev->image_id = NULL; + } else { + dout("image_id is %s\n", rbd_dev->image_id); + } +out: + kfree(response); + kfree(object_name); + + return ret; +} + static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) @@ -2600,6 +2683,21 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_client; rbd_dev->pool_id = rc; + rc = rbd_dev_image_id(rbd_dev); + if (!rc) { + rc = -ENOTSUPP; /* Not actually supporting format 2 yet */ + goto err_out_client; + } + + /* Version 1 images have no id; empty string is used */ + + rbd_dev->image_id = kstrdup("", GFP_KERNEL); + if (!rbd_dev->image_id) { + rc = -ENOMEM; + goto err_out_client; + } + rbd_dev->image_id_len = 0; + /* Create the name of the header object */ rbd_dev->header_name = kmalloc(rbd_dev->image_name_len @@ -2691,6 +2789,7 @@ err_out_header: err_out_client: kfree(rbd_dev->header_name); rbd_put_client(rbd_dev); + kfree(rbd_dev->image_id); err_out_args: kfree(rbd_dev->mapping.snap_name); kfree(rbd_dev->image_name); @@ -2746,6 +2845,7 @@ static void rbd_dev_release(struct device *dev) /* done with the id, and with the rbd_dev */ kfree(rbd_dev->mapping.snap_name); + kfree(rbd_dev->image_id); kfree(rbd_dev->header_name); kfree(rbd_dev->pool_name); kfree(rbd_dev->image_name); -- cgit v1.2.3-70-g09d2 From 02cdb02ceab1f3dd9ac2bc899fc51f0e0e744782 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 10 Aug 2012 13:12:10 -0700 Subject: rbd: kill create_snap sysfs entry Josh proposed the following change, and I don't think I could explain it any better than he did: From: Josh Durgin Date: Tue, 24 Jul 2012 14:22:11 -0700 To: ceph-devel Message-ID: <500F1203.9050605@inktank.com> Right now the kernel still has one piece of rbd management duplicated from the rbd command line tool: snapshot creation. There's nothing special about snapshot creation that makes it advantageous to do from the kernel, so I'd like to remove the create_snap sysfs interface. That is, /sys/bus/rbd/devices//create_snap would be removed. Does anyone rely on the sysfs interface for creating rbd snapshots? If so, how hard would it be to replace with: rbd snap create pool/image@snap Is there any benefit to the sysfs interface that I'm missing? Josh This patch implements this proposal, removing the code that implements the "snap_create" sysfs interface for rbd images. As a result, quite a lot of other supporting code goes away. Suggested-by: Josh Durgin Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- Documentation/ABI/testing/sysfs-bus-rbd | 6 -- drivers/block/rbd.c | 158 -------------------------------- 2 files changed, 164 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd index 7cbbe343af5e..6fe4224cc5bd 100644 --- a/Documentation/ABI/testing/sysfs-bus-rbd +++ b/Documentation/ABI/testing/sysfs-bus-rbd @@ -62,12 +62,6 @@ current_snap The current snapshot for which the device is mapped. -create_snap - - Create a snapshot: - - $ echo > /sys/bus/rbd/devices//snap_create - snap_* A directory per each snapshot diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 34f46c3b188f..e453f8cc8949 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -212,10 +212,6 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev); static int rbd_dev_snaps_register(struct rbd_device *rbd_dev); static void rbd_dev_release(struct device *dev); -static ssize_t rbd_snap_add(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count); static void __rbd_remove_snap_dev(struct rbd_snap *snap); static ssize_t rbd_add(struct bus_type *bus, const char *buf, @@ -1375,71 +1371,6 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) return ret; } -struct rbd_notify_info { - struct rbd_device *rbd_dev; -}; - -static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) -{ - struct rbd_device *rbd_dev = (struct rbd_device *)data; - if (!rbd_dev) - return; - - dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n", - rbd_dev->header_name, (unsigned long long) notify_id, - (unsigned int) opcode); -} - -/* - * Request sync osd notify - */ -static int rbd_req_sync_notify(struct rbd_device *rbd_dev) -{ - struct ceph_osd_req_op *ops; - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - struct ceph_osd_event *event; - struct rbd_notify_info info; - int payload_len = sizeof(u32) + sizeof(u32); - int ret; - - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len); - if (!ops) - return -ENOMEM; - - info.rbd_dev = rbd_dev; - - ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, - (void *)&info, &event); - if (ret < 0) - goto fail; - - ops[0].watch.ver = 1; - ops[0].watch.flag = 1; - ops[0].watch.cookie = event->cookie; - ops[0].watch.prot_ver = RADOS_NOTIFY_VER; - ops[0].watch.timeout = 12; - - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, - rbd_dev->header_name, - 0, 0, NULL, NULL, NULL); - if (ret < 0) - goto fail_event; - - ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT); - dout("ceph_osdc_wait_event returned %d\n", ret); - rbd_destroy_ops(ops); - return 0; - -fail_event: - ceph_osdc_cancel_event(event); -fail: - rbd_destroy_ops(ops); - return ret; -} - /* * Synchronous osd object method call */ @@ -1761,52 +1692,6 @@ static int rbd_read_header(struct rbd_device *rbd_dev, return ret; } -/* - * create a snapshot - */ -static int rbd_header_add_snap(struct rbd_device *rbd_dev, - const char *snap_name, - gfp_t gfp_flags) -{ - int name_len = strlen(snap_name); - u64 new_snapid; - int ret; - void *data, *p, *e; - struct ceph_mon_client *monc; - - /* we should create a snapshot only if we're pointing at the head */ - if (rbd_dev->mapping.snap_id != CEPH_NOSNAP) - return -EINVAL; - - monc = &rbd_dev->rbd_client->client->monc; - ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid); - dout("created snapid=%llu\n", (unsigned long long) new_snapid); - if (ret < 0) - return ret; - - data = kmalloc(name_len + 16, gfp_flags); - if (!data) - return -ENOMEM; - - p = data; - e = data + name_len + 16; - - ceph_encode_string_safe(&p, e, snap_name, name_len, bad); - ceph_encode_64_safe(&p, e, new_snapid, bad); - - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, - "rbd", "snap_add", - data, (size_t) (p - data), NULL, 0, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - NULL); - - kfree(data); - - return ret < 0 ? ret : 0; -bad: - return -ERANGE; -} - static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) { struct rbd_snap *snap; @@ -2030,7 +1915,6 @@ static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); -static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add); static struct attribute *rbd_attrs[] = { &dev_attr_size.attr, @@ -2042,7 +1926,6 @@ static struct attribute *rbd_attrs[] = { &dev_attr_image_id.attr, &dev_attr_current_snap.attr, &dev_attr_refresh.attr, - &dev_attr_create_snap.attr, NULL }; @@ -2891,47 +2774,6 @@ done: return ret; } -static ssize_t rbd_snap_add(struct device *dev, - struct device_attribute *attr, - const char *buf, - size_t count) -{ - struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - int ret; - char *name = kmalloc(count + 1, GFP_KERNEL); - if (!name) - return -ENOMEM; - - snprintf(name, count, "%s", buf); - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - ret = rbd_header_add_snap(rbd_dev, - name, GFP_KERNEL); - if (ret < 0) - goto err_unlock; - - ret = __rbd_refresh_header(rbd_dev, NULL); - if (ret < 0) - goto err_unlock; - - /* shouldn't hold ctl_mutex when notifying.. notify might - trigger a watch callback that would need to get that mutex */ - mutex_unlock(&ctl_mutex); - - /* make a best effort, don't error if failed */ - rbd_req_sync_notify(rbd_dev); - - ret = count; - kfree(name); - return ret; - -err_unlock: - mutex_unlock(&ctl_mutex); - kfree(name); - return ret; -} - /* * create control files in sysfs * /sys/bus/rbd/... -- cgit v1.2.3-70-g09d2 From c8d184250d8a47b1a958affcffe3ffdd85644301 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 10 Jul 2012 20:30:11 -0500 Subject: rbd: don't use index in __rbd_add_snap_dev() Pass the snapshot id and snapshot size rather than an index to __rbd_add_snap_dev() to specify values for a new snapshot. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e453f8cc8949..8ac193ff4849 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2036,7 +2036,8 @@ static int rbd_register_snap_dev(struct rbd_snap *snap, } static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, - int i, const char *name) + const char *snap_name, + u64 snap_id, u64 snap_size) { struct rbd_snap *snap; int ret; @@ -2046,12 +2047,12 @@ static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, return ERR_PTR(-ENOMEM); ret = -ENOMEM; - snap->name = kstrdup(name, GFP_KERNEL); + snap->name = kstrdup(snap_name, GFP_KERNEL); if (!snap->name) goto err; - snap->size = rbd_dev->header.snap_sizes[i]; - snap->id = rbd_dev->header.snapc->snaps[i]; + snap->id = snap_id; + snap->size = snap_size; return snap; @@ -2116,12 +2117,13 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, (unsigned long long) snap_id); if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { + struct rbd_image_header *header = &rbd_dev->header; struct rbd_snap *new_snap; /* We haven't seen this snapshot before */ - new_snap = __rbd_add_snap_dev(rbd_dev, index, - snap_name); + new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, + snap_id, header->snap_sizes[index]); if (IS_ERR(new_snap)) { int err = PTR_ERR(new_snap); -- cgit v1.2.3-70-g09d2 From 34b131849feb359f183907b467e9aa4d652b1baa Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 13 Jul 2012 20:35:12 -0500 Subject: rbd: add an rbd features field Record the features values for each rbd image and each of its snapshots. This is really something that only becomes meaningful for version 2 images, so this is just putting in place code that will form common infrastructure. It may be useful to expand the sysfs entries--and therefore the information we maintain--for the image and for each snapshot. But I'm going to hold off doing that until we start making active use of the feature bits. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- Documentation/ABI/testing/sysfs-bus-rbd | 7 ++++++ drivers/block/rbd.c | 43 +++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd index 6fe4224cc5bd..1cf2adf46b11 100644 --- a/Documentation/ABI/testing/sysfs-bus-rbd +++ b/Documentation/ABI/testing/sysfs-bus-rbd @@ -25,6 +25,10 @@ client_id The ceph unique client id that was assigned for this specific session. +features + + A hexadecimal encoding of the feature bits for this image. + major The block device major number. @@ -78,4 +82,7 @@ snap_size The size of the image when this snapshot was taken. +snap_features + + A hexadecimal encoding of the feature bits for this snapshot. diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8ac193ff4849..463f8b264c6f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -85,6 +85,7 @@ struct rbd_image_header { /* These four fields never change for a given rbd image */ char *object_prefix; + u64 features; __u8 obj_order; __u8 crypt_type; __u8 comp_type; @@ -148,12 +149,14 @@ struct rbd_snap { u64 size; struct list_head node; u64 id; + u64 features; }; struct rbd_mapping { char *snap_name; u64 snap_id; u64 size; + u64 features; bool snap_exists; bool read_only; }; @@ -590,6 +593,7 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->snap_sizes = NULL; } + header->features = 0; /* No features support in v1 images */ header->obj_order = ondisk->options.order; header->crypt_type = ondisk->options.crypt_type; header->comp_type = ondisk->options.comp_type; @@ -632,6 +636,7 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name) if (!strcmp(snap_name, snap->name)) { rbd_dev->mapping.snap_id = snap->id; rbd_dev->mapping.size = snap->size; + rbd_dev->mapping.features = snap->features; return 0; } @@ -648,6 +653,7 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name) sizeof (RBD_SNAP_HEAD_NAME))) { rbd_dev->mapping.snap_id = CEPH_NOSNAP; rbd_dev->mapping.size = rbd_dev->header.image_size; + rbd_dev->mapping.features = rbd_dev->header.features; rbd_dev->mapping.snap_exists = false; rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only; ret = 0; @@ -1835,6 +1841,19 @@ static ssize_t rbd_size_show(struct device *dev, return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); } +/* + * Note this shows the features for whatever's mapped, which is not + * necessarily the base image. + */ +static ssize_t rbd_features_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "0x%016llx\n", + (unsigned long long) rbd_dev->mapping.features); +} + static ssize_t rbd_major_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1884,6 +1903,10 @@ static ssize_t rbd_image_id_show(struct device *dev, return sprintf(buf, "%s\n", rbd_dev->image_id); } +/* + * Shows the name of the currently-mapped snapshot (or + * RBD_SNAP_HEAD_NAME for the base image). + */ static ssize_t rbd_snap_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1907,6 +1930,7 @@ static ssize_t rbd_image_refresh(struct device *dev, } static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); +static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); @@ -1918,6 +1942,7 @@ static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); static struct attribute *rbd_attrs[] = { &dev_attr_size.attr, + &dev_attr_features.attr, &dev_attr_major.attr, &dev_attr_client_id.attr, &dev_attr_pool.attr, @@ -1971,12 +1996,24 @@ static ssize_t rbd_snap_id_show(struct device *dev, return sprintf(buf, "%llu\n", (unsigned long long)snap->id); } +static ssize_t rbd_snap_features_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev); + + return sprintf(buf, "0x%016llx\n", + (unsigned long long) snap->features); +} + static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL); static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL); +static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL); static struct attribute *rbd_snap_attrs[] = { &dev_attr_snap_size.attr, &dev_attr_snap_id.attr, + &dev_attr_snap_features.attr, NULL, }; @@ -2037,7 +2074,8 @@ static int rbd_register_snap_dev(struct rbd_snap *snap, static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, const char *snap_name, - u64 snap_id, u64 snap_size) + u64 snap_id, u64 snap_size, + u64 snap_features) { struct rbd_snap *snap; int ret; @@ -2053,6 +2091,7 @@ static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, snap->id = snap_id; snap->size = snap_size; + snap->features = snap_features; return snap; @@ -2123,7 +2162,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) /* We haven't seen this snapshot before */ new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, - snap_id, header->snap_sizes[index]); + snap_id, header->snap_sizes[index], 0); if (IS_ERR(new_snap)) { int err = PTR_ERR(new_snap); -- cgit v1.2.3-70-g09d2 From cd892126c617b3837b6088bf6c097ad2def4de83 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 3 Jul 2012 16:01:19 -0500 Subject: rbd: encapsulate code that gets snapshot info Create a function that encapsulates looking up the name, size and features related to a given snapshot, which is indicated by its index in an rbd device's snapshot context array of snapshot ids. This interface will be used to hide differences between the format 1 and format 2 images. At the moment this (looking up the name anyway) is slightly less efficient than what's done currently, but we may be able to optimize this a bit later on by cacheing the last lookup if it proves to be a problem. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 463f8b264c6f..366a3a1f2aac 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2102,6 +2102,25 @@ err: return ERR_PTR(ret); } +static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, + u64 *snap_size, u64 *snap_features) +{ + char *snap_name; + + rbd_assert(which < rbd_dev->header.snapc->num_snaps); + + *snap_size = rbd_dev->header.snap_sizes[which]; + *snap_features = 0; /* No features for v1 */ + + /* Skip over names until we find the one we are looking for */ + + snap_name = rbd_dev->header.snap_names; + while (which--) + snap_name += strlen(snap_name) + 1; + + return snap_name; +} + /* * Scan the rbd device's current snapshot list and compare it to the * newly-received snapshot context. Remove any existing snapshots @@ -2118,7 +2137,6 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) { struct ceph_snap_context *snapc = rbd_dev->header.snapc; const u32 snap_count = snapc->num_snaps; - char *snap_name = rbd_dev->header.snap_names; struct list_head *head = &rbd_dev->snaps; struct list_head *links = head->next; u32 index = 0; @@ -2127,6 +2145,9 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) while (index < snap_count || links != head) { u64 snap_id; struct rbd_snap *snap; + char *snap_name; + u64 snap_size = 0; + u64 snap_features = 0; snap_id = index < snap_count ? snapc->snaps[index] : CEPH_NOSNAP; @@ -2153,16 +2174,20 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) continue; } + snap_name = rbd_dev_v1_snap_info(rbd_dev, index, + &snap_size, &snap_features); + if (IS_ERR(snap_name)) + return PTR_ERR(snap_name); + dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count, (unsigned long long) snap_id); if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) { - struct rbd_image_header *header = &rbd_dev->header; struct rbd_snap *new_snap; /* We haven't seen this snapshot before */ new_snap = __rbd_add_snap_dev(rbd_dev, snap_name, - snap_id, header->snap_sizes[index], 0); + snap_id, snap_size, snap_features); if (IS_ERR(new_snap)) { int err = PTR_ERR(new_snap); @@ -2183,9 +2208,9 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) dout(" already present\n"); - rbd_assert(snap->size == - rbd_dev->header.snap_sizes[index]); + rbd_assert(snap->size == snap_size); rbd_assert(!strcmp(snap->name, snap_name)); + rbd_assert(snap->features == snap_features); /* Done with this list entry; advance */ @@ -2195,7 +2220,6 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) /* Advance to the next entry in the snapshot context */ index++; - snap_name += strlen(snap_name) + 1; } dout("%s: done\n", __func__); -- cgit v1.2.3-70-g09d2 From a30b71b999c92071befec73434f4e67fd4b4734b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 10 Jul 2012 20:30:11 -0500 Subject: rbd: lay out header probe infrastructure This defines a new function rbd_dev_probe() as a top-level function for populating detailed information about an rbd device. It first checks for the existence of a format 2 rbd image id object. If it exists, the image is assumed to be a format 2 rbd image, and another function rbd_dev_v2() is called to finish populating header data for that image. If it does not exist, it is assumed to be an old (format 1) rbd image, and calls a similar function rbd_dev_v1() to populate its header information. A new field, rbd_dev->format, is defined to record which version of the rbd image format the device represents. For a valid mapped rbd device it will have one of two values, 1 or 2. So far, the format 2 images are not really supported; this is laying out the infrastructure for fleshing out that support. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 127 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 99 insertions(+), 28 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 366a3a1f2aac..3b284d53a566 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -170,6 +170,7 @@ struct rbd_device { int major; /* blkdev assigned major */ struct gendisk *disk; /* blkdev's gendisk and rq */ + u32 image_format; /* Either 1 or 2 */ struct rbd_options rbd_opts; struct rbd_client *rbd_client; @@ -507,6 +508,11 @@ static void rbd_coll_release(struct kref *kref) kfree(coll); } +static bool rbd_image_format_valid(u32 image_format) +{ + return image_format == 1 || image_format == 2; +} + static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) { size_t size; @@ -2584,6 +2590,96 @@ out: return ret; } +static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) +{ + int ret; + size_t size; + + /* Version 1 images have no id; empty string is used */ + + rbd_dev->image_id = kstrdup("", GFP_KERNEL); + if (!rbd_dev->image_id) + return -ENOMEM; + rbd_dev->image_id_len = 0; + + /* Record the header object name for this rbd image. */ + + size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX); + rbd_dev->header_name = kmalloc(size, GFP_KERNEL); + if (!rbd_dev->header_name) { + ret = -ENOMEM; + goto out_err; + } + sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); + + /* Populate rbd image metadata */ + + ret = rbd_read_header(rbd_dev, &rbd_dev->header); + if (ret < 0) + goto out_err; + rbd_dev->image_format = 1; + + dout("discovered version 1 image, header name is %s\n", + rbd_dev->header_name); + + return 0; + +out_err: + kfree(rbd_dev->header_name); + rbd_dev->header_name = NULL; + kfree(rbd_dev->image_id); + rbd_dev->image_id = NULL; + + return ret; +} + +static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) +{ + size_t size; + + /* + * Image id was filled in by the caller. Record the header + * object name for this rbd image. + */ + size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len; + rbd_dev->header_name = kmalloc(size, GFP_KERNEL); + if (!rbd_dev->header_name) + return -ENOMEM; + sprintf(rbd_dev->header_name, "%s%s", + RBD_HEADER_PREFIX, rbd_dev->image_id); + rbd_dev->image_format = 2; + + dout("discovered version 2 image, header name is %s\n", + rbd_dev->header_name); + + return -ENOTSUPP; +} + +/* + * Probe for the existence of the header object for the given rbd + * device. For format 2 images this includes determining the image + * id. + */ +static int rbd_dev_probe(struct rbd_device *rbd_dev) +{ + int ret; + + /* + * Get the id from the image id object. If it's not a + * format 2 image, we'll get ENOENT back, and we'll assume + * it's a format 1 image. + */ + ret = rbd_dev_image_id(rbd_dev); + if (ret) + ret = rbd_dev_v1_probe(rbd_dev); + else + ret = rbd_dev_v2_probe(rbd_dev); + if (ret) + dout("probe failed, returning %d\n", ret); + + return ret; +} + static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) @@ -2631,35 +2727,10 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_client; rbd_dev->pool_id = rc; - rc = rbd_dev_image_id(rbd_dev); - if (!rc) { - rc = -ENOTSUPP; /* Not actually supporting format 2 yet */ - goto err_out_client; - } - - /* Version 1 images have no id; empty string is used */ - - rbd_dev->image_id = kstrdup("", GFP_KERNEL); - if (!rbd_dev->image_id) { - rc = -ENOMEM; - goto err_out_client; - } - rbd_dev->image_id_len = 0; - - /* Create the name of the header object */ - - rbd_dev->header_name = kmalloc(rbd_dev->image_name_len - + sizeof (RBD_SUFFIX), - GFP_KERNEL); - if (!rbd_dev->header_name) - goto err_out_client; - sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); - - /* Get information about the image being mapped */ - - rc = rbd_read_header(rbd_dev, &rbd_dev->header); - if (rc) + rc = rbd_dev_probe(rbd_dev); + if (rc < 0) goto err_out_client; + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); /* no need to lock here, as rbd_dev is not registered yet */ rc = rbd_dev_snaps_update(rbd_dev); -- cgit v1.2.3-70-g09d2 From 9d475de5d12af8ac4c2101807e0a889ac7389c5a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 3 Jul 2012 16:01:19 -0500 Subject: rbd: add code to get the size of a v2 rbd image The size of an rbd format 2 image is fetched from the server using a "get_size" method. The same method is used for getting the size of a snapshot, so structure this addition with a generic helper routine that we can get this information for either. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3b284d53a566..061c62496fea 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2127,6 +2127,47 @@ static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which, return snap_name; } +/* + * Get the size and object order for an image snapshot, or if + * snap_id is CEPH_NOSNAP, gets this information for the base + * image. + */ +static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, + u8 *order, u64 *snap_size) +{ + __le64 snapid = cpu_to_le64(snap_id); + int ret; + struct { + u8 order; + __le64 size; + } __attribute__ ((packed)) size_buf = { 0 }; + + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + "rbd", "get_size", + (char *) &snapid, sizeof (snapid), + (char *) &size_buf, sizeof (size_buf), + CEPH_OSD_FLAG_READ, NULL); + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + if (ret < 0) + return ret; + + *order = size_buf.order; + *snap_size = le64_to_cpu(size_buf.size); + + dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n", + (unsigned long long) snap_id, (unsigned int) *order, + (unsigned long long) *snap_size); + + return 0; +} + +static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) +{ + return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, + &rbd_dev->header.obj_order, + &rbd_dev->header.image_size); +} + /* * Scan the rbd device's current snapshot list and compare it to the * newly-received snapshot context. Remove any existing snapshots @@ -2636,6 +2677,7 @@ out_err: static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) { size_t size; + int ret; /* * Image id was filled in by the caller. Record the header @@ -2647,12 +2689,23 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) return -ENOMEM; sprintf(rbd_dev->header_name, "%s%s", RBD_HEADER_PREFIX, rbd_dev->image_id); + + /* Get the size and object order for the image */ + + ret = rbd_dev_v2_image_size(rbd_dev); + if (ret < 0) + goto out_err; rbd_dev->image_format = 2; dout("discovered version 2 image, header name is %s\n", rbd_dev->header_name); return -ENOTSUPP; +out_err: + kfree(rbd_dev->header_name); + rbd_dev->header_name = NULL; + + return ret; } /* -- cgit v1.2.3-70-g09d2 From 1e1301998ee80d9a8cc09297906293f16f8a6064 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 3 Jul 2012 16:01:19 -0500 Subject: rbd: get the object prefix for a v2 rbd image The object prefix of an rbd format 2 image is fetched from the server using a "get_object_prefix" method. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 061c62496fea..64a4dd5f6f2b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -66,7 +66,8 @@ #define RBD_SNAP_HEAD_NAME "-" -#define RBD_IMAGE_ID_LEN_MAX 64 +#define RBD_IMAGE_ID_LEN_MAX 64 +#define RBD_OBJ_PREFIX_LEN_MAX 64 /* * An RBD device name will be "rbd#", where the "rbd" comes from @@ -2168,6 +2169,43 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) &rbd_dev->header.image_size); } +static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) +{ + void *reply_buf; + int ret; + void *p; + + reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); + if (!reply_buf) + return -ENOMEM; + + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + "rbd", "get_object_prefix", + NULL, 0, + reply_buf, RBD_OBJ_PREFIX_LEN_MAX, + CEPH_OSD_FLAG_READ, NULL); + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + if (ret < 0) + goto out; + + p = reply_buf; + rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, + p + RBD_OBJ_PREFIX_LEN_MAX, + NULL, GFP_NOIO); + + if (IS_ERR(rbd_dev->header.object_prefix)) { + ret = PTR_ERR(rbd_dev->header.object_prefix); + rbd_dev->header.object_prefix = NULL; + } else { + dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); + } + +out: + kfree(reply_buf); + + return ret; +} + /* * Scan the rbd device's current snapshot list and compare it to the * newly-received snapshot context. Remove any existing snapshots @@ -2693,6 +2731,12 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) /* Get the size and object order for the image */ ret = rbd_dev_v2_image_size(rbd_dev); + if (ret < 0) + goto out_err; + + /* Get the object prefix (a.k.a. block_name) for the image */ + + ret = rbd_dev_v2_object_prefix(rbd_dev); if (ret < 0) goto out_err; rbd_dev->image_format = 2; @@ -2704,6 +2748,8 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) out_err: kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; + kfree(rbd_dev->header.object_prefix); + rbd_dev->header.object_prefix = NULL; return ret; } -- cgit v1.2.3-70-g09d2 From b1b5402aa9c4a9aeb8431886e41b0a1d127318d1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 3 Jul 2012 16:01:19 -0500 Subject: rbd: get image features for a v2 image The features values for an rbd format 2 image are fetched from the server using a "get_features" method. The same method is used for getting the features for a snapshot, so structure this addition with a generic helper routine that can get this information for either. The server will provide two 64-bit feature masks, one representing the features potentially in use for this image (or its snapshot), and one representing features that must be supported by the client in order to work with the image. For the time being, neither of these is really used so we keep things simple and just record the first feature vector. Once we start using these feature masks, what we record and what we expose to the user will most likely change. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 64a4dd5f6f2b..b8b8271bd9e2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2206,6 +2206,40 @@ out: return ret; } +static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, + u64 *snap_features) +{ + __le64 snapid = cpu_to_le64(snap_id); + struct { + __le64 features; + __le64 incompat; + } features_buf = { 0 }; + int ret; + + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + "rbd", "get_features", + (char *) &snapid, sizeof (snapid), + (char *) &features_buf, sizeof (features_buf), + CEPH_OSD_FLAG_READ, NULL); + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + if (ret < 0) + return ret; + *snap_features = le64_to_cpu(features_buf.features); + + dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", + (unsigned long long) snap_id, + (unsigned long long) *snap_features, + (unsigned long long) le64_to_cpu(features_buf.incompat)); + + return 0; +} + +static int rbd_dev_v2_features(struct rbd_device *rbd_dev) +{ + return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, + &rbd_dev->header.features); +} + /* * Scan the rbd device's current snapshot list and compare it to the * newly-received snapshot context. Remove any existing snapshots @@ -2737,6 +2771,12 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) /* Get the object prefix (a.k.a. block_name) for the image */ ret = rbd_dev_v2_object_prefix(rbd_dev); + if (ret < 0) + goto out_err; + + /* Get the features for the image */ + + ret = rbd_dev_v2_features(rbd_dev); if (ret < 0) goto out_err; rbd_dev->image_format = 2; -- cgit v1.2.3-70-g09d2 From 35d489f94651ac19be55661732a7ee15c6304a55 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 3 Jul 2012 16:01:19 -0500 Subject: rbd: get the snapshot context for a v2 image Fetch the snapshot context for an rbd format 2 image by calling the "get_snapcontext" method on its header object. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b8b8271bd9e2..b673a8dc161d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -62,6 +62,7 @@ #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ #define RBD_MAX_SNAP_NAME_LEN 32 +#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ #define RBD_MAX_OPT_LEN 1024 #define RBD_SNAP_HEAD_NAME "-" @@ -2240,6 +2241,84 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev) &rbd_dev->header.features); } +static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) +{ + size_t size; + int ret; + void *reply_buf; + void *p; + void *end; + u64 seq; + u32 snap_count; + struct ceph_snap_context *snapc; + u32 i; + + /* + * We'll need room for the seq value (maximum snapshot id), + * snapshot count, and array of that many snapshot ids. + * For now we have a fixed upper limit on the number we're + * prepared to receive. + */ + size = sizeof (__le64) + sizeof (__le32) + + RBD_MAX_SNAP_COUNT * sizeof (__le64); + reply_buf = kzalloc(size, GFP_KERNEL); + if (!reply_buf) + return -ENOMEM; + + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + "rbd", "get_snapcontext", + NULL, 0, + reply_buf, size, + CEPH_OSD_FLAG_READ, NULL); + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + if (ret < 0) + goto out; + + ret = -ERANGE; + p = reply_buf; + end = (char *) reply_buf + size; + ceph_decode_64_safe(&p, end, seq, out); + ceph_decode_32_safe(&p, end, snap_count, out); + + /* + * Make sure the reported number of snapshot ids wouldn't go + * beyond the end of our buffer. But before checking that, + * make sure the computed size of the snapshot context we + * allocate is representable in a size_t. + */ + if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) + / sizeof (u64)) { + ret = -EINVAL; + goto out; + } + if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) + goto out; + + size = sizeof (struct ceph_snap_context) + + snap_count * sizeof (snapc->snaps[0]); + snapc = kmalloc(size, GFP_KERNEL); + if (!snapc) { + ret = -ENOMEM; + goto out; + } + + atomic_set(&snapc->nref, 1); + snapc->seq = seq; + snapc->num_snaps = snap_count; + for (i = 0; i < snap_count; i++) + snapc->snaps[i] = ceph_decode_64(&p); + + rbd_dev->header.snapc = snapc; + + dout(" snap context seq = %llu, snap_count = %u\n", + (unsigned long long) seq, (unsigned int) snap_count); + +out: + kfree(reply_buf); + + return 0; +} + /* * Scan the rbd device's current snapshot list and compare it to the * newly-received snapshot context. Remove any existing snapshots @@ -2779,6 +2858,12 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) ret = rbd_dev_v2_features(rbd_dev); if (ret < 0) goto out_err; + + /* Get the snapshot context */ + + ret = rbd_dev_v2_snap_context(rbd_dev); + if (ret) + goto out_err; rbd_dev->image_format = 2; dout("discovered version 2 image, header name is %s\n", -- cgit v1.2.3-70-g09d2 From b8b1e2db52de61f575981d0c23da785a7c5b4a77 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 3 Jul 2012 16:01:19 -0500 Subject: rbd: get snapshot name for a v2 image Define rbd_dev_v2_snap_name() to fetch the name for a particular snapshot in a format 2 rbd image. Define rbd_dev_v2_snap_info() to to be a wrapper for getting the name, size, and features for a particular snapshot, using an interface that matches the equivalent function for version 1 images. Define rbd_dev_snap_info() wrapper function and use it to call the appropriate function for getting the snapshot name, size, and features, dependent on the rbd image format. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b673a8dc161d..b51f1c997c1d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2319,6 +2319,83 @@ out: return 0; } +static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) +{ + size_t size; + void *reply_buf; + __le64 snap_id; + int ret; + void *p; + void *end; + size_t snap_name_len; + char *snap_name; + + size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; + reply_buf = kmalloc(size, GFP_KERNEL); + if (!reply_buf) + return ERR_PTR(-ENOMEM); + + snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + "rbd", "get_snapshot_name", + (char *) &snap_id, sizeof (snap_id), + reply_buf, size, + CEPH_OSD_FLAG_READ, NULL); + dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + if (ret < 0) + goto out; + + p = reply_buf; + end = (char *) reply_buf + size; + snap_name_len = 0; + snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len, + GFP_KERNEL); + if (IS_ERR(snap_name)) { + ret = PTR_ERR(snap_name); + goto out; + } else { + dout(" snap_id 0x%016llx snap_name = %s\n", + (unsigned long long) le64_to_cpu(snap_id), snap_name); + } + kfree(reply_buf); + + return snap_name; +out: + kfree(reply_buf); + + return ERR_PTR(ret); +} + +static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, + u64 *snap_size, u64 *snap_features) +{ + __le64 snap_id; + u8 order; + int ret; + + snap_id = rbd_dev->header.snapc->snaps[which]; + ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size); + if (ret) + return ERR_PTR(ret); + ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features); + if (ret) + return ERR_PTR(ret); + + return rbd_dev_v2_snap_name(rbd_dev, which); +} + +static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which, + u64 *snap_size, u64 *snap_features) +{ + if (rbd_dev->image_format == 1) + return rbd_dev_v1_snap_info(rbd_dev, which, + snap_size, snap_features); + if (rbd_dev->image_format == 2) + return rbd_dev_v2_snap_info(rbd_dev, which, + snap_size, snap_features); + return ERR_PTR(-EINVAL); +} + /* * Scan the rbd device's current snapshot list and compare it to the * newly-received snapshot context. Remove any existing snapshots @@ -2372,8 +2449,8 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) continue; } - snap_name = rbd_dev_v1_snap_info(rbd_dev, index, - &snap_size, &snap_features); + snap_name = rbd_dev_snap_info(rbd_dev, index, + &snap_size, &snap_features); if (IS_ERR(snap_name)) return PTR_ERR(snap_name); -- cgit v1.2.3-70-g09d2 From 6e14b1a6c3b8d7e48ece68733d2dac0464611ee4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 3 Jul 2012 16:01:19 -0500 Subject: rbd: update remaining header fields for v2 There are three fields that are not yet updated for format 2 rbd image headers: the version of the header object; the encryption type; and the compression type. There is no interface defined for fetching the latter two, so just initialize them explicitly to 0 for now. Change rbd_dev_v2_snap_context() so the caller can be supplied the version for the header object. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b51f1c997c1d..2f1bef8c1d88 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2241,7 +2241,7 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev) &rbd_dev->header.features); } -static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) +static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) { size_t size; int ret; @@ -2269,7 +2269,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) "rbd", "get_snapcontext", NULL, 0, reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); + CEPH_OSD_FLAG_READ, ver); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -2906,6 +2906,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) { size_t size; int ret; + u64 ver = 0; /* * Image id was filled in by the caller. Record the header @@ -2936,11 +2937,18 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) if (ret < 0) goto out_err; - /* Get the snapshot context */ + /* crypto and compression type aren't (yet) supported for v2 images */ + + rbd_dev->header.crypt_type = 0; + rbd_dev->header.comp_type = 0; - ret = rbd_dev_v2_snap_context(rbd_dev); + /* Get the snapshot context, plus the header version */ + + ret = rbd_dev_v2_snap_context(rbd_dev, &ver); if (ret) goto out_err; + rbd_dev->header.obj_version = ver; + rbd_dev->image_format = 2; dout("discovered version 2 image, header name is %s\n", -- cgit v1.2.3-70-g09d2 From 3e8f43a089f06279c5f76a9ccd42578eebf7bfa5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 20 Sep 2012 17:42:25 +0800 Subject: ceph: Fix oops when handling mdsmap that decreases max_mds When i >= newmap->m_max_mds, ceph_mdsmap_get_addr(newmap, i) return NULL. Passing NULL to memcmp() triggers oops. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a5a735422aa7..1bcf712655d9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2625,7 +2625,8 @@ static void check_new_map(struct ceph_mds_client *mdsc, ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", session_state_name(s->s_state)); - if (memcmp(ceph_mdsmap_get_addr(oldmap, i), + if (i >= newmap->m_max_mds || + memcmp(ceph_mdsmap_get_addr(oldmap, i), ceph_mdsmap_get_addr(newmap, i), sizeof(struct ceph_entity_addr))) { if (s->s_state == CEPH_MDS_SESSION_OPENING) { -- cgit v1.2.3-70-g09d2 From b905a7f8b7a61c192927d0324f2ea6c998f451ba Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 28 Sep 2012 12:59:16 +0800 Subject: ceph: convert to use le32_add_cpu() Convert cpu_to_le32(le32_to_cpu(E1) + E2) to use le32_add_cpu(). dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 620daad201db..3251e9cc6401 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1005,7 +1005,7 @@ static void __queue_cap_release(struct ceph_mds_session *session, BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); head = msg->front.iov_base; - head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); + le32_add_cpu(&head->num, 1); item = msg->front.iov_base + msg->front.iov_len; item->ino = cpu_to_le64(ino); item->cap_id = cpu_to_le64(cap_id); -- cgit v1.2.3-70-g09d2 From d63b77f4c552cc3a20506871046ab0fcbc332609 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Sep 2012 20:59:48 -0700 Subject: libceph: check for invalid mapping If we encounter an invalid (e.g., zeroed) mapping, return an error and avoid a divide by zero. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/osd_client.h | 2 +- include/linux/ceph/osdmap.h | 6 +++--- net/ceph/osd_client.c | 32 ++++++++++++++++++++------------ net/ceph/osdmap.c | 18 ++++++++++++++++-- 4 files changed, 40 insertions(+), 18 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cedfb1a8434a..d9b880e977e6 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -207,7 +207,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); -extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc, +extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, u64 snapid, u64 off, u64 *plen, u64 *bno, diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 311ef8d6aa9e..e88a620b9f8a 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -109,9 +109,9 @@ extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, extern void ceph_osdmap_destroy(struct ceph_osdmap *map); /* calculate mapping of a file extent to an object */ -extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 *plen, - u64 *bno, u64 *oxoff, u64 *oxlen); +extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, + u64 off, u64 *plen, + u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ extern int ceph_calc_object_layout(struct ceph_object_layout *ol, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 42119c05e82c..f7b56e23988c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -52,7 +52,7 @@ static int op_has_extent(int op) op == CEPH_OSD_OP_WRITE); } -void ceph_calc_raw_layout(struct ceph_osd_client *osdc, +int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, u64 snapid, u64 off, u64 *plen, u64 *bno, @@ -62,12 +62,15 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; u64 orig_len = *plen; u64 objoff, objlen; /* extent in object */ + int r; reqhead->snapid = cpu_to_le64(snapid); /* object extent? */ - ceph_calc_file_object_mapping(layout, off, plen, bno, - &objoff, &objlen); + r = ceph_calc_file_object_mapping(layout, off, plen, bno, + &objoff, &objlen); + if (r < 0) + return r; if (*plen < orig_len) dout(" skipping last %llu, final file extent %llu~%llu\n", orig_len - *plen, off, *plen); @@ -83,7 +86,7 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc, dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", *bno, objoff, objlen, req->r_num_pages); - + return 0; } EXPORT_SYMBOL(ceph_calc_raw_layout); @@ -112,20 +115,25 @@ EXPORT_SYMBOL(ceph_calc_raw_layout); * * fill osd op in request message. */ -static void calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op) +static int calc_layout(struct ceph_osd_client *osdc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + u64 off, u64 *plen, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) { u64 bno; + int r; - ceph_calc_raw_layout(osdc, layout, vino.snap, off, - plen, &bno, req, op); + r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, + plen, &bno, req, op); + if (r < 0) + return r; snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); + + return r; } /* diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 3124b71a8883..5433fb0eb3c6 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -984,7 +984,7 @@ bad: * for now, we write only a single su, until we can * pass a stride back to the caller. */ -void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, +int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *ono, u64 *oxoff, u64 *oxlen) @@ -998,11 +998,17 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, osize, su); + if (su == 0 || sc == 0) + goto invalid; su_per_object = osize / su; + if (su_per_object == 0) + goto invalid; dout("osize %u / su %u = su_per_object %u\n", osize, su, su_per_object); - BUG_ON((su & ~PAGE_MASK) != 0); + if ((su & ~PAGE_MASK) != 0) + goto invalid; + /* bl = *off / su; */ t = off; do_div(t, su); @@ -1030,6 +1036,14 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, *plen = *oxlen; dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); + return 0; + +invalid: + dout(" invalid layout\n"); + *ono = 0; + *oxoff = 0; + *oxlen = 0; + return -EINVAL; } EXPORT_SYMBOL(ceph_calc_file_object_mapping); -- cgit v1.2.3-70-g09d2 From 6816282dab3a72efe8c0d182c1bc2960d87f4322 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Sep 2012 21:01:02 -0700 Subject: ceph: propagate layout error on osd request creation If we are creating an osd request and get an invalid layout, return an EINVAL to the caller. We switch up the return to have an error code instead of NULL implying -ENOMEM. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- fs/ceph/addr.c | 8 ++++---- fs/ceph/file.c | 4 ++-- net/ceph/osd_client.c | 15 +++++++++------ 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 452e71a1b753..4469b63c9b7b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -308,8 +308,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) NULL, 0, ci->i_truncate_seq, ci->i_truncate_size, NULL, false, 1, 0); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); /* build page vector */ nr_pages = len >> PAGE_CACHE_SHIFT; @@ -832,8 +832,8 @@ get_more_pages: ci->i_truncate_size, &inode->i_mtime, true, 1, 0); - if (!req) { - rc = -ENOMEM; + if (IS_ERR(req)) { + rc = PTR_ERR(req); unlock_page(page); break; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ecebbc09bfc7..5840d2aaed15 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -536,8 +536,8 @@ more: do_sync, ci->i_truncate_seq, ci->i_truncate_size, &mtime, false, 2, page_align); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); if (file->f_flags & O_DIRECT) { pages = ceph_get_direct_page_vector(data, num_pages, false); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f7b56e23988c..ccbdfbba9e53 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -464,6 +464,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, { struct ceph_osd_req_op ops[3]; struct ceph_osd_request *req; + int r; ops[0].op = opcode; ops[0].extent.truncate_seq = truncate_seq; @@ -482,10 +483,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, use_mempool, GFP_NOFS, NULL, NULL); if (!req) - return NULL; + return ERR_PTR(-ENOMEM); /* calculate max write size */ - calc_layout(osdc, vino, layout, off, plen, req, ops); + r = calc_layout(osdc, vino, layout, off, plen, req, ops); + if (r < 0) + return ERR_PTR(r); req->r_file_layout = *layout; /* keep a copy */ /* in case it differs from natural (file) alignment that @@ -1928,8 +1931,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, truncate_seq, truncate_size, NULL, false, 1, page_align); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); /* it may be a short read due to an object boundary */ req->r_pages = pages; @@ -1971,8 +1974,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, snapc, do_sync, truncate_seq, truncate_size, mtime, nofail, 1, page_align); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); /* it may be a short write due to an object boundary */ req->r_pages = pages; -- cgit v1.2.3-70-g09d2 From 6cae3717cddaf8e5e96e304733dca66e40d56f89 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Sep 2012 21:02:47 -0700 Subject: rbd: BUG on invalid layout This shouldn't actually be possible because the layout struct is constructed from the RBD header and validated then. [elder@inktank.com: converted BUG() call to equivalent rbd_assert()] Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- drivers/block/rbd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2f1bef8c1d88..bb3d9be3b1b4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1020,8 +1020,9 @@ static int rbd_do_request(struct request *rq, layout->fl_stripe_count = cpu_to_le32(1); layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); - ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, - req, ops); + ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, + req, ops); + rbd_assert(ret == 0); ceph_osdc_build_request(req, ofs, &len, ops, -- cgit v1.2.3-70-g09d2 From 457712a0bc5389b75d2c93840a684fd77df2aabb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Sep 2012 21:04:57 -0700 Subject: ceph: return EIO on invalid layout on GET_DATALOC ioctl If the user calls GET_DATALOC on a file with an invalid (e.g., zeroed) layout, return EIO to userland. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- fs/ceph/ioctl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 1396ceb46797..36549a46e311 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -187,14 +187,18 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) u64 tmp; struct ceph_object_layout ol; struct ceph_pg pgid; + int r; /* copy and validate */ if (copy_from_user(&dl, arg, sizeof(dl))) return -EFAULT; down_read(&osdc->map_sem); - ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, - &dl.object_no, &dl.object_offset, &olen); + r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, + &dl.object_no, &dl.object_offset, + &olen); + if (r < 0) + return -EIO; dl.file_offset -= dl.object_offset; dl.object_size = ceph_file_layout_object_size(ci->i_layout); dl.block_size = ceph_file_layout_su(ci->i_layout); -- cgit v1.2.3-70-g09d2 From 6285bc231277419255f3498d3eb5ddc9f8e7fe79 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 2 Oct 2012 10:25:51 -0500 Subject: ceph: avoid 32-bit page index overflow A pgoff_t is defined (by default) to have type (unsigned long). On architectures such as i686 that's a 32-bit type. The ceph address space code was attempting to produce 64 bit offsets by shifting a page's index by PAGE_CACHE_SHIFT, but the result was not what was desired because the shift occurred before the result got promoted to 64 bits. Fix this by converting all uses of page->index used in this way to use the page_offset() macro, which ensures the 64-bit result has the intended value. This fixes http://tracker.newdream.net/issues/3112 Reported-by: Mohamed Pakkeer Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- fs/ceph/addr.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4469b63c9b7b..22b6e4583faa 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -205,7 +205,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - page->index << PAGE_CACHE_SHIFT, &len, + (u64) page_offset(page), &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); if (err == -ENOENT) @@ -286,7 +286,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) int nr_pages = 0; int ret; - off = page->index << PAGE_CACHE_SHIFT; + off = (u64) page_offset(page); /* count pages */ next_index = page->index; @@ -426,7 +426,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_inode_info *ci; struct ceph_fs_client *fsc; struct ceph_osd_client *osdc; - loff_t page_off = page->index << PAGE_CACHE_SHIFT; + loff_t page_off = page_offset(page); int len = PAGE_CACHE_SIZE; loff_t i_size; int err = 0; @@ -817,8 +817,7 @@ get_more_pages: /* ok */ if (locked_pages == 0) { /* prepare async write request */ - offset = (unsigned long long)page->index - << PAGE_CACHE_SHIFT; + offset = (u64) page_offset(page); len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, @@ -1180,7 +1179,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page = vmf->page; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - loff_t off = page->index << PAGE_CACHE_SHIFT; + loff_t off = page_offset(page); loff_t size, len; int ret; -- cgit v1.2.3-70-g09d2