diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-17 11:53:52 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-12-17 11:53:52 -0800 |
commit | be695ee29e8fc0af266d9f1882868c47da01a790 (patch) | |
tree | 085cca4c1a124751d18cd2a06b2fe157daf67e40 /fs | |
parent | 92dbc9dedccb9759c7f9f2f0ae6242396376988f (diff) | |
parent | 2f0df6cfa325d7106b8a65bc0e02db1086e3f73b (diff) |
Merge tag 'ceph-for-5.11-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov:
"The big ticket item here is support for msgr2 on-wire protocol, which
adds the option of full in-transit encryption using AES-GCM algorithm
(myself).
On top of that we have a series to avoid intermittent errors during
recovery with recover_session=clean and some MDS request encoding work
from Jeff, a cap handling fix and assorted observability improvements
from Luis and Xiubo and a good number of cleanups.
Luis also ran into a corner case with quotas which sadly means that we
are back to denying cross-quota-realm renames"
* tag 'ceph-for-5.11-rc1' of git://github.com/ceph/ceph-client: (59 commits)
libceph: drop ceph_auth_{create,update}_authorizer()
libceph, ceph: make use of __ceph_auth_get_authorizer() in msgr1
libceph, ceph: implement msgr2.1 protocol (crc and secure modes)
libceph: introduce connection modes and ms_mode option
libceph, rbd: ignore addr->type while comparing in some cases
libceph, ceph: get and handle cluster maps with addrvecs
libceph: factor out finish_auth()
libceph: drop ac->ops->name field
libceph: amend cephx init_protocol() and build_request()
libceph, ceph: incorporate nautilus cephx changes
libceph: safer en/decoding of cephx requests and replies
libceph: more insight into ticket expiry and invalidation
libceph: move msgr1 protocol specific fields to its own struct
libceph: move msgr1 protocol implementation to its own file
libceph: separate msgr1 protocol implementation
libceph: export remaining protocol independent infrastructure
libceph: export zero_page
libceph: rename and export con->flags bits
libceph: rename and export con->state states
libceph: make con->state an int
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ceph/addr.c | 6 | ||||
-rw-r--r-- | fs/ceph/caps.c | 27 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 20 | ||||
-rw-r--r-- | fs/ceph/dir.c | 9 | ||||
-rw-r--r-- | fs/ceph/inode.c | 41 | ||||
-rw-r--r-- | fs/ceph/locks.c | 8 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 280 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 3 | ||||
-rw-r--r-- | fs/ceph/mdsmap.c | 25 | ||||
-rw-r--r-- | fs/ceph/metric.c | 18 | ||||
-rw-r--r-- | fs/ceph/metric.h | 14 | ||||
-rw-r--r-- | fs/ceph/quota.c | 58 | ||||
-rw-r--r-- | fs/ceph/super.c | 14 | ||||
-rw-r--r-- | fs/ceph/super.h | 7 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 81 |
15 files changed, 410 insertions, 201 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 35c83f65475b..950552944436 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -840,7 +840,7 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { if (ci->i_wrbuffer_ref > 0) { pr_warn_ratelimited( "writepage_start %p %lld forced umount\n", @@ -1264,7 +1264,7 @@ ceph_find_incompatible(struct page *page) struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { dout(" page %p forced umount\n", page); return ERR_PTR(-EIO); } @@ -1321,7 +1321,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); for (;;) { - page = grab_cache_page_write_begin(mapping, index, 0); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { r = -ENOMEM; break; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ded4229c314a..255a512f1277 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1140,16 +1140,24 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; - struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_mds_client *mdsc; int removed = 0; + /* 'ci' being NULL means the remove have already occurred */ + if (!ci) { + dout("%s: cap inode is NULL\n", __func__); + return; + } + dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc; + /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); if (ci->i_auth_cap == cap) { - WARN_ON_ONCE(!list_empty(&ci->i_dirty_item)); + WARN_ON_ONCE(!list_empty(&ci->i_dirty_item) && + !mdsc->fsc->blocklisted); ci->i_auth_cap = NULL; } @@ -2746,7 +2754,7 @@ again: goto out_unlock; } - if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { dout("get_cap_refs %p forced umount\n", inode); ret = -EIO; goto out_unlock; @@ -4027,15 +4035,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, } if (msg_version >= 8) { - u64 flush_tid; - u32 caller_uid, caller_gid; u32 pool_ns_len; /* version >= 6 */ - ceph_decode_64_safe(&p, end, flush_tid, bad); + ceph_decode_skip_64(&p, end, bad); // flush_tid /* version >= 7 */ - ceph_decode_32_safe(&p, end, caller_uid, bad); - ceph_decode_32_safe(&p, end, caller_gid, bad); + ceph_decode_skip_32(&p, end, bad); // caller_uid + ceph_decode_skip_32(&p, end, bad); // caller_gid /* version >= 8 */ ceph_decode_32_safe(&p, end, pool_ns_len, bad); if (pool_ns_len > 0) { @@ -4058,9 +4064,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, } if (msg_version >= 11) { - u32 flags; /* version >= 10 */ - ceph_decode_32_safe(&p, end, flags, bad); + ceph_decode_skip_32(&p, end, bad); // flags /* version >= 11 */ extra_info.dirstat_valid = true; ceph_decode_64_safe(&p, end, extra_info.nfiles, bad); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 7a8fbe3e4751..66989c880adb 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -304,11 +304,25 @@ static int mds_sessions_show(struct seq_file *s, void *ptr) return 0; } +static int status_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_entity_inst *inst = &fsc->client->msgr.inst; + struct ceph_entity_addr *client_addr = ceph_client_addr(fsc->client); + + seq_printf(s, "instance: %s.%lld %s/%u\n", ENTITY_NAME(inst->name), + ceph_pr_addr(client_addr), le32_to_cpu(client_addr->nonce)); + seq_printf(s, "blocklisted: %s\n", fsc->blocklisted ? "true" : "false"); + + return 0; +} + DEFINE_SHOW_ATTRIBUTE(mdsmap); DEFINE_SHOW_ATTRIBUTE(mdsc); DEFINE_SHOW_ATTRIBUTE(caps); DEFINE_SHOW_ATTRIBUTE(mds_sessions); DEFINE_SHOW_ATTRIBUTE(metric); +DEFINE_SHOW_ATTRIBUTE(status); /* @@ -394,6 +408,12 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) fsc->client->debugfs_dir, fsc, &caps_fops); + + fsc->debugfs_status = debugfs_create_file("status", + 0400, + fsc->client->debugfs_dir, + fsc, + &status_fops); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a4d48370b2b3..858ee7362ff5 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1202,12 +1202,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, op = CEPH_MDS_OP_RENAMESNAP; else return -EROFS; - } else if (old_dir != new_dir) { - err = ceph_quota_check_rename(mdsc, d_inode(old_dentry), - new_dir); - if (err) - return err; } + /* don't allow cross-quota renames */ + if ((old_dir != new_dir) && + (!ceph_quota_is_same_realm(old_dir, new_dir))) + return -EXDEV; dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 526faf4778ce..adc8fc3c5d85 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1315,15 +1315,10 @@ retry_lookup: } if (rinfo->head->is_target) { - tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); - tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - - in = ceph_get_inode(sb, tvino); - if (IS_ERR(in)) { - err = PTR_ERR(in); - goto done; - } + /* Should be filled in by handle_reply */ + BUG_ON(!req->r_target_inode); + in = req->r_target_inode; err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, session, (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && @@ -1333,11 +1328,13 @@ retry_lookup: if (err < 0) { pr_err("ceph_fill_inode badness %p %llx.%llx\n", in, ceph_vinop(in)); + req->r_target_inode = NULL; if (in->i_state & I_NEW) discard_new_inode(in); + else + iput(in); goto done; } - req->r_target_inode = in; if (in->i_state & I_NEW) unlock_new_inode(in); } @@ -1597,8 +1594,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct dentry *dn; struct inode *in; int err = 0, skipped = 0, ret, i; - struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - u32 frag = le32_to_cpu(rhead->args.readdir.frag); + u32 frag = le32_to_cpu(req->r_args.readdir.frag); u32 last_hash = 0; u32 fpos_offset; struct ceph_readdir_cache_control cache_ctl = {}; @@ -1615,7 +1611,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, } else if (rinfo->offset_hash) { /* mds understands offset_hash */ WARN_ON_ONCE(req->r_readdir_offset != 2); - last_hash = le32_to_cpu(rhead->args.readdir.offset_hash); + last_hash = le32_to_cpu(req->r_args.readdir.offset_hash); } } @@ -1888,7 +1884,7 @@ static void ceph_do_invalidate_pages(struct inode *inode) mutex_lock(&ci->i_truncate_mutex); - if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", inode, ceph_ino(inode)); mapping_set_error(inode->i_mapping, -EIO); @@ -2340,15 +2336,23 @@ int ceph_permission(struct inode *inode, int mask) } /* Craft a mask of needed caps given a set of requested statx attrs. */ -static int statx_to_caps(u32 want) +static int statx_to_caps(u32 want, umode_t mode) { int mask = 0; if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) mask |= CEPH_CAP_AUTH_SHARED; - if (want & (STATX_NLINK|STATX_CTIME)) - mask |= CEPH_CAP_LINK_SHARED; + if (want & (STATX_NLINK|STATX_CTIME)) { + /* + * The link count for directories depends on inode->i_subdirs, + * and that is only updated when Fs caps are held. + */ + if (S_ISDIR(mode)) + mask |= CEPH_CAP_FILE_SHARED; + else + mask |= CEPH_CAP_LINK_SHARED; + } if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| STATX_BLOCKS)) @@ -2374,8 +2378,9 @@ int ceph_getattr(const struct path *path, struct kstat *stat, /* Skip the getattr altogether if we're asked not to sync */ if (!(flags & AT_STATX_DONT_SYNC)) { - err = ceph_do_getattr(inode, statx_to_caps(request_mask), - flags & AT_STATX_FORCE_SYNC); + err = ceph_do_getattr(inode, + statx_to_caps(request_mask, inode->i_mode), + flags & AT_STATX_FORCE_SYNC); if (err) return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 048a435a29be..fa8a847743d0 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -57,7 +57,7 @@ static const struct file_lock_operations ceph_fl_lock_ops = { .fl_release_private = ceph_fl_release_lock, }; -/** +/* * Implement fcntl and flock locking functions. */ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, @@ -225,7 +225,7 @@ static int try_unlock_file(struct file *file, struct file_lock *fl) return 1; } -/** +/* * Attempt to set an fcntl lock. * For now, this just goes away to the server. Later it may be more awesome. */ @@ -408,7 +408,7 @@ static int lock_to_ceph_filelock(struct file_lock *lock, return err; } -/** +/* * Encode the flock and fcntl locks for the given inode into the ceph_filelock * array. Must be called with inode->i_lock already held. * If we encounter more of a specific lock type than expected, return -ENOSPC. @@ -458,7 +458,7 @@ fail: return err; } -/** +/* * Copy the encoded flock and fcntl locks into the pagelist. * Format is: #fcntl locks, sequential fcntl locks, #flock locks, * sequential flock locks. diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8f1d7500a7ec..98c15ff2e599 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -516,13 +516,9 @@ static int parse_reply_info_create(void **p, void *end, /* Malformed reply? */ info->has_create_ino = false; } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) { - u8 struct_v, struct_compat; - u32 len; - info->has_create_ino = true; - ceph_decode_8_safe(p, end, struct_v, bad); - ceph_decode_8_safe(p, end, struct_compat, bad); - ceph_decode_32_safe(p, end, len, bad); + /* struct_v, struct_compat, and len */ + ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad); ceph_decode_64_safe(p, end, info->ino, bad); ret = ceph_parse_deleg_inos(p, end, s); if (ret) @@ -837,6 +833,7 @@ void ceph_mdsc_release_request(struct kref *kref) } kfree(req->r_path1); kfree(req->r_path2); + put_cred(req->r_cred); if (req->r_pagelist) ceph_pagelist_release(req->r_pagelist); put_request_session(req); @@ -892,8 +889,7 @@ static void __register_request(struct ceph_mds_client *mdsc, ceph_mdsc_get_request(req); insert_request(&mdsc->request_tree, req); - req->r_uid = current_fsuid(); - req->r_gid = current_fsgid(); + req->r_cred = get_current_cred(); if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) mdsc->oldest_tid = req->r_tid; @@ -1243,7 +1239,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 { struct ceph_msg *msg; struct ceph_mds_session_head *h; - int i = -1; + int i; int extra_bytes = 0; int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; @@ -1595,7 +1591,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_cap_flush *cf; struct ceph_mds_client *mdsc = fsc->mdsc; - if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { if (inode->i_data.nrpages > 0) invalidate = true; if (ci->i_wrbuffer_ref > 0) @@ -2482,21 +2478,24 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, /* * called under mdsc->mutex */ -static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, +static struct ceph_msg *create_request_message(struct ceph_mds_session *session, struct ceph_mds_request *req, - int mds, bool drop_cap_releases) + bool drop_cap_releases) { + int mds = session->s_mds; + struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_msg *msg; - struct ceph_mds_request_head *head; + struct ceph_mds_request_head_old *head; const char *path1 = NULL; const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; bool freepath1 = false, freepath2 = false; - int len; + int len, i; u16 releases; void *p, *end; int ret; + bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); ret = set_request_path_attr(req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, @@ -2518,14 +2517,23 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, goto out_free1; } - len = sizeof(*head) + - pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + + if (legacy) { + /* Old style */ + len = sizeof(*head); + } else { + /* New style: add gid_list and any later fields */ + len = sizeof(struct ceph_mds_request_head) + sizeof(u32) + + (sizeof(u64) * req->r_cred->group_info->ngroups); + } + + len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + sizeof(struct ceph_timespec); /* calculate (max) length for cap releases */ len += sizeof(struct ceph_mds_request_release) * (!!req->r_inode_drop + !!req->r_dentry_drop + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); + if (req->r_dentry_drop) len += pathlen1; if (req->r_old_dentry_drop) @@ -2537,17 +2545,33 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, goto out_free2; } - msg->hdr.version = cpu_to_le16(2); msg->hdr.tid = cpu_to_le64(req->r_tid); - head = msg->front.iov_base; - p = msg->front.iov_base + sizeof(*head); + /* + * The old ceph_mds_request_header didn't contain a version field, and + * one was added when we moved the message version from 3->4. + */ + if (legacy) { + msg->hdr.version = cpu_to_le16(3); + head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(*head); + } else { + struct ceph_mds_request_head *new_head = msg->front.iov_base; + + msg->hdr.version = cpu_to_le16(4); + new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); + head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; + p = msg->front.iov_base + sizeof(*new_head); + } + end = msg->front.iov_base + msg->front.iov_len; head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); - head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); + head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, + req->r_cred->fsuid)); + head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, + req->r_cred->fsgid)); head->ino = cpu_to_le64(req->r_deleg_ino); head->args = req->r_args; @@ -2592,6 +2616,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ceph_encode_copy(&p, &ts, sizeof(ts)); } + /* gid list */ + if (!legacy) { + ceph_encode_32(&p, req->r_cred->group_info->ngroups); + for (i = 0; i < req->r_cred->group_info->ngroups; i++) + ceph_encode_64(&p, from_kgid(&init_user_ns, + req->r_cred->group_info->gid[i])); + } + if (WARN_ON_ONCE(p > end)) { ceph_msg_put(msg); msg = ERR_PTR(-ERANGE); @@ -2635,14 +2667,28 @@ static void complete_request(struct ceph_mds_client *mdsc, complete_all(&req->r_completion); } +static struct ceph_mds_request_head_old * +find_old_request_head(void *p, u64 features) +{ + bool legacy = !(features & CEPH_FEATURE_FS_BTIME); + struct ceph_mds_request_head *new_head; + + if (legacy) + return (struct ceph_mds_request_head_old *)p; + new_head = (struct ceph_mds_request_head *)p; + return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; +} + /* * called under mdsc->mutex */ -static int __prepare_send_request(struct ceph_mds_client *mdsc, +static int __prepare_send_request(struct ceph_mds_session *session, struct ceph_mds_request *req, - int mds, bool drop_cap_releases) + bool drop_cap_releases) { - struct ceph_mds_request_head *rhead; + int mds = session->s_mds; + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_mds_request_head_old *rhead; struct ceph_msg *msg; int flags = 0; @@ -2661,6 +2707,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { void *p; + /* * Replay. Do not regenerate message (and rebuild * paths, etc.); just use the original message. @@ -2668,7 +2715,8 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, * d_move mangles the src name. */ msg = req->r_request; - rhead = msg->front.iov_base; + rhead = find_old_request_head(msg->front.iov_base, + session->s_con.peer_features); flags = le32_to_cpu(rhead->flags); flags |= CEPH_MDS_FLAG_REPLAY; @@ -2699,14 +2747,15 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, ceph_msg_put(req->r_request); req->r_request = NULL; } - msg = create_request_message(mdsc, req, mds, drop_cap_releases); + msg = create_request_message(session, req, drop_cap_releases); if (IS_ERR(msg)) { req->r_err = PTR_ERR(msg); return PTR_ERR(msg); } req->r_request = msg; - rhead = msg->front.iov_base; + rhead = find_old_request_head(msg->front.iov_base, + session->s_con.peer_features); rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; @@ -2725,15 +2774,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, /* * called under mdsc->mutex */ -static int __send_request(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, +static int __send_request(struct ceph_mds_session *session, struct ceph_mds_request *req, bool drop_cap_releases) { int err; - err = __prepare_send_request(mdsc, req, session->s_mds, - drop_cap_releases); + err = __prepare_send_request(session, req, drop_cap_releases); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2818,10 +2865,6 @@ static void __do_request(struct ceph_mds_client *mdsc, ceph_session_state_name(session->s_state)); if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { - if (session->s_state == CEPH_MDS_SESSION_REJECTED) { - err = -EACCES; - goto out_session; - } /* * We cannot queue async requests since the caps and delegated * inodes are bound to the session. Just return -EJUKEBOX and @@ -2831,6 +2874,20 @@ static void __do_request(struct ceph_mds_client *mdsc, err = -EJUKEBOX; goto out_session; } + + /* + * If the session has been REJECTED, then return a hard error, + * unless it's a CLEANRECOVER mount, in which case we'll queue + * it to the mdsc queue. + */ + if (session->s_state == CEPH_MDS_SESSION_REJECTED) { + if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) + list_add(&req->r_wait, &mdsc->waiting_for_map); + else + err = -EACCES; + goto out_session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) { err = __open_session(mdsc, session); @@ -2850,7 +2907,7 @@ static void __do_request(struct ceph_mds_client *mdsc, if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; - err = __send_request(mdsc, session, req, false); + err = __send_request(session, req, false); out_session: ceph_put_mds_session(session); @@ -3173,6 +3230,23 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); + /* Must find target inode outside of mutexes to avoid deadlocks */ + if ((err >= 0) && rinfo->head->is_target) { + struct inode *in; + struct ceph_vino tvino = { + .ino = le64_to_cpu(rinfo->targeti.in->ino), + .snap = le64_to_cpu(rinfo->targeti.in->snapid) + }; + + in = ceph_get_inode(mdsc->fsc->sb, tvino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + mutex_lock(&session->s_mutex); + goto out_err; + } + req->r_target_inode = in; + } + mutex_lock(&session->s_mutex); if (err < 0) { pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); @@ -3514,7 +3588,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) - __send_request(mdsc, session, req, true); + __send_request(session, req, true); /* * also re-send old requests when MDS enters reconnect stage. So that MDS @@ -3535,7 +3609,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, ceph_mdsc_release_dir_caps_no_check(req); - __send_request(mdsc, session, req, true); + __send_request(session, req, true); } mutex_unlock(&mdsc->mutex); } @@ -4374,12 +4448,7 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) if (!READ_ONCE(fsc->blocklisted)) return; - if (fsc->last_auto_reconnect && - time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) - return; - pr_info("auto reconnect after blocklisted\n"); - fsc->last_auto_reconnect = jiffies; ceph_force_reconnect(fsc->sb); } @@ -4678,7 +4747,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; - if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) return; dout("sync\n"); @@ -4855,10 +4924,8 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; u32 epoch; - u32 map_len; u32 num_fs; u32 mount_fscid = (u32)-1; - u8 struct_v, struct_cv; int err = -EINVAL; ceph_decode_need(&p, end, sizeof(u32), bad); @@ -4866,24 +4933,17 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) dout("handle_fsmap epoch %u\n", epoch); - ceph_decode_need(&p, end, 2 + sizeof(u32), bad); - struct_v = ceph_decode_8(&p); - struct_cv = ceph_decode_8(&p); - map_len = ceph_decode_32(&p); - - ceph_decode_need(&p, end, sizeof(u32) * 3, bad); - p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */ + /* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */ + ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad); - num_fs = ceph_decode_32(&p); + ceph_decode_32_safe(&p, end, num_fs, bad); while (num_fs-- > 0) { void *info_p, *info_end; u32 info_len; - u8 info_v, info_cv; u32 fscid, namelen; ceph_decode_need(&p, end, 2 + sizeof(u32), bad); - info_v = ceph_decode_8(&p); - info_cv = ceph_decode_8(&p); + p += 2; // info_v, info_cv info_len = ceph_decode_32(&p); ceph_decode_need(&p, end, info_len, bad); info_p = p; @@ -4954,7 +5014,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) return; } - newmap = ceph_mdsmap_decode(&p, end); + newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client)); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad_unlock; @@ -5081,23 +5141,12 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; + int ret; - if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(auth->authorizer); - auth->authorizer = NULL; - } - if (!auth->authorizer) { - int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); - if (ret) - return ERR_PTR(ret); - } else { - int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); - if (ret) - return ERR_PTR(ret); - } - *proto = ac->protocol; + ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS, + force_new, proto, NULL, NULL); + if (ret) + return ERR_PTR(ret); return auth; } @@ -5118,8 +5167,11 @@ static int verify_authorizer_reply(struct ceph_connection *con) struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; - return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer); + return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, + auth->authorizer_reply_buf, auth->authorizer_reply_buf_len, + NULL, NULL, NULL, NULL); } static int invalidate_authorizer(struct ceph_connection *con) @@ -5133,6 +5185,80 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } +static int mds_get_auth_request(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + int ret; + + ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int mds_handle_auth_reply_more(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + int ret; + + ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int mds_handle_auth_done(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + + return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len, + session_key, session_key_len, + con_secret, con_secret_len); +} + +static int mds_handle_auth_bad_method(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc; + int ret; + + if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS, + used_proto, result, + allowed_protos, proto_cnt, + allowed_modes, mode_cnt)) { + ret = ceph_monc_validate_auth(monc); + if (ret) + return ret; + } + + return -EACCES; +} + static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { @@ -5182,6 +5308,10 @@ static const struct ceph_connection_operations mds_con_ops = { .alloc_msg = mds_alloc_msg, .sign_message = mds_sign_message, .check_message_signature = mds_check_message_signature, + .get_auth_request = mds_get_auth_request, + .handle_auth_reply_more = mds_handle_auth_reply_more, + .handle_auth_done = mds_handle_auth_done, + .handle_auth_bad_method = mds_handle_auth_bad_method, }; /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f5adbebcb38e..eaa7c5422116 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -275,8 +275,7 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ - kuid_t r_uid; - kgid_t r_gid; + const struct cred *r_cred; int r_request_release_offset; struct timespec64 r_stamp; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index e4aba6c6d3b5..abd9af7727ad 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -114,7 +114,7 @@ bad: * Ignore any fields we don't care about (there are quite a few of * them). */ -struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) +struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) { struct ceph_mdsmap *m; const void *start = *p; @@ -201,18 +201,19 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) namelen = ceph_decode_32(p); /* skip mds name */ *p += namelen; - ceph_decode_need(p, end, - 4*sizeof(u32) + sizeof(u64) + - sizeof(addr) + sizeof(struct ceph_timespec), - bad); - mds = ceph_decode_32(p); - inc = ceph_decode_32(p); - state = ceph_decode_32(p); + ceph_decode_32_safe(p, end, mds, bad); + ceph_decode_32_safe(p, end, inc, bad); + ceph_decode_32_safe(p, end, state, bad); *p += sizeof(u64); /* state_seq */ - err = ceph_decode_entity_addr(p, end, &addr); + if (info_v >= 8) + err = ceph_decode_entity_addrvec(p, end, msgr2, &addr); + else + err = ceph_decode_entity_addr(p, end, &addr); if (err) goto corrupt; - ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + + ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since), + bad); laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); @@ -243,8 +244,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) } if (state <= 0) { - pr_warn("mdsmap_decode got incorrect state(%s)\n", - ceph_mds_state_name(state)); + dout("mdsmap_decode got incorrect state(%s)\n", + ceph_mds_state_name(state)); continue; } diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index fee4c4778313..5ec94bd4c1de 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -16,6 +16,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_metric_read_latency *read; struct ceph_metric_write_latency *write; struct ceph_metric_metadata_latency *meta; + struct ceph_metric_dlease *dlease; struct ceph_client_metric *m = &mdsc->metric; u64 nr_caps = atomic64_read(&m->total_caps); struct ceph_msg *msg; @@ -25,7 +26,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, s32 len; len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) - + sizeof(*meta); + + sizeof(*meta) + sizeof(*dlease); msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); if (!msg) { @@ -42,8 +43,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, cap->ver = 1; cap->compat = 1; cap->data_len = cpu_to_le32(sizeof(*cap) - 10); - cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit)); - cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis)); + cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit)); + cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis)); cap->total = cpu_to_le64(nr_caps); items++; @@ -83,6 +84,17 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, meta->nsec = cpu_to_le32(ts.tv_nsec); items++; + /* encode the dentry lease metric */ + dlease = (struct ceph_metric_dlease *)(meta + 1); + dlease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); + dlease->ver = 1; + dlease->compat = 1; + dlease->data_len = cpu_to_le32(sizeof(*dlease) - 10); + dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit)); + dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis)); + dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries)); + items++; + put_unaligned_le32(items, &head->num); msg->front.iov_len = len; msg->hdr.version = cpu_to_le16(1); diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 710f3f1dceab..af6038ff39d4 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -27,6 +27,7 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_READ_LATENCY, \ CLIENT_METRIC_TYPE_WRITE_LATENCY, \ CLIENT_METRIC_TYPE_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_DENTRY_LEASE, \ \ CLIENT_METRIC_TYPE_MAX, \ } @@ -80,6 +81,19 @@ struct ceph_metric_metadata_latency { __le32 nsec; } __packed; +/* metric dentry lease header */ +struct ceph_metric_dlease { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(hit + mis + total) */ + __le64 hit; + __le64 mis; + __le64 total; +} __packed; + struct ceph_metric_head { __le32 num; /* the number of metrics that will be sent */ } __packed; diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 9b785f11e95a..4e32c9600ecc 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -264,7 +264,7 @@ restart: return NULL; } -static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) +bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb); struct ceph_snap_realm *old_realm, *new_realm; @@ -516,59 +516,3 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) return is_updated; } -/* - * ceph_quota_check_rename - check if a rename can be executed - * @mdsc: MDS client instance - * @old: inode to be copied - * @new: destination inode (directory) - * - * This function verifies if a rename (e.g. moving a file or directory) can be - * executed. It forces an rstat update in the @new target directory (and in the - * source @old as well, if it's a directory). The actual check is done both for - * max_files and max_bytes. - * - * This function returns 0 if it's OK to do the rename, or, if quotas are - * exceeded, -EXDEV (if @old is a directory) or -EDQUOT. - */ -int ceph_quota_check_rename(struct ceph_mds_client *mdsc, - struct inode *old, struct inode *new) -{ - struct ceph_inode_info *ci_old = ceph_inode(old); - int ret = 0; - - if (ceph_quota_is_same_realm(old, new)) - return 0; - - /* - * Get the latest rstat for target directory (and for source, if a - * directory) - */ - ret = ceph_do_getattr(new, CEPH_STAT_RSTAT, false); - if (ret) - return ret; - - if (S_ISDIR(old->i_mode)) { - ret = ceph_do_getattr(old, CEPH_STAT_RSTAT, false); - if (ret) - return ret; - ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP, - ci_old->i_rbytes); - if (!ret) - ret = check_quota_exceeded(new, - QUOTA_CHECK_MAX_FILES_OP, - ci_old->i_rfiles + - ci_old->i_rsubdirs); - if (ret) - ret = -EXDEV; - } else { - ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP, - i_size_read(old)); - if (!ret) - ret = check_quota_exceeded(new, - QUOTA_CHECK_MAX_FILES_OP, 1); - if (ret) - ret = -EDQUOT; - } - - return ret; -} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 33ba6f0aa55c..9b1b7f4cfdd4 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -831,6 +831,13 @@ static void destroy_caches(void) ceph_fscache_unregister(); } +static void __ceph_umount_begin(struct ceph_fs_client *fsc) +{ + ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); + ceph_mdsc_force_umount(fsc->mdsc); + fsc->filp_gen++; // invalidate open files +} + /* * ceph_umount_begin - initiate forced umount. Tear down the * mount, skipping steps that may hang while waiting for server(s). @@ -843,9 +850,7 @@ static void ceph_umount_begin(struct super_block *sb) if (!fsc) return; fsc->mount_state = CEPH_MOUNT_SHUTDOWN; - ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); - ceph_mdsc_force_umount(fsc->mdsc); - fsc->filp_gen++; // invalidate open files + __ceph_umount_begin(fsc); } static const struct super_operations ceph_super_ops = { @@ -1234,7 +1239,8 @@ int ceph_force_reconnect(struct super_block *sb) struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; - ceph_umount_begin(sb); + fsc->mount_state = CEPH_MOUNT_RECOVER; + __ceph_umount_begin(fsc); /* Make sure all page caches get invalidated. * see remove_session_caps_cb() */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 482473e4cce1..b62d8fee3b86 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -106,9 +106,8 @@ struct ceph_fs_client { struct ceph_mount_options *mount_options; struct ceph_client *client; - unsigned long mount_state; + int mount_state; - unsigned long last_auto_reconnect; bool blocklisted; bool have_copy_from2; @@ -129,6 +128,7 @@ struct ceph_fs_client { struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; struct dentry *debugfs_metric; + struct dentry *debugfs_status; struct dentry *debugfs_mds_sessions; #endif @@ -1222,14 +1222,13 @@ extern void ceph_handle_quota(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); extern bool ceph_quota_is_max_files_exceeded(struct inode *inode); +extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new); extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newlen); extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newlen); extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); -extern int ceph_quota_check_rename(struct ceph_mds_client *mdsc, - struct inode *old, struct inode *new); extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 197cb1234341..24997982de01 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -42,6 +42,7 @@ struct ceph_vxattr { #define VXATTR_FLAG_READONLY (1<<0) #define VXATTR_FLAG_HIDDEN (1<<1) #define VXATTR_FLAG_RSTAT (1<<2) +#define VXATTR_FLAG_DIRSTAT (1<<3) /* layouts */ @@ -303,6 +304,36 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val, ci->i_snap_btime.tv_nsec); } +static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, + char *val, size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + + return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid); +} + +static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci, + char *val, size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + + return ceph_fmt_xattr(val, size, "client%lld", + ceph_client_gid(fsc->client)); +} + +static ssize_t ceph_vxattrcb_caps(struct ceph_inode_info *ci, char *val, + size_t size) +{ + int issued; + + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + + return ceph_fmt_xattr(val, size, "%s/0x%x", + ceph_cap_string(issued), issued); +} + #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME2(_type, _name, _name2) \ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 @@ -347,9 +378,9 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_LAYOUT_FIELD(dir, layout, object_size), XATTR_LAYOUT_FIELD(dir, layout, pool), XATTR_LAYOUT_FIELD(dir, layout, pool_namespace), - XATTR_NAME_CEPH(dir, entries, 0), - XATTR_NAME_CEPH(dir, files, 0), - XATTR_NAME_CEPH(dir, subdirs, 0), + XATTR_NAME_CEPH(dir, entries, VXATTR_FLAG_DIRSTAT), + XATTR_NAME_CEPH(dir, files, VXATTR_FLAG_DIRSTAT), + XATTR_NAME_CEPH(dir, subdirs, VXATTR_FLAG_DIRSTAT), XATTR_RSTAT_FIELD(dir, rentries), XATTR_RSTAT_FIELD(dir, rfiles), XATTR_RSTAT_FIELD(dir, rsubdirs), @@ -378,6 +409,13 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { .exists_cb = ceph_vxattrcb_snap_btime_exists, .flags = VXATTR_FLAG_READONLY, }, + { + .name = "ceph.caps", + .name_size = sizeof("ceph.caps"), + .getxattr_cb = ceph_vxattrcb_caps, + .exists_cb = NULL, + .flags = VXATTR_FLAG_HIDDEN, + }, { .name = NULL, 0 } /* Required table terminator */ }; @@ -403,6 +441,31 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { .exists_cb = ceph_vxattrcb_snap_btime_exists, .flags = VXATTR_FLAG_READONLY, }, + { + .name = "ceph.caps", + .name_size = sizeof("ceph.caps"), + .getxattr_cb = ceph_vxattrcb_caps, + .exists_cb = NULL, + .flags = VXATTR_FLAG_HIDDEN, + }, + { .name = NULL, 0 } /* Required table terminator */ +}; + +static struct ceph_vxattr ceph_common_vxattrs[] = { + { + .name = "ceph.cluster_fsid", + .name_size = sizeof("ceph.cluster_fsid"), + .getxattr_cb = ceph_vxattrcb_cluster_fsid, + .exists_cb = NULL, + .flags = VXATTR_FLAG_READONLY, + }, + { + .name = "ceph.client_id", + .name_size = sizeof("ceph.client_id"), + .getxattr_cb = ceph_vxattrcb_client_id, + .exists_cb = NULL, + .flags = VXATTR_FLAG_READONLY, + }, { .name = NULL, 0 } /* Required table terminator */ }; @@ -428,6 +491,13 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, } } + vxattr = ceph_common_vxattrs; + while (vxattr->name) { + if (!strcmp(vxattr->name, name)) + return vxattr; + vxattr++; + } + return NULL; } @@ -837,6 +907,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, int mask = 0; if (vxattr->flags & VXATTR_FLAG_RSTAT) mask |= CEPH_STAT_RSTAT; + if (vxattr->flags & VXATTR_FLAG_DIRSTAT) + mask |= CEPH_CAP_FILE_SHARED; err = ceph_do_getattr(inode, mask, true); if (err) return err; @@ -950,6 +1022,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_pagelist *pagelist = NULL; int op = CEPH_MDS_OP_SETXATTR; int err; @@ -988,6 +1061,8 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, if (op == CEPH_MDS_OP_SETXATTR) { req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_args.setxattr.osdmap_epoch = + cpu_to_le32(osdc->osdmap->epoch); req->r_pagelist = pagelist; pagelist = NULL; } |