diff options
Diffstat (limited to 'net')
112 files changed, 7893 insertions, 2748 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index f292e0267bb9..8b644113715e 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -284,8 +284,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) return 0; out_free_newdev: - if (new_dev->reg_state == NETREG_UNINITIALIZED) - free_netdev(new_dev); + free_netdev(new_dev); return err; } diff --git a/net/9p/client.c b/net/9p/client.c index 785a7bb6a539..4f62f299da0c 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -903,6 +903,7 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) fid->clnt = clnt; fid->rdir = NULL; fid->fid = 0; + refcount_set(&fid->count, 1); idr_preload(GFP_KERNEL); spin_lock_irq(&clnt->lock); @@ -910,7 +911,6 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) GFP_NOWAIT); spin_unlock_irq(&clnt->lock); idr_preload_end(); - if (!ret) return fid; @@ -1189,7 +1189,6 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n", oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL); - req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid, nwname, wnames); if (IS_ERR(req)) { @@ -1221,7 +1220,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, if (nwname) memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid)); else - fid->qid = oldfid->qid; + memmove(&fid->qid, &oldfid->qid, sizeof(struct p9_qid)); kfree(wqids); return fid; @@ -1274,6 +1273,7 @@ int p9_client_open(struct p9_fid *fid, int mode) p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", qid.type, (unsigned long long)qid.path, qid.version, iounit); + memmove(&fid->qid, &qid, sizeof(struct p9_qid)); fid->mode = mode; fid->iounit = iounit; @@ -1319,6 +1319,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 (unsigned long long)qid->path, qid->version, iounit); + memmove(&ofid->qid, qid, sizeof(struct p9_qid)); ofid->mode = mode; ofid->iounit = iounit; @@ -1364,6 +1365,7 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, (unsigned long long)qid.path, qid.version, iounit); + memmove(&fid->qid, &qid, sizeof(struct p9_qid)); fid->mode = mode; fid->iounit = iounit; @@ -1460,12 +1462,14 @@ int p9_client_clunk(struct p9_fid *fid) struct p9_req_t *req; int retries = 0; - if (!fid) { - pr_warn("%s (%d): Trying to clunk with NULL fid\n", + if (!fid || IS_ERR(fid)) { + pr_warn("%s (%d): Trying to clunk with invalid fid\n", __func__, task_pid_nr(current)); dump_stack(); return 0; } + if (!refcount_dec_and_test(&fid->count)) + return 0; again: p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n", fid->fid, diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c1c30a9f76f3..8b796c499cbb 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -272,7 +272,8 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, kattr->test.repeat) return -EINVAL; - if (ctx_size_in < prog->aux->max_ctx_offset) + if (ctx_size_in < prog->aux->max_ctx_offset || + ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64)) return -EINVAL; if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) diff --git a/net/can/isotp.c b/net/can/isotp.c index 7839c3b9e5be..3ef7f78e553b 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1155,6 +1155,7 @@ static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) if (peer) return -EOPNOTSUPP; + memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = so->ifindex; addr->can_addr.tp.rx_id = so->rxid; diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index f36f9a3a4e20..c5c4eef3a9ff 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -5,6 +5,9 @@ config CEPH_LIB select LIBCRC32C select CRYPTO_AES select CRYPTO_CBC + select CRYPTO_GCM + select CRYPTO_HMAC + select CRYPTO_SHA256 select CRYPTO select KEYS default n diff --git a/net/ceph/Makefile b/net/ceph/Makefile index ce09bb4fb249..8802a0c0155d 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -14,4 +14,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ crypto.o armor.o \ auth_x.o \ ceph_strings.o ceph_hash.o \ - pagevec.o snapshot.o string_table.o + pagevec.o snapshot.o string_table.o \ + messenger_v1.o messenger_v2.o diff --git a/net/ceph/auth.c b/net/ceph/auth.c index fbeee068ea14..eb261aa5fe18 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -21,28 +21,31 @@ static u32 supported_protocols[] = { CEPH_AUTH_CEPHX }; -static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) +static int init_protocol(struct ceph_auth_client *ac, int proto) { - switch (protocol) { + dout("%s proto %d\n", __func__, proto); + + switch (proto) { case CEPH_AUTH_NONE: return ceph_auth_none_init(ac); case CEPH_AUTH_CEPHX: return ceph_x_init(ac); default: - return -ENOENT; + pr_err("bad auth protocol %d\n", proto); + return -EINVAL; } } /* * setup, teardown. */ -struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key) +struct ceph_auth_client *ceph_auth_init(const char *name, + const struct ceph_crypto_key *key, + const int *con_modes) { struct ceph_auth_client *ac; int ret; - dout("auth_init name '%s'\n", name); - ret = -ENOMEM; ac = kzalloc(sizeof(*ac), GFP_NOFS); if (!ac) @@ -54,8 +57,12 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp ac->name = name; else ac->name = CEPH_AUTH_NAME_DEFAULT; - dout("auth_init name %s\n", ac->name); ac->key = key; + ac->preferred_mode = con_modes[0]; + ac->fallback_mode = con_modes[1]; + + dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__, + ac->name, ac->preferred_mode, ac->fallback_mode); return ac; out: @@ -145,31 +152,35 @@ bad: goto out; } -static int ceph_build_auth_request(struct ceph_auth_client *ac, - void *msg_buf, size_t msg_len) +static int build_request(struct ceph_auth_client *ac, bool add_header, + void *buf, int buf_len) { - struct ceph_mon_request_header *monhdr = msg_buf; - void *p = monhdr + 1; - void *end = msg_buf + msg_len; + void *end = buf + buf_len; + void *p; int ret; - monhdr->have_version = 0; - monhdr->session_mon = cpu_to_le16(-1); - monhdr->session_mon_tid = 0; - - ceph_encode_32(&p, ac->protocol); + p = buf; + if (add_header) { + /* struct ceph_mon_request_header + protocol */ + ceph_encode_64_safe(&p, end, 0, e_range); + ceph_encode_16_safe(&p, end, -1, e_range); + ceph_encode_64_safe(&p, end, 0, e_range); + ceph_encode_32_safe(&p, end, ac->protocol, e_range); + } + ceph_encode_need(&p, end, sizeof(u32), e_range); ret = ac->ops->build_request(ac, p + sizeof(u32), end); if (ret < 0) { - pr_err("error %d building auth method %s request\n", ret, - ac->ops->name); - goto out; + pr_err("auth protocol '%s' building request failed: %d\n", + ceph_auth_proto_name(ac->protocol), ret); + return ret; } dout(" built request %d bytes\n", ret); ceph_encode_32(&p, ret); - ret = p + ret - msg_buf; -out: - return ret; + return p + ret - buf; + +e_range: + return -ERANGE; } /* @@ -229,10 +240,10 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ac->ops = NULL; } if (ac->protocol != protocol) { - ret = ceph_auth_init_protocol(ac, protocol); + ret = init_protocol(ac, protocol); if (ret) { - pr_err("error %d on auth protocol %d init\n", - ret, protocol); + pr_err("auth protocol '%s' init failed: %d\n", + ceph_auth_proto_name(protocol), ret); goto out; } } @@ -240,12 +251,13 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ac->negotiating = false; } - ret = ac->ops->handle_reply(ac, result, payload, payload_end); - if (ret == -EAGAIN) { - ret = ceph_build_auth_request(ac, reply_buf, reply_len); - } else if (ret) { - pr_err("auth method '%s' error %d\n", ac->ops->name, ret); - } + ret = ac->ops->handle_reply(ac, result, payload, payload_end, + NULL, NULL, NULL, NULL); + if (ret == -EAGAIN) + ret = build_request(ac, true, reply_buf, reply_len); + else if (ret) + pr_err("auth protocol '%s' mauth authentication failed: %d\n", + ceph_auth_proto_name(ac->protocol), result); out: mutex_unlock(&ac->mutex); @@ -264,7 +276,7 @@ int ceph_build_auth(struct ceph_auth_client *ac, mutex_lock(&ac->mutex); if (ac->ops->should_authenticate(ac)) - ret = ceph_build_auth_request(ac, msg_buf, msg_len); + ret = build_request(ac, true, msg_buf, msg_len); mutex_unlock(&ac->mutex); return ret; } @@ -281,19 +293,38 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac) } EXPORT_SYMBOL(ceph_auth_is_authenticated); -int ceph_auth_create_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *auth) +int __ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, bool force_new, + int *proto, int *pref_mode, int *fallb_mode) { - int ret = 0; + int ret; mutex_lock(&ac->mutex); - if (ac->ops && ac->ops->create_authorizer) + if (force_new && auth->authorizer) { + ceph_auth_destroy_authorizer(auth->authorizer); + auth->authorizer = NULL; + } + if (!auth->authorizer) ret = ac->ops->create_authorizer(ac, peer_type, auth); + else if (ac->ops->update_authorizer) + ret = ac->ops->update_authorizer(ac, peer_type, auth); + else + ret = 0; + if (ret) + goto out; + + *proto = ac->protocol; + if (pref_mode && fallb_mode) { + *pref_mode = ac->preferred_mode; + *fallb_mode = ac->fallback_mode; + } + +out: mutex_unlock(&ac->mutex); return ret; } -EXPORT_SYMBOL(ceph_auth_create_authorizer); +EXPORT_SYMBOL(__ceph_auth_get_authorizer); void ceph_auth_destroy_authorizer(struct ceph_authorizer *a) { @@ -301,20 +332,6 @@ void ceph_auth_destroy_authorizer(struct ceph_authorizer *a) } EXPORT_SYMBOL(ceph_auth_destroy_authorizer); -int ceph_auth_update_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *a) -{ - int ret = 0; - - mutex_lock(&ac->mutex); - if (ac->ops && ac->ops->update_authorizer) - ret = ac->ops->update_authorizer(ac, peer_type, a); - mutex_unlock(&ac->mutex); - return ret; -} -EXPORT_SYMBOL(ceph_auth_update_authorizer); - int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, struct ceph_authorizer *a, void *challenge_buf, @@ -332,13 +349,18 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge); int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, - struct ceph_authorizer *a) + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) { int ret = 0; mutex_lock(&ac->mutex); if (ac->ops && ac->ops->verify_authorizer_reply) - ret = ac->ops->verify_authorizer_reply(ac, a); + ret = ac->ops->verify_authorizer_reply(ac, a, + reply, reply_len, session_key, session_key_len, + con_secret, con_secret_len); mutex_unlock(&ac->mutex); return ret; } @@ -352,3 +374,279 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type) mutex_unlock(&ac->mutex); } EXPORT_SYMBOL(ceph_auth_invalidate_authorizer); + +/* + * msgr2 authentication + */ + +static bool contains(const int *arr, int cnt, int val) +{ + int i; + + for (i = 0; i < cnt; i++) { + if (arr[i] == val) + return true; + } + + return false; +} + +static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode) +{ + WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN); + if (fallb_mode != CEPH_CON_MODE_UNKNOWN) { + ceph_encode_32_safe(p, end, 2, e_range); + ceph_encode_32_safe(p, end, pref_mode, e_range); + ceph_encode_32_safe(p, end, fallb_mode, e_range); + } else { + ceph_encode_32_safe(p, end, 1, e_range); + ceph_encode_32_safe(p, end, pref_mode, e_range); + } + + return 0; + +e_range: + return -ERANGE; +} + +/* + * Similar to ceph_auth_build_hello(). + */ +int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len) +{ + int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE; + void *end = buf + buf_len; + void *lenp; + void *p; + int ret; + + mutex_lock(&ac->mutex); + if (ac->protocol == CEPH_AUTH_UNKNOWN) { + ret = init_protocol(ac, proto); + if (ret) { + pr_err("auth protocol '%s' init failed: %d\n", + ceph_auth_proto_name(proto), ret); + goto out; + } + } else { + WARN_ON(ac->protocol != proto); + ac->ops->reset(ac); + } + + p = buf; + ceph_encode_32_safe(&p, end, ac->protocol, e_range); + ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode); + if (ret) + goto out; + + lenp = p; + p += 4; /* space for len */ + + ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range); + ret = ceph_auth_entity_name_encode(ac->name, &p, end); + if (ret) + goto out; + + ceph_encode_64_safe(&p, end, ac->global_id, e_range); + ceph_encode_32(&lenp, p - lenp - 4); + ret = p - buf; + +out: + mutex_unlock(&ac->mutex); + return ret; + +e_range: + ret = -ERANGE; + goto out; +} + +int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply, + int reply_len, void *buf, int buf_len) +{ + int ret; + + mutex_lock(&ac->mutex); + ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len, + NULL, NULL, NULL, NULL); + if (ret == -EAGAIN) + ret = build_request(ac, false, buf, buf_len); + else + WARN_ON(ret >= 0); + mutex_unlock(&ac->mutex); + return ret; +} + +int ceph_auth_handle_reply_done(struct ceph_auth_client *ac, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + int ret; + + mutex_lock(&ac->mutex); + if (global_id && ac->global_id != global_id) { + dout("%s global_id %llu -> %llu\n", __func__, ac->global_id, + global_id); + ac->global_id = global_id; + } + + ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len, + session_key, session_key_len, + con_secret, con_secret_len); + mutex_unlock(&ac->mutex); + return ret; +} + +bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + mutex_lock(&ac->mutex); + WARN_ON(used_proto != ac->protocol); + + if (result == -EOPNOTSUPP) { + if (!contains(allowed_protos, proto_cnt, ac->protocol)) { + pr_err("auth protocol '%s' not allowed\n", + ceph_auth_proto_name(ac->protocol)); + goto not_allowed; + } + if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) && + (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN || + !contains(allowed_modes, mode_cnt, ac->fallback_mode))) { + pr_err("preferred mode '%s' not allowed\n", + ceph_con_mode_name(ac->preferred_mode)); + if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN) + pr_err("no fallback mode\n"); + else + pr_err("fallback mode '%s' not allowed\n", + ceph_con_mode_name(ac->fallback_mode)); + goto not_allowed; + } + } + + WARN_ON(result == -EOPNOTSUPP || result >= 0); + pr_err("auth protocol '%s' msgr authentication failed: %d\n", + ceph_auth_proto_name(ac->protocol), result); + + mutex_unlock(&ac->mutex); + return true; + +not_allowed: + mutex_unlock(&ac->mutex); + return false; +} + +int ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, void *buf, int *buf_len) +{ + void *end = buf + *buf_len; + int pref_mode, fallb_mode; + int proto; + void *p; + int ret; + + ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto, + &pref_mode, &fallb_mode); + if (ret) + return ret; + + p = buf; + ceph_encode_32_safe(&p, end, proto, e_range); + ret = encode_con_modes(&p, end, pref_mode, fallb_mode); + if (ret) + return ret; + + ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range); + *buf_len = p - buf; + return 0; + +e_range: + return -ERANGE; +} +EXPORT_SYMBOL(ceph_auth_get_authorizer); + +int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + void *buf, int *buf_len) +{ + void *end = buf + *buf_len; + void *p; + int ret; + + ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer, + reply, reply_len); + if (ret) + return ret; + + p = buf; + ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range); + *buf_len = p - buf; + return 0; + +e_range: + return -ERANGE; +} +EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more); + +int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, + reply, reply_len, session_key, session_key_len, + con_secret, con_secret_len); +} +EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done); + +bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac, + int peer_type, int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + mutex_lock(&ac->mutex); + WARN_ON(used_proto != ac->protocol); + + if (result == -EOPNOTSUPP) { + if (!contains(allowed_protos, proto_cnt, ac->protocol)) { + pr_err("auth protocol '%s' not allowed by %s\n", + ceph_auth_proto_name(ac->protocol), + ceph_entity_type_name(peer_type)); + goto not_allowed; + } + if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) && + (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN || + !contains(allowed_modes, mode_cnt, ac->fallback_mode))) { + pr_err("preferred mode '%s' not allowed by %s\n", + ceph_con_mode_name(ac->preferred_mode), + ceph_entity_type_name(peer_type)); + if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN) + pr_err("no fallback mode\n"); + else + pr_err("fallback mode '%s' not allowed by %s\n", + ceph_con_mode_name(ac->fallback_mode), + ceph_entity_type_name(peer_type)); + goto not_allowed; + } + } + + WARN_ON(result == -EOPNOTSUPP || result >= 0); + pr_err("auth protocol '%s' authorization to %s failed: %d\n", + ceph_auth_proto_name(ac->protocol), + ceph_entity_type_name(peer_type), result); + + if (ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, peer_type); + + mutex_unlock(&ac->mutex); + return true; + +not_allowed: + mutex_unlock(&ac->mutex); + return false; +} +EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer); diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index edb7042479ed..70e86e462250 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -70,7 +70,9 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end) * authenticate state, so nothing happens here. */ static int handle_reply(struct ceph_auth_client *ac, int result, - void *buf, void *end) + void *buf, void *end, u8 *session_key, + int *session_key_len, u8 *con_secret, + int *con_secret_len) { struct ceph_auth_none_info *xi = ac->private; @@ -116,7 +118,6 @@ static int ceph_auth_none_create_authorizer( } static const struct ceph_auth_client_ops ceph_auth_none_ops = { - .name = "none", .reset = reset, .destroy = destroy, .is_authenticated = is_authenticated, diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index b52732337ca6..ca44c327bace 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -22,12 +22,15 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); static int ceph_x_is_authenticated(struct ceph_auth_client *ac) { struct ceph_x_info *xi = ac->private; - int need; + int missing; + int need; /* missing + need renewal */ ceph_x_validate_tickets(ac, &need); - dout("ceph_x_is_authenticated want=%d need=%d have=%d\n", - ac->want_keys, need, xi->have_keys); - return (ac->want_keys & xi->have_keys) == ac->want_keys; + missing = ac->want_keys & ~xi->have_keys; + WARN_ON((need & missing) != missing); + dout("%s want 0x%x have 0x%x missing 0x%x -> %d\n", __func__, + ac->want_keys, xi->have_keys, missing, !missing); + return !missing; } static int ceph_x_should_authenticate(struct ceph_auth_client *ac) @@ -36,9 +39,9 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac) int need; ceph_x_validate_tickets(ac, &need); - dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", - ac->want_keys, need, xi->have_keys); - return need != 0; + dout("%s want 0x%x have 0x%x need 0x%x -> %d\n", __func__, + ac->want_keys, xi->have_keys, need, !!need); + return !!need; } static int ceph_x_encrypt_offset(void) @@ -197,7 +200,7 @@ static int process_one_ticket(struct ceph_auth_client *ac, dout(" decrypted %d bytes\n", ret); dend = dp + ret; - tkt_struct_v = ceph_decode_8(&dp); + ceph_decode_8_safe(&dp, dend, tkt_struct_v, bad); if (tkt_struct_v != 1) goto bad; @@ -205,6 +208,7 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; + ceph_decode_need(&dp, dend, sizeof(struct ceph_timespec), bad); ceph_decode_timespec64(&validity, dp); dp += sizeof(struct ceph_timespec); new_expires = ktime_get_real_seconds() + validity.tv_sec; @@ -265,22 +269,21 @@ out: static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, struct ceph_crypto_key *secret, - void *buf, void *end) + void **p, void *end) { - void *p = buf; u8 reply_struct_v; u32 num; int ret; - ceph_decode_8_safe(&p, end, reply_struct_v, bad); + ceph_decode_8_safe(p, end, reply_struct_v, bad); if (reply_struct_v != 1) return -EINVAL; - ceph_decode_32_safe(&p, end, num, bad); + ceph_decode_32_safe(p, end, num, bad); dout("%d tickets\n", num); while (num--) { - ret = process_one_ticket(ac, secret, &p, end); + ret = process_one_ticket(ac, secret, p, end); if (ret) return ret; } @@ -379,6 +382,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, } } au->service = th->service; + WARN_ON(!th->secret_id); au->secret_id = th->secret_id; msg_a = au->buf->vec.iov_base; @@ -442,9 +446,10 @@ static bool need_key(struct ceph_x_ticket_handler *th) static bool have_key(struct ceph_x_ticket_handler *th) { - if (th->have_key) { - if (ktime_get_real_seconds() >= th->expires) - th->have_key = false; + if (th->have_key && ktime_get_real_seconds() >= th->expires) { + dout("ticket %d (%s) secret_id %llu expired\n", th->service, + ceph_entity_type_name(th->service), th->secret_id); + th->have_key = false; } return th->have_key; @@ -486,6 +491,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, struct ceph_x_info *xi = ac->private; int need; struct ceph_x_request_header *head = buf; + void *p; int ret; struct ceph_x_ticket_handler *th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); @@ -494,18 +500,17 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, return PTR_ERR(th); ceph_x_validate_tickets(ac, &need); - - dout("build_request want %x have %x need %x\n", - ac->want_keys, xi->have_keys, need); + dout("%s want 0x%x have 0x%x need 0x%x\n", __func__, ac->want_keys, + xi->have_keys, need); if (need & CEPH_ENTITY_TYPE_AUTH) { struct ceph_x_authenticate *auth = (void *)(head + 1); - void *p = auth + 1; void *enc_buf = xi->auth_authorizer.enc_buf; struct ceph_x_challenge_blob *blob = enc_buf + ceph_x_encrypt_offset(); u64 *u; + p = auth + 1; if (p > end) return -ERANGE; @@ -521,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, if (ret < 0) return ret; - auth->struct_v = 1; + auth->struct_v = 2; /* nautilus+ */ auth->key = 0; for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++) auth->key ^= *(__le64 *)u; @@ -534,39 +539,137 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, if (ret < 0) return ret; + /* nautilus+: request service tickets at the same time */ + need = ac->want_keys & ~CEPH_ENTITY_TYPE_AUTH; + WARN_ON(!need); + ceph_encode_32_safe(&p, end, need, e_range); return p - buf; } if (need) { - void *p = head + 1; - struct ceph_x_service_ticket_request *req; - - if (p > end) - return -ERANGE; - head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); - + dout(" get_principal_session_key\n"); ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); if (ret) return ret; - ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base, - xi->auth_authorizer.buf->vec.iov_len); - req = p; - req->keys = cpu_to_le32(need); - p += sizeof(*req); + p = buf; + ceph_encode_16_safe(&p, end, CEPHX_GET_PRINCIPAL_SESSION_KEY, + e_range); + ceph_encode_copy_safe(&p, end, + xi->auth_authorizer.buf->vec.iov_base, + xi->auth_authorizer.buf->vec.iov_len, e_range); + ceph_encode_8_safe(&p, end, 1, e_range); + ceph_encode_32_safe(&p, end, need, e_range); return p - buf; } return 0; + +e_range: + return -ERANGE; +} + +static int decode_con_secret(void **p, void *end, u8 *con_secret, + int *con_secret_len) +{ + int len; + + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + + dout("%s len %d\n", __func__, len); + if (con_secret) { + if (len > CEPH_MAX_CON_SECRET_LEN) { + pr_err("connection secret too big %d\n", len); + goto bad_memzero; + } + memcpy(con_secret, *p, len); + *con_secret_len = len; + } + memzero_explicit(*p, len); + *p += len; + return 0; + +bad_memzero: + memzero_explicit(*p, len); +bad: + pr_err("failed to decode connection secret\n"); + return -EINVAL; +} + +static int handle_auth_session_key(struct ceph_auth_client *ac, + void **p, void *end, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_x_info *xi = ac->private; + struct ceph_x_ticket_handler *th; + void *dp, *dend; + int len; + int ret; + + /* AUTH ticket */ + ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end); + if (ret) + return ret; + + if (*p == end) { + /* pre-nautilus (or didn't request service tickets!) */ + WARN_ON(session_key || con_secret); + return 0; + } + + th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + if (IS_ERR(th)) + return PTR_ERR(th); + + if (session_key) { + memcpy(session_key, th->session_key.key, th->session_key.len); + *session_key_len = th->session_key.len; + } + + /* connection secret */ + ceph_decode_32_safe(p, end, len, e_inval); + dout("%s connection secret blob len %d\n", __func__, len); + if (len > 0) { + dp = *p + ceph_x_encrypt_offset(); + ret = ceph_x_decrypt(&th->session_key, p, *p + len); + if (ret < 0) + return ret; + + dout("%s decrypted %d bytes\n", __func__, ret); + dend = dp + ret; + + ret = decode_con_secret(&dp, dend, con_secret, con_secret_len); + if (ret) + return ret; + } + + /* service tickets */ + ceph_decode_32_safe(p, end, len, e_inval); + dout("%s service tickets blob len %d\n", __func__, len); + if (len > 0) { + ret = ceph_x_proc_ticket_reply(ac, &th->session_key, + p, *p + len); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; } static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, - void *buf, void *end) + void *buf, void *end, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) { struct ceph_x_info *xi = ac->private; - struct ceph_x_reply_header *head = buf; struct ceph_x_ticket_handler *th; int len = end - buf; + void *p; int op; int ret; @@ -587,22 +690,25 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, return -EAGAIN; } - op = le16_to_cpu(head->op); - result = le32_to_cpu(head->result); + p = buf; + ceph_decode_16_safe(&p, end, op, e_inval); + ceph_decode_32_safe(&p, end, result, e_inval); dout("handle_reply op %d result %d\n", op, result); switch (op) { case CEPHX_GET_AUTH_SESSION_KEY: - /* verify auth key */ - ret = ceph_x_proc_ticket_reply(ac, &xi->secret, - buf + sizeof(*head), end); + /* AUTH ticket + [connection secret] + service tickets */ + ret = handle_auth_session_key(ac, &p, end, session_key, + session_key_len, con_secret, + con_secret_len); break; case CEPHX_GET_PRINCIPAL_SESSION_KEY: th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); if (IS_ERR(th)) return PTR_ERR(th); - ret = ceph_x_proc_ticket_reply(ac, &th->session_key, - buf + sizeof(*head), end); + + /* service tickets */ + ret = ceph_x_proc_ticket_reply(ac, &th->session_key, &p, end); break; default: @@ -613,6 +719,9 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, if (ac->want_keys == xi->have_keys) return 0; return -EAGAIN; + +e_inval: + return -EINVAL; } static void ceph_x_destroy_authorizer(struct ceph_authorizer *a) @@ -678,40 +787,44 @@ static int ceph_x_update_authorizer( return 0; } -static int decrypt_authorize_challenge(struct ceph_x_authorizer *au, - void *challenge_buf, - int challenge_buf_len, - u64 *server_challenge) +/* + * CephXAuthorizeChallenge + */ +static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret, + void *challenge, int challenge_len, + u64 *server_challenge) { - struct ceph_x_authorize_challenge *ch = - challenge_buf + sizeof(struct ceph_x_encrypt_header); + void *dp, *dend; int ret; /* no leading len */ - ret = __ceph_x_decrypt(&au->session_key, challenge_buf, - challenge_buf_len); + ret = __ceph_x_decrypt(secret, challenge, challenge_len); if (ret < 0) return ret; - if (ret < sizeof(*ch)) { - pr_err("bad size %d for ceph_x_authorize_challenge\n", ret); - return -EINVAL; - } - *server_challenge = le64_to_cpu(ch->server_challenge); + dout("%s decrypted %d bytes\n", __func__, ret); + dp = challenge + sizeof(struct ceph_x_encrypt_header); + dend = dp + ret; + + ceph_decode_skip_8(&dp, dend, e_inval); /* struct_v */ + ceph_decode_64_safe(&dp, dend, *server_challenge, e_inval); + dout("%s server_challenge %llu\n", __func__, *server_challenge); return 0; + +e_inval: + return -EINVAL; } static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac, struct ceph_authorizer *a, - void *challenge_buf, - int challenge_buf_len) + void *challenge, int challenge_len) { struct ceph_x_authorizer *au = (void *)a; u64 server_challenge; int ret; - ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len, - &server_challenge); + ret = decrypt_authorizer_challenge(&au->session_key, challenge, + challenge_len, &server_challenge); if (ret) { pr_err("failed to decrypt authorize challenge: %d", ret); return ret; @@ -726,29 +839,67 @@ static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac, return 0; } +/* + * CephXAuthorizeReply + */ +static int decrypt_authorizer_reply(struct ceph_crypto_key *secret, + void **p, void *end, u64 *nonce_plus_one, + u8 *con_secret, int *con_secret_len) +{ + void *dp, *dend; + u8 struct_v; + int ret; + + dp = *p + ceph_x_encrypt_offset(); + ret = ceph_x_decrypt(secret, p, end); + if (ret < 0) + return ret; + + dout("%s decrypted %d bytes\n", __func__, ret); + dend = dp + ret; + + ceph_decode_8_safe(&dp, dend, struct_v, e_inval); + ceph_decode_64_safe(&dp, dend, *nonce_plus_one, e_inval); + dout("%s nonce_plus_one %llu\n", __func__, *nonce_plus_one); + if (struct_v >= 2) { + ret = decode_con_secret(&dp, dend, con_secret, con_secret_len); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, - struct ceph_authorizer *a) + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) { struct ceph_x_authorizer *au = (void *)a; - void *p = au->enc_buf; - struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset(); + u64 nonce_plus_one; int ret; - ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN); - if (ret < 0) + if (session_key) { + memcpy(session_key, au->session_key.key, au->session_key.len); + *session_key_len = au->session_key.len; + } + + ret = decrypt_authorizer_reply(&au->session_key, &reply, + reply + reply_len, &nonce_plus_one, + con_secret, con_secret_len); + if (ret) return ret; - if (ret < sizeof(*reply)) { - pr_err("bad size %d for ceph_x_authorize_reply\n", ret); - return -EINVAL; + + if (nonce_plus_one != au->nonce + 1) { + pr_err("failed to authenticate server\n"); + return -EPERM; } - if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one)) - ret = -EPERM; - else - ret = 0; - dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", - au->nonce, le64_to_cpu(reply->nonce_plus_one), ret); - return ret; + return 0; } static void ceph_x_reset(struct ceph_auth_client *ac) @@ -785,8 +936,15 @@ static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type) struct ceph_x_ticket_handler *th; th = get_ticket_handler(ac, peer_type); - if (!IS_ERR(th)) + if (IS_ERR(th)) + return; + + if (th->have_key) { + dout("ticket %d (%s) secret_id %llu invalidated\n", + th->service, ceph_entity_type_name(th->service), + th->secret_id); th->have_key = false; + } } static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, @@ -911,7 +1069,6 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth, } static const struct ceph_auth_client_ops ceph_x_ops = { - .name = "x", .is_authenticated = ceph_x_is_authenticated, .should_authenticate = ceph_x_should_authenticate, .build_request = ceph_x_build_request, diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h index 24b0b74564d0..792fcb974dc3 100644 --- a/net/ceph/auth_x_protocol.h +++ b/net/ceph/auth_x_protocol.h @@ -38,7 +38,8 @@ struct ceph_x_authenticate { __u8 struct_v; __le64 client_challenge; __le64 key; - /* ticket blob */ + /* old_ticket blob */ + /* nautilus+: other_keys */ } __attribute__ ((packed)); struct ceph_x_service_ticket_request { diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4e7edd707a14..271287c5ec12 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -265,6 +265,7 @@ enum { Opt_ip, Opt_crush_location, Opt_read_from_replica, + Opt_ms_mode, /* string args above */ Opt_share, Opt_crc, @@ -287,6 +288,23 @@ static const struct constant_table ceph_param_read_from_replica[] = { {} }; +enum ceph_ms_mode { + Opt_ms_mode_legacy, + Opt_ms_mode_crc, + Opt_ms_mode_secure, + Opt_ms_mode_prefer_crc, + Opt_ms_mode_prefer_secure +}; + +static const struct constant_table ceph_param_ms_mode[] = { + {"legacy", Opt_ms_mode_legacy}, + {"crc", Opt_ms_mode_crc}, + {"secure", Opt_ms_mode_secure}, + {"prefer-crc", Opt_ms_mode_prefer_crc}, + {"prefer-secure", Opt_ms_mode_prefer_secure}, + {} +}; + static const struct fs_parameter_spec ceph_parameters[] = { fsparam_flag ("abort_on_full", Opt_abort_on_full), fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), @@ -305,6 +323,8 @@ static const struct fs_parameter_spec ceph_parameters[] = { fs_param_deprecated, NULL), fsparam_enum ("read_from_replica", Opt_read_from_replica, ceph_param_read_from_replica), + fsparam_enum ("ms_mode", Opt_ms_mode, + ceph_param_ms_mode), fsparam_string ("secret", Opt_secret), fsparam_flag_no ("share", Opt_share), fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay), @@ -333,6 +353,8 @@ struct ceph_options *ceph_alloc_options(void) opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT; opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT; + opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; return opt; } EXPORT_SYMBOL(ceph_alloc_options); @@ -503,6 +525,32 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, BUG(); } break; + case Opt_ms_mode: + switch (result.uint_32) { + case Opt_ms_mode_legacy: + opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; + break; + case Opt_ms_mode_crc: + opt->con_modes[0] = CEPH_CON_MODE_CRC; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; + break; + case Opt_ms_mode_secure: + opt->con_modes[0] = CEPH_CON_MODE_SECURE; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; + break; + case Opt_ms_mode_prefer_crc: + opt->con_modes[0] = CEPH_CON_MODE_CRC; + opt->con_modes[1] = CEPH_CON_MODE_SECURE; + break; + case Opt_ms_mode_prefer_secure: + opt->con_modes[0] = CEPH_CON_MODE_SECURE; + opt->con_modes[1] = CEPH_CON_MODE_CRC; + break; + default: + BUG(); + } + break; case Opt_osdtimeout: warn_plog(&log, "Ignoring osdtimeout"); @@ -616,6 +664,21 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, } else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) { seq_puts(m, "read_from_replica=localize,"); } + if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) { + if (opt->con_modes[0] == CEPH_CON_MODE_CRC && + opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) { + seq_puts(m, "ms_mode=crc,"); + } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE && + opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) { + seq_puts(m, "ms_mode=secure,"); + } else if (opt->con_modes[0] == CEPH_CON_MODE_CRC && + opt->con_modes[1] == CEPH_CON_MODE_SECURE) { + seq_puts(m, "ms_mode=prefer-crc,"); + } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE && + opt->con_modes[1] == CEPH_CON_MODE_CRC) { + seq_puts(m, "ms_mode=prefer-secure,"); + } + } if (opt->flags & CEPH_OPT_FSID) seq_printf(m, "fsid=%pU,", &opt->fsid); diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 10e01494993c..355fea272120 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c @@ -18,6 +18,34 @@ const char *ceph_entity_type_name(int type) } EXPORT_SYMBOL(ceph_entity_type_name); +const char *ceph_auth_proto_name(int proto) +{ + switch (proto) { + case CEPH_AUTH_UNKNOWN: + return "unknown"; + case CEPH_AUTH_NONE: + return "none"; + case CEPH_AUTH_CEPHX: + return "cephx"; + default: + return "???"; + } +} + +const char *ceph_con_mode_name(int mode) +{ + switch (mode) { + case CEPH_CON_MODE_UNKNOWN: + return "unknown"; + case CEPH_CON_MODE_CRC: + return "crc"; + case CEPH_CON_MODE_SECURE: + return "secure"; + default: + return "???"; + } +} + const char *ceph_osd_op_name(int op) { switch (op) { diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 4f75df40fb12..92d89b331645 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -96,6 +96,7 @@ int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) key->len = ceph_decode_16(p); ceph_decode_need(p, end, key->len, bad); ret = set_secret(key, *p); + memzero_explicit(*p, key->len); *p += key->len; return ret; @@ -134,7 +135,7 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) void ceph_crypto_key_destroy(struct ceph_crypto_key *key) { if (key) { - kfree(key->key); + kfree_sensitive(key->key); key->key = NULL; if (key->tfm) { crypto_free_sync_skcipher(key->tfm); diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 96ef4d860bc9..13bd526349fa 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -5,6 +5,9 @@ #include <linux/ceph/types.h> #include <linux/ceph/buffer.h> +#define CEPH_KEY_LEN 16 +#define CEPH_MAX_CON_SECRET_LEN 64 + /* * cryptographic secret */ diff --git a/net/ceph/decode.c b/net/ceph/decode.c index eea529595a7a..b44f7651be04 100644 --- a/net/ceph/decode.c +++ b/net/ceph/decode.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/inet.h> #include <linux/ceph/decode.h> @@ -82,3 +85,101 @@ bad: } EXPORT_SYMBOL(ceph_decode_entity_addr); +/* + * Return addr of desired type (MSGR2 or LEGACY) or error. + * Make sure there is only one match. + * + * Assume encoding with MSG_ADDR2. + */ +int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2, + struct ceph_entity_addr *addr) +{ + __le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 : + CEPH_ENTITY_ADDR_TYPE_LEGACY; + struct ceph_entity_addr tmp_addr; + int addr_cnt; + bool found; + u8 marker; + int ret; + int i; + + ceph_decode_8_safe(p, end, marker, e_inval); + if (marker != 2) { + pr_err("bad addrvec marker %d\n", marker); + return -EINVAL; + } + + ceph_decode_32_safe(p, end, addr_cnt, e_inval); + + found = false; + for (i = 0; i < addr_cnt; i++) { + ret = ceph_decode_entity_addr(p, end, &tmp_addr); + if (ret) + return ret; + + if (tmp_addr.type == my_type) { + if (found) { + pr_err("another match of type %d in addrvec\n", + le32_to_cpu(my_type)); + return -EINVAL; + } + + memcpy(addr, &tmp_addr, sizeof(*addr)); + found = true; + } + } + if (!found && addr_cnt != 0) { + pr_err("no match of type %d in addrvec\n", + le32_to_cpu(my_type)); + return -ENOENT; + } + + return 0; + +e_inval: + return -EINVAL; +} +EXPORT_SYMBOL(ceph_decode_entity_addrvec); + +static int get_sockaddr_encoding_len(sa_family_t family) +{ + union { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } u; + + switch (family) { + case AF_INET: + return sizeof(u.sin); + case AF_INET6: + return sizeof(u.sin6); + default: + return sizeof(u); + } +} + +int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr) +{ + sa_family_t family = get_unaligned(&addr->in_addr.ss_family); + int addr_len = get_sockaddr_encoding_len(family); + + return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len; +} + +void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr) +{ + sa_family_t family = get_unaligned(&addr->in_addr.ss_family); + int addr_len = get_sockaddr_encoding_len(family); + + ceph_encode_8(p, 1); /* marker */ + ceph_start_encoding(p, 1, 1, sizeof(addr->type) + + sizeof(addr->nonce) + + sizeof(u32) + addr_len); + ceph_encode_copy(p, &addr->type, sizeof(addr->type)); + ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce)); + + ceph_encode_32(p, addr_len); + ceph_encode_16(p, family); + ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family)); +} diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index af0f1fa24937..57d043b382ed 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -82,71 +82,51 @@ #define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ #define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ -/* - * connection states - */ -#define CON_STATE_CLOSED 1 /* -> PREOPEN */ -#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */ -#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */ -#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */ -#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */ -#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */ - -/* - * ceph_connection flag bits - */ -#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop - * messages on errors */ -#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ -#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */ -#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ -#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ - static bool con_flag_valid(unsigned long con_flag) { switch (con_flag) { - case CON_FLAG_LOSSYTX: - case CON_FLAG_KEEPALIVE_PENDING: - case CON_FLAG_WRITE_PENDING: - case CON_FLAG_SOCK_CLOSED: - case CON_FLAG_BACKOFF: + case CEPH_CON_F_LOSSYTX: + case CEPH_CON_F_KEEPALIVE_PENDING: + case CEPH_CON_F_WRITE_PENDING: + case CEPH_CON_F_SOCK_CLOSED: + case CEPH_CON_F_BACKOFF: return true; default: return false; } } -static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag) +void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); clear_bit(con_flag, &con->flags); } -static void con_flag_set(struct ceph_connection *con, unsigned long con_flag) +void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); set_bit(con_flag, &con->flags); } -static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag) +bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_bit(con_flag, &con->flags); } -static bool con_flag_test_and_clear(struct ceph_connection *con, - unsigned long con_flag) +bool ceph_con_flag_test_and_clear(struct ceph_connection *con, + unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_and_clear_bit(con_flag, &con->flags); } -static bool con_flag_test_and_set(struct ceph_connection *con, - unsigned long con_flag) +bool ceph_con_flag_test_and_set(struct ceph_connection *con, + unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); @@ -157,12 +137,6 @@ static bool con_flag_test_and_set(struct ceph_connection *con, static struct kmem_cache *ceph_msg_cache; -/* static tag bytes (protocol control messages) */ -static char tag_msg = CEPH_MSGR_TAG_MSG; -static char tag_ack = CEPH_MSGR_TAG_ACK; -static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; -static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; - #ifdef CONFIG_LOCKDEP static struct lock_class_key socket_class; #endif @@ -184,7 +158,7 @@ static void con_fault(struct ceph_connection *con); static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; static atomic_t addr_str_seq = ATOMIC_INIT(0); -static struct page *zero_page; /* used in certain error cases */ +struct page *ceph_zero_page; /* used in certain error cases */ const char *ceph_pr_addr(const struct ceph_entity_addr *addr) { @@ -219,10 +193,13 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr) } EXPORT_SYMBOL(ceph_pr_addr); -static void encode_my_addr(struct ceph_messenger *msgr) +void ceph_encode_my_addr(struct ceph_messenger *msgr) { - memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); - ceph_encode_banner_addr(&msgr->my_enc_addr); + if (!ceph_msgr2(from_msgr(msgr))) { + memcpy(&msgr->my_enc_addr, &msgr->inst.addr, + sizeof(msgr->my_enc_addr)); + ceph_encode_banner_addr(&msgr->my_enc_addr); + } } /* @@ -254,9 +231,9 @@ static void _ceph_msgr_exit(void) ceph_msgr_wq = NULL; } - BUG_ON(zero_page == NULL); - put_page(zero_page); - zero_page = NULL; + BUG_ON(!ceph_zero_page); + put_page(ceph_zero_page); + ceph_zero_page = NULL; ceph_msgr_slab_exit(); } @@ -266,9 +243,9 @@ int __init ceph_msgr_init(void) if (ceph_msgr_slab_init()) return -ENOMEM; - BUG_ON(zero_page != NULL); - zero_page = ZERO_PAGE(0); - get_page(zero_page); + BUG_ON(ceph_zero_page); + ceph_zero_page = ZERO_PAGE(0); + get_page(ceph_zero_page); /* * The number of active work items is limited by the number of @@ -372,7 +349,7 @@ static void ceph_sock_data_ready(struct sock *sk) } if (sk->sk_state != TCP_CLOSE_WAIT) { - dout("%s on %p state = %lu, queueing work\n", __func__, + dout("%s %p state = %d, queueing work\n", __func__, con, con->state); queue_con(con); } @@ -390,7 +367,7 @@ static void ceph_sock_write_space(struct sock *sk) * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ - if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) { + if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) { if (sk_stream_is_writeable(sk)) { dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); @@ -406,7 +383,7 @@ static void ceph_sock_state_change(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; - dout("%s %p state = %lu sk_state = %u\n", __func__, + dout("%s %p state = %d sk_state = %u\n", __func__, con, con->state, sk->sk_state); switch (sk->sk_state) { @@ -416,7 +393,7 @@ static void ceph_sock_state_change(struct sock *sk) case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); - con_flag_set(con, CON_FLAG_SOCK_CLOSED); + ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED); queue_con(con); break; case TCP_ESTABLISHED: @@ -450,13 +427,15 @@ static void set_sock_callbacks(struct socket *sock, /* * initiate connection to a remote socket. */ -static int ceph_tcp_connect(struct ceph_connection *con) +int ceph_tcp_connect(struct ceph_connection *con) { struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */ struct socket *sock; unsigned int noio_flag; int ret; + dout("%s con %p peer_addr %s\n", __func__, con, + ceph_pr_addr(&con->peer_addr)); BUG_ON(con->sock); /* sock_create_kern() allocates with GFP_KERNEL */ @@ -474,8 +453,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) set_sock_callbacks(sock, con); - dout("connect %s\n", ceph_pr_addr(&con->peer_addr)); - con_sock_state_connecting(con); ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss), O_NONBLOCK); @@ -498,103 +475,13 @@ static int ceph_tcp_connect(struct ceph_connection *con) } /* - * If @buf is NULL, discard up to @len bytes. - */ -static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) -{ - struct kvec iov = {buf, len}; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - int r; - - if (!buf) - msg.msg_flags |= MSG_TRUNC; - - iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len); - r = sock_recvmsg(sock, &msg, msg.msg_flags); - if (r == -EAGAIN) - r = 0; - return r; -} - -static int ceph_tcp_recvpage(struct socket *sock, struct page *page, - int page_offset, size_t length) -{ - struct bio_vec bvec = { - .bv_page = page, - .bv_offset = page_offset, - .bv_len = length - }; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - int r; - - BUG_ON(page_offset + length > PAGE_SIZE); - iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length); - r = sock_recvmsg(sock, &msg, msg.msg_flags); - if (r == -EAGAIN) - r = 0; - return r; -} - -/* - * write something. @more is true if caller will be sending more data - * shortly. - */ -static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, - size_t kvlen, size_t len, bool more) -{ - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - int r; - - if (more) - msg.msg_flags |= MSG_MORE; - else - msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ - - r = kernel_sendmsg(sock, &msg, iov, kvlen, len); - if (r == -EAGAIN) - r = 0; - return r; -} - -/* - * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST - */ -static int ceph_tcp_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int more) -{ - ssize_t (*sendpage)(struct socket *sock, struct page *page, - int offset, size_t size, int flags); - int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; - int ret; - - /* - * sendpage cannot properly handle pages with page_count == 0, - * we need to fall back to sendmsg if that's the case. - * - * Same goes for slab pages: skb_can_coalesce() allows - * coalescing neighboring slab objects into a single frag which - * triggers one of hardened usercopy checks. - */ - if (sendpage_ok(page)) - sendpage = sock->ops->sendpage; - else - sendpage = sock_no_sendpage; - - ret = sendpage(sock, page, offset, size, flags); - if (ret == -EAGAIN) - ret = 0; - - return ret; -} - -/* * Shutdown/close the socket for the given connection. */ -static int con_close_socket(struct ceph_connection *con) +int ceph_con_close_socket(struct ceph_connection *con) { int rc = 0; - dout("con_close_socket on %p sock %p\n", con, con->sock); + dout("%s con %p sock %p\n", __func__, con, con->sock); if (con->sock) { rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); sock_release(con->sock); @@ -607,12 +494,34 @@ static int con_close_socket(struct ceph_connection *con) * received a socket close event before we had the chance to * shut the socket down. */ - con_flag_clear(con, CON_FLAG_SOCK_CLOSED); + ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED); con_sock_state_closed(con); return rc; } +static void ceph_con_reset_protocol(struct ceph_connection *con) +{ + dout("%s con %p\n", __func__, con); + + ceph_con_close_socket(con); + if (con->in_msg) { + WARN_ON(con->in_msg->con != con); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + } + if (con->out_msg) { + WARN_ON(con->out_msg->con != con); + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } + + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_reset_protocol(con); + else + ceph_con_v1_reset_protocol(con); +} + /* * Reset a connection. Discard all incoming and outgoing messages * and clear *_seq state. @@ -623,6 +532,7 @@ static void ceph_msg_remove(struct ceph_msg *msg) ceph_msg_put(msg); } + static void ceph_msg_remove_list(struct list_head *head) { while (!list_empty(head)) { @@ -632,31 +542,22 @@ static void ceph_msg_remove_list(struct list_head *head) } } -static void reset_connection(struct ceph_connection *con) +void ceph_con_reset_session(struct ceph_connection *con) { - /* reset connection, out_queue, msg_ and connect_seq */ - /* discard existing out_queue and msg_seq */ - dout("reset_connection %p\n", con); + dout("%s con %p\n", __func__, con); + + WARN_ON(con->in_msg); + WARN_ON(con->out_msg); ceph_msg_remove_list(&con->out_queue); ceph_msg_remove_list(&con->out_sent); - - if (con->in_msg) { - BUG_ON(con->in_msg->con != con); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - } - - con->connect_seq = 0; con->out_seq = 0; - if (con->out_msg) { - BUG_ON(con->out_msg->con != con); - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } con->in_seq = 0; con->in_seq_acked = 0; - con->out_skip = 0; + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_reset_session(con); + else + ceph_con_v1_reset_session(con); } /* @@ -666,17 +567,17 @@ void ceph_con_close(struct ceph_connection *con) { mutex_lock(&con->mutex); dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr)); - con->state = CON_STATE_CLOSED; + con->state = CEPH_CON_S_CLOSED; - con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */ - con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING); - con_flag_clear(con, CON_FLAG_WRITE_PENDING); - con_flag_clear(con, CON_FLAG_BACKOFF); + ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next + connect */ + ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING); + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF); - reset_connection(con); - con->peer_global_seq = 0; + ceph_con_reset_protocol(con); + ceph_con_reset_session(con); cancel_con(con); - con_close_socket(con); mutex_unlock(&con->mutex); } EXPORT_SYMBOL(ceph_con_close); @@ -691,8 +592,8 @@ void ceph_con_open(struct ceph_connection *con, mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(addr)); - WARN_ON(con->state != CON_STATE_CLOSED); - con->state = CON_STATE_PREOPEN; + WARN_ON(con->state != CEPH_CON_S_CLOSED); + con->state = CEPH_CON_S_PREOPEN; con->peer_name.type = (__u8) entity_type; con->peer_name.num = cpu_to_le64(entity_num); @@ -709,7 +610,10 @@ EXPORT_SYMBOL(ceph_con_open); */ bool ceph_con_opened(struct ceph_connection *con) { - return con->connect_seq > 0; + if (ceph_msgr2(from_msgr(con->msgr))) + return ceph_con_v2_opened(con); + + return ceph_con_v1_opened(con); } /* @@ -732,16 +636,15 @@ void ceph_con_init(struct ceph_connection *con, void *private, INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, ceph_con_workfn); - con->state = CON_STATE_CLOSED; + con->state = CEPH_CON_S_CLOSED; } EXPORT_SYMBOL(ceph_con_init); - /* * We maintain a global counter to order connection attempts. Get * a unique seq greater than @gt. */ -static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) +u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt) { u32 ret; @@ -753,48 +656,53 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) return ret; } -static void con_out_kvec_reset(struct ceph_connection *con) -{ - BUG_ON(con->out_skip); - - con->out_kvec_left = 0; - con->out_kvec_bytes = 0; - con->out_kvec_cur = &con->out_kvec[0]; -} - -static void con_out_kvec_add(struct ceph_connection *con, - size_t size, void *data) +/* + * Discard messages that have been acked by the server. + */ +void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq) { - int index = con->out_kvec_left; + struct ceph_msg *msg; + u64 seq; - BUG_ON(con->out_skip); - BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); + dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq); + while (!list_empty(&con->out_sent)) { + msg = list_first_entry(&con->out_sent, struct ceph_msg, + list_head); + WARN_ON(msg->needs_out_seq); + seq = le64_to_cpu(msg->hdr.seq); + if (seq > ack_seq) + break; - con->out_kvec[index].iov_len = size; - con->out_kvec[index].iov_base = data; - con->out_kvec_left++; - con->out_kvec_bytes += size; + dout("%s con %p discarding msg %p seq %llu\n", __func__, con, + msg, seq); + ceph_msg_remove(msg); + } } /* - * Chop off a kvec from the end. Return residual number of bytes for - * that kvec, i.e. how many bytes would have been written if the kvec - * hadn't been nuked. + * Discard messages that have been requeued in con_fault(), up to + * reconnect_seq. This avoids gratuitously resending messages that + * the server had received and handled prior to reconnect. */ -static int con_out_kvec_skip(struct ceph_connection *con) +void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq) { - int off = con->out_kvec_cur - con->out_kvec; - int skip = 0; + struct ceph_msg *msg; + u64 seq; - if (con->out_kvec_bytes > 0) { - skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len; - BUG_ON(con->out_kvec_bytes < skip); - BUG_ON(!con->out_kvec_left); - con->out_kvec_bytes -= skip; - con->out_kvec_left--; - } + dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq); + while (!list_empty(&con->out_queue)) { + msg = list_first_entry(&con->out_queue, struct ceph_msg, + list_head); + if (msg->needs_out_seq) + break; + seq = le64_to_cpu(msg->hdr.seq); + if (seq > reconnect_seq) + break; - return skip; + dout("%s con %p discarding msg %p seq %llu\n", __func__, con, + msg, seq); + ceph_msg_remove(msg); + } } #ifdef CONFIG_BLOCK @@ -1113,10 +1021,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) cursor->need_crc = true; } -static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) +void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, + struct ceph_msg *msg, size_t length) { - struct ceph_msg_data_cursor *cursor = &msg->cursor; - BUG_ON(!length); BUG_ON(length > msg->data_length); BUG_ON(!msg->num_data_items); @@ -1132,9 +1039,9 @@ static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) * data item, and supply the page offset and length of that piece. * Indicate whether this is the last piece in this data item. */ -static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, - size_t *page_offset, size_t *length, - bool *last_piece) +struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length, + bool *last_piece) { struct page *page; @@ -1173,8 +1080,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, * Returns true if the result moves the cursor on to the next piece * of the data item. */ -static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, - size_t bytes) +void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { bool new_piece; @@ -1210,328 +1116,8 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, cursor->need_crc = new_piece; } -static size_t sizeof_footer(struct ceph_connection *con) -{ - return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? - sizeof(struct ceph_msg_footer) : - sizeof(struct ceph_msg_footer_old); -} - -static void prepare_message_data(struct ceph_msg *msg, u32 data_len) -{ - /* Initialize data cursor */ - - ceph_msg_data_cursor_init(msg, (size_t)data_len); -} - -/* - * Prepare footer for currently outgoing message, and finish things - * off. Assumes out_kvec* are already valid.. we just add on to the end. - */ -static void prepare_write_message_footer(struct ceph_connection *con) -{ - struct ceph_msg *m = con->out_msg; - - m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; - - dout("prepare_write_message_footer %p\n", con); - con_out_kvec_add(con, sizeof_footer(con), &m->footer); - if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { - if (con->ops->sign_message) - con->ops->sign_message(m); - else - m->footer.sig = 0; - } else { - m->old_footer.flags = m->footer.flags; - } - con->out_more = m->more_to_follow; - con->out_msg_done = true; -} - -/* - * Prepare headers for the next outgoing message. - */ -static void prepare_write_message(struct ceph_connection *con) -{ - struct ceph_msg *m; - u32 crc; - - con_out_kvec_reset(con); - con->out_msg_done = false; - - /* Sneak an ack in there first? If we can get it into the same - * TCP packet that's a good thing. */ - if (con->in_seq > con->in_seq_acked) { - con->in_seq_acked = con->in_seq; - con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con_out_kvec_add(con, sizeof (con->out_temp_ack), - &con->out_temp_ack); - } - - BUG_ON(list_empty(&con->out_queue)); - m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); - con->out_msg = m; - BUG_ON(m->con != con); - - /* put message on sent list */ - ceph_msg_get(m); - list_move_tail(&m->list_head, &con->out_sent); - - /* - * only assign outgoing seq # if we haven't sent this message - * yet. if it is requeued, resend with it's original seq. - */ - if (m->needs_out_seq) { - m->hdr.seq = cpu_to_le64(++con->out_seq); - m->needs_out_seq = false; - - if (con->ops->reencode_message) - con->ops->reencode_message(m); - } - - dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", - m, con->out_seq, le16_to_cpu(m->hdr.type), - le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - m->data_length); - WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); - WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); - - /* tag + hdr + front + middle */ - con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr); - con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); - - if (m->middle) - con_out_kvec_add(con, m->middle->vec.iov_len, - m->middle->vec.iov_base); - - /* fill in hdr crc and finalize hdr */ - crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); - con->out_msg->hdr.crc = cpu_to_le32(crc); - memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr)); - - /* fill in front and middle crc, footer */ - crc = crc32c(0, m->front.iov_base, m->front.iov_len); - con->out_msg->footer.front_crc = cpu_to_le32(crc); - if (m->middle) { - crc = crc32c(0, m->middle->vec.iov_base, - m->middle->vec.iov_len); - con->out_msg->footer.middle_crc = cpu_to_le32(crc); - } else - con->out_msg->footer.middle_crc = 0; - dout("%s front_crc %u middle_crc %u\n", __func__, - le32_to_cpu(con->out_msg->footer.front_crc), - le32_to_cpu(con->out_msg->footer.middle_crc)); - con->out_msg->footer.flags = 0; - - /* is there a data payload? */ - con->out_msg->footer.data_crc = 0; - if (m->data_length) { - prepare_message_data(con->out_msg, m->data_length); - con->out_more = 1; /* data + footer will follow */ - } else { - /* no, queue up footer too and be done */ - prepare_write_message_footer(con); - } - - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Prepare an ack. - */ -static void prepare_write_ack(struct ceph_connection *con) -{ - dout("prepare_write_ack %p %llu -> %llu\n", con, - con->in_seq_acked, con->in_seq); - con->in_seq_acked = con->in_seq; - - con_out_kvec_reset(con); - - con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); - - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con_out_kvec_add(con, sizeof (con->out_temp_ack), - &con->out_temp_ack); - - con->out_more = 1; /* more will follow.. eventually.. */ - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Prepare to share the seq during handshake - */ -static void prepare_write_seq(struct ceph_connection *con) -{ - dout("prepare_write_seq %p %llu -> %llu\n", con, - con->in_seq_acked, con->in_seq); - con->in_seq_acked = con->in_seq; - - con_out_kvec_reset(con); - - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con_out_kvec_add(con, sizeof (con->out_temp_ack), - &con->out_temp_ack); - - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Prepare to write keepalive byte. - */ -static void prepare_write_keepalive(struct ceph_connection *con) -{ - dout("prepare_write_keepalive %p\n", con); - con_out_kvec_reset(con); - if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { - struct timespec64 now; - - ktime_get_real_ts64(&now); - con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); - ceph_encode_timespec64(&con->out_temp_keepalive2, &now); - con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), - &con->out_temp_keepalive2); - } else { - con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); - } - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Connection negotiation. - */ - -static int get_connect_authorizer(struct ceph_connection *con) -{ - struct ceph_auth_handshake *auth; - int auth_proto; - - if (!con->ops->get_authorizer) { - con->auth = NULL; - con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; - con->out_connect.authorizer_len = 0; - return 0; - } - - auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry); - if (IS_ERR(auth)) - return PTR_ERR(auth); - - con->auth = auth; - con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); - con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len); - return 0; -} - -/* - * We connected to a peer and are saying hello. - */ -static void prepare_write_banner(struct ceph_connection *con) -{ - con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); - con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), - &con->msgr->my_enc_addr); - - con->out_more = 0; - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -static void __prepare_write_connect(struct ceph_connection *con) -{ - con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect); - if (con->auth) - con_out_kvec_add(con, con->auth->authorizer_buf_len, - con->auth->authorizer_buf); - - con->out_more = 0; - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -static int prepare_write_connect(struct ceph_connection *con) -{ - unsigned int global_seq = get_global_seq(con->msgr, 0); - int proto; - int ret; - - switch (con->peer_name.type) { - case CEPH_ENTITY_TYPE_MON: - proto = CEPH_MONC_PROTOCOL; - break; - case CEPH_ENTITY_TYPE_OSD: - proto = CEPH_OSDC_PROTOCOL; - break; - case CEPH_ENTITY_TYPE_MDS: - proto = CEPH_MDSC_PROTOCOL; - break; - default: - BUG(); - } - - dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, - con->connect_seq, global_seq, proto); - - con->out_connect.features = - cpu_to_le64(from_msgr(con->msgr)->supported_features); - con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); - con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); - con->out_connect.global_seq = cpu_to_le32(global_seq); - con->out_connect.protocol_version = cpu_to_le32(proto); - con->out_connect.flags = 0; - - ret = get_connect_authorizer(con); - if (ret) - return ret; - - __prepare_write_connect(con); - return 0; -} - -/* - * write as much of pending kvecs to the socket as we can. - * 1 -> done - * 0 -> socket full, but more to do - * <0 -> error - */ -static int write_partial_kvec(struct ceph_connection *con) -{ - int ret; - - dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes); - while (con->out_kvec_bytes > 0) { - ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur, - con->out_kvec_left, con->out_kvec_bytes, - con->out_more); - if (ret <= 0) - goto out; - con->out_kvec_bytes -= ret; - if (con->out_kvec_bytes == 0) - break; /* done */ - - /* account for full iov entries consumed */ - while (ret >= con->out_kvec_cur->iov_len) { - BUG_ON(!con->out_kvec_left); - ret -= con->out_kvec_cur->iov_len; - con->out_kvec_cur++; - con->out_kvec_left--; - } - /* and for a partially-consumed entry */ - if (ret) { - con->out_kvec_cur->iov_len -= ret; - con->out_kvec_cur->iov_base += ret; - } - } - con->out_kvec_left = 0; - ret = 1; -out: - dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, - con->out_kvec_bytes, con->out_kvec_left, ret); - return ret; /* done! */ -} - -static u32 ceph_crc32c_page(u32 crc, struct page *page, - unsigned int page_offset, - unsigned int length) +u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, + unsigned int length) { char *kaddr; @@ -1542,257 +1128,8 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page, return crc; } -/* - * Write as much message data payload as we can. If we finish, queue - * up the footer. - * 1 -> done, footer is now queued in out_kvec[]. - * 0 -> socket full, but more to do - * <0 -> error - */ -static int write_partial_message_data(struct ceph_connection *con) -{ - struct ceph_msg *msg = con->out_msg; - struct ceph_msg_data_cursor *cursor = &msg->cursor; - bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); - int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; - u32 crc; - - dout("%s %p msg %p\n", __func__, con, msg); - - if (!msg->num_data_items) - return -EINVAL; - - /* - * Iterate through each page that contains data to be - * written, and send as much as possible for each. - * - * If we are calculating the data crc (the default), we will - * need to map the page. If we have no pages, they have - * been revoked, so use the zero page. - */ - crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; - while (cursor->total_resid) { - struct page *page; - size_t page_offset; - size_t length; - int ret; - - if (!cursor->resid) { - ceph_msg_data_advance(cursor, 0); - continue; - } - - page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); - if (length == cursor->total_resid) - more = MSG_MORE; - ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, - more); - if (ret <= 0) { - if (do_datacrc) - msg->footer.data_crc = cpu_to_le32(crc); - - return ret; - } - if (do_datacrc && cursor->need_crc) - crc = ceph_crc32c_page(crc, page, page_offset, length); - ceph_msg_data_advance(cursor, (size_t)ret); - } - - dout("%s %p msg %p done\n", __func__, con, msg); - - /* prepare and queue up footer, too */ - if (do_datacrc) - msg->footer.data_crc = cpu_to_le32(crc); - else - msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; - con_out_kvec_reset(con); - prepare_write_message_footer(con); - - return 1; /* must return > 0 to indicate success */ -} - -/* - * write some zeros - */ -static int write_partial_skip(struct ceph_connection *con) -{ - int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; - int ret; - - dout("%s %p %d left\n", __func__, con, con->out_skip); - while (con->out_skip > 0) { - size_t size = min(con->out_skip, (int) PAGE_SIZE); - - if (size == con->out_skip) - more = MSG_MORE; - ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more); - if (ret <= 0) - goto out; - con->out_skip -= ret; - } - ret = 1; -out: - return ret; -} - -/* - * Prepare to read connection handshake, or an ack. - */ -static void prepare_read_banner(struct ceph_connection *con) -{ - dout("prepare_read_banner %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_connect(struct ceph_connection *con) -{ - dout("prepare_read_connect %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_ack(struct ceph_connection *con) -{ - dout("prepare_read_ack %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_seq(struct ceph_connection *con) -{ - dout("prepare_read_seq %p\n", con); - con->in_base_pos = 0; - con->in_tag = CEPH_MSGR_TAG_SEQ; -} - -static void prepare_read_tag(struct ceph_connection *con) -{ - dout("prepare_read_tag %p\n", con); - con->in_base_pos = 0; - con->in_tag = CEPH_MSGR_TAG_READY; -} - -static void prepare_read_keepalive_ack(struct ceph_connection *con) -{ - dout("prepare_read_keepalive_ack %p\n", con); - con->in_base_pos = 0; -} - -/* - * Prepare to read a message. - */ -static int prepare_read_message(struct ceph_connection *con) -{ - dout("prepare_read_message %p\n", con); - BUG_ON(con->in_msg != NULL); - con->in_base_pos = 0; - con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; - return 0; -} - - -static int read_partial(struct ceph_connection *con, - int end, int size, void *object) -{ - while (con->in_base_pos < end) { - int left = end - con->in_base_pos; - int have = size - left; - int ret = ceph_tcp_recvmsg(con->sock, object + have, left); - if (ret <= 0) - return ret; - con->in_base_pos += ret; - } - return 1; -} - - -/* - * Read all or part of the connect-side handshake on a new connection - */ -static int read_partial_banner(struct ceph_connection *con) -{ - int size; - int end; - int ret; - - dout("read_partial_banner %p at %d\n", con, con->in_base_pos); - - /* peer's banner */ - size = strlen(CEPH_BANNER); - end = size; - ret = read_partial(con, end, size, con->in_banner); - if (ret <= 0) - goto out; - - size = sizeof (con->actual_peer_addr); - end += size; - ret = read_partial(con, end, size, &con->actual_peer_addr); - if (ret <= 0) - goto out; - ceph_decode_banner_addr(&con->actual_peer_addr); - - size = sizeof (con->peer_addr_for_me); - end += size; - ret = read_partial(con, end, size, &con->peer_addr_for_me); - if (ret <= 0) - goto out; - ceph_decode_banner_addr(&con->peer_addr_for_me); - -out: - return ret; -} - -static int read_partial_connect(struct ceph_connection *con) -{ - int size; - int end; - int ret; - - dout("read_partial_connect %p at %d\n", con, con->in_base_pos); - - size = sizeof (con->in_reply); - end = size; - ret = read_partial(con, end, size, &con->in_reply); - if (ret <= 0) - goto out; - - if (con->auth) { - size = le32_to_cpu(con->in_reply.authorizer_len); - if (size > con->auth->authorizer_reply_buf_len) { - pr_err("authorizer reply too big: %d > %zu\n", size, - con->auth->authorizer_reply_buf_len); - ret = -EINVAL; - goto out; - } - - end += size; - ret = read_partial(con, end, size, - con->auth->authorizer_reply_buf); - if (ret <= 0) - goto out; - } - - dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", - con, (int)con->in_reply.tag, - le32_to_cpu(con->in_reply.connect_seq), - le32_to_cpu(con->in_reply.global_seq)); -out: - return ret; -} -/* - * Verify the hello banner looks okay. - */ -static int verify_hello(struct ceph_connection *con) -{ - if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { - pr_err("connect to %s got bad banner\n", - ceph_pr_addr(&con->peer_addr)); - con->error_msg = "protocol error, bad banner"; - return -1; - } - return 0; -} - -static bool addr_is_blank(struct ceph_entity_addr *addr) +bool ceph_addr_is_blank(const struct ceph_entity_addr *addr) { struct sockaddr_storage ss = addr->in_addr; /* align */ struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr; @@ -1808,7 +1145,7 @@ static bool addr_is_blank(struct ceph_entity_addr *addr) } } -static int addr_port(struct ceph_entity_addr *addr) +int ceph_addr_port(const struct ceph_entity_addr *addr) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: @@ -1819,7 +1156,7 @@ static int addr_port(struct ceph_entity_addr *addr) return 0; } -static void addr_set_port(struct ceph_entity_addr *addr, int p) +void ceph_addr_set_port(struct ceph_entity_addr *addr, int p) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: @@ -1977,8 +1314,17 @@ int ceph_parse_ips(const char *c, const char *end, port = CEPH_MON_PORT; } - addr_set_port(&addr[i], port); + ceph_addr_set_port(&addr[i], port); + /* + * We want the type to be set according to ms_mode + * option, but options are normally parsed after mon + * addresses. Rather than complicating parsing, set + * to LEGACY and override in build_initial_monmap() + * for mon addresses and ceph_messenger_init() for + * ip option. + */ addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; + addr[i].nonce = 0; dout("parse_ips got %s\n", ceph_pr_addr(&addr[i])); @@ -2000,521 +1346,12 @@ bad: return ret; } -static int process_banner(struct ceph_connection *con) -{ - dout("process_banner on %p\n", con); - - if (verify_hello(con) < 0) - return -1; - - /* - * Make sure the other end is who we wanted. note that the other - * end may not yet know their ip address, so if it's 0.0.0.0, give - * them the benefit of the doubt. - */ - if (memcmp(&con->peer_addr, &con->actual_peer_addr, - sizeof(con->peer_addr)) != 0 && - !(addr_is_blank(&con->actual_peer_addr) && - con->actual_peer_addr.nonce == con->peer_addr.nonce)) { - pr_warn("wrong peer, want %s/%u, got %s/%u\n", - ceph_pr_addr(&con->peer_addr), - le32_to_cpu(con->peer_addr.nonce), - ceph_pr_addr(&con->actual_peer_addr), - le32_to_cpu(con->actual_peer_addr.nonce)); - con->error_msg = "wrong peer at address"; - return -1; - } - - /* - * did we learn our address? - */ - if (addr_is_blank(&con->msgr->inst.addr)) { - int port = addr_port(&con->msgr->inst.addr); - - memcpy(&con->msgr->inst.addr.in_addr, - &con->peer_addr_for_me.in_addr, - sizeof(con->peer_addr_for_me.in_addr)); - addr_set_port(&con->msgr->inst.addr, port); - encode_my_addr(con->msgr); - dout("process_banner learned my addr is %s\n", - ceph_pr_addr(&con->msgr->inst.addr)); - } - - return 0; -} - -static int process_connect(struct ceph_connection *con) -{ - u64 sup_feat = from_msgr(con->msgr)->supported_features; - u64 req_feat = from_msgr(con->msgr)->required_features; - u64 server_feat = le64_to_cpu(con->in_reply.features); - int ret; - - dout("process_connect on %p tag %d\n", con, (int)con->in_tag); - - if (con->auth) { - int len = le32_to_cpu(con->in_reply.authorizer_len); - - /* - * Any connection that defines ->get_authorizer() - * should also define ->add_authorizer_challenge() and - * ->verify_authorizer_reply(). - * - * See get_connect_authorizer(). - */ - if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { - ret = con->ops->add_authorizer_challenge( - con, con->auth->authorizer_reply_buf, len); - if (ret < 0) - return ret; - - con_out_kvec_reset(con); - __prepare_write_connect(con); - prepare_read_connect(con); - return 0; - } - - if (len) { - ret = con->ops->verify_authorizer_reply(con); - if (ret < 0) { - con->error_msg = "bad authorize reply"; - return ret; - } - } - } - - switch (con->in_reply.tag) { - case CEPH_MSGR_TAG_FEATURES: - pr_err("%s%lld %s feature set mismatch," - " my %llx < server's %llx, missing %llx\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - sup_feat, server_feat, server_feat & ~sup_feat); - con->error_msg = "missing required protocol features"; - reset_connection(con); - return -1; - - case CEPH_MSGR_TAG_BADPROTOVER: - pr_err("%s%lld %s protocol version mismatch," - " my %d != server's %d\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - le32_to_cpu(con->out_connect.protocol_version), - le32_to_cpu(con->in_reply.protocol_version)); - con->error_msg = "protocol version mismatch"; - reset_connection(con); - return -1; - - case CEPH_MSGR_TAG_BADAUTHORIZER: - con->auth_retry++; - dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, - con->auth_retry); - if (con->auth_retry == 2) { - con->error_msg = "connect authorization failure"; - return -1; - } - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_RESETSESSION: - /* - * If we connected with a large connect_seq but the peer - * has no record of a session with us (no connection, or - * connect_seq == 0), they will send RESETSESION to indicate - * that they must have reset their session, and may have - * dropped messages. - */ - dout("process_connect got RESET peer seq %u\n", - le32_to_cpu(con->in_reply.connect_seq)); - pr_err("%s%lld %s connection reset\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr)); - reset_connection(con); - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - - /* Tell ceph about it. */ - mutex_unlock(&con->mutex); - pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name)); - if (con->ops->peer_reset) - con->ops->peer_reset(con); - mutex_lock(&con->mutex); - if (con->state != CON_STATE_NEGOTIATING) - return -EAGAIN; - break; - - case CEPH_MSGR_TAG_RETRY_SESSION: - /* - * If we sent a smaller connect_seq than the peer has, try - * again with a larger value. - */ - dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", - le32_to_cpu(con->out_connect.connect_seq), - le32_to_cpu(con->in_reply.connect_seq)); - con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_RETRY_GLOBAL: - /* - * If we sent a smaller global_seq than the peer has, try - * again with a larger value. - */ - dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", - con->peer_global_seq, - le32_to_cpu(con->in_reply.global_seq)); - get_global_seq(con->msgr, - le32_to_cpu(con->in_reply.global_seq)); - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_SEQ: - case CEPH_MSGR_TAG_READY: - if (req_feat & ~server_feat) { - pr_err("%s%lld %s protocol feature mismatch," - " my required %llx > server's %llx, need %llx\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - req_feat, server_feat, req_feat & ~server_feat); - con->error_msg = "missing required protocol features"; - reset_connection(con); - return -1; - } - - WARN_ON(con->state != CON_STATE_NEGOTIATING); - con->state = CON_STATE_OPEN; - con->auth_retry = 0; /* we authenticated; clear flag */ - con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); - con->connect_seq++; - con->peer_features = server_feat; - dout("process_connect got READY gseq %d cseq %d (%d)\n", - con->peer_global_seq, - le32_to_cpu(con->in_reply.connect_seq), - con->connect_seq); - WARN_ON(con->connect_seq != - le32_to_cpu(con->in_reply.connect_seq)); - - if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) - con_flag_set(con, CON_FLAG_LOSSYTX); - - con->delay = 0; /* reset backoff memory */ - - if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { - prepare_write_seq(con); - prepare_read_seq(con); - } else { - prepare_read_tag(con); - } - break; - - case CEPH_MSGR_TAG_WAIT: - /* - * If there is a connection race (we are opening - * connections to each other), one of us may just have - * to WAIT. This shouldn't happen if we are the - * client. - */ - con->error_msg = "protocol error, got WAIT as client"; - return -1; - - default: - con->error_msg = "protocol error, garbage tag during connect"; - return -1; - } - return 0; -} - - -/* - * read (part of) an ack - */ -static int read_partial_ack(struct ceph_connection *con) -{ - int size = sizeof (con->in_temp_ack); - int end = size; - - return read_partial(con, end, size, &con->in_temp_ack); -} - -/* - * We can finally discard anything that's been acked. - */ -static void process_ack(struct ceph_connection *con) -{ - struct ceph_msg *m; - u64 ack = le64_to_cpu(con->in_temp_ack); - u64 seq; - bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ); - struct list_head *list = reconnect ? &con->out_queue : &con->out_sent; - - /* - * In the reconnect case, con_fault() has requeued messages - * in out_sent. We should cleanup old messages according to - * the reconnect seq. - */ - while (!list_empty(list)) { - m = list_first_entry(list, struct ceph_msg, list_head); - if (reconnect && m->needs_out_seq) - break; - seq = le64_to_cpu(m->hdr.seq); - if (seq > ack) - break; - dout("got ack for seq %llu type %d at %p\n", seq, - le16_to_cpu(m->hdr.type), m); - m->ack_stamp = jiffies; - ceph_msg_remove(m); - } - - prepare_read_tag(con); -} - - -static int read_partial_message_section(struct ceph_connection *con, - struct kvec *section, - unsigned int sec_len, u32 *crc) -{ - int ret, left; - - BUG_ON(!section); - - while (section->iov_len < sec_len) { - BUG_ON(section->iov_base == NULL); - left = sec_len - section->iov_len; - ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + - section->iov_len, left); - if (ret <= 0) - return ret; - section->iov_len += ret; - } - if (section->iov_len == sec_len) - *crc = crc32c(0, section->iov_base, section->iov_len); - - return 1; -} - -static int read_partial_msg_data(struct ceph_connection *con) -{ - struct ceph_msg *msg = con->in_msg; - struct ceph_msg_data_cursor *cursor = &msg->cursor; - bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); - struct page *page; - size_t page_offset; - size_t length; - u32 crc = 0; - int ret; - - if (!msg->num_data_items) - return -EIO; - - if (do_datacrc) - crc = con->in_data_crc; - while (cursor->total_resid) { - if (!cursor->resid) { - ceph_msg_data_advance(cursor, 0); - continue; - } - - page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); - ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); - if (ret <= 0) { - if (do_datacrc) - con->in_data_crc = crc; - - return ret; - } - - if (do_datacrc) - crc = ceph_crc32c_page(crc, page, page_offset, ret); - ceph_msg_data_advance(cursor, (size_t)ret); - } - if (do_datacrc) - con->in_data_crc = crc; - - return 1; /* must return > 0 to indicate success */ -} - -/* - * read (part of) a message. - */ -static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); - -static int read_partial_message(struct ceph_connection *con) -{ - struct ceph_msg *m = con->in_msg; - int size; - int end; - int ret; - unsigned int front_len, middle_len, data_len; - bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); - bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); - u64 seq; - u32 crc; - - dout("read_partial_message con %p msg %p\n", con, m); - - /* header */ - size = sizeof (con->in_hdr); - end = size; - ret = read_partial(con, end, size, &con->in_hdr); - if (ret <= 0) - return ret; - - crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); - if (cpu_to_le32(crc) != con->in_hdr.crc) { - pr_err("read_partial_message bad hdr crc %u != expected %u\n", - crc, con->in_hdr.crc); - return -EBADMSG; - } - - front_len = le32_to_cpu(con->in_hdr.front_len); - if (front_len > CEPH_MSG_MAX_FRONT_LEN) - return -EIO; - middle_len = le32_to_cpu(con->in_hdr.middle_len); - if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) - return -EIO; - data_len = le32_to_cpu(con->in_hdr.data_len); - if (data_len > CEPH_MSG_MAX_DATA_LEN) - return -EIO; - - /* verify seq# */ - seq = le64_to_cpu(con->in_hdr.seq); - if ((s64)seq - (s64)con->in_seq < 1) { - pr_info("skipping %s%lld %s seq %lld expected %lld\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - seq, con->in_seq + 1); - con->in_base_pos = -front_len - middle_len - data_len - - sizeof_footer(con); - con->in_tag = CEPH_MSGR_TAG_READY; - return 1; - } else if ((s64)seq - (s64)con->in_seq > 1) { - pr_err("read_partial_message bad seq %lld expected %lld\n", - seq, con->in_seq + 1); - con->error_msg = "bad message sequence # for incoming message"; - return -EBADE; - } - - /* allocate message? */ - if (!con->in_msg) { - int skip = 0; - - dout("got hdr type %d front %d data %d\n", con->in_hdr.type, - front_len, data_len); - ret = ceph_con_in_msg_alloc(con, &skip); - if (ret < 0) - return ret; - - BUG_ON(!con->in_msg ^ skip); - if (skip) { - /* skip this message */ - dout("alloc_msg said skip message\n"); - con->in_base_pos = -front_len - middle_len - data_len - - sizeof_footer(con); - con->in_tag = CEPH_MSGR_TAG_READY; - con->in_seq++; - return 1; - } - - BUG_ON(!con->in_msg); - BUG_ON(con->in_msg->con != con); - m = con->in_msg; - m->front.iov_len = 0; /* haven't read it yet */ - if (m->middle) - m->middle->vec.iov_len = 0; - - /* prepare for data payload, if any */ - - if (data_len) - prepare_message_data(con->in_msg, data_len); - } - - /* front */ - ret = read_partial_message_section(con, &m->front, front_len, - &con->in_front_crc); - if (ret <= 0) - return ret; - - /* middle */ - if (m->middle) { - ret = read_partial_message_section(con, &m->middle->vec, - middle_len, - &con->in_middle_crc); - if (ret <= 0) - return ret; - } - - /* (page) data */ - if (data_len) { - ret = read_partial_msg_data(con); - if (ret <= 0) - return ret; - } - - /* footer */ - size = sizeof_footer(con); - end += size; - ret = read_partial(con, end, size, &m->footer); - if (ret <= 0) - return ret; - - if (!need_sign) { - m->footer.flags = m->old_footer.flags; - m->footer.sig = 0; - } - - dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", - m, front_len, m->footer.front_crc, middle_len, - m->footer.middle_crc, data_len, m->footer.data_crc); - - /* crc ok? */ - if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { - pr_err("read_partial_message %p front crc %u != exp. %u\n", - m, con->in_front_crc, m->footer.front_crc); - return -EBADMSG; - } - if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { - pr_err("read_partial_message %p middle crc %u != exp %u\n", - m, con->in_middle_crc, m->footer.middle_crc); - return -EBADMSG; - } - if (do_datacrc && - (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && - con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { - pr_err("read_partial_message %p data crc %u != exp. %u\n", m, - con->in_data_crc, le32_to_cpu(m->footer.data_crc)); - return -EBADMSG; - } - - if (need_sign && con->ops->check_message_signature && - con->ops->check_message_signature(m)) { - pr_err("read_partial_message %p signature check failed\n", m); - return -EBADMSG; - } - - return 1; /* done! */ -} - /* * Process message. This happens in the worker thread. The callback should * be careful not to do anything that waits on other incoming messages or it * may deadlock. */ -static void process_message(struct ceph_connection *con) +void ceph_con_process_message(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; @@ -2528,12 +1365,13 @@ static void process_message(struct ceph_connection *con) con->in_seq++; mutex_unlock(&con->mutex); - dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", + dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), ENTITY_NAME(msg->hdr.src), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len), con->in_front_crc, con->in_middle_crc, con->in_data_crc); con->ops->dispatch(con, msg); @@ -2541,264 +1379,6 @@ static void process_message(struct ceph_connection *con) mutex_lock(&con->mutex); } -static int read_keepalive_ack(struct ceph_connection *con) -{ - struct ceph_timespec ceph_ts; - size_t size = sizeof(ceph_ts); - int ret = read_partial(con, size, size, &ceph_ts); - if (ret <= 0) - return ret; - ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts); - prepare_read_tag(con); - return 1; -} - -/* - * Write something to the socket. Called in a worker thread when the - * socket appears to be writeable and we have something ready to send. - */ -static int try_write(struct ceph_connection *con) -{ - int ret = 1; - - dout("try_write start %p state %lu\n", con, con->state); - if (con->state != CON_STATE_PREOPEN && - con->state != CON_STATE_CONNECTING && - con->state != CON_STATE_NEGOTIATING && - con->state != CON_STATE_OPEN) - return 0; - - /* open the socket first? */ - if (con->state == CON_STATE_PREOPEN) { - BUG_ON(con->sock); - con->state = CON_STATE_CONNECTING; - - con_out_kvec_reset(con); - prepare_write_banner(con); - prepare_read_banner(con); - - BUG_ON(con->in_msg); - con->in_tag = CEPH_MSGR_TAG_READY; - dout("try_write initiating connect on %p new state %lu\n", - con, con->state); - ret = ceph_tcp_connect(con); - if (ret < 0) { - con->error_msg = "connect error"; - goto out; - } - } - -more: - dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); - BUG_ON(!con->sock); - - /* kvec data queued? */ - if (con->out_kvec_left) { - ret = write_partial_kvec(con); - if (ret <= 0) - goto out; - } - if (con->out_skip) { - ret = write_partial_skip(con); - if (ret <= 0) - goto out; - } - - /* msg pages? */ - if (con->out_msg) { - if (con->out_msg_done) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; /* we're done with this one */ - goto do_next; - } - - ret = write_partial_message_data(con); - if (ret == 1) - goto more; /* we need to send the footer, too! */ - if (ret == 0) - goto out; - if (ret < 0) { - dout("try_write write_partial_message_data err %d\n", - ret); - goto out; - } - } - -do_next: - if (con->state == CON_STATE_OPEN) { - if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { - prepare_write_keepalive(con); - goto more; - } - /* is anything else pending? */ - if (!list_empty(&con->out_queue)) { - prepare_write_message(con); - goto more; - } - if (con->in_seq > con->in_seq_acked) { - prepare_write_ack(con); - goto more; - } - } - - /* Nothing to do! */ - con_flag_clear(con, CON_FLAG_WRITE_PENDING); - dout("try_write nothing else to write.\n"); - ret = 0; -out: - dout("try_write done on %p ret %d\n", con, ret); - return ret; -} - -/* - * Read what we can from the socket. - */ -static int try_read(struct ceph_connection *con) -{ - int ret = -1; - -more: - dout("try_read start on %p state %lu\n", con, con->state); - if (con->state != CON_STATE_CONNECTING && - con->state != CON_STATE_NEGOTIATING && - con->state != CON_STATE_OPEN) - return 0; - - BUG_ON(!con->sock); - - dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, - con->in_base_pos); - - if (con->state == CON_STATE_CONNECTING) { - dout("try_read connecting\n"); - ret = read_partial_banner(con); - if (ret <= 0) - goto out; - ret = process_banner(con); - if (ret < 0) - goto out; - - con->state = CON_STATE_NEGOTIATING; - - /* - * Received banner is good, exchange connection info. - * Do not reset out_kvec, as sending our banner raced - * with receiving peer banner after connect completed. - */ - ret = prepare_write_connect(con); - if (ret < 0) - goto out; - prepare_read_connect(con); - - /* Send connection info before awaiting response */ - goto out; - } - - if (con->state == CON_STATE_NEGOTIATING) { - dout("try_read negotiating\n"); - ret = read_partial_connect(con); - if (ret <= 0) - goto out; - ret = process_connect(con); - if (ret < 0) - goto out; - goto more; - } - - WARN_ON(con->state != CON_STATE_OPEN); - - if (con->in_base_pos < 0) { - /* - * skipping + discarding content. - */ - ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos); - if (ret <= 0) - goto out; - dout("skipped %d / %d bytes\n", ret, -con->in_base_pos); - con->in_base_pos += ret; - if (con->in_base_pos) - goto more; - } - if (con->in_tag == CEPH_MSGR_TAG_READY) { - /* - * what's next? - */ - ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); - if (ret <= 0) - goto out; - dout("try_read got tag %d\n", (int)con->in_tag); - switch (con->in_tag) { - case CEPH_MSGR_TAG_MSG: - prepare_read_message(con); - break; - case CEPH_MSGR_TAG_ACK: - prepare_read_ack(con); - break; - case CEPH_MSGR_TAG_KEEPALIVE2_ACK: - prepare_read_keepalive_ack(con); - break; - case CEPH_MSGR_TAG_CLOSE: - con_close_socket(con); - con->state = CON_STATE_CLOSED; - goto out; - default: - goto bad_tag; - } - } - if (con->in_tag == CEPH_MSGR_TAG_MSG) { - ret = read_partial_message(con); - if (ret <= 0) { - switch (ret) { - case -EBADMSG: - con->error_msg = "bad crc/signature"; - fallthrough; - case -EBADE: - ret = -EIO; - break; - case -EIO: - con->error_msg = "io error"; - break; - } - goto out; - } - if (con->in_tag == CEPH_MSGR_TAG_READY) - goto more; - process_message(con); - if (con->state == CON_STATE_OPEN) - prepare_read_tag(con); - goto more; - } - if (con->in_tag == CEPH_MSGR_TAG_ACK || - con->in_tag == CEPH_MSGR_TAG_SEQ) { - /* - * the final handshake seq exchange is semantically - * equivalent to an ACK - */ - ret = read_partial_ack(con); - if (ret <= 0) - goto out; - process_ack(con); - goto more; - } - if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { - ret = read_keepalive_ack(con); - if (ret <= 0) - goto out; - goto more; - } - -out: - dout("try_read done on %p ret %d\n", con, ret); - return ret; - -bad_tag: - pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag); - con->error_msg = "protocol error, garbage tag"; - ret = -1; - goto out; -} - - /* * Atomically queue work on a connection after the specified delay. * Bump @con reference to avoid races with connection teardown. @@ -2811,6 +1391,9 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay) return -ENOENT; } + if (delay >= HZ) + delay = round_jiffies_relative(delay); + dout("%s %p %lu\n", __func__, con, delay); if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { dout("%s %p - already queued\n", __func__, con); @@ -2836,27 +1419,30 @@ static void cancel_con(struct ceph_connection *con) static bool con_sock_closed(struct ceph_connection *con) { - if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED)) + if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED)) return false; #define CASE(x) \ - case CON_STATE_ ## x: \ + case CEPH_CON_S_ ## x: \ con->error_msg = "socket closed (con state " #x ")"; \ break; switch (con->state) { CASE(CLOSED); CASE(PREOPEN); - CASE(CONNECTING); - CASE(NEGOTIATING); + CASE(V1_BANNER); + CASE(V1_CONNECT_MSG); + CASE(V2_BANNER_PREFIX); + CASE(V2_BANNER_PAYLOAD); + CASE(V2_HELLO); + CASE(V2_AUTH); + CASE(V2_AUTH_SIGNATURE); + CASE(V2_SESSION_CONNECT); + CASE(V2_SESSION_RECONNECT); CASE(OPEN); CASE(STANDBY); default: - pr_warn("%s con %p unrecognized state %lu\n", - __func__, con, con->state); - con->error_msg = "unrecognized con state"; BUG(); - break; } #undef CASE @@ -2867,15 +1453,15 @@ static bool con_backoff(struct ceph_connection *con) { int ret; - if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF)) + if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF)) return false; - ret = queue_con_delay(con, round_jiffies_relative(con->delay)); + ret = queue_con_delay(con, con->delay); if (ret) { dout("%s: con %p FAILED to back off %lu\n", __func__, con, con->delay); BUG_ON(ret == -ENOENT); - con_flag_set(con, CON_FLAG_BACKOFF); + ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); } return true; @@ -2891,11 +1477,11 @@ static void con_fault_finish(struct ceph_connection *con) * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->auth_retry) { - dout("auth_retry %d, invalidating\n", con->auth_retry); + if (con->v1.auth_retry) { + dout("auth_retry %d, invalidating\n", con->v1.auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); - con->auth_retry = 0; + con->v1.auth_retry = 0; } if (con->ops->fault) @@ -2923,21 +1509,24 @@ static void ceph_con_workfn(struct work_struct *work) dout("%s: con %p BACKOFF\n", __func__, con); break; } - if (con->state == CON_STATE_STANDBY) { + if (con->state == CEPH_CON_S_STANDBY) { dout("%s: con %p STANDBY\n", __func__, con); break; } - if (con->state == CON_STATE_CLOSED) { + if (con->state == CEPH_CON_S_CLOSED) { dout("%s: con %p CLOSED\n", __func__, con); BUG_ON(con->sock); break; } - if (con->state == CON_STATE_PREOPEN) { + if (con->state == CEPH_CON_S_PREOPEN) { dout("%s: con %p PREOPEN\n", __func__, con); BUG_ON(con->sock); } - ret = try_read(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ret = ceph_con_v2_try_read(con); + else + ret = ceph_con_v1_try_read(con); if (ret < 0) { if (ret == -EAGAIN) continue; @@ -2947,7 +1536,10 @@ static void ceph_con_workfn(struct work_struct *work) break; } - ret = try_write(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ret = ceph_con_v2_try_write(con); + else + ret = ceph_con_v1_try_write(con); if (ret < 0) { if (ret == -EAGAIN) continue; @@ -2974,64 +1566,54 @@ static void ceph_con_workfn(struct work_struct *work) */ static void con_fault(struct ceph_connection *con) { - dout("fault %p state %lu to peer %s\n", + dout("fault %p state %d to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr)); pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr), con->error_msg); con->error_msg = NULL; - WARN_ON(con->state != CON_STATE_CONNECTING && - con->state != CON_STATE_NEGOTIATING && - con->state != CON_STATE_OPEN); + WARN_ON(con->state == CEPH_CON_S_STANDBY || + con->state == CEPH_CON_S_CLOSED); - con_close_socket(con); + ceph_con_reset_protocol(con); - if (con_flag_test(con, CON_FLAG_LOSSYTX)) { + if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) { dout("fault on LOSSYTX channel, marking CLOSED\n"); - con->state = CON_STATE_CLOSED; + con->state = CEPH_CON_S_CLOSED; return; } - if (con->in_msg) { - BUG_ON(con->in_msg->con != con); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - } - if (con->out_msg) { - BUG_ON(con->out_msg->con != con); - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } - /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && - !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) { + !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); - con_flag_clear(con, CON_FLAG_WRITE_PENDING); - con->state = CON_STATE_STANDBY; + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + con->state = CEPH_CON_S_STANDBY; } else { /* retry after a delay. */ - con->state = CON_STATE_PREOPEN; - if (con->delay == 0) + con->state = CEPH_CON_S_PREOPEN; + if (!con->delay) { con->delay = BASE_DELAY_INTERVAL; - else if (con->delay < MAX_DELAY_INTERVAL) + } else if (con->delay < MAX_DELAY_INTERVAL) { con->delay *= 2; - con_flag_set(con, CON_FLAG_BACKOFF); + if (con->delay > MAX_DELAY_INTERVAL) + con->delay = MAX_DELAY_INTERVAL; + } + ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); queue_con(con); } } - void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) { u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; msgr->inst.addr.nonce = cpu_to_le32(nonce); - encode_my_addr(msgr); + ceph_encode_my_addr(msgr); } /* @@ -3042,26 +1624,35 @@ void ceph_messenger_init(struct ceph_messenger *msgr, { spin_lock_init(&msgr->global_seq_lock); - if (myaddr) - msgr->inst.addr = *myaddr; + if (myaddr) { + memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr, + sizeof(msgr->inst.addr.in_addr)); + ceph_addr_set_port(&msgr->inst.addr, 0); + } - /* select a random nonce */ - msgr->inst.addr.type = 0; - get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); - encode_my_addr(msgr); + /* + * Since nautilus, clients are identified using type ANY. + * For msgr1, ceph_encode_banner_addr() munges it to NONE. + */ + msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY; + + /* generate a random non-zero nonce */ + do { + get_random_bytes(&msgr->inst.addr.nonce, + sizeof(msgr->inst.addr.nonce)); + } while (!msgr->inst.addr.nonce); + ceph_encode_my_addr(msgr); atomic_set(&msgr->stopping, 0); write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); dout("%s %p\n", __func__, msgr); } -EXPORT_SYMBOL(ceph_messenger_init); void ceph_messenger_fini(struct ceph_messenger *msgr) { put_net(read_pnet(&msgr->net)); } -EXPORT_SYMBOL(ceph_messenger_fini); static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) { @@ -3075,17 +1666,19 @@ static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ - if (con->state == CON_STATE_STANDBY) { + if (con->state == CEPH_CON_S_STANDBY) { dout("clear_standby %p and ++connect_seq\n", con); - con->state = CON_STATE_PREOPEN; - con->connect_seq++; - WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING)); - WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)); + con->state = CEPH_CON_S_PREOPEN; + con->v1.connect_seq++; + WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); + WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); } } /* * Queue up an outgoing message on the given connection. + * + * Consumes a ref on @msg. */ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) { @@ -3096,7 +1689,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) mutex_lock(&con->mutex); - if (con->state == CON_STATE_CLOSED) { + if (con->state == CEPH_CON_S_CLOSED) { dout("con_send %p closed, dropping %p\n", con, msg); ceph_msg_put(msg); mutex_unlock(&con->mutex); @@ -3119,7 +1712,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) /* if there wasn't anything waiting to send before, queue * new work */ - if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) + if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); @@ -3137,36 +1730,30 @@ void ceph_msg_revoke(struct ceph_msg *msg) } mutex_lock(&con->mutex); - if (!list_empty(&msg->list_head)) { - dout("%s %p msg %p - was on queue\n", __func__, con, msg); - list_del_init(&msg->list_head); - msg->hdr.seq = 0; - - ceph_msg_put(msg); + if (list_empty(&msg->list_head)) { + WARN_ON(con->out_msg == msg); + dout("%s con %p msg %p not linked\n", __func__, con, msg); + mutex_unlock(&con->mutex); + return; } + + dout("%s con %p msg %p was linked\n", __func__, con, msg); + msg->hdr.seq = 0; + ceph_msg_remove(msg); + if (con->out_msg == msg) { - BUG_ON(con->out_skip); - /* footer */ - if (con->out_msg_done) { - con->out_skip += con_out_kvec_skip(con); - } else { - BUG_ON(!msg->data_length); - con->out_skip += sizeof_footer(con); - } - /* data, middle, front */ - if (msg->data_length) - con->out_skip += msg->cursor.total_resid; - if (msg->middle) - con->out_skip += con_out_kvec_skip(con); - con->out_skip += con_out_kvec_skip(con); - - dout("%s %p msg %p - was sending, will write %d skip %d\n", - __func__, con, msg, con->out_kvec_bytes, con->out_skip); - msg->hdr.seq = 0; + WARN_ON(con->state != CEPH_CON_S_OPEN); + dout("%s con %p msg %p was sending\n", __func__, con, msg); + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_revoke(con); + else + ceph_con_v1_revoke(con); + ceph_msg_put(con->out_msg); con->out_msg = NULL; - ceph_msg_put(msg); + } else { + dout("%s con %p msg %p not current, out_msg %p\n", __func__, + con, msg, con->out_msg); } - mutex_unlock(&con->mutex); } @@ -3184,25 +1771,17 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg) mutex_lock(&con->mutex); if (con->in_msg == msg) { - unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); - unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); - unsigned int data_len = le32_to_cpu(con->in_hdr.data_len); - - /* skip rest of message */ - dout("%s %p msg %p revoked\n", __func__, con, msg); - con->in_base_pos = con->in_base_pos - - sizeof(struct ceph_msg_header) - - front_len - - middle_len - - data_len - - sizeof(struct ceph_msg_footer); + WARN_ON(con->state != CEPH_CON_S_OPEN); + dout("%s con %p msg %p was recving\n", __func__, con, msg); + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_revoke_incoming(con); + else + ceph_con_v1_revoke_incoming(con); ceph_msg_put(con->in_msg); con->in_msg = NULL; - con->in_tag = CEPH_MSGR_TAG_READY; - con->in_seq++; } else { - dout("%s %p in_msg %p msg %p no-op\n", - __func__, con, con->in_msg, msg); + dout("%s con %p msg %p not current, in_msg %p\n", __func__, + con, msg, con->in_msg); } mutex_unlock(&con->mutex); } @@ -3215,10 +1794,10 @@ void ceph_con_keepalive(struct ceph_connection *con) dout("con_keepalive %p\n", con); mutex_lock(&con->mutex); clear_standby(con); - con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING); + ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING); mutex_unlock(&con->mutex); - if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) + if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); @@ -3424,9 +2003,9 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) * On error (ENOMEM, EAGAIN, ...), * - con->in_msg == NULL */ -static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) +int ceph_con_in_msg_alloc(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip) { - struct ceph_msg_header *hdr = &con->in_hdr; int middle_len = le32_to_cpu(hdr->middle_len); struct ceph_msg *msg; int ret = 0; @@ -3437,7 +2016,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); - if (con->state != CON_STATE_OPEN) { + if (con->state != CEPH_CON_S_OPEN) { if (msg) ceph_msg_put(msg); return -EAGAIN; @@ -3458,7 +2037,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) con->error_msg = "error allocating memory for incoming message"; return -ENOMEM; } - memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); + memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr)); if (middle_len && !con->in_msg->middle) { ret = ceph_alloc_middle(con, con->in_msg); @@ -3471,6 +2050,39 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) return ret; } +void ceph_con_get_out_msg(struct ceph_connection *con) +{ + struct ceph_msg *msg; + + BUG_ON(list_empty(&con->out_queue)); + msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); + WARN_ON(msg->con != con); + + /* + * Put the message on "sent" list using a ref from ceph_con_send(). + * It is put when the message is acked or revoked. + */ + list_move_tail(&msg->list_head, &con->out_sent); + + /* + * Only assign outgoing seq # if we haven't sent this message + * yet. If it is requeued, resend with it's original seq. + */ + if (msg->needs_out_seq) { + msg->hdr.seq = cpu_to_le64(++con->out_seq); + msg->needs_out_seq = false; + + if (con->ops->reencode_message) + con->ops->reencode_message(msg); + } + + /* + * Get a ref for out_msg. It is put when we are done sending the + * message or in case of a fault. + */ + WARN_ON(con->out_msg); + con->out_msg = ceph_msg_get(msg); +} /* * Free a generically kmalloc'd message. diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c new file mode 100644 index 000000000000..2cb5ffdf071a --- /dev/null +++ b/net/ceph/messenger_v1.c @@ -0,0 +1,1506 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ceph/ceph_debug.h> + +#include <linux/bvec.h> +#include <linux/crc32c.h> +#include <linux/net.h> +#include <linux/socket.h> +#include <net/sock.h> + +#include <linux/ceph/ceph_features.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/messenger.h> + +/* static tag bytes (protocol control messages) */ +static char tag_msg = CEPH_MSGR_TAG_MSG; +static char tag_ack = CEPH_MSGR_TAG_ACK; +static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; +static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; + +/* + * If @buf is NULL, discard up to @len bytes. + */ +static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) +{ + struct kvec iov = {buf, len}; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + if (!buf) + msg.msg_flags |= MSG_TRUNC; + + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len); + r = sock_recvmsg(sock, &msg, msg.msg_flags); + if (r == -EAGAIN) + r = 0; + return r; +} + +static int ceph_tcp_recvpage(struct socket *sock, struct page *page, + int page_offset, size_t length) +{ + struct bio_vec bvec = { + .bv_page = page, + .bv_offset = page_offset, + .bv_len = length + }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + BUG_ON(page_offset + length > PAGE_SIZE); + iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length); + r = sock_recvmsg(sock, &msg, msg.msg_flags); + if (r == -EAGAIN) + r = 0; + return r; +} + +/* + * write something. @more is true if caller will be sending more data + * shortly. + */ +static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, + size_t kvlen, size_t len, bool more) +{ + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + if (more) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ + + r = kernel_sendmsg(sock, &msg, iov, kvlen, len); + if (r == -EAGAIN) + r = 0; + return r; +} + +/* + * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST + */ +static int ceph_tcp_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int more) +{ + ssize_t (*sendpage)(struct socket *sock, struct page *page, + int offset, size_t size, int flags); + int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; + int ret; + + /* + * sendpage cannot properly handle pages with page_count == 0, + * we need to fall back to sendmsg if that's the case. + * + * Same goes for slab pages: skb_can_coalesce() allows + * coalescing neighboring slab objects into a single frag which + * triggers one of hardened usercopy checks. + */ + if (sendpage_ok(page)) + sendpage = sock->ops->sendpage; + else + sendpage = sock_no_sendpage; + + ret = sendpage(sock, page, offset, size, flags); + if (ret == -EAGAIN) + ret = 0; + + return ret; +} + +static void con_out_kvec_reset(struct ceph_connection *con) +{ + BUG_ON(con->v1.out_skip); + + con->v1.out_kvec_left = 0; + con->v1.out_kvec_bytes = 0; + con->v1.out_kvec_cur = &con->v1.out_kvec[0]; +} + +static void con_out_kvec_add(struct ceph_connection *con, + size_t size, void *data) +{ + int index = con->v1.out_kvec_left; + + BUG_ON(con->v1.out_skip); + BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec)); + + con->v1.out_kvec[index].iov_len = size; + con->v1.out_kvec[index].iov_base = data; + con->v1.out_kvec_left++; + con->v1.out_kvec_bytes += size; +} + +/* + * Chop off a kvec from the end. Return residual number of bytes for + * that kvec, i.e. how many bytes would have been written if the kvec + * hadn't been nuked. + */ +static int con_out_kvec_skip(struct ceph_connection *con) +{ + int skip = 0; + + if (con->v1.out_kvec_bytes > 0) { + skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len; + BUG_ON(con->v1.out_kvec_bytes < skip); + BUG_ON(!con->v1.out_kvec_left); + con->v1.out_kvec_bytes -= skip; + con->v1.out_kvec_left--; + } + + return skip; +} + +static size_t sizeof_footer(struct ceph_connection *con) +{ + return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? + sizeof(struct ceph_msg_footer) : + sizeof(struct ceph_msg_footer_old); +} + +static void prepare_message_data(struct ceph_msg *msg, u32 data_len) +{ + /* Initialize data cursor */ + + ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); +} + +/* + * Prepare footer for currently outgoing message, and finish things + * off. Assumes out_kvec* are already valid.. we just add on to the end. + */ +static void prepare_write_message_footer(struct ceph_connection *con) +{ + struct ceph_msg *m = con->out_msg; + + m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; + + dout("prepare_write_message_footer %p\n", con); + con_out_kvec_add(con, sizeof_footer(con), &m->footer); + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { + if (con->ops->sign_message) + con->ops->sign_message(m); + else + m->footer.sig = 0; + } else { + m->old_footer.flags = m->footer.flags; + } + con->v1.out_more = m->more_to_follow; + con->v1.out_msg_done = true; +} + +/* + * Prepare headers for the next outgoing message. + */ +static void prepare_write_message(struct ceph_connection *con) +{ + struct ceph_msg *m; + u32 crc; + + con_out_kvec_reset(con); + con->v1.out_msg_done = false; + + /* Sneak an ack in there first? If we can get it into the same + * TCP packet that's a good thing. */ + if (con->in_seq > con->in_seq_acked) { + con->in_seq_acked = con->in_seq; + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), + &con->v1.out_temp_ack); + } + + ceph_con_get_out_msg(con); + m = con->out_msg; + + dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", + m, con->out_seq, le16_to_cpu(m->hdr.type), + le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), + m->data_length); + WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); + WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); + + /* tag + hdr + front + middle */ + con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); + con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr); + con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); + + if (m->middle) + con_out_kvec_add(con, m->middle->vec.iov_len, + m->middle->vec.iov_base); + + /* fill in hdr crc and finalize hdr */ + crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); + con->out_msg->hdr.crc = cpu_to_le32(crc); + memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr)); + + /* fill in front and middle crc, footer */ + crc = crc32c(0, m->front.iov_base, m->front.iov_len); + con->out_msg->footer.front_crc = cpu_to_le32(crc); + if (m->middle) { + crc = crc32c(0, m->middle->vec.iov_base, + m->middle->vec.iov_len); + con->out_msg->footer.middle_crc = cpu_to_le32(crc); + } else + con->out_msg->footer.middle_crc = 0; + dout("%s front_crc %u middle_crc %u\n", __func__, + le32_to_cpu(con->out_msg->footer.front_crc), + le32_to_cpu(con->out_msg->footer.middle_crc)); + con->out_msg->footer.flags = 0; + + /* is there a data payload? */ + con->out_msg->footer.data_crc = 0; + if (m->data_length) { + prepare_message_data(con->out_msg, m->data_length); + con->v1.out_more = 1; /* data + footer will follow */ + } else { + /* no, queue up footer too and be done */ + prepare_write_message_footer(con); + } + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Prepare an ack. + */ +static void prepare_write_ack(struct ceph_connection *con) +{ + dout("prepare_write_ack %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + + con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), + &con->v1.out_temp_ack); + + con->v1.out_more = 1; /* more will follow.. eventually.. */ + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Prepare to share the seq during handshake + */ +static void prepare_write_seq(struct ceph_connection *con) +{ + dout("prepare_write_seq %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), + &con->v1.out_temp_ack); + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Prepare to write keepalive byte. + */ +static void prepare_write_keepalive(struct ceph_connection *con) +{ + dout("prepare_write_keepalive %p\n", con); + con_out_kvec_reset(con); + if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { + struct timespec64 now; + + ktime_get_real_ts64(&now); + con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); + ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now); + con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2), + &con->v1.out_temp_keepalive2); + } else { + con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); + } + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Connection negotiation. + */ + +static int get_connect_authorizer(struct ceph_connection *con) +{ + struct ceph_auth_handshake *auth; + int auth_proto; + + if (!con->ops->get_authorizer) { + con->v1.auth = NULL; + con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; + con->v1.out_connect.authorizer_len = 0; + return 0; + } + + auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + con->v1.auth = auth; + con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto); + con->v1.out_connect.authorizer_len = + cpu_to_le32(auth->authorizer_buf_len); + return 0; +} + +/* + * We connected to a peer and are saying hello. + */ +static void prepare_write_banner(struct ceph_connection *con) +{ + con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); + con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), + &con->msgr->my_enc_addr); + + con->v1.out_more = 0; + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +static void __prepare_write_connect(struct ceph_connection *con) +{ + con_out_kvec_add(con, sizeof(con->v1.out_connect), + &con->v1.out_connect); + if (con->v1.auth) + con_out_kvec_add(con, con->v1.auth->authorizer_buf_len, + con->v1.auth->authorizer_buf); + + con->v1.out_more = 0; + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +static int prepare_write_connect(struct ceph_connection *con) +{ + unsigned int global_seq = ceph_get_global_seq(con->msgr, 0); + int proto; + int ret; + + switch (con->peer_name.type) { + case CEPH_ENTITY_TYPE_MON: + proto = CEPH_MONC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_OSD: + proto = CEPH_OSDC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_MDS: + proto = CEPH_MDSC_PROTOCOL; + break; + default: + BUG(); + } + + dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, + con->v1.connect_seq, global_seq, proto); + + con->v1.out_connect.features = + cpu_to_le64(from_msgr(con->msgr)->supported_features); + con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); + con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq); + con->v1.out_connect.global_seq = cpu_to_le32(global_seq); + con->v1.out_connect.protocol_version = cpu_to_le32(proto); + con->v1.out_connect.flags = 0; + + ret = get_connect_authorizer(con); + if (ret) + return ret; + + __prepare_write_connect(con); + return 0; +} + +/* + * write as much of pending kvecs to the socket as we can. + * 1 -> done + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_kvec(struct ceph_connection *con) +{ + int ret; + + dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes); + while (con->v1.out_kvec_bytes > 0) { + ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur, + con->v1.out_kvec_left, + con->v1.out_kvec_bytes, + con->v1.out_more); + if (ret <= 0) + goto out; + con->v1.out_kvec_bytes -= ret; + if (!con->v1.out_kvec_bytes) + break; /* done */ + + /* account for full iov entries consumed */ + while (ret >= con->v1.out_kvec_cur->iov_len) { + BUG_ON(!con->v1.out_kvec_left); + ret -= con->v1.out_kvec_cur->iov_len; + con->v1.out_kvec_cur++; + con->v1.out_kvec_left--; + } + /* and for a partially-consumed entry */ + if (ret) { + con->v1.out_kvec_cur->iov_len -= ret; + con->v1.out_kvec_cur->iov_base += ret; + } + } + con->v1.out_kvec_left = 0; + ret = 1; +out: + dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, + con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret); + return ret; /* done! */ +} + +/* + * Write as much message data payload as we can. If we finish, queue + * up the footer. + * 1 -> done, footer is now queued in out_kvec[]. + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_message_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + struct ceph_msg_data_cursor *cursor = &msg->cursor; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; + u32 crc; + + dout("%s %p msg %p\n", __func__, con, msg); + + if (!msg->num_data_items) + return -EINVAL; + + /* + * Iterate through each page that contains data to be + * written, and send as much as possible for each. + * + * If we are calculating the data crc (the default), we will + * need to map the page. If we have no pages, they have + * been revoked, so use the zero page. + */ + crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; + while (cursor->total_resid) { + struct page *page; + size_t page_offset; + size_t length; + int ret; + + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + + page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + if (length == cursor->total_resid) + more = MSG_MORE; + ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, + more); + if (ret <= 0) { + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + + return ret; + } + if (do_datacrc && cursor->need_crc) + crc = ceph_crc32c_page(crc, page, page_offset, length); + ceph_msg_data_advance(cursor, (size_t)ret); + } + + dout("%s %p msg %p done\n", __func__, con, msg); + + /* prepare and queue up footer, too */ + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + else + msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; + con_out_kvec_reset(con); + prepare_write_message_footer(con); + + return 1; /* must return > 0 to indicate success */ +} + +/* + * write some zeros + */ +static int write_partial_skip(struct ceph_connection *con) +{ + int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; + int ret; + + dout("%s %p %d left\n", __func__, con, con->v1.out_skip); + while (con->v1.out_skip > 0) { + size_t size = min(con->v1.out_skip, (int)PAGE_SIZE); + + if (size == con->v1.out_skip) + more = MSG_MORE; + ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size, + more); + if (ret <= 0) + goto out; + con->v1.out_skip -= ret; + } + ret = 1; +out: + return ret; +} + +/* + * Prepare to read connection handshake, or an ack. + */ +static void prepare_read_banner(struct ceph_connection *con) +{ + dout("prepare_read_banner %p\n", con); + con->v1.in_base_pos = 0; +} + +static void prepare_read_connect(struct ceph_connection *con) +{ + dout("prepare_read_connect %p\n", con); + con->v1.in_base_pos = 0; +} + +static void prepare_read_ack(struct ceph_connection *con) +{ + dout("prepare_read_ack %p\n", con); + con->v1.in_base_pos = 0; +} + +static void prepare_read_seq(struct ceph_connection *con) +{ + dout("prepare_read_seq %p\n", con); + con->v1.in_base_pos = 0; + con->v1.in_tag = CEPH_MSGR_TAG_SEQ; +} + +static void prepare_read_tag(struct ceph_connection *con) +{ + dout("prepare_read_tag %p\n", con); + con->v1.in_base_pos = 0; + con->v1.in_tag = CEPH_MSGR_TAG_READY; +} + +static void prepare_read_keepalive_ack(struct ceph_connection *con) +{ + dout("prepare_read_keepalive_ack %p\n", con); + con->v1.in_base_pos = 0; +} + +/* + * Prepare to read a message. + */ +static int prepare_read_message(struct ceph_connection *con) +{ + dout("prepare_read_message %p\n", con); + BUG_ON(con->in_msg != NULL); + con->v1.in_base_pos = 0; + con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; + return 0; +} + +static int read_partial(struct ceph_connection *con, + int end, int size, void *object) +{ + while (con->v1.in_base_pos < end) { + int left = end - con->v1.in_base_pos; + int have = size - left; + int ret = ceph_tcp_recvmsg(con->sock, object + have, left); + if (ret <= 0) + return ret; + con->v1.in_base_pos += ret; + } + return 1; +} + +/* + * Read all or part of the connect-side handshake on a new connection + */ +static int read_partial_banner(struct ceph_connection *con) +{ + int size; + int end; + int ret; + + dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos); + + /* peer's banner */ + size = strlen(CEPH_BANNER); + end = size; + ret = read_partial(con, end, size, con->v1.in_banner); + if (ret <= 0) + goto out; + + size = sizeof(con->v1.actual_peer_addr); + end += size; + ret = read_partial(con, end, size, &con->v1.actual_peer_addr); + if (ret <= 0) + goto out; + ceph_decode_banner_addr(&con->v1.actual_peer_addr); + + size = sizeof(con->v1.peer_addr_for_me); + end += size; + ret = read_partial(con, end, size, &con->v1.peer_addr_for_me); + if (ret <= 0) + goto out; + ceph_decode_banner_addr(&con->v1.peer_addr_for_me); + +out: + return ret; +} + +static int read_partial_connect(struct ceph_connection *con) +{ + int size; + int end; + int ret; + + dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos); + + size = sizeof(con->v1.in_reply); + end = size; + ret = read_partial(con, end, size, &con->v1.in_reply); + if (ret <= 0) + goto out; + + if (con->v1.auth) { + size = le32_to_cpu(con->v1.in_reply.authorizer_len); + if (size > con->v1.auth->authorizer_reply_buf_len) { + pr_err("authorizer reply too big: %d > %zu\n", size, + con->v1.auth->authorizer_reply_buf_len); + ret = -EINVAL; + goto out; + } + + end += size; + ret = read_partial(con, end, size, + con->v1.auth->authorizer_reply_buf); + if (ret <= 0) + goto out; + } + + dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", + con, con->v1.in_reply.tag, + le32_to_cpu(con->v1.in_reply.connect_seq), + le32_to_cpu(con->v1.in_reply.global_seq)); +out: + return ret; +} + +/* + * Verify the hello banner looks okay. + */ +static int verify_hello(struct ceph_connection *con) +{ + if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { + pr_err("connect to %s got bad banner\n", + ceph_pr_addr(&con->peer_addr)); + con->error_msg = "protocol error, bad banner"; + return -1; + } + return 0; +} + +static int process_banner(struct ceph_connection *con) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + + dout("process_banner on %p\n", con); + + if (verify_hello(con) < 0) + return -1; + + /* + * Make sure the other end is who we wanted. note that the other + * end may not yet know their ip address, so if it's 0.0.0.0, give + * them the benefit of the doubt. + */ + if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr, + sizeof(con->peer_addr)) != 0 && + !(ceph_addr_is_blank(&con->v1.actual_peer_addr) && + con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) { + pr_warn("wrong peer, want %s/%u, got %s/%u\n", + ceph_pr_addr(&con->peer_addr), + le32_to_cpu(con->peer_addr.nonce), + ceph_pr_addr(&con->v1.actual_peer_addr), + le32_to_cpu(con->v1.actual_peer_addr.nonce)); + con->error_msg = "wrong peer at address"; + return -1; + } + + /* + * did we learn our address? + */ + if (ceph_addr_is_blank(my_addr)) { + memcpy(&my_addr->in_addr, + &con->v1.peer_addr_for_me.in_addr, + sizeof(con->v1.peer_addr_for_me.in_addr)); + ceph_addr_set_port(my_addr, 0); + ceph_encode_my_addr(con->msgr); + dout("process_banner learned my addr is %s\n", + ceph_pr_addr(my_addr)); + } + + return 0; +} + +static int process_connect(struct ceph_connection *con) +{ + u64 sup_feat = from_msgr(con->msgr)->supported_features; + u64 req_feat = from_msgr(con->msgr)->required_features; + u64 server_feat = le64_to_cpu(con->v1.in_reply.features); + int ret; + + dout("process_connect on %p tag %d\n", con, con->v1.in_tag); + + if (con->v1.auth) { + int len = le32_to_cpu(con->v1.in_reply.authorizer_len); + + /* + * Any connection that defines ->get_authorizer() + * should also define ->add_authorizer_challenge() and + * ->verify_authorizer_reply(). + * + * See get_connect_authorizer(). + */ + if (con->v1.in_reply.tag == + CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { + ret = con->ops->add_authorizer_challenge( + con, con->v1.auth->authorizer_reply_buf, len); + if (ret < 0) + return ret; + + con_out_kvec_reset(con); + __prepare_write_connect(con); + prepare_read_connect(con); + return 0; + } + + if (len) { + ret = con->ops->verify_authorizer_reply(con); + if (ret < 0) { + con->error_msg = "bad authorize reply"; + return ret; + } + } + } + + switch (con->v1.in_reply.tag) { + case CEPH_MSGR_TAG_FEATURES: + pr_err("%s%lld %s feature set mismatch," + " my %llx < server's %llx, missing %llx\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + sup_feat, server_feat, server_feat & ~sup_feat); + con->error_msg = "missing required protocol features"; + return -1; + + case CEPH_MSGR_TAG_BADPROTOVER: + pr_err("%s%lld %s protocol version mismatch," + " my %d != server's %d\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + le32_to_cpu(con->v1.out_connect.protocol_version), + le32_to_cpu(con->v1.in_reply.protocol_version)); + con->error_msg = "protocol version mismatch"; + return -1; + + case CEPH_MSGR_TAG_BADAUTHORIZER: + con->v1.auth_retry++; + dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, + con->v1.auth_retry); + if (con->v1.auth_retry == 2) { + con->error_msg = "connect authorization failure"; + return -1; + } + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RESETSESSION: + /* + * If we connected with a large connect_seq but the peer + * has no record of a session with us (no connection, or + * connect_seq == 0), they will send RESETSESION to indicate + * that they must have reset their session, and may have + * dropped messages. + */ + dout("process_connect got RESET peer seq %u\n", + le32_to_cpu(con->v1.in_reply.connect_seq)); + pr_info("%s%lld %s session reset\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr)); + ceph_con_reset_session(con); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + + /* Tell ceph about it. */ + mutex_unlock(&con->mutex); + if (con->ops->peer_reset) + con->ops->peer_reset(con); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V1_CONNECT_MSG) + return -EAGAIN; + break; + + case CEPH_MSGR_TAG_RETRY_SESSION: + /* + * If we sent a smaller connect_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", + le32_to_cpu(con->v1.out_connect.connect_seq), + le32_to_cpu(con->v1.in_reply.connect_seq)); + con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RETRY_GLOBAL: + /* + * If we sent a smaller global_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", + con->v1.peer_global_seq, + le32_to_cpu(con->v1.in_reply.global_seq)); + ceph_get_global_seq(con->msgr, + le32_to_cpu(con->v1.in_reply.global_seq)); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_SEQ: + case CEPH_MSGR_TAG_READY: + if (req_feat & ~server_feat) { + pr_err("%s%lld %s protocol feature mismatch," + " my required %llx > server's %llx, need %llx\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + req_feat, server_feat, req_feat & ~server_feat); + con->error_msg = "missing required protocol features"; + return -1; + } + + WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG); + con->state = CEPH_CON_S_OPEN; + con->v1.auth_retry = 0; /* we authenticated; clear flag */ + con->v1.peer_global_seq = + le32_to_cpu(con->v1.in_reply.global_seq); + con->v1.connect_seq++; + con->peer_features = server_feat; + dout("process_connect got READY gseq %d cseq %d (%d)\n", + con->v1.peer_global_seq, + le32_to_cpu(con->v1.in_reply.connect_seq), + con->v1.connect_seq); + WARN_ON(con->v1.connect_seq != + le32_to_cpu(con->v1.in_reply.connect_seq)); + + if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY) + ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX); + + con->delay = 0; /* reset backoff memory */ + + if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) { + prepare_write_seq(con); + prepare_read_seq(con); + } else { + prepare_read_tag(con); + } + break; + + case CEPH_MSGR_TAG_WAIT: + /* + * If there is a connection race (we are opening + * connections to each other), one of us may just have + * to WAIT. This shouldn't happen if we are the + * client. + */ + con->error_msg = "protocol error, got WAIT as client"; + return -1; + + default: + con->error_msg = "protocol error, garbage tag during connect"; + return -1; + } + return 0; +} + +/* + * read (part of) an ack + */ +static int read_partial_ack(struct ceph_connection *con) +{ + int size = sizeof(con->v1.in_temp_ack); + int end = size; + + return read_partial(con, end, size, &con->v1.in_temp_ack); +} + +/* + * We can finally discard anything that's been acked. + */ +static void process_ack(struct ceph_connection *con) +{ + u64 ack = le64_to_cpu(con->v1.in_temp_ack); + + if (con->v1.in_tag == CEPH_MSGR_TAG_ACK) + ceph_con_discard_sent(con, ack); + else + ceph_con_discard_requeued(con, ack); + + prepare_read_tag(con); +} + +static int read_partial_message_section(struct ceph_connection *con, + struct kvec *section, + unsigned int sec_len, u32 *crc) +{ + int ret, left; + + BUG_ON(!section); + + while (section->iov_len < sec_len) { + BUG_ON(section->iov_base == NULL); + left = sec_len - section->iov_len; + ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + + section->iov_len, left); + if (ret <= 0) + return ret; + section->iov_len += ret; + } + if (section->iov_len == sec_len) + *crc = crc32c(0, section->iov_base, section->iov_len); + + return 1; +} + +static int read_partial_msg_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + struct ceph_msg_data_cursor *cursor = &msg->cursor; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + struct page *page; + size_t page_offset; + size_t length; + u32 crc = 0; + int ret; + + if (!msg->num_data_items) + return -EIO; + + if (do_datacrc) + crc = con->in_data_crc; + while (cursor->total_resid) { + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + + page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); + if (ret <= 0) { + if (do_datacrc) + con->in_data_crc = crc; + + return ret; + } + + if (do_datacrc) + crc = ceph_crc32c_page(crc, page, page_offset, ret); + ceph_msg_data_advance(cursor, (size_t)ret); + } + if (do_datacrc) + con->in_data_crc = crc; + + return 1; /* must return > 0 to indicate success */ +} + +/* + * read (part of) a message. + */ +static int read_partial_message(struct ceph_connection *con) +{ + struct ceph_msg *m = con->in_msg; + int size; + int end; + int ret; + unsigned int front_len, middle_len, data_len; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); + u64 seq; + u32 crc; + + dout("read_partial_message con %p msg %p\n", con, m); + + /* header */ + size = sizeof(con->v1.in_hdr); + end = size; + ret = read_partial(con, end, size, &con->v1.in_hdr); + if (ret <= 0) + return ret; + + crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc)); + if (cpu_to_le32(crc) != con->v1.in_hdr.crc) { + pr_err("read_partial_message bad hdr crc %u != expected %u\n", + crc, con->v1.in_hdr.crc); + return -EBADMSG; + } + + front_len = le32_to_cpu(con->v1.in_hdr.front_len); + if (front_len > CEPH_MSG_MAX_FRONT_LEN) + return -EIO; + middle_len = le32_to_cpu(con->v1.in_hdr.middle_len); + if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) + return -EIO; + data_len = le32_to_cpu(con->v1.in_hdr.data_len); + if (data_len > CEPH_MSG_MAX_DATA_LEN) + return -EIO; + + /* verify seq# */ + seq = le64_to_cpu(con->v1.in_hdr.seq); + if ((s64)seq - (s64)con->in_seq < 1) { + pr_info("skipping %s%lld %s seq %lld expected %lld\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + seq, con->in_seq + 1); + con->v1.in_base_pos = -front_len - middle_len - data_len - + sizeof_footer(con); + con->v1.in_tag = CEPH_MSGR_TAG_READY; + return 1; + } else if ((s64)seq - (s64)con->in_seq > 1) { + pr_err("read_partial_message bad seq %lld expected %lld\n", + seq, con->in_seq + 1); + con->error_msg = "bad message sequence # for incoming message"; + return -EBADE; + } + + /* allocate message? */ + if (!con->in_msg) { + int skip = 0; + + dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type, + front_len, data_len); + ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip); + if (ret < 0) + return ret; + + BUG_ON((!con->in_msg) ^ skip); + if (skip) { + /* skip this message */ + dout("alloc_msg said skip message\n"); + con->v1.in_base_pos = -front_len - middle_len - + data_len - sizeof_footer(con); + con->v1.in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + return 1; + } + + BUG_ON(!con->in_msg); + BUG_ON(con->in_msg->con != con); + m = con->in_msg; + m->front.iov_len = 0; /* haven't read it yet */ + if (m->middle) + m->middle->vec.iov_len = 0; + + /* prepare for data payload, if any */ + + if (data_len) + prepare_message_data(con->in_msg, data_len); + } + + /* front */ + ret = read_partial_message_section(con, &m->front, front_len, + &con->in_front_crc); + if (ret <= 0) + return ret; + + /* middle */ + if (m->middle) { + ret = read_partial_message_section(con, &m->middle->vec, + middle_len, + &con->in_middle_crc); + if (ret <= 0) + return ret; + } + + /* (page) data */ + if (data_len) { + ret = read_partial_msg_data(con); + if (ret <= 0) + return ret; + } + + /* footer */ + size = sizeof_footer(con); + end += size; + ret = read_partial(con, end, size, &m->footer); + if (ret <= 0) + return ret; + + if (!need_sign) { + m->footer.flags = m->old_footer.flags; + m->footer.sig = 0; + } + + dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", + m, front_len, m->footer.front_crc, middle_len, + m->footer.middle_crc, data_len, m->footer.data_crc); + + /* crc ok? */ + if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { + pr_err("read_partial_message %p front crc %u != exp. %u\n", + m, con->in_front_crc, m->footer.front_crc); + return -EBADMSG; + } + if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { + pr_err("read_partial_message %p middle crc %u != exp %u\n", + m, con->in_middle_crc, m->footer.middle_crc); + return -EBADMSG; + } + if (do_datacrc && + (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && + con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { + pr_err("read_partial_message %p data crc %u != exp. %u\n", m, + con->in_data_crc, le32_to_cpu(m->footer.data_crc)); + return -EBADMSG; + } + + if (need_sign && con->ops->check_message_signature && + con->ops->check_message_signature(m)) { + pr_err("read_partial_message %p signature check failed\n", m); + return -EBADMSG; + } + + return 1; /* done! */ +} + +static int read_keepalive_ack(struct ceph_connection *con) +{ + struct ceph_timespec ceph_ts; + size_t size = sizeof(ceph_ts); + int ret = read_partial(con, size, size, &ceph_ts); + if (ret <= 0) + return ret; + ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts); + prepare_read_tag(con); + return 1; +} + +/* + * Read what we can from the socket. + */ +int ceph_con_v1_try_read(struct ceph_connection *con) +{ + int ret = -1; + +more: + dout("try_read start %p state %d\n", con, con->state); + if (con->state != CEPH_CON_S_V1_BANNER && + con->state != CEPH_CON_S_V1_CONNECT_MSG && + con->state != CEPH_CON_S_OPEN) + return 0; + + BUG_ON(!con->sock); + + dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag, + con->v1.in_base_pos); + + if (con->state == CEPH_CON_S_V1_BANNER) { + ret = read_partial_banner(con); + if (ret <= 0) + goto out; + ret = process_banner(con); + if (ret < 0) + goto out; + + con->state = CEPH_CON_S_V1_CONNECT_MSG; + + /* + * Received banner is good, exchange connection info. + * Do not reset out_kvec, as sending our banner raced + * with receiving peer banner after connect completed. + */ + ret = prepare_write_connect(con); + if (ret < 0) + goto out; + prepare_read_connect(con); + + /* Send connection info before awaiting response */ + goto out; + } + + if (con->state == CEPH_CON_S_V1_CONNECT_MSG) { + ret = read_partial_connect(con); + if (ret <= 0) + goto out; + ret = process_connect(con); + if (ret < 0) + goto out; + goto more; + } + + WARN_ON(con->state != CEPH_CON_S_OPEN); + + if (con->v1.in_base_pos < 0) { + /* + * skipping + discarding content. + */ + ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos); + if (ret <= 0) + goto out; + dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos); + con->v1.in_base_pos += ret; + if (con->v1.in_base_pos) + goto more; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_READY) { + /* + * what's next? + */ + ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1); + if (ret <= 0) + goto out; + dout("try_read got tag %d\n", con->v1.in_tag); + switch (con->v1.in_tag) { + case CEPH_MSGR_TAG_MSG: + prepare_read_message(con); + break; + case CEPH_MSGR_TAG_ACK: + prepare_read_ack(con); + break; + case CEPH_MSGR_TAG_KEEPALIVE2_ACK: + prepare_read_keepalive_ack(con); + break; + case CEPH_MSGR_TAG_CLOSE: + ceph_con_close_socket(con); + con->state = CEPH_CON_S_CLOSED; + goto out; + default: + goto bad_tag; + } + } + if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) { + ret = read_partial_message(con); + if (ret <= 0) { + switch (ret) { + case -EBADMSG: + con->error_msg = "bad crc/signature"; + fallthrough; + case -EBADE: + ret = -EIO; + break; + case -EIO: + con->error_msg = "io error"; + break; + } + goto out; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_READY) + goto more; + ceph_con_process_message(con); + if (con->state == CEPH_CON_S_OPEN) + prepare_read_tag(con); + goto more; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_ACK || + con->v1.in_tag == CEPH_MSGR_TAG_SEQ) { + /* + * the final handshake seq exchange is semantically + * equivalent to an ACK + */ + ret = read_partial_ack(con); + if (ret <= 0) + goto out; + process_ack(con); + goto more; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { + ret = read_keepalive_ack(con); + if (ret <= 0) + goto out; + goto more; + } + +out: + dout("try_read done on %p ret %d\n", con, ret); + return ret; + +bad_tag: + pr_err("try_read bad tag %d\n", con->v1.in_tag); + con->error_msg = "protocol error, garbage tag"; + ret = -1; + goto out; +} + +/* + * Write something to the socket. Called in a worker thread when the + * socket appears to be writeable and we have something ready to send. + */ +int ceph_con_v1_try_write(struct ceph_connection *con) +{ + int ret = 1; + + dout("try_write start %p state %d\n", con, con->state); + if (con->state != CEPH_CON_S_PREOPEN && + con->state != CEPH_CON_S_V1_BANNER && + con->state != CEPH_CON_S_V1_CONNECT_MSG && + con->state != CEPH_CON_S_OPEN) + return 0; + + /* open the socket first? */ + if (con->state == CEPH_CON_S_PREOPEN) { + BUG_ON(con->sock); + con->state = CEPH_CON_S_V1_BANNER; + + con_out_kvec_reset(con); + prepare_write_banner(con); + prepare_read_banner(con); + + BUG_ON(con->in_msg); + con->v1.in_tag = CEPH_MSGR_TAG_READY; + dout("try_write initiating connect on %p new state %d\n", + con, con->state); + ret = ceph_tcp_connect(con); + if (ret < 0) { + con->error_msg = "connect error"; + goto out; + } + } + +more: + dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes); + BUG_ON(!con->sock); + + /* kvec data queued? */ + if (con->v1.out_kvec_left) { + ret = write_partial_kvec(con); + if (ret <= 0) + goto out; + } + if (con->v1.out_skip) { + ret = write_partial_skip(con); + if (ret <= 0) + goto out; + } + + /* msg pages? */ + if (con->out_msg) { + if (con->v1.out_msg_done) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; /* we're done with this one */ + goto do_next; + } + + ret = write_partial_message_data(con); + if (ret == 1) + goto more; /* we need to send the footer, too! */ + if (ret == 0) + goto out; + if (ret < 0) { + dout("try_write write_partial_message_data err %d\n", + ret); + goto out; + } + } + +do_next: + if (con->state == CEPH_CON_S_OPEN) { + if (ceph_con_flag_test_and_clear(con, + CEPH_CON_F_KEEPALIVE_PENDING)) { + prepare_write_keepalive(con); + goto more; + } + /* is anything else pending? */ + if (!list_empty(&con->out_queue)) { + prepare_write_message(con); + goto more; + } + if (con->in_seq > con->in_seq_acked) { + prepare_write_ack(con); + goto more; + } + } + + /* Nothing to do! */ + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + dout("try_write nothing else to write.\n"); + ret = 0; +out: + dout("try_write done on %p ret %d\n", con, ret); + return ret; +} + +void ceph_con_v1_revoke(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + + WARN_ON(con->v1.out_skip); + /* footer */ + if (con->v1.out_msg_done) { + con->v1.out_skip += con_out_kvec_skip(con); + } else { + WARN_ON(!msg->data_length); + con->v1.out_skip += sizeof_footer(con); + } + /* data, middle, front */ + if (msg->data_length) + con->v1.out_skip += msg->cursor.total_resid; + if (msg->middle) + con->v1.out_skip += con_out_kvec_skip(con); + con->v1.out_skip += con_out_kvec_skip(con); + + dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con, + con->v1.out_kvec_bytes, con->v1.out_skip); +} + +void ceph_con_v1_revoke_incoming(struct ceph_connection *con) +{ + unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len); + unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len); + unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len); + + /* skip rest of message */ + con->v1.in_base_pos = con->v1.in_base_pos - + sizeof(struct ceph_msg_header) - + front_len - + middle_len - + data_len - + sizeof(struct ceph_msg_footer); + + con->v1.in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + + dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos); +} + +bool ceph_con_v1_opened(struct ceph_connection *con) +{ + return con->v1.connect_seq; +} + +void ceph_con_v1_reset_session(struct ceph_connection *con) +{ + con->v1.connect_seq = 0; + con->v1.peer_global_seq = 0; +} + +void ceph_con_v1_reset_protocol(struct ceph_connection *con) +{ + con->v1.out_skip = 0; +} diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c new file mode 100644 index 000000000000..cc40ce4e02fb --- /dev/null +++ b/net/ceph/messenger_v2.c @@ -0,0 +1,3459 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ceph msgr2 protocol implementation + * + * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com> + */ + +#include <linux/ceph/ceph_debug.h> + +#include <crypto/aead.h> +#include <crypto/algapi.h> /* for crypto_memneq() */ +#include <crypto/hash.h> +#include <crypto/sha2.h> +#include <linux/bvec.h> +#include <linux/crc32c.h> +#include <linux/net.h> +#include <linux/scatterlist.h> +#include <linux/socket.h> +#include <linux/sched/mm.h> +#include <net/sock.h> +#include <net/tcp.h> + +#include <linux/ceph/ceph_features.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/messenger.h> + +#include "crypto.h" /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */ + +#define FRAME_TAG_HELLO 1 +#define FRAME_TAG_AUTH_REQUEST 2 +#define FRAME_TAG_AUTH_BAD_METHOD 3 +#define FRAME_TAG_AUTH_REPLY_MORE 4 +#define FRAME_TAG_AUTH_REQUEST_MORE 5 +#define FRAME_TAG_AUTH_DONE 6 +#define FRAME_TAG_AUTH_SIGNATURE 7 +#define FRAME_TAG_CLIENT_IDENT 8 +#define FRAME_TAG_SERVER_IDENT 9 +#define FRAME_TAG_IDENT_MISSING_FEATURES 10 +#define FRAME_TAG_SESSION_RECONNECT 11 +#define FRAME_TAG_SESSION_RESET 12 +#define FRAME_TAG_SESSION_RETRY 13 +#define FRAME_TAG_SESSION_RETRY_GLOBAL 14 +#define FRAME_TAG_SESSION_RECONNECT_OK 15 +#define FRAME_TAG_WAIT 16 +#define FRAME_TAG_MESSAGE 17 +#define FRAME_TAG_KEEPALIVE2 18 +#define FRAME_TAG_KEEPALIVE2_ACK 19 +#define FRAME_TAG_ACK 20 + +#define FRAME_LATE_STATUS_ABORTED 0x1 +#define FRAME_LATE_STATUS_COMPLETE 0xe +#define FRAME_LATE_STATUS_ABORTED_MASK 0xf + +#define IN_S_HANDLE_PREAMBLE 1 +#define IN_S_HANDLE_CONTROL 2 +#define IN_S_HANDLE_CONTROL_REMAINDER 3 +#define IN_S_PREPARE_READ_DATA 4 +#define IN_S_PREPARE_READ_DATA_CONT 5 +#define IN_S_HANDLE_EPILOGUE 6 +#define IN_S_FINISH_SKIP 7 + +#define OUT_S_QUEUE_DATA 1 +#define OUT_S_QUEUE_DATA_CONT 2 +#define OUT_S_QUEUE_ENC_PAGE 3 +#define OUT_S_QUEUE_ZEROS 4 +#define OUT_S_FINISH_MESSAGE 5 +#define OUT_S_GET_NEXT 6 + +#define CTRL_BODY(p) ((void *)(p) + CEPH_PREAMBLE_LEN) +#define FRONT_PAD(p) ((void *)(p) + CEPH_EPILOGUE_SECURE_LEN) +#define MIDDLE_PAD(p) (FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN) +#define DATA_PAD(p) (MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN) + +#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) + +static int do_recvmsg(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + int ret; + + msg.msg_iter = *it; + while (iov_iter_count(it)) { + ret = sock_recvmsg(sock, &msg, msg.msg_flags); + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + WARN_ON(msg_data_left(&msg)); + return 1; +} + +/* + * Read as much as possible. + * + * Return: + * 1 - done, nothing (else) to read + * 0 - socket is empty, need to wait + * <0 - error + */ +static int ceph_tcp_recv(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p %s %zu\n", __func__, con, + iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need", + iov_iter_count(&con->v2.in_iter)); + ret = do_recvmsg(con->sock, &con->v2.in_iter); + dout("%s con %p ret %d left %zu\n", __func__, con, ret, + iov_iter_count(&con->v2.in_iter)); + return ret; +} + +static int do_sendmsg(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + int ret; + + msg.msg_iter = *it; + while (iov_iter_count(it)) { + ret = sock_sendmsg(sock, &msg); + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + WARN_ON(msg_data_left(&msg)); + return 1; +} + +static int do_try_sendpage(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + struct bio_vec bv; + int ret; + + if (WARN_ON(!iov_iter_is_bvec(it))) + return -EINVAL; + + while (iov_iter_count(it)) { + /* iov_iter_iovec() for ITER_BVEC */ + bv.bv_page = it->bvec->bv_page; + bv.bv_offset = it->bvec->bv_offset + it->iov_offset; + bv.bv_len = min(iov_iter_count(it), + it->bvec->bv_len - it->iov_offset); + + /* + * sendpage cannot properly handle pages with + * page_count == 0, we need to fall back to sendmsg if + * that's the case. + * + * Same goes for slab pages: skb_can_coalesce() allows + * coalescing neighboring slab objects into a single frag + * which triggers one of hardened usercopy checks. + */ + if (sendpage_ok(bv.bv_page)) { + ret = sock->ops->sendpage(sock, bv.bv_page, + bv.bv_offset, bv.bv_len, + CEPH_MSG_FLAGS); + } else { + iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len); + ret = sock_sendmsg(sock, &msg); + } + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + return 1; +} + +/* + * Write as much as possible. The socket is expected to be corked, + * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here. + * + * Return: + * 1 - done, nothing (else) to write + * 0 - socket is full, need to wait + * <0 - error + */ +static int ceph_tcp_send(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p have %zu try_sendpage %d\n", __func__, con, + iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage); + if (con->v2.out_iter_sendpage) + ret = do_try_sendpage(con->sock, &con->v2.out_iter); + else + ret = do_sendmsg(con->sock, &con->v2.out_iter); + dout("%s con %p ret %d left %zu\n", __func__, con, ret, + iov_iter_count(&con->v2.out_iter)); + return ret; +} + +static void add_in_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs)); + WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); + + con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf; + con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len; + con->v2.in_kvec_cnt++; + + con->v2.in_iter.nr_segs++; + con->v2.in_iter.count += len; +} + +static void reset_in_kvecs(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + con->v2.in_kvec_cnt = 0; + iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0); +} + +static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + con->v2.in_bvec = *bv; + iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len); +} + +static void set_in_skip(struct ceph_connection *con, int len) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + dout("%s con %p len %d\n", __func__, con, len); + iov_iter_discard(&con->v2.in_iter, READ, len); +} + +static void add_out_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs)); + WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf; + con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len; + con->v2.out_kvec_cnt++; + + con->v2.out_iter.nr_segs++; + con->v2.out_iter.count += len; +} + +static void reset_out_kvecs(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_kvec_cnt = 0; + + iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0); + con->v2.out_iter_sendpage = false; +} + +static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv, + bool zerocopy) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_bvec = *bv; + con->v2.out_iter_sendpage = zerocopy; + iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1, + con->v2.out_bvec.bv_len); +} + +static void set_out_bvec_zero(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(!con->v2.out_zero); + + con->v2.out_bvec.bv_page = ceph_zero_page; + con->v2.out_bvec.bv_offset = 0; + con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE); + con->v2.out_iter_sendpage = true; + iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1, + con->v2.out_bvec.bv_len); +} + +static void out_zero_add(struct ceph_connection *con, int len) +{ + dout("%s con %p len %d\n", __func__, con, len); + con->v2.out_zero += len; +} + +static void *alloc_conn_buf(struct ceph_connection *con, int len) +{ + void *buf; + + dout("%s con %p len %d\n", __func__, con, len); + + if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs))) + return NULL; + + buf = ceph_kvmalloc(len, GFP_NOIO); + if (!buf) + return NULL; + + con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf; + return buf; +} + +static void free_conn_bufs(struct ceph_connection *con) +{ + while (con->v2.conn_buf_cnt) + kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]); +} + +static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs)); + + con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf; + con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len; + con->v2.in_sign_kvec_cnt++; +} + +static void clear_in_sign_kvecs(struct ceph_connection *con) +{ + con->v2.in_sign_kvec_cnt = 0; +} + +static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs)); + + con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf; + con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len; + con->v2.out_sign_kvec_cnt++; +} + +static void clear_out_sign_kvecs(struct ceph_connection *con) +{ + con->v2.out_sign_kvec_cnt = 0; +} + +static bool con_secure(struct ceph_connection *con) +{ + return con->v2.con_mode == CEPH_CON_MODE_SECURE; +} + +static int front_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.front_len); +} + +static int middle_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.middle_len); +} + +static int data_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.data_len); +} + +static bool need_padding(int len) +{ + return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN); +} + +static int padded_len(int len) +{ + return ALIGN(len, CEPH_GCM_BLOCK_LEN); +} + +static int padding_len(int len) +{ + return padded_len(len) - len; +} + +/* preamble + control segment */ +static int head_onwire_len(int ctrl_len, bool secure) +{ + int head_len; + int rem_len; + + if (secure) { + head_len = CEPH_PREAMBLE_SECURE_LEN; + if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) { + rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN; + } + } else { + head_len = CEPH_PREAMBLE_PLAIN_LEN; + if (ctrl_len) + head_len += ctrl_len + CEPH_CRC_LEN; + } + return head_len; +} + +/* front, middle and data segments + epilogue */ +static int __tail_onwire_len(int front_len, int middle_len, int data_len, + bool secure) +{ + if (!front_len && !middle_len && !data_len) + return 0; + + if (!secure) + return front_len + middle_len + data_len + + CEPH_EPILOGUE_PLAIN_LEN; + + return padded_len(front_len) + padded_len(middle_len) + + padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN; +} + +static int tail_onwire_len(const struct ceph_msg *msg, bool secure) +{ + return __tail_onwire_len(front_len(msg), middle_len(msg), + data_len(msg), secure); +} + +/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */ +#define MESSAGE_HEAD_PLAIN_LEN (CEPH_PREAMBLE_PLAIN_LEN + \ + sizeof(struct ceph_msg_header2) + \ + CEPH_CRC_LEN) + +static const int frame_aligns[] = { + sizeof(void *), + sizeof(void *), + sizeof(void *), + PAGE_SIZE +}; + +/* + * Discards trailing empty segments, unless there is just one segment. + * A frame always has at least one (possibly empty) segment. + */ +static int calc_segment_count(const int *lens, int len_cnt) +{ + int i; + + for (i = len_cnt - 1; i >= 0; i--) { + if (lens[i]) + return i + 1; + } + + return 1; +} + +static void init_frame_desc(struct ceph_frame_desc *desc, int tag, + const int *lens, int len_cnt) +{ + int i; + + memset(desc, 0, sizeof(*desc)); + + desc->fd_tag = tag; + desc->fd_seg_cnt = calc_segment_count(lens, len_cnt); + BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT); + for (i = 0; i < desc->fd_seg_cnt; i++) { + desc->fd_lens[i] = lens[i]; + desc->fd_aligns[i] = frame_aligns[i]; + } +} + +/* + * Preamble crc covers everything up to itself (28 bytes) and + * is calculated and verified irrespective of the connection mode + * (i.e. even if the frame is encrypted). + */ +static void encode_preamble(const struct ceph_frame_desc *desc, void *p) +{ + void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN; + void *start = p; + int i; + + memset(p, 0, CEPH_PREAMBLE_LEN); + + ceph_encode_8(&p, desc->fd_tag); + ceph_encode_8(&p, desc->fd_seg_cnt); + for (i = 0; i < desc->fd_seg_cnt; i++) { + ceph_encode_32(&p, desc->fd_lens[i]); + ceph_encode_16(&p, desc->fd_aligns[i]); + } + + put_unaligned_le32(crc32c(0, start, crcp - start), crcp); +} + +static int decode_preamble(void *p, struct ceph_frame_desc *desc) +{ + void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN; + u32 crc, expected_crc; + int i; + + crc = crc32c(0, p, crcp - p); + expected_crc = get_unaligned_le32(crcp); + if (crc != expected_crc) { + pr_err("bad preamble crc, calculated %u, expected %u\n", + crc, expected_crc); + return -EBADMSG; + } + + memset(desc, 0, sizeof(*desc)); + + desc->fd_tag = ceph_decode_8(&p); + desc->fd_seg_cnt = ceph_decode_8(&p); + if (desc->fd_seg_cnt < 1 || + desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) { + pr_err("bad segment count %d\n", desc->fd_seg_cnt); + return -EINVAL; + } + for (i = 0; i < desc->fd_seg_cnt; i++) { + desc->fd_lens[i] = ceph_decode_32(&p); + desc->fd_aligns[i] = ceph_decode_16(&p); + } + + /* + * This would fire for FRAME_TAG_WAIT (it has one empty + * segment), but we should never get it as client. + */ + if (!desc->fd_lens[desc->fd_seg_cnt - 1]) { + pr_err("last segment empty\n"); + return -EINVAL; + } + + if (desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) { + pr_err("control segment too big %d\n", desc->fd_lens[0]); + return -EINVAL; + } + if (desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) { + pr_err("front segment too big %d\n", desc->fd_lens[1]); + return -EINVAL; + } + if (desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) { + pr_err("middle segment too big %d\n", desc->fd_lens[2]); + return -EINVAL; + } + if (desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) { + pr_err("data segment too big %d\n", desc->fd_lens[3]); + return -EINVAL; + } + + return 0; +} + +static void encode_epilogue_plain(struct ceph_connection *con, bool aborted) +{ + con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED : + FRAME_LATE_STATUS_COMPLETE; + cpu_to_le32s(&con->v2.out_epil.front_crc); + cpu_to_le32s(&con->v2.out_epil.middle_crc); + cpu_to_le32s(&con->v2.out_epil.data_crc); +} + +static void encode_epilogue_secure(struct ceph_connection *con, bool aborted) +{ + memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil)); + con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED : + FRAME_LATE_STATUS_COMPLETE; +} + +static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc, + u32 *data_crc) +{ + u8 late_status; + + late_status = ceph_decode_8(&p); + if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) != + FRAME_LATE_STATUS_COMPLETE) { + /* we should never get an aborted message as client */ + pr_err("bad late_status 0x%x\n", late_status); + return -EINVAL; + } + + if (front_crc && middle_crc && data_crc) { + *front_crc = ceph_decode_32(&p); + *middle_crc = ceph_decode_32(&p); + *data_crc = ceph_decode_32(&p); + } + + return 0; +} + +static void fill_header(struct ceph_msg_header *hdr, + const struct ceph_msg_header2 *hdr2, + int front_len, int middle_len, int data_len, + const struct ceph_entity_name *peer_name) +{ + hdr->seq = hdr2->seq; + hdr->tid = hdr2->tid; + hdr->type = hdr2->type; + hdr->priority = hdr2->priority; + hdr->version = hdr2->version; + hdr->front_len = cpu_to_le32(front_len); + hdr->middle_len = cpu_to_le32(middle_len); + hdr->data_len = cpu_to_le32(data_len); + hdr->data_off = hdr2->data_off; + hdr->src = *peer_name; + hdr->compat_version = hdr2->compat_version; + hdr->reserved = 0; + hdr->crc = 0; +} + +static void fill_header2(struct ceph_msg_header2 *hdr2, + const struct ceph_msg_header *hdr, u64 ack_seq) +{ + hdr2->seq = hdr->seq; + hdr2->tid = hdr->tid; + hdr2->type = hdr->type; + hdr2->priority = hdr->priority; + hdr2->version = hdr->version; + hdr2->data_pre_padding_len = 0; + hdr2->data_off = hdr->data_off; + hdr2->ack_seq = cpu_to_le64(ack_seq); + hdr2->flags = 0; + hdr2->compat_version = hdr->compat_version; + hdr2->reserved = 0; +} + +static int verify_control_crc(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + u32 crc, expected_crc; + + WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len); + WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN); + + crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len); + expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base); + if (crc != expected_crc) { + pr_err("bad control crc, calculated %u, expected %u\n", + crc, expected_crc); + return -EBADMSG; + } + + return 0; +} + +static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc, + u32 middle_crc, u32 data_crc) +{ + if (front_len(con->in_msg)) { + con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base, + front_len(con->in_msg)); + } else { + WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg)); + con->in_front_crc = -1; + } + + if (middle_len(con->in_msg)) + con->in_middle_crc = crc32c(-1, + con->in_msg->middle->vec.iov_base, + middle_len(con->in_msg)); + else if (data_len(con->in_msg)) + con->in_middle_crc = -1; + else + con->in_middle_crc = 0; + + if (!data_len(con->in_msg)) + con->in_data_crc = 0; + + dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg, + con->in_front_crc, con->in_middle_crc, con->in_data_crc); + + if (con->in_front_crc != front_crc) { + pr_err("bad front crc, calculated %u, expected %u\n", + con->in_front_crc, front_crc); + return -EBADMSG; + } + if (con->in_middle_crc != middle_crc) { + pr_err("bad middle crc, calculated %u, expected %u\n", + con->in_middle_crc, middle_crc); + return -EBADMSG; + } + if (con->in_data_crc != data_crc) { + pr_err("bad data crc, calculated %u, expected %u\n", + con->in_data_crc, data_crc); + return -EBADMSG; + } + + return 0; +} + +static int setup_crypto(struct ceph_connection *con, + const u8 *session_key, int session_key_len, + const u8 *con_secret, int con_secret_len) +{ + unsigned int noio_flag; + int ret; + + dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n", + __func__, con, con->v2.con_mode, session_key_len, con_secret_len); + WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req); + + if (con->v2.con_mode != CEPH_CON_MODE_CRC && + con->v2.con_mode != CEPH_CON_MODE_SECURE) { + pr_err("bad con_mode %d\n", con->v2.con_mode); + return -EINVAL; + } + + if (!session_key_len) { + WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC); + WARN_ON(con_secret_len); + return 0; /* auth_none */ + } + + noio_flag = memalloc_noio_save(); + con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0); + memalloc_noio_restore(noio_flag); + if (IS_ERR(con->v2.hmac_tfm)) { + ret = PTR_ERR(con->v2.hmac_tfm); + con->v2.hmac_tfm = NULL; + pr_err("failed to allocate hmac tfm context: %d\n", ret); + return ret; + } + + WARN_ON((unsigned long)session_key & + crypto_shash_alignmask(con->v2.hmac_tfm)); + ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key, + session_key_len); + if (ret) { + pr_err("failed to set hmac key: %d\n", ret); + return ret; + } + + if (con->v2.con_mode == CEPH_CON_MODE_CRC) { + WARN_ON(con_secret_len); + return 0; /* auth_x, plain mode */ + } + + if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) { + pr_err("con_secret too small %d\n", con_secret_len); + return -EINVAL; + } + + noio_flag = memalloc_noio_save(); + con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0); + memalloc_noio_restore(noio_flag); + if (IS_ERR(con->v2.gcm_tfm)) { + ret = PTR_ERR(con->v2.gcm_tfm); + con->v2.gcm_tfm = NULL; + pr_err("failed to allocate gcm tfm context: %d\n", ret); + return ret; + } + + WARN_ON((unsigned long)con_secret & + crypto_aead_alignmask(con->v2.gcm_tfm)); + ret = crypto_aead_setkey(con->v2.gcm_tfm, con_secret, CEPH_GCM_KEY_LEN); + if (ret) { + pr_err("failed to set gcm key: %d\n", ret); + return ret; + } + + WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN); + ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN); + if (ret) { + pr_err("failed to set gcm tag size: %d\n", ret); + return ret; + } + + con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO); + if (!con->v2.gcm_req) { + pr_err("failed to allocate gcm request\n"); + return -ENOMEM; + } + + crypto_init_wait(&con->v2.gcm_wait); + aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &con->v2.gcm_wait); + + memcpy(&con->v2.in_gcm_nonce, con_secret + CEPH_GCM_KEY_LEN, + CEPH_GCM_IV_LEN); + memcpy(&con->v2.out_gcm_nonce, + con_secret + CEPH_GCM_KEY_LEN + CEPH_GCM_IV_LEN, + CEPH_GCM_IV_LEN); + return 0; /* auth_x, secure mode */ +} + +static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs, + int kvec_cnt, u8 *hmac) +{ + SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm); /* tfm arg is ignored */ + int ret; + int i; + + dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con, + con->v2.hmac_tfm, kvec_cnt); + + if (!con->v2.hmac_tfm) { + memset(hmac, 0, SHA256_DIGEST_SIZE); + return 0; /* auth_none */ + } + + desc->tfm = con->v2.hmac_tfm; + ret = crypto_shash_init(desc); + if (ret) + goto out; + + for (i = 0; i < kvec_cnt; i++) { + WARN_ON((unsigned long)kvecs[i].iov_base & + crypto_shash_alignmask(con->v2.hmac_tfm)); + ret = crypto_shash_update(desc, kvecs[i].iov_base, + kvecs[i].iov_len); + if (ret) + goto out; + } + + ret = crypto_shash_final(desc, hmac); + +out: + shash_desc_zero(desc); + return ret; /* auth_x, both plain and secure modes */ +} + +static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce) +{ + u64 counter; + + counter = le64_to_cpu(nonce->counter); + nonce->counter = cpu_to_le64(counter + 1); +} + +static int gcm_crypt(struct ceph_connection *con, bool encrypt, + struct scatterlist *src, struct scatterlist *dst, + int src_len) +{ + struct ceph_gcm_nonce *nonce; + int ret; + + nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce; + + aead_request_set_ad(con->v2.gcm_req, 0); /* no AAD */ + aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce); + ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) : + crypto_aead_decrypt(con->v2.gcm_req), + &con->v2.gcm_wait); + if (ret) + return ret; + + gcm_inc_nonce(nonce); + return 0; +} + +static void get_bvec_at(struct ceph_msg_data_cursor *cursor, + struct bio_vec *bv) +{ + struct page *page; + size_t off, len; + + WARN_ON(!cursor->total_resid); + + /* skip zero-length data items */ + while (!cursor->resid) + ceph_msg_data_advance(cursor, 0); + + /* get a piece of data, cursor isn't advanced */ + page = ceph_msg_data_next(cursor, &off, &len, NULL); + + bv->bv_page = page; + bv->bv_offset = off; + bv->bv_len = len; +} + +static int calc_sg_cnt(void *buf, int buf_len) +{ + int sg_cnt; + + if (!buf_len) + return 0; + + sg_cnt = need_padding(buf_len) ? 1 : 0; + if (is_vmalloc_addr(buf)) { + WARN_ON(offset_in_page(buf)); + sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT; + } else { + sg_cnt++; + } + + return sg_cnt; +} + +static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor) +{ + int data_len = cursor->total_resid; + struct bio_vec bv; + int sg_cnt; + + if (!data_len) + return 0; + + sg_cnt = need_padding(data_len) ? 1 : 0; + do { + get_bvec_at(cursor, &bv); + sg_cnt++; + + ceph_msg_data_advance(cursor, bv.bv_len); + } while (cursor->total_resid); + + return sg_cnt; +} + +static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad) +{ + void *end = buf + buf_len; + struct page *page; + int len; + void *p; + + if (!buf_len) + return; + + if (is_vmalloc_addr(buf)) { + p = buf; + do { + page = vmalloc_to_page(p); + len = min_t(int, end - p, PAGE_SIZE); + WARN_ON(!page || !len || offset_in_page(p)); + sg_set_page(*sg, page, len, 0); + *sg = sg_next(*sg); + p += len; + } while (p != end); + } else { + sg_set_buf(*sg, buf, buf_len); + *sg = sg_next(*sg); + } + + if (need_padding(buf_len)) { + sg_set_buf(*sg, pad, padding_len(buf_len)); + *sg = sg_next(*sg); + } +} + +static void init_sgs_cursor(struct scatterlist **sg, + struct ceph_msg_data_cursor *cursor, u8 *pad) +{ + int data_len = cursor->total_resid; + struct bio_vec bv; + + if (!data_len) + return; + + do { + get_bvec_at(cursor, &bv); + sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); + *sg = sg_next(*sg); + + ceph_msg_data_advance(cursor, bv.bv_len); + } while (cursor->total_resid); + + if (need_padding(data_len)) { + sg_set_buf(*sg, pad, padding_len(data_len)); + *sg = sg_next(*sg); + } +} + +static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg, + u8 *front_pad, u8 *middle_pad, u8 *data_pad, + void *epilogue, bool add_tag) +{ + struct ceph_msg_data_cursor cursor; + struct scatterlist *cur_sg; + int sg_cnt; + int ret; + + if (!front_len(msg) && !middle_len(msg) && !data_len(msg)) + return 0; + + sg_cnt = 1; /* epilogue + [auth tag] */ + if (front_len(msg)) + sg_cnt += calc_sg_cnt(msg->front.iov_base, + front_len(msg)); + if (middle_len(msg)) + sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base, + middle_len(msg)); + if (data_len(msg)) { + ceph_msg_data_cursor_init(&cursor, msg, data_len(msg)); + sg_cnt += calc_sg_cnt_cursor(&cursor); + } + + ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO); + if (ret) + return ret; + + cur_sg = sgt->sgl; + if (front_len(msg)) + init_sgs(&cur_sg, msg->front.iov_base, front_len(msg), + front_pad); + if (middle_len(msg)) + init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg), + middle_pad); + if (data_len(msg)) { + ceph_msg_data_cursor_init(&cursor, msg, data_len(msg)); + init_sgs_cursor(&cur_sg, &cursor, data_pad); + } + + WARN_ON(!sg_is_last(cur_sg)); + sg_set_buf(cur_sg, epilogue, + CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0)); + return 0; +} + +static int decrypt_preamble(struct ceph_connection *con) +{ + struct scatterlist sg; + + sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN); + return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN); +} + +static int decrypt_control_remainder(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN; + struct scatterlist sgs[2]; + + WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len); + WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len); + + sg_init_table(sgs, 2); + sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len); + sg_set_buf(&sgs[1], con->v2.in_buf, pt_len); + + return gcm_crypt(con, false, sgs, sgs, + padded_len(rem_len) + CEPH_GCM_TAG_LEN); +} + +static int decrypt_message(struct ceph_connection *con) +{ + struct sg_table sgt = {}; + int ret; + + ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf), + MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf), + con->v2.in_buf, true); + if (ret) + goto out; + + ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl, + tail_onwire_len(con->in_msg, true)); + +out: + sg_free_table(&sgt); + return ret; +} + +static int prepare_banner(struct ceph_connection *con) +{ + int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8; + void *buf, *p; + + buf = alloc_conn_buf(con, buf_len); + if (!buf) + return -ENOMEM; + + p = buf; + ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN); + ceph_encode_16(&p, sizeof(u64) + sizeof(u64)); + ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES); + ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES); + WARN_ON(p != buf + buf_len); + + add_out_kvec(con, buf, buf_len); + add_out_sign_kvec(con, buf, buf_len); + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +/* + * base: + * preamble + * control body (ctrl_len bytes) + * space for control crc + * + * extdata (optional): + * control body (extdata_len bytes) + * + * Compute control crc and gather base and extdata into: + * + * preamble + * control body (ctrl_len + extdata_len bytes) + * control crc + * + * Preamble should already be encoded at the start of base. + */ +static void prepare_head_plain(struct ceph_connection *con, void *base, + int ctrl_len, void *extdata, int extdata_len, + bool to_be_signed) +{ + int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN; + void *crcp = base + base_len - CEPH_CRC_LEN; + u32 crc; + + crc = crc32c(-1, CTRL_BODY(base), ctrl_len); + if (extdata_len) + crc = crc32c(crc, extdata, extdata_len); + put_unaligned_le32(crc, crcp); + + if (!extdata_len) { + add_out_kvec(con, base, base_len); + if (to_be_signed) + add_out_sign_kvec(con, base, base_len); + return; + } + + add_out_kvec(con, base, crcp - base); + add_out_kvec(con, extdata, extdata_len); + add_out_kvec(con, crcp, CEPH_CRC_LEN); + if (to_be_signed) { + add_out_sign_kvec(con, base, crcp - base); + add_out_sign_kvec(con, extdata, extdata_len); + add_out_sign_kvec(con, crcp, CEPH_CRC_LEN); + } +} + +static int prepare_head_secure_small(struct ceph_connection *con, + void *base, int ctrl_len) +{ + struct scatterlist sg; + int ret; + + /* inline buffer padding? */ + if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN) + memset(CTRL_BODY(base) + ctrl_len, 0, + CEPH_PREAMBLE_INLINE_LEN - ctrl_len); + + sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN); + ret = gcm_crypt(con, true, &sg, &sg, + CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN); + if (ret) + return ret; + + add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN); + return 0; +} + +/* + * base: + * preamble + * control body (ctrl_len bytes) + * space for padding, if needed + * space for control remainder auth tag + * space for preamble auth tag + * + * Encrypt preamble and the inline portion, then encrypt the remainder + * and gather into: + * + * preamble + * control body (48 bytes) + * preamble auth tag + * control body (ctrl_len - 48 bytes) + * zero padding, if needed + * control remainder auth tag + * + * Preamble should already be encoded at the start of base. + */ +static int prepare_head_secure_big(struct ceph_connection *con, + void *base, int ctrl_len) +{ + int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN; + void *rem_tag = rem + padded_len(rem_len); + void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN; + struct scatterlist sgs[2]; + int ret; + + sg_init_table(sgs, 2); + sg_set_buf(&sgs[0], base, rem - base); + sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN); + ret = gcm_crypt(con, true, sgs, sgs, rem - base); + if (ret) + return ret; + + /* control remainder padding? */ + if (need_padding(rem_len)) + memset(rem + rem_len, 0, padding_len(rem_len)); + + sg_init_one(&sgs[0], rem, pmbl_tag - rem); + ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem); + if (ret) + return ret; + + add_out_kvec(con, base, rem - base); + add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN); + add_out_kvec(con, rem, pmbl_tag - rem); + return 0; +} + +static int __prepare_control(struct ceph_connection *con, int tag, + void *base, int ctrl_len, void *extdata, + int extdata_len, bool to_be_signed) +{ + int total_len = ctrl_len + extdata_len; + struct ceph_frame_desc desc; + int ret; + + dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag, + total_len, ctrl_len, extdata_len); + + /* extdata may be vmalloc'ed but not base */ + if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len)) + return -EINVAL; + + init_frame_desc(&desc, tag, &total_len, 1); + encode_preamble(&desc, base); + + if (con_secure(con)) { + if (WARN_ON(extdata_len || to_be_signed)) + return -EINVAL; + + if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN) + /* fully inlined, inline buffer may need padding */ + ret = prepare_head_secure_small(con, base, ctrl_len); + else + /* partially inlined, inline buffer is full */ + ret = prepare_head_secure_big(con, base, ctrl_len); + if (ret) + return ret; + } else { + prepare_head_plain(con, base, ctrl_len, extdata, extdata_len, + to_be_signed); + } + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +static int prepare_control(struct ceph_connection *con, int tag, + void *base, int ctrl_len) +{ + return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false); +} + +static int prepare_hello(struct ceph_connection *con) +{ + void *buf, *p; + int ctrl_len; + + ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr); + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false)); + if (!buf) + return -ENOMEM; + + p = CTRL_BODY(buf); + ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT); + ceph_encode_entity_addr(&p, &con->peer_addr); + WARN_ON(p != CTRL_BODY(buf) + ctrl_len); + + return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len, + NULL, 0, true); +} + +/* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */ +#define AUTH_BUF_LEN (512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN) + +static int prepare_auth_request(struct ceph_connection *con) +{ + void *authorizer, *authorizer_copy; + int ctrl_len, authorizer_len; + void *buf; + int ret; + + ctrl_len = AUTH_BUF_LEN; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false)); + if (!buf) + return -ENOMEM; + + mutex_unlock(&con->mutex); + ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len, + &authorizer, &authorizer_len); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_HELLO) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + dout("%s con %p get_auth_request ret %d\n", __func__, con, ret); + if (ret) + return ret; + + authorizer_copy = alloc_conn_buf(con, authorizer_len); + if (!authorizer_copy) + return -ENOMEM; + + memcpy(authorizer_copy, authorizer, authorizer_len); + + return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len, + authorizer_copy, authorizer_len, true); +} + +static int prepare_auth_request_more(struct ceph_connection *con, + void *reply, int reply_len) +{ + int ctrl_len, authorizer_len; + void *authorizer; + void *buf; + int ret; + + ctrl_len = AUTH_BUF_LEN; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false)); + if (!buf) + return -ENOMEM; + + mutex_unlock(&con->mutex); + ret = con->ops->handle_auth_reply_more(con, reply, reply_len, + CTRL_BODY(buf), &ctrl_len, + &authorizer, &authorizer_len); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_AUTH) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret); + if (ret) + return ret; + + return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf, + ctrl_len, authorizer, authorizer_len, true); +} + +static int prepare_auth_signature(struct ceph_connection *con) +{ + void *buf; + int ret; + + buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, + con_secure(con))); + if (!buf) + return -ENOMEM; + + ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, + CTRL_BODY(buf)); + if (ret) + return ret; + + return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf, + SHA256_DIGEST_SIZE); +} + +static int prepare_client_ident(struct ceph_connection *con) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + struct ceph_client *client = from_msgr(con->msgr); + u64 global_id = ceph_client_gid(client); + void *buf, *p; + int ctrl_len; + + WARN_ON(con->v2.server_cookie); + WARN_ON(con->v2.connect_seq); + WARN_ON(con->v2.peer_global_seq); + + if (!con->v2.client_cookie) { + do { + get_random_bytes(&con->v2.client_cookie, + sizeof(con->v2.client_cookie)); + } while (!con->v2.client_cookie); + dout("%s con %p generated cookie 0x%llx\n", __func__, con, + con->v2.client_cookie); + } else { + dout("%s con %p cookie already set 0x%llx\n", __func__, con, + con->v2.client_cookie); + } + + dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n", + __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce), + ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce), + global_id, con->v2.global_seq, client->supported_features, + client->required_features, con->v2.client_cookie); + + ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + + ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con))); + if (!buf) + return -ENOMEM; + + p = CTRL_BODY(buf); + ceph_encode_8(&p, 2); /* addrvec marker */ + ceph_encode_32(&p, 1); /* addr_cnt */ + ceph_encode_entity_addr(&p, my_addr); + ceph_encode_entity_addr(&p, &con->peer_addr); + ceph_encode_64(&p, global_id); + ceph_encode_64(&p, con->v2.global_seq); + ceph_encode_64(&p, client->supported_features); + ceph_encode_64(&p, client->required_features); + ceph_encode_64(&p, 0); /* flags */ + ceph_encode_64(&p, con->v2.client_cookie); + WARN_ON(p != CTRL_BODY(buf) + ctrl_len); + + return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len); +} + +static int prepare_session_reconnect(struct ceph_connection *con) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + void *buf, *p; + int ctrl_len; + + WARN_ON(!con->v2.client_cookie); + WARN_ON(!con->v2.server_cookie); + WARN_ON(!con->v2.connect_seq); + WARN_ON(!con->v2.peer_global_seq); + + dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n", + __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce), + con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq, + con->v2.connect_seq, con->in_seq); + + ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con))); + if (!buf) + return -ENOMEM; + + p = CTRL_BODY(buf); + ceph_encode_8(&p, 2); /* entity_addrvec_t marker */ + ceph_encode_32(&p, 1); /* my_addrs len */ + ceph_encode_entity_addr(&p, my_addr); + ceph_encode_64(&p, con->v2.client_cookie); + ceph_encode_64(&p, con->v2.server_cookie); + ceph_encode_64(&p, con->v2.global_seq); + ceph_encode_64(&p, con->v2.connect_seq); + ceph_encode_64(&p, con->in_seq); + WARN_ON(p != CTRL_BODY(buf) + ctrl_len); + + return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len); +} + +static int prepare_keepalive2(struct ceph_connection *con) +{ + struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf); + struct timespec64 now; + + ktime_get_real_ts64(&now); + dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec, + now.tv_nsec); + + ceph_encode_timespec64(ts, &now); + + reset_out_kvecs(con); + return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf, + sizeof(struct ceph_timespec)); +} + +static int prepare_ack(struct ceph_connection *con) +{ + void *p; + + dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + p = CTRL_BODY(con->v2.out_buf); + ceph_encode_64(&p, con->in_seq_acked); + + reset_out_kvecs(con); + return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8); +} + +static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted) +{ + dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con, + con->out_msg, aborted, con->v2.out_epil.front_crc, + con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc); + + encode_epilogue_plain(con, aborted); + add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN); +} + +/* + * For "used" empty segments, crc is -1. For unused (trailing) + * segments, crc is 0. + */ +static void prepare_message_plain(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + + prepare_head_plain(con, con->v2.out_buf, + sizeof(struct ceph_msg_header2), NULL, 0, false); + + if (!front_len(msg) && !middle_len(msg)) { + if (!data_len(msg)) { + /* + * Empty message: once the head is written, + * we are done -- there is no epilogue. + */ + con->v2.out_state = OUT_S_FINISH_MESSAGE; + return; + } + + con->v2.out_epil.front_crc = -1; + con->v2.out_epil.middle_crc = -1; + con->v2.out_state = OUT_S_QUEUE_DATA; + return; + } + + if (front_len(msg)) { + con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base, + front_len(msg)); + add_out_kvec(con, msg->front.iov_base, front_len(msg)); + } else { + /* middle (at least) is there, checked above */ + con->v2.out_epil.front_crc = -1; + } + + if (middle_len(msg)) { + con->v2.out_epil.middle_crc = + crc32c(-1, msg->middle->vec.iov_base, middle_len(msg)); + add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); + } else { + con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0; + } + + if (data_len(msg)) { + con->v2.out_state = OUT_S_QUEUE_DATA; + } else { + con->v2.out_epil.data_crc = 0; + prepare_epilogue_plain(con, false); + con->v2.out_state = OUT_S_FINISH_MESSAGE; + } +} + +/* + * Unfortunately the kernel crypto API doesn't support streaming + * (piecewise) operation for AEAD algorithms, so we can't get away + * with a fixed size buffer and a couple sgs. Instead, we have to + * allocate pages for the entire tail of the message (currently up + * to ~32M) and two sgs arrays (up to ~256K each)... + */ +static int prepare_message_secure(struct ceph_connection *con) +{ + void *zerop = page_address(ceph_zero_page); + struct sg_table enc_sgt = {}; + struct sg_table sgt = {}; + struct page **enc_pages; + int enc_page_cnt; + int tail_len; + int ret; + + ret = prepare_head_secure_small(con, con->v2.out_buf, + sizeof(struct ceph_msg_header2)); + if (ret) + return ret; + + tail_len = tail_onwire_len(con->out_msg, true); + if (!tail_len) { + /* + * Empty message: once the head is written, + * we are done -- there is no epilogue. + */ + con->v2.out_state = OUT_S_FINISH_MESSAGE; + return 0; + } + + encode_epilogue_secure(con, false); + ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop, + &con->v2.out_epil, false); + if (ret) + goto out; + + enc_page_cnt = calc_pages_for(0, tail_len); + enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO); + if (IS_ERR(enc_pages)) { + ret = PTR_ERR(enc_pages); + goto out; + } + + WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt); + con->v2.out_enc_pages = enc_pages; + con->v2.out_enc_page_cnt = enc_page_cnt; + con->v2.out_enc_resid = tail_len; + con->v2.out_enc_i = 0; + + ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt, + 0, tail_len, GFP_NOIO); + if (ret) + goto out; + + ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl, + tail_len - CEPH_GCM_TAG_LEN); + if (ret) + goto out; + + dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con, + con->out_msg, sgt.orig_nents, enc_page_cnt); + con->v2.out_state = OUT_S_QUEUE_ENC_PAGE; + +out: + sg_free_table(&sgt); + sg_free_table(&enc_sgt); + return ret; +} + +static int prepare_message(struct ceph_connection *con) +{ + int lens[] = { + sizeof(struct ceph_msg_header2), + front_len(con->out_msg), + middle_len(con->out_msg), + data_len(con->out_msg) + }; + struct ceph_frame_desc desc; + int ret; + + dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con, + con->out_msg, lens[0], lens[1], lens[2], lens[3]); + + if (con->in_seq > con->in_seq_acked) { + dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + } + + reset_out_kvecs(con); + init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4); + encode_preamble(&desc, con->v2.out_buf); + fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr, + con->in_seq_acked); + + if (con_secure(con)) { + ret = prepare_message_secure(con); + if (ret) + return ret; + } else { + prepare_message_plain(con); + } + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +static int prepare_read_banner_prefix(struct ceph_connection *con) +{ + void *buf; + + buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN); + if (!buf) + return -ENOMEM; + + reset_in_kvecs(con); + add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN); + add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN); + con->state = CEPH_CON_S_V2_BANNER_PREFIX; + return 0; +} + +static int prepare_read_banner_payload(struct ceph_connection *con, + int payload_len) +{ + void *buf; + + buf = alloc_conn_buf(con, payload_len); + if (!buf) + return -ENOMEM; + + reset_in_kvecs(con); + add_in_kvec(con, buf, payload_len); + add_in_sign_kvec(con, buf, payload_len); + con->state = CEPH_CON_S_V2_BANNER_PAYLOAD; + return 0; +} + +static void prepare_read_preamble(struct ceph_connection *con) +{ + reset_in_kvecs(con); + add_in_kvec(con, con->v2.in_buf, + con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN : + CEPH_PREAMBLE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_PREAMBLE; +} + +static int prepare_read_control(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + int head_len; + void *buf; + + reset_in_kvecs(con); + if (con->state == CEPH_CON_S_V2_HELLO || + con->state == CEPH_CON_S_V2_AUTH) { + head_len = head_onwire_len(ctrl_len, false); + buf = alloc_conn_buf(con, head_len); + if (!buf) + return -ENOMEM; + + /* preserve preamble */ + memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN); + + add_in_kvec(con, CTRL_BODY(buf), ctrl_len); + add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN); + add_in_sign_kvec(con, buf, head_len); + } else { + if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) { + buf = alloc_conn_buf(con, ctrl_len); + if (!buf) + return -ENOMEM; + + add_in_kvec(con, buf, ctrl_len); + } else { + add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len); + } + add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN); + } + con->v2.in_state = IN_S_HANDLE_CONTROL; + return 0; +} + +static int prepare_read_control_remainder(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + void *buf; + + buf = alloc_conn_buf(con, ctrl_len); + if (!buf) + return -ENOMEM; + + memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN); + + reset_in_kvecs(con); + add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len); + add_in_kvec(con, con->v2.in_buf, + padding_len(rem_len) + CEPH_GCM_TAG_LEN); + con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER; + return 0; +} + +static void prepare_read_data(struct ceph_connection *con) +{ + struct bio_vec bv; + + if (!con_secure(con)) + con->in_data_crc = -1; + ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg, + data_len(con->in_msg)); + + get_bvec_at(&con->v2.in_cursor, &bv); + set_in_bvec(con, &bv); + con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT; +} + +static void prepare_read_data_cont(struct ceph_connection *con) +{ + struct bio_vec bv; + + if (!con_secure(con)) + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + con->v2.in_bvec.bv_page, + con->v2.in_bvec.bv_offset, + con->v2.in_bvec.bv_len); + + ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len); + if (con->v2.in_cursor.total_resid) { + get_bvec_at(&con->v2.in_cursor, &bv); + set_in_bvec(con, &bv); + WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT); + return; + } + + /* + * We've read all data. Prepare to read data padding (if any) + * and epilogue. + */ + reset_in_kvecs(con); + if (con_secure(con)) { + if (need_padding(data_len(con->in_msg))) + add_in_kvec(con, DATA_PAD(con->v2.in_buf), + padding_len(data_len(con->in_msg))); + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN); + } else { + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + } + con->v2.in_state = IN_S_HANDLE_EPILOGUE; +} + +static void __finish_skip(struct ceph_connection *con) +{ + con->in_seq++; + prepare_read_preamble(con); +} + +static void prepare_skip_message(struct ceph_connection *con) +{ + struct ceph_frame_desc *desc = &con->v2.in_desc; + int tail_len; + + dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1], + desc->fd_lens[2], desc->fd_lens[3]); + + tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2], + desc->fd_lens[3], con_secure(con)); + if (!tail_len) { + __finish_skip(con); + } else { + set_in_skip(con, tail_len); + con->v2.in_state = IN_S_FINISH_SKIP; + } +} + +static int process_banner_prefix(struct ceph_connection *con) +{ + int payload_len; + void *p; + + WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN); + + p = con->v2.in_kvecs[0].iov_base; + if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) { + if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN)) + con->error_msg = "server is speaking msgr1 protocol"; + else + con->error_msg = "protocol error, bad banner"; + return -EINVAL; + } + + p += CEPH_BANNER_V2_LEN; + payload_len = ceph_decode_16(&p); + dout("%s con %p payload_len %d\n", __func__, con, payload_len); + + return prepare_read_banner_payload(con, payload_len); +} + +static int process_banner_payload(struct ceph_connection *con) +{ + void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len; + u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES; + u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES; + u64 server_feat, server_req_feat; + void *p; + int ret; + + p = con->v2.in_kvecs[0].iov_base; + ceph_decode_64_safe(&p, end, server_feat, bad); + ceph_decode_64_safe(&p, end, server_req_feat, bad); + + dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n", + __func__, con, server_feat, server_req_feat); + + if (req_feat & ~server_feat) { + pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n", + server_feat, req_feat & ~server_feat); + con->error_msg = "missing required protocol features"; + return -EINVAL; + } + if (server_req_feat & ~feat) { + pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n", + feat, server_req_feat & ~feat); + con->error_msg = "missing required protocol features"; + return -EINVAL; + } + + /* no reset_out_kvecs() as our banner may still be pending */ + ret = prepare_hello(con); + if (ret) { + pr_err("prepare_hello failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_HELLO; + prepare_read_preamble(con); + return 0; + +bad: + pr_err("failed to decode banner payload\n"); + return -EINVAL; +} + +static int process_hello(struct ceph_connection *con, void *p, void *end) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + struct ceph_entity_addr addr_for_me; + u8 entity_type; + int ret; + + if (con->state != CEPH_CON_S_V2_HELLO) { + con->error_msg = "protocol error, unexpected hello"; + return -EINVAL; + } + + ceph_decode_8_safe(&p, end, entity_type, bad); + ret = ceph_decode_entity_addr(&p, end, &addr_for_me); + if (ret) { + pr_err("failed to decode addr_for_me: %d\n", ret); + return ret; + } + + dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con, + entity_type, ceph_pr_addr(&addr_for_me)); + + if (entity_type != con->peer_name.type) { + pr_err("bad peer type, want %d, got %d\n", + con->peer_name.type, entity_type); + con->error_msg = "wrong peer at address"; + return -EINVAL; + } + + /* + * Set our address to the address our first peer (i.e. monitor) + * sees that we are connecting from. If we are behind some sort + * of NAT and want to be identified by some private (not NATed) + * address, ip option should be used. + */ + if (ceph_addr_is_blank(my_addr)) { + memcpy(&my_addr->in_addr, &addr_for_me.in_addr, + sizeof(my_addr->in_addr)); + ceph_addr_set_port(my_addr, 0); + dout("%s con %p set my addr %s, as seen by peer %s\n", + __func__, con, ceph_pr_addr(my_addr), + ceph_pr_addr(&con->peer_addr)); + } else { + dout("%s con %p my addr already set %s\n", + __func__, con, ceph_pr_addr(my_addr)); + } + + WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr)); + WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY); + WARN_ON(!my_addr->nonce); + + /* no reset_out_kvecs() as our hello may still be pending */ + ret = prepare_auth_request(con); + if (ret) { + if (ret != -EAGAIN) + pr_err("prepare_auth_request failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_AUTH; + return 0; + +bad: + pr_err("failed to decode hello\n"); + return -EINVAL; +} + +static int process_auth_bad_method(struct ceph_connection *con, + void *p, void *end) +{ + int allowed_protos[8], allowed_modes[8]; + int allowed_proto_cnt, allowed_mode_cnt; + int used_proto, result; + int ret; + int i; + + if (con->state != CEPH_CON_S_V2_AUTH) { + con->error_msg = "protocol error, unexpected auth_bad_method"; + return -EINVAL; + } + + ceph_decode_32_safe(&p, end, used_proto, bad); + ceph_decode_32_safe(&p, end, result, bad); + dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto, + result); + + ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad); + if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) { + pr_err("allowed_protos too big %d\n", allowed_proto_cnt); + return -EINVAL; + } + for (i = 0; i < allowed_proto_cnt; i++) { + ceph_decode_32_safe(&p, end, allowed_protos[i], bad); + dout("%s con %p allowed_protos[%d] %d\n", __func__, con, + i, allowed_protos[i]); + } + + ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad); + if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) { + pr_err("allowed_modes too big %d\n", allowed_mode_cnt); + return -EINVAL; + } + for (i = 0; i < allowed_mode_cnt; i++) { + ceph_decode_32_safe(&p, end, allowed_modes[i], bad); + dout("%s con %p allowed_modes[%d] %d\n", __func__, con, + i, allowed_modes[i]); + } + + mutex_unlock(&con->mutex); + ret = con->ops->handle_auth_bad_method(con, used_proto, result, + allowed_protos, + allowed_proto_cnt, + allowed_modes, + allowed_mode_cnt); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_AUTH) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret); + return ret; + +bad: + pr_err("failed to decode auth_bad_method\n"); + return -EINVAL; +} + +static int process_auth_reply_more(struct ceph_connection *con, + void *p, void *end) +{ + int payload_len; + int ret; + + if (con->state != CEPH_CON_S_V2_AUTH) { + con->error_msg = "protocol error, unexpected auth_reply_more"; + return -EINVAL; + } + + ceph_decode_32_safe(&p, end, payload_len, bad); + ceph_decode_need(&p, end, payload_len, bad); + + dout("%s con %p payload_len %d\n", __func__, con, payload_len); + + reset_out_kvecs(con); + ret = prepare_auth_request_more(con, p, payload_len); + if (ret) { + if (ret != -EAGAIN) + pr_err("prepare_auth_request_more failed: %d\n", ret); + return ret; + } + + return 0; + +bad: + pr_err("failed to decode auth_reply_more\n"); + return -EINVAL; +} + +/* + * Align session_key and con_secret to avoid GFP_ATOMIC allocation + * inside crypto_shash_setkey() and crypto_aead_setkey() called from + * setup_crypto(). __aligned(16) isn't guaranteed to work for stack + * objects, so do it by hand. + */ +static int process_auth_done(struct ceph_connection *con, void *p, void *end) +{ + u8 session_key_buf[CEPH_KEY_LEN + 16]; + u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16]; + u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16); + u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16); + int session_key_len, con_secret_len; + int payload_len; + u64 global_id; + int ret; + + if (con->state != CEPH_CON_S_V2_AUTH) { + con->error_msg = "protocol error, unexpected auth_done"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, global_id, bad); + ceph_decode_32_safe(&p, end, con->v2.con_mode, bad); + ceph_decode_32_safe(&p, end, payload_len, bad); + + dout("%s con %p global_id %llu con_mode %d payload_len %d\n", + __func__, con, global_id, con->v2.con_mode, payload_len); + + mutex_unlock(&con->mutex); + session_key_len = 0; + con_secret_len = 0; + ret = con->ops->handle_auth_done(con, global_id, p, payload_len, + session_key, &session_key_len, + con_secret, &con_secret_len); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_AUTH) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + ret = -EAGAIN; + goto out; + } + + dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret); + if (ret) + goto out; + + ret = setup_crypto(con, session_key, session_key_len, con_secret, + con_secret_len); + if (ret) + goto out; + + reset_out_kvecs(con); + ret = prepare_auth_signature(con); + if (ret) { + pr_err("prepare_auth_signature failed: %d\n", ret); + goto out; + } + + con->state = CEPH_CON_S_V2_AUTH_SIGNATURE; + +out: + memzero_explicit(session_key_buf, sizeof(session_key_buf)); + memzero_explicit(con_secret_buf, sizeof(con_secret_buf)); + return ret; + +bad: + pr_err("failed to decode auth_done\n"); + return -EINVAL; +} + +static int process_auth_signature(struct ceph_connection *con, + void *p, void *end) +{ + u8 hmac[SHA256_DIGEST_SIZE]; + int ret; + + if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) { + con->error_msg = "protocol error, unexpected auth_signature"; + return -EINVAL; + } + + ret = hmac_sha256(con, con->v2.out_sign_kvecs, + con->v2.out_sign_kvec_cnt, hmac); + if (ret) + return ret; + + ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad); + if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) { + con->error_msg = "integrity error, bad auth signature"; + return -EBADMSG; + } + + dout("%s con %p auth signature ok\n", __func__, con); + + /* no reset_out_kvecs() as our auth_signature may still be pending */ + if (!con->v2.server_cookie) { + ret = prepare_client_ident(con); + if (ret) { + pr_err("prepare_client_ident failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_SESSION_CONNECT; + } else { + ret = prepare_session_reconnect(con); + if (ret) { + pr_err("prepare_session_reconnect failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_SESSION_RECONNECT; + } + + return 0; + +bad: + pr_err("failed to decode auth_signature\n"); + return -EINVAL; +} + +static int process_server_ident(struct ceph_connection *con, + void *p, void *end) +{ + struct ceph_client *client = from_msgr(con->msgr); + u64 features, required_features; + struct ceph_entity_addr addr; + u64 global_seq; + u64 global_id; + u64 cookie; + u64 flags; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) { + con->error_msg = "protocol error, unexpected server_ident"; + return -EINVAL; + } + + ret = ceph_decode_entity_addrvec(&p, end, true, &addr); + if (ret) { + pr_err("failed to decode server addrs: %d\n", ret); + return ret; + } + + ceph_decode_64_safe(&p, end, global_id, bad); + ceph_decode_64_safe(&p, end, global_seq, bad); + ceph_decode_64_safe(&p, end, features, bad); + ceph_decode_64_safe(&p, end, required_features, bad); + ceph_decode_64_safe(&p, end, flags, bad); + ceph_decode_64_safe(&p, end, cookie, bad); + + dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n", + __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce), + global_id, global_seq, features, required_features, flags, cookie); + + /* is this who we intended to talk to? */ + if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) { + pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n", + ceph_pr_addr(&con->peer_addr), + le32_to_cpu(con->peer_addr.nonce), + ceph_pr_addr(&addr), le32_to_cpu(addr.nonce)); + con->error_msg = "wrong peer at address"; + return -EINVAL; + } + + if (client->required_features & ~features) { + pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n", + features, client->required_features & ~features); + con->error_msg = "missing required protocol features"; + return -EINVAL; + } + + /* + * Both name->type and name->num are set in ceph_con_open() but + * name->num may be bogus in the initial monmap. name->type is + * verified in handle_hello(). + */ + WARN_ON(!con->peer_name.type); + con->peer_name.num = cpu_to_le64(global_id); + con->v2.peer_global_seq = global_seq; + con->peer_features = features; + WARN_ON(required_features & ~client->supported_features); + con->v2.server_cookie = cookie; + + if (flags & CEPH_MSG_CONNECT_LOSSY) { + ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX); + WARN_ON(con->v2.server_cookie); + } else { + WARN_ON(!con->v2.server_cookie); + } + + clear_in_sign_kvecs(con); + clear_out_sign_kvecs(con); + free_conn_bufs(con); + con->delay = 0; /* reset backoff memory */ + + con->state = CEPH_CON_S_OPEN; + con->v2.out_state = OUT_S_GET_NEXT; + return 0; + +bad: + pr_err("failed to decode server_ident\n"); + return -EINVAL; +} + +static int process_ident_missing_features(struct ceph_connection *con, + void *p, void *end) +{ + struct ceph_client *client = from_msgr(con->msgr); + u64 missing_features; + + if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) { + con->error_msg = "protocol error, unexpected ident_missing_features"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, missing_features, bad); + pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n", + client->supported_features, missing_features); + con->error_msg = "missing required protocol features"; + return -EINVAL; + +bad: + pr_err("failed to decode ident_missing_features\n"); + return -EINVAL; +} + +static int process_session_reconnect_ok(struct ceph_connection *con, + void *p, void *end) +{ + u64 seq; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_reconnect_ok"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, seq, bad); + + dout("%s con %p seq %llu\n", __func__, con, seq); + ceph_con_discard_requeued(con, seq); + + clear_in_sign_kvecs(con); + clear_out_sign_kvecs(con); + free_conn_bufs(con); + con->delay = 0; /* reset backoff memory */ + + con->state = CEPH_CON_S_OPEN; + con->v2.out_state = OUT_S_GET_NEXT; + return 0; + +bad: + pr_err("failed to decode session_reconnect_ok\n"); + return -EINVAL; +} + +static int process_session_retry(struct ceph_connection *con, + void *p, void *end) +{ + u64 connect_seq; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_retry"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, connect_seq, bad); + + dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq); + WARN_ON(connect_seq <= con->v2.connect_seq); + con->v2.connect_seq = connect_seq + 1; + + free_conn_bufs(con); + + reset_out_kvecs(con); + ret = prepare_session_reconnect(con); + if (ret) { + pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret); + return ret; + } + + return 0; + +bad: + pr_err("failed to decode session_retry\n"); + return -EINVAL; +} + +static int process_session_retry_global(struct ceph_connection *con, + void *p, void *end) +{ + u64 global_seq; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_retry_global"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, global_seq, bad); + + dout("%s con %p global_seq %llu\n", __func__, con, global_seq); + WARN_ON(global_seq <= con->v2.global_seq); + con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq); + + free_conn_bufs(con); + + reset_out_kvecs(con); + ret = prepare_session_reconnect(con); + if (ret) { + pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret); + return ret; + } + + return 0; + +bad: + pr_err("failed to decode session_retry_global\n"); + return -EINVAL; +} + +static int process_session_reset(struct ceph_connection *con, + void *p, void *end) +{ + bool full; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_reset"; + return -EINVAL; + } + + ceph_decode_8_safe(&p, end, full, bad); + if (!full) { + con->error_msg = "protocol error, bad session_reset"; + return -EINVAL; + } + + pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr)); + ceph_con_reset_session(con); + + mutex_unlock(&con->mutex); + if (con->ops->peer_reset) + con->ops->peer_reset(con); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + free_conn_bufs(con); + + reset_out_kvecs(con); + ret = prepare_client_ident(con); + if (ret) { + pr_err("prepare_client_ident (rst) failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_SESSION_CONNECT; + return 0; + +bad: + pr_err("failed to decode session_reset\n"); + return -EINVAL; +} + +static int process_keepalive2_ack(struct ceph_connection *con, + void *p, void *end) +{ + if (con->state != CEPH_CON_S_OPEN) { + con->error_msg = "protocol error, unexpected keepalive2_ack"; + return -EINVAL; + } + + ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad); + ceph_decode_timespec64(&con->last_keepalive_ack, p); + + dout("%s con %p timestamp %lld.%09ld\n", __func__, con, + con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec); + + return 0; + +bad: + pr_err("failed to decode keepalive2_ack\n"); + return -EINVAL; +} + +static int process_ack(struct ceph_connection *con, void *p, void *end) +{ + u64 seq; + + if (con->state != CEPH_CON_S_OPEN) { + con->error_msg = "protocol error, unexpected ack"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, seq, bad); + + dout("%s con %p seq %llu\n", __func__, con, seq); + ceph_con_discard_sent(con, seq); + return 0; + +bad: + pr_err("failed to decode ack\n"); + return -EINVAL; +} + +static int process_control(struct ceph_connection *con, void *p, void *end) +{ + int tag = con->v2.in_desc.fd_tag; + int ret; + + dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p)); + + switch (tag) { + case FRAME_TAG_HELLO: + ret = process_hello(con, p, end); + break; + case FRAME_TAG_AUTH_BAD_METHOD: + ret = process_auth_bad_method(con, p, end); + break; + case FRAME_TAG_AUTH_REPLY_MORE: + ret = process_auth_reply_more(con, p, end); + break; + case FRAME_TAG_AUTH_DONE: + ret = process_auth_done(con, p, end); + break; + case FRAME_TAG_AUTH_SIGNATURE: + ret = process_auth_signature(con, p, end); + break; + case FRAME_TAG_SERVER_IDENT: + ret = process_server_ident(con, p, end); + break; + case FRAME_TAG_IDENT_MISSING_FEATURES: + ret = process_ident_missing_features(con, p, end); + break; + case FRAME_TAG_SESSION_RECONNECT_OK: + ret = process_session_reconnect_ok(con, p, end); + break; + case FRAME_TAG_SESSION_RETRY: + ret = process_session_retry(con, p, end); + break; + case FRAME_TAG_SESSION_RETRY_GLOBAL: + ret = process_session_retry_global(con, p, end); + break; + case FRAME_TAG_SESSION_RESET: + ret = process_session_reset(con, p, end); + break; + case FRAME_TAG_KEEPALIVE2_ACK: + ret = process_keepalive2_ack(con, p, end); + break; + case FRAME_TAG_ACK: + ret = process_ack(con, p, end); + break; + default: + pr_err("bad tag %d\n", tag); + con->error_msg = "protocol error, bad tag"; + return -EINVAL; + } + if (ret) { + dout("%s con %p error %d\n", __func__, con, ret); + return ret; + } + + prepare_read_preamble(con); + return 0; +} + +/* + * Return: + * 1 - con->in_msg set, read message + * 0 - skip message + * <0 - error + */ +static int process_message_header(struct ceph_connection *con, + void *p, void *end) +{ + struct ceph_frame_desc *desc = &con->v2.in_desc; + struct ceph_msg_header2 *hdr2 = p; + struct ceph_msg_header hdr; + int skip; + int ret; + u64 seq; + + /* verify seq# */ + seq = le64_to_cpu(hdr2->seq); + if ((s64)seq - (s64)con->in_seq < 1) { + pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + seq, con->in_seq + 1); + return 0; + } + if ((s64)seq - (s64)con->in_seq > 1) { + pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1); + con->error_msg = "bad message sequence # for incoming message"; + return -EBADE; + } + + ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq)); + + fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2], + desc->fd_lens[3], &con->peer_name); + ret = ceph_con_in_msg_alloc(con, &hdr, &skip); + if (ret) + return ret; + + WARN_ON(!con->in_msg ^ skip); + if (skip) + return 0; + + WARN_ON(!con->in_msg); + WARN_ON(con->in_msg->con != con); + return 1; +} + +static int process_message(struct ceph_connection *con) +{ + ceph_con_process_message(con); + + /* + * We could have been closed by ceph_con_close() because + * ceph_con_process_message() temporarily drops con->mutex. + */ + if (con->state != CEPH_CON_S_OPEN) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + prepare_read_preamble(con); + return 0; +} + +static int __handle_control(struct ceph_connection *con, void *p) +{ + void *end = p + con->v2.in_desc.fd_lens[0]; + struct ceph_msg *msg; + int ret; + + if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE) + return process_control(con, p, end); + + ret = process_message_header(con, p, end); + if (ret < 0) + return ret; + if (ret == 0) { + prepare_skip_message(con); + return 0; + } + + msg = con->in_msg; /* set in process_message_header() */ + if (!front_len(msg) && !middle_len(msg)) { + if (!data_len(msg)) + return process_message(con); + + prepare_read_data(con); + return 0; + } + + reset_in_kvecs(con); + if (front_len(msg)) { + WARN_ON(front_len(msg) > msg->front_alloc_len); + add_in_kvec(con, msg->front.iov_base, front_len(msg)); + msg->front.iov_len = front_len(msg); + + if (con_secure(con) && need_padding(front_len(msg))) + add_in_kvec(con, FRONT_PAD(con->v2.in_buf), + padding_len(front_len(msg))); + } else { + msg->front.iov_len = 0; + } + if (middle_len(msg)) { + WARN_ON(middle_len(msg) > msg->middle->alloc_len); + add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); + msg->middle->vec.iov_len = middle_len(msg); + + if (con_secure(con) && need_padding(middle_len(msg))) + add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf), + padding_len(middle_len(msg))); + } else if (msg->middle) { + msg->middle->vec.iov_len = 0; + } + + if (data_len(msg)) { + con->v2.in_state = IN_S_PREPARE_READ_DATA; + } else { + add_in_kvec(con, con->v2.in_buf, + con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN : + CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; + } + return 0; +} + +static int handle_preamble(struct ceph_connection *con) +{ + struct ceph_frame_desc *desc = &con->v2.in_desc; + int ret; + + if (con_secure(con)) { + ret = decrypt_preamble(con); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad preamble auth tag"; + return ret; + } + } + + ret = decode_preamble(con->v2.in_buf, desc); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad crc"; + else + con->error_msg = "protocol error, bad preamble"; + return ret; + } + + dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__, + con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0], + desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]); + + if (!con_secure(con)) + return prepare_read_control(con); + + if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN) + return prepare_read_control_remainder(con); + + return __handle_control(con, CTRL_BODY(con->v2.in_buf)); +} + +static int handle_control(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + void *buf; + int ret; + + WARN_ON(con_secure(con)); + + ret = verify_control_crc(con); + if (ret) { + con->error_msg = "integrity error, bad crc"; + return ret; + } + + if (con->state == CEPH_CON_S_V2_AUTH) { + buf = alloc_conn_buf(con, ctrl_len); + if (!buf) + return -ENOMEM; + + memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len); + return __handle_control(con, buf); + } + + return __handle_control(con, con->v2.in_kvecs[0].iov_base); +} + +static int handle_control_remainder(struct ceph_connection *con) +{ + int ret; + + WARN_ON(!con_secure(con)); + + ret = decrypt_control_remainder(con); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad control remainder auth tag"; + return ret; + } + + return __handle_control(con, con->v2.in_kvecs[0].iov_base - + CEPH_PREAMBLE_INLINE_LEN); +} + +static int handle_epilogue(struct ceph_connection *con) +{ + u32 front_crc, middle_crc, data_crc; + int ret; + + if (con_secure(con)) { + ret = decrypt_message(con); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad epilogue auth tag"; + return ret; + } + + /* just late_status */ + ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL); + if (ret) { + con->error_msg = "protocol error, bad epilogue"; + return ret; + } + } else { + ret = decode_epilogue(con->v2.in_buf, &front_crc, + &middle_crc, &data_crc); + if (ret) { + con->error_msg = "protocol error, bad epilogue"; + return ret; + } + + ret = verify_epilogue_crcs(con, front_crc, middle_crc, + data_crc); + if (ret) { + con->error_msg = "integrity error, bad crc"; + return ret; + } + } + + return process_message(con); +} + +static void finish_skip(struct ceph_connection *con) +{ + dout("%s con %p\n", __func__, con); + + if (con_secure(con)) + gcm_inc_nonce(&con->v2.in_gcm_nonce); + + __finish_skip(con); +} + +static int populate_in_iter(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d in_state %d\n", __func__, con, con->state, + con->v2.in_state); + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) { + ret = process_banner_prefix(con); + } else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) { + ret = process_banner_payload(con); + } else if ((con->state >= CEPH_CON_S_V2_HELLO && + con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) || + con->state == CEPH_CON_S_OPEN) { + switch (con->v2.in_state) { + case IN_S_HANDLE_PREAMBLE: + ret = handle_preamble(con); + break; + case IN_S_HANDLE_CONTROL: + ret = handle_control(con); + break; + case IN_S_HANDLE_CONTROL_REMAINDER: + ret = handle_control_remainder(con); + break; + case IN_S_PREPARE_READ_DATA: + prepare_read_data(con); + ret = 0; + break; + case IN_S_PREPARE_READ_DATA_CONT: + prepare_read_data_cont(con); + ret = 0; + break; + case IN_S_HANDLE_EPILOGUE: + ret = handle_epilogue(con); + break; + case IN_S_FINISH_SKIP: + finish_skip(con); + ret = 0; + break; + default: + WARN(1, "bad in_state %d", con->v2.in_state); + return -EINVAL; + } + } else { + WARN(1, "bad state %d", con->state); + return -EINVAL; + } + if (ret) { + dout("%s con %p error %d\n", __func__, con, ret); + return ret; + } + + if (WARN_ON(!iov_iter_count(&con->v2.in_iter))) + return -ENODATA; + dout("%s con %p populated %zu\n", __func__, con, + iov_iter_count(&con->v2.in_iter)); + return 1; +} + +int ceph_con_v2_try_read(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d need %zu\n", __func__, con, con->state, + iov_iter_count(&con->v2.in_iter)); + + if (con->state == CEPH_CON_S_PREOPEN) + return 0; + + /* + * We should always have something pending here. If not, + * avoid calling populate_in_iter() as if we read something + * (ceph_tcp_recv() would immediately return 1). + */ + if (WARN_ON(!iov_iter_count(&con->v2.in_iter))) + return -ENODATA; + + for (;;) { + ret = ceph_tcp_recv(con); + if (ret <= 0) + return ret; + + ret = populate_in_iter(con); + if (ret <= 0) { + if (ret && ret != -EAGAIN && !con->error_msg) + con->error_msg = "read processing error"; + return ret; + } + } +} + +static void queue_data(struct ceph_connection *con) +{ + struct bio_vec bv; + + con->v2.out_epil.data_crc = -1; + ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg, + data_len(con->out_msg)); + + get_bvec_at(&con->v2.out_cursor, &bv); + set_out_bvec(con, &bv, true); + con->v2.out_state = OUT_S_QUEUE_DATA_CONT; +} + +static void queue_data_cont(struct ceph_connection *con) +{ + struct bio_vec bv; + + con->v2.out_epil.data_crc = ceph_crc32c_page( + con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page, + con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len); + + ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len); + if (con->v2.out_cursor.total_resid) { + get_bvec_at(&con->v2.out_cursor, &bv); + set_out_bvec(con, &bv, true); + WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT); + return; + } + + /* + * We've written all data. Queue epilogue. Once it's written, + * we are done. + */ + reset_out_kvecs(con); + prepare_epilogue_plain(con, false); + con->v2.out_state = OUT_S_FINISH_MESSAGE; +} + +static void queue_enc_page(struct ceph_connection *con) +{ + struct bio_vec bv; + + dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i, + con->v2.out_enc_resid); + WARN_ON(!con->v2.out_enc_resid); + + bv.bv_page = con->v2.out_enc_pages[con->v2.out_enc_i]; + bv.bv_offset = 0; + bv.bv_len = min(con->v2.out_enc_resid, (int)PAGE_SIZE); + + set_out_bvec(con, &bv, false); + con->v2.out_enc_i++; + con->v2.out_enc_resid -= bv.bv_len; + + if (con->v2.out_enc_resid) { + WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE); + return; + } + + /* + * We've queued the last piece of ciphertext (ending with + * epilogue) + auth tag. Once it's written, we are done. + */ + WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt); + con->v2.out_state = OUT_S_FINISH_MESSAGE; +} + +static void queue_zeros(struct ceph_connection *con) +{ + dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero); + + if (con->v2.out_zero) { + set_out_bvec_zero(con); + con->v2.out_zero -= con->v2.out_bvec.bv_len; + con->v2.out_state = OUT_S_QUEUE_ZEROS; + return; + } + + /* + * We've zero-filled everything up to epilogue. Queue epilogue + * with late_status set to ABORTED and crcs adjusted for zeros. + * Once it's written, we are done patching up for the revoke. + */ + reset_out_kvecs(con); + prepare_epilogue_plain(con, true); + con->v2.out_state = OUT_S_FINISH_MESSAGE; +} + +static void finish_message(struct ceph_connection *con) +{ + dout("%s con %p msg %p\n", __func__, con, con->out_msg); + + /* we end up here both plain and secure modes */ + if (con->v2.out_enc_pages) { + WARN_ON(!con->v2.out_enc_page_cnt); + ceph_release_page_vector(con->v2.out_enc_pages, + con->v2.out_enc_page_cnt); + con->v2.out_enc_pages = NULL; + con->v2.out_enc_page_cnt = 0; + } + /* message may have been revoked */ + if (con->out_msg) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } + + con->v2.out_state = OUT_S_GET_NEXT; +} + +static int populate_out_iter(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d out_state %d\n", __func__, con, con->state, + con->v2.out_state); + WARN_ON(iov_iter_count(&con->v2.out_iter)); + + if (con->state != CEPH_CON_S_OPEN) { + WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX || + con->state > CEPH_CON_S_V2_SESSION_RECONNECT); + goto nothing_pending; + } + + switch (con->v2.out_state) { + case OUT_S_QUEUE_DATA: + WARN_ON(!con->out_msg); + queue_data(con); + goto populated; + case OUT_S_QUEUE_DATA_CONT: + WARN_ON(!con->out_msg); + queue_data_cont(con); + goto populated; + case OUT_S_QUEUE_ENC_PAGE: + queue_enc_page(con); + goto populated; + case OUT_S_QUEUE_ZEROS: + WARN_ON(con->out_msg); /* revoked */ + queue_zeros(con); + goto populated; + case OUT_S_FINISH_MESSAGE: + finish_message(con); + break; + case OUT_S_GET_NEXT: + break; + default: + WARN(1, "bad out_state %d", con->v2.out_state); + return -EINVAL; + } + + WARN_ON(con->v2.out_state != OUT_S_GET_NEXT); + if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) { + ret = prepare_keepalive2(con); + if (ret) { + pr_err("prepare_keepalive2 failed: %d\n", ret); + return ret; + } + } else if (!list_empty(&con->out_queue)) { + ceph_con_get_out_msg(con); + ret = prepare_message(con); + if (ret) { + pr_err("prepare_message failed: %d\n", ret); + return ret; + } + } else if (con->in_seq > con->in_seq_acked) { + ret = prepare_ack(con); + if (ret) { + pr_err("prepare_ack failed: %d\n", ret); + return ret; + } + } else { + goto nothing_pending; + } + +populated: + if (WARN_ON(!iov_iter_count(&con->v2.out_iter))) + return -ENODATA; + dout("%s con %p populated %zu\n", __func__, con, + iov_iter_count(&con->v2.out_iter)); + return 1; + +nothing_pending: + WARN_ON(iov_iter_count(&con->v2.out_iter)); + dout("%s con %p nothing pending\n", __func__, con); + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +int ceph_con_v2_try_write(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d have %zu\n", __func__, con, con->state, + iov_iter_count(&con->v2.out_iter)); + + /* open the socket first? */ + if (con->state == CEPH_CON_S_PREOPEN) { + WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2); + + /* + * Always bump global_seq. Bump connect_seq only if + * there is a session (i.e. we are reconnecting and will + * send session_reconnect instead of client_ident). + */ + con->v2.global_seq = ceph_get_global_seq(con->msgr, 0); + if (con->v2.server_cookie) + con->v2.connect_seq++; + + ret = prepare_read_banner_prefix(con); + if (ret) { + pr_err("prepare_read_banner_prefix failed: %d\n", ret); + con->error_msg = "connect error"; + return ret; + } + + reset_out_kvecs(con); + ret = prepare_banner(con); + if (ret) { + pr_err("prepare_banner failed: %d\n", ret); + con->error_msg = "connect error"; + return ret; + } + + ret = ceph_tcp_connect(con); + if (ret) { + pr_err("ceph_tcp_connect failed: %d\n", ret); + con->error_msg = "connect error"; + return ret; + } + } + + if (!iov_iter_count(&con->v2.out_iter)) { + ret = populate_out_iter(con); + if (ret <= 0) { + if (ret && ret != -EAGAIN && !con->error_msg) + con->error_msg = "write processing error"; + return ret; + } + } + + tcp_sock_set_cork(con->sock->sk, true); + for (;;) { + ret = ceph_tcp_send(con); + if (ret <= 0) + break; + + ret = populate_out_iter(con); + if (ret <= 0) { + if (ret && ret != -EAGAIN && !con->error_msg) + con->error_msg = "write processing error"; + break; + } + } + + tcp_sock_set_cork(con->sock->sk, false); + return ret; +} + +static u32 crc32c_zeros(u32 crc, int zero_len) +{ + int len; + + while (zero_len) { + len = min(zero_len, (int)PAGE_SIZE); + crc = crc32c(crc, page_address(ceph_zero_page), len); + zero_len -= len; + } + + return crc; +} + +static void prepare_zero_front(struct ceph_connection *con, int resid) +{ + int sent; + + WARN_ON(!resid || resid > front_len(con->out_msg)); + sent = front_len(con->out_msg) - resid; + dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); + + if (sent) { + con->v2.out_epil.front_crc = + crc32c(-1, con->out_msg->front.iov_base, sent); + con->v2.out_epil.front_crc = + crc32c_zeros(con->v2.out_epil.front_crc, resid); + } else { + con->v2.out_epil.front_crc = crc32c_zeros(-1, resid); + } + + con->v2.out_iter.count -= resid; + out_zero_add(con, resid); +} + +static void prepare_zero_middle(struct ceph_connection *con, int resid) +{ + int sent; + + WARN_ON(!resid || resid > middle_len(con->out_msg)); + sent = middle_len(con->out_msg) - resid; + dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); + + if (sent) { + con->v2.out_epil.middle_crc = + crc32c(-1, con->out_msg->middle->vec.iov_base, sent); + con->v2.out_epil.middle_crc = + crc32c_zeros(con->v2.out_epil.middle_crc, resid); + } else { + con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid); + } + + con->v2.out_iter.count -= resid; + out_zero_add(con, resid); +} + +static void prepare_zero_data(struct ceph_connection *con) +{ + dout("%s con %p\n", __func__, con); + con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg)); + out_zero_add(con, data_len(con->out_msg)); +} + +static void revoke_at_queue_data(struct ceph_connection *con) +{ + int boundary; + int resid; + + WARN_ON(!data_len(con->out_msg)); + WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); + resid = iov_iter_count(&con->v2.out_iter); + + boundary = front_len(con->out_msg) + middle_len(con->out_msg); + if (resid > boundary) { + resid -= boundary; + WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); + dout("%s con %p was sending head\n", __func__, con); + if (front_len(con->out_msg)) + prepare_zero_front(con, front_len(con->out_msg)); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + prepare_zero_data(con); + WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); + con->v2.out_state = OUT_S_QUEUE_ZEROS; + return; + } + + boundary = middle_len(con->out_msg); + if (resid > boundary) { + resid -= boundary; + dout("%s con %p was sending front\n", __func__, con); + prepare_zero_front(con, resid); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + prepare_zero_data(con); + queue_zeros(con); + return; + } + + WARN_ON(!resid); + dout("%s con %p was sending middle\n", __func__, con); + prepare_zero_middle(con, resid); + prepare_zero_data(con); + queue_zeros(con); +} + +static void revoke_at_queue_data_cont(struct ceph_connection *con) +{ + int sent, resid; /* current piece of data */ + + WARN_ON(!data_len(con->out_msg)); + WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)); + resid = iov_iter_count(&con->v2.out_iter); + WARN_ON(!resid || resid > con->v2.out_bvec.bv_len); + sent = con->v2.out_bvec.bv_len - resid; + dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); + + if (sent) { + con->v2.out_epil.data_crc = ceph_crc32c_page( + con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page, + con->v2.out_bvec.bv_offset, sent); + ceph_msg_data_advance(&con->v2.out_cursor, sent); + } + WARN_ON(resid > con->v2.out_cursor.total_resid); + con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc, + con->v2.out_cursor.total_resid); + + con->v2.out_iter.count -= resid; + out_zero_add(con, con->v2.out_cursor.total_resid); + queue_zeros(con); +} + +static void revoke_at_finish_message(struct ceph_connection *con) +{ + int boundary; + int resid; + + WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); + resid = iov_iter_count(&con->v2.out_iter); + + if (!front_len(con->out_msg) && !middle_len(con->out_msg) && + !data_len(con->out_msg)) { + WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN); + dout("%s con %p was sending head (empty message) - noop\n", + __func__, con); + return; + } + + boundary = front_len(con->out_msg) + middle_len(con->out_msg) + + CEPH_EPILOGUE_PLAIN_LEN; + if (resid > boundary) { + resid -= boundary; + WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); + dout("%s con %p was sending head\n", __func__, con); + if (front_len(con->out_msg)) + prepare_zero_front(con, front_len(con->out_msg)); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; + WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); + con->v2.out_state = OUT_S_QUEUE_ZEROS; + return; + } + + boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN; + if (resid > boundary) { + resid -= boundary; + dout("%s con %p was sending front\n", __func__, con); + prepare_zero_front(con, resid); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; + queue_zeros(con); + return; + } + + boundary = CEPH_EPILOGUE_PLAIN_LEN; + if (resid > boundary) { + resid -= boundary; + dout("%s con %p was sending middle\n", __func__, con); + prepare_zero_middle(con, resid); + con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; + queue_zeros(con); + return; + } + + WARN_ON(!resid); + dout("%s con %p was sending epilogue - noop\n", __func__, con); +} + +void ceph_con_v2_revoke(struct ceph_connection *con) +{ + WARN_ON(con->v2.out_zero); + + if (con_secure(con)) { + WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE && + con->v2.out_state != OUT_S_FINISH_MESSAGE); + dout("%s con %p secure - noop\n", __func__, con); + return; + } + + switch (con->v2.out_state) { + case OUT_S_QUEUE_DATA: + revoke_at_queue_data(con); + break; + case OUT_S_QUEUE_DATA_CONT: + revoke_at_queue_data_cont(con); + break; + case OUT_S_FINISH_MESSAGE: + revoke_at_finish_message(con); + break; + default: + WARN(1, "bad out_state %d", con->v2.out_state); + break; + } +} + +static void revoke_at_prepare_read_data(struct ceph_connection *con) +{ + int remaining; /* data + [data padding] + epilogue */ + int resid; + + WARN_ON(!data_len(con->in_msg)); + WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid); + + if (con_secure(con)) + remaining = padded_len(data_len(con->in_msg)) + + CEPH_EPILOGUE_SECURE_LEN; + else + remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN; + + dout("%s con %p resid %d remaining %d\n", __func__, con, resid, + remaining); + con->v2.in_iter.count -= resid; + set_in_skip(con, resid + remaining); + con->v2.in_state = IN_S_FINISH_SKIP; +} + +static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) +{ + int recved, resid; /* current piece of data */ + int remaining; /* [data padding] + epilogue */ + + WARN_ON(!data_len(con->in_msg)); + WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid || resid > con->v2.in_bvec.bv_len); + recved = con->v2.in_bvec.bv_len - resid; + dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid); + + if (recved) + ceph_msg_data_advance(&con->v2.in_cursor, recved); + WARN_ON(resid > con->v2.in_cursor.total_resid); + + if (con_secure(con)) + remaining = padding_len(data_len(con->in_msg)) + + CEPH_EPILOGUE_SECURE_LEN; + else + remaining = CEPH_EPILOGUE_PLAIN_LEN; + + dout("%s con %p total_resid %zu remaining %d\n", __func__, con, + con->v2.in_cursor.total_resid, remaining); + con->v2.in_iter.count -= resid; + set_in_skip(con, con->v2.in_cursor.total_resid + remaining); + con->v2.in_state = IN_S_FINISH_SKIP; +} + +static void revoke_at_handle_epilogue(struct ceph_connection *con) +{ + int resid; + + WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid); + + dout("%s con %p resid %d\n", __func__, con, resid); + con->v2.in_iter.count -= resid; + set_in_skip(con, resid); + con->v2.in_state = IN_S_FINISH_SKIP; +} + +void ceph_con_v2_revoke_incoming(struct ceph_connection *con) +{ + switch (con->v2.in_state) { + case IN_S_PREPARE_READ_DATA: + revoke_at_prepare_read_data(con); + break; + case IN_S_PREPARE_READ_DATA_CONT: + revoke_at_prepare_read_data_cont(con); + break; + case IN_S_HANDLE_EPILOGUE: + revoke_at_handle_epilogue(con); + break; + default: + WARN(1, "bad in_state %d", con->v2.in_state); + break; + } +} + +bool ceph_con_v2_opened(struct ceph_connection *con) +{ + return con->v2.peer_global_seq; +} + +void ceph_con_v2_reset_session(struct ceph_connection *con) +{ + con->v2.client_cookie = 0; + con->v2.server_cookie = 0; + con->v2.global_seq = 0; + con->v2.connect_seq = 0; + con->v2.peer_global_seq = 0; +} + +void ceph_con_v2_reset_protocol(struct ceph_connection *con) +{ + iov_iter_truncate(&con->v2.in_iter, 0); + iov_iter_truncate(&con->v2.out_iter, 0); + con->v2.out_zero = 0; + + clear_in_sign_kvecs(con); + clear_out_sign_kvecs(con); + free_conn_bufs(con); + + if (con->v2.out_enc_pages) { + WARN_ON(!con->v2.out_enc_page_cnt); + ceph_release_page_vector(con->v2.out_enc_pages, + con->v2.out_enc_page_cnt); + con->v2.out_enc_pages = NULL; + con->v2.out_enc_page_cnt = 0; + } + + con->v2.con_mode = CEPH_CON_MODE_UNKNOWN; + memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN); + memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN); + + if (con->v2.hmac_tfm) { + crypto_free_shash(con->v2.hmac_tfm); + con->v2.hmac_tfm = NULL; + } + if (con->v2.gcm_req) { + aead_request_free(con->v2.gcm_req); + con->v2.gcm_req = NULL; + } + if (con->v2.gcm_tfm) { + crypto_free_aead(con->v2.gcm_tfm); + con->v2.gcm_tfm = NULL; + } +} diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index c4cf2529d08b..195ceb8afb06 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -36,57 +36,122 @@ static const struct ceph_connection_operations mon_con_ops; static int __validate_auth(struct ceph_mon_client *monc); +static int decode_mon_info(void **p, void *end, bool msgr2, + struct ceph_entity_addr *addr) +{ + void *mon_info_end; + u32 struct_len; + u8 struct_v; + int ret; + + ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v, + &struct_len); + if (ret) + return ret; + + mon_info_end = *p + struct_len; + ceph_decode_skip_string(p, end, e_inval); /* skip mon name */ + ret = ceph_decode_entity_addrvec(p, end, msgr2, addr); + if (ret) + return ret; + + *p = mon_info_end; + return 0; + +e_inval: + return -EINVAL; +} + /* * Decode a monmap blob (e.g., during mount). + * + * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC). */ -static struct ceph_monmap *ceph_monmap_decode(void *p, void *end) +static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2) { - struct ceph_monmap *m = NULL; - int i, err = -EINVAL; + struct ceph_monmap *monmap = NULL; struct ceph_fsid fsid; - u32 epoch, num_mon; - u32 len; + u32 struct_len; + int blob_len; + int num_mon; + u8 struct_v; + u32 epoch; + int ret; + int i; + + ceph_decode_32_safe(p, end, blob_len, e_inval); + ceph_decode_need(p, end, blob_len, e_inval); + + ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len); + if (ret) + goto fail; - ceph_decode_32_safe(&p, end, len, bad); - ceph_decode_need(&p, end, len, bad); + dout("%s struct_v %d\n", __func__, struct_v); + ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval); + ceph_decode_32_safe(p, end, epoch, e_inval); + if (struct_v >= 6) { + u32 feat_struct_len; + u8 feat_struct_v; - dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p)); - p += sizeof(u16); /* skip version */ + *p += sizeof(struct ceph_timespec); /* skip last_changed */ + *p += sizeof(struct ceph_timespec); /* skip created */ - ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); - ceph_decode_copy(&p, &fsid, sizeof(fsid)); - epoch = ceph_decode_32(&p); + ret = ceph_start_decoding(p, end, 1, "mon_feature_t", + &feat_struct_v, &feat_struct_len); + if (ret) + goto fail; - num_mon = ceph_decode_32(&p); + *p += feat_struct_len; /* skip persistent_features */ + + ret = ceph_start_decoding(p, end, 1, "mon_feature_t", + &feat_struct_v, &feat_struct_len); + if (ret) + goto fail; + + *p += feat_struct_len; /* skip optional_features */ + } + ceph_decode_32_safe(p, end, num_mon, e_inval); + dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch, + num_mon); if (num_mon > CEPH_MAX_MON) - goto bad; - m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS); - if (m == NULL) - return ERR_PTR(-ENOMEM); - m->fsid = fsid; - m->epoch = epoch; - m->num_mon = num_mon; - for (i = 0; i < num_mon; ++i) { - struct ceph_entity_inst *inst = &m->mon_inst[i]; - - /* copy name portion */ - ceph_decode_copy_safe(&p, end, &inst->name, - sizeof(inst->name), bad); - err = ceph_decode_entity_addr(&p, end, &inst->addr); - if (err) - goto bad; + goto e_inval; + + monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO); + if (!monmap) { + ret = -ENOMEM; + goto fail; } - dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, - m->num_mon); - for (i = 0; i < m->num_mon; i++) - dout("monmap_decode mon%d is %s\n", i, - ceph_pr_addr(&m->mon_inst[i].addr)); - return m; -bad: - dout("monmap_decode failed with %d\n", err); - kfree(m); - return ERR_PTR(err); + monmap->fsid = fsid; + monmap->epoch = epoch; + monmap->num_mon = num_mon; + + /* legacy_mon_addr map or mon_info map */ + for (i = 0; i < num_mon; i++) { + struct ceph_entity_inst *inst = &monmap->mon_inst[i]; + + ceph_decode_skip_string(p, end, e_inval); /* skip mon name */ + inst->name.type = CEPH_ENTITY_TYPE_MON; + inst->name.num = cpu_to_le64(i); + + if (struct_v >= 6) + ret = decode_mon_info(p, end, msgr2, &inst->addr); + else + ret = ceph_decode_entity_addr(p, end, &inst->addr); + if (ret) + goto fail; + + dout("%s mon%d addr %s\n", __func__, i, + ceph_pr_addr(&inst->addr)); + } + + return monmap; + +e_inval: + ret = -EINVAL; +fail: + kfree(monmap); + return ERR_PTR(ret); } /* @@ -96,9 +161,11 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) { int i; - for (i = 0; i < m->num_mon; i++) - if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) + for (i = 0; i < m->num_mon; i++) { + if (ceph_addr_equal_no_type(addr, &m->mon_inst[i].addr)) return 1; + } + return 0; } @@ -190,10 +257,16 @@ static void __open_session(struct ceph_mon_client *monc) &monc->monmap->mon_inst[monc->cur_mon].addr); /* - * send an initial keepalive to ensure our timestamp is valid - * by the time we are in an OPENED state + * Queue a keepalive to ensure that in case of an early fault + * the messenger doesn't put us into STANDBY state and instead + * retries. This also ensures that our timestamp is valid by + * the time we finish hunting and delayed_work() checks it. */ ceph_con_keepalive(&monc->con); + if (ceph_msgr2(monc->client)) { + monc->pending_auth = 1; + return; + } /* initiate authentication handshake */ ret = ceph_auth_build_hello(monc->auth, @@ -476,7 +549,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, p = msg->front.iov_base; end = p + msg->front.iov_len; - monmap = ceph_monmap_decode(p, end); + monmap = ceph_monmap_decode(&p, end, ceph_msgr2(client)); if (IS_ERR(monmap)) { pr_err("problem decoding monmap, %d\n", (int)PTR_ERR(monmap)); @@ -1052,8 +1125,9 @@ static void delayed_work(struct work_struct *work) */ static int build_initial_monmap(struct ceph_mon_client *monc) { + __le32 my_type = ceph_msgr2(monc->client) ? + CEPH_ENTITY_ADDR_TYPE_MSGR2 : CEPH_ENTITY_ADDR_TYPE_LEGACY; struct ceph_options *opt = monc->client->options; - struct ceph_entity_addr *mon_addr = opt->mon_addr; int num_mon = opt->num_mon; int i; @@ -1062,12 +1136,16 @@ static int build_initial_monmap(struct ceph_mon_client *monc) GFP_KERNEL); if (!monc->monmap) return -ENOMEM; + for (i = 0; i < num_mon; i++) { - monc->monmap->mon_inst[i].addr = mon_addr[i]; - monc->monmap->mon_inst[i].addr.nonce = 0; - monc->monmap->mon_inst[i].name.type = - CEPH_ENTITY_TYPE_MON; - monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); + struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i]; + + memcpy(&inst->addr.in_addr, &opt->mon_addr[i].in_addr, + sizeof(inst->addr.in_addr)); + inst->addr.type = my_type; + inst->addr.nonce = 0; + inst->name.type = CEPH_ENTITY_TYPE_MON; + inst->name.num = cpu_to_le64(i); } monc->monmap->num_mon = num_mon; return 0; @@ -1089,8 +1167,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) /* connection */ /* authentication */ - monc->auth = ceph_auth_init(cl->options->name, - cl->options->key); + monc->auth = ceph_auth_init(cl->options->name, cl->options->key, + cl->options->con_modes); if (IS_ERR(monc->auth)) { err = PTR_ERR(monc->auth); goto out_monmap; @@ -1194,30 +1272,22 @@ static void finish_hunting(struct ceph_mon_client *monc) } } -static void handle_auth_reply(struct ceph_mon_client *monc, - struct ceph_msg *msg) +static void finish_auth(struct ceph_mon_client *monc, int auth_err, + bool was_authed) { - int ret; - int was_auth = 0; + dout("%s auth_err %d was_authed %d\n", __func__, auth_err, was_authed); + WARN_ON(auth_err > 0); - mutex_lock(&monc->mutex); - was_auth = ceph_auth_is_authenticated(monc->auth); monc->pending_auth = 0; - ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, - msg->front.iov_len, - monc->m_auth->front.iov_base, - monc->m_auth->front_alloc_len); - if (ret > 0) { - __send_prepared_auth_request(monc, ret); - goto out; + if (auth_err) { + monc->client->auth_err = auth_err; + wake_up_all(&monc->client->auth_wq); + return; } - finish_hunting(monc); - - if (ret < 0) { - monc->client->auth_err = ret; - } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { - dout("authenticated, starting session\n"); + if (!was_authed && ceph_auth_is_authenticated(monc->auth)) { + dout("%s authenticated, starting session global_id %llu\n", + __func__, monc->auth->global_id); monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; monc->client->msgr.inst.name.num = @@ -1229,11 +1299,27 @@ static void handle_auth_reply(struct ceph_mon_client *monc, pr_info("mon%d %s session established\n", monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr)); } +} -out: +static void handle_auth_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + bool was_authed; + int ret; + + mutex_lock(&monc->mutex); + was_authed = ceph_auth_is_authenticated(monc->auth); + ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, + msg->front.iov_len, + monc->m_auth->front.iov_base, + monc->m_auth->front_alloc_len); + if (ret > 0) { + __send_prepared_auth_request(monc, ret); + } else { + finish_auth(monc, ret, was_authed); + finish_hunting(monc); + } mutex_unlock(&monc->mutex); - if (monc->client->auth_err < 0) - wake_up_all(&monc->client->auth_wq); } static int __validate_auth(struct ceph_mon_client *monc) @@ -1262,10 +1348,92 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_validate_auth); +static int mon_get_auth_request(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mon_client *monc = con->private; + int ret; + + mutex_lock(&monc->mutex); + ret = ceph_auth_get_request(monc->auth, buf, *buf_len); + mutex_unlock(&monc->mutex); + if (ret < 0) + return ret; + + *buf_len = ret; + *authorizer = NULL; + *authorizer_len = 0; + return 0; +} + +static int mon_handle_auth_reply_more(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mon_client *monc = con->private; + int ret; + + mutex_lock(&monc->mutex); + ret = ceph_auth_handle_reply_more(monc->auth, reply, reply_len, + buf, *buf_len); + mutex_unlock(&monc->mutex); + if (ret < 0) + return ret; + + *buf_len = ret; + *authorizer = NULL; + *authorizer_len = 0; + return 0; +} + +static int mon_handle_auth_done(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_mon_client *monc = con->private; + bool was_authed; + int ret; + + mutex_lock(&monc->mutex); + WARN_ON(!monc->hunting); + was_authed = ceph_auth_is_authenticated(monc->auth); + ret = ceph_auth_handle_reply_done(monc->auth, global_id, + reply, reply_len, + session_key, session_key_len, + con_secret, con_secret_len); + finish_auth(monc, ret, was_authed); + if (!ret) + finish_hunting(monc); + mutex_unlock(&monc->mutex); + return 0; +} + +static int mon_handle_auth_bad_method(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + struct ceph_mon_client *monc = con->private; + bool was_authed; + + mutex_lock(&monc->mutex); + WARN_ON(!monc->hunting); + was_authed = ceph_auth_is_authenticated(monc->auth); + ceph_auth_handle_bad_method(monc->auth, used_proto, result, + allowed_protos, proto_cnt, + allowed_modes, mode_cnt); + finish_auth(monc, -EACCES, was_authed); + mutex_unlock(&monc->mutex); + return 0; +} + /* * handle incoming message */ -static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +static void mon_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_mon_client *monc = con->private; int type = le16_to_cpu(msg->hdr.type); @@ -1397,19 +1565,23 @@ static void mon_fault(struct ceph_connection *con) * will come from the messenger workqueue, which is drained prior to * mon_client destruction. */ -static struct ceph_connection *con_get(struct ceph_connection *con) +static struct ceph_connection *mon_get_con(struct ceph_connection *con) { return con; } -static void con_put(struct ceph_connection *con) +static void mon_put_con(struct ceph_connection *con) { } static const struct ceph_connection_operations mon_con_ops = { - .get = con_get, - .put = con_put, - .dispatch = dispatch, - .fault = mon_fault, + .get = mon_get_con, + .put = mon_put_con, .alloc_msg = mon_alloc_msg, + .dispatch = mon_dispatch, + .fault = mon_fault, + .get_auth_request = mon_get_auth_request, + .handle_auth_reply_more = mon_handle_auth_reply_more, + .handle_auth_done = mon_handle_auth_done, + .handle_auth_bad_method = mon_handle_auth_bad_method, }; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7901ab6c79fd..ff8624a7c964 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -3918,9 +3918,11 @@ static int handle_one_map(struct ceph_osd_client *osdc, set_pool_was_full(osdc); if (incremental) - newmap = osdmap_apply_incremental(&p, end, osdc->osdmap); + newmap = osdmap_apply_incremental(&p, end, + ceph_msgr2(osdc->client), + osdc->osdmap); else - newmap = ceph_osdmap_decode(&p, end); + newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client)); if (IS_ERR(newmap)) return PTR_ERR(newmap); @@ -5410,7 +5412,7 @@ void ceph_osdc_cleanup(void) /* * handle incoming message */ -static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; @@ -5532,9 +5534,9 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) return m; } -static struct ceph_msg *alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip) +static struct ceph_msg *osd_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) { struct ceph_osd *osd = con->private; int type = le16_to_cpu(hdr->type); @@ -5558,7 +5560,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, /* * Wrappers to refcount containing ceph_osd struct */ -static struct ceph_connection *get_osd_con(struct ceph_connection *con) +static struct ceph_connection *osd_get_con(struct ceph_connection *con) { struct ceph_osd *osd = con->private; if (get_osd(osd)) @@ -5566,7 +5568,7 @@ static struct ceph_connection *get_osd_con(struct ceph_connection *con) return NULL; } -static void put_osd_con(struct ceph_connection *con) +static void osd_put_con(struct ceph_connection *con) { struct ceph_osd *osd = con->private; put_osd(osd); @@ -5575,39 +5577,29 @@ static void put_osd_con(struct ceph_connection *con) /* * authentication */ + /* * Note: returned pointer is the address of a structure that's * managed separately. Caller must *not* attempt to free it. */ -static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, - int *proto, int force_new) +static struct ceph_auth_handshake * +osd_get_authorizer(struct ceph_connection *con, int *proto, int force_new) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; + int ret; - if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(auth->authorizer); - auth->authorizer = NULL; - } - if (!auth->authorizer) { - int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); - if (ret) - return ERR_PTR(ret); - } else { - int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); - if (ret) - return ERR_PTR(ret); - } - *proto = ac->protocol; + ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, + force_new, proto, NULL, NULL); + if (ret) + return ERR_PTR(ret); return auth; } -static int add_authorizer_challenge(struct ceph_connection *con, +static int osd_add_authorizer_challenge(struct ceph_connection *con, void *challenge_buf, int challenge_buf_len) { struct ceph_osd *o = con->private; @@ -5618,16 +5610,19 @@ static int add_authorizer_challenge(struct ceph_connection *con, challenge_buf, challenge_buf_len); } -static int verify_authorizer_reply(struct ceph_connection *con) +static int osd_verify_authorizer_reply(struct ceph_connection *con) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; - return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer); + return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, + auth->authorizer_reply_buf, auth->authorizer_reply_buf_len, + NULL, NULL, NULL, NULL); } -static int invalidate_authorizer(struct ceph_connection *con) +static int osd_invalidate_authorizer(struct ceph_connection *con) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; @@ -5637,6 +5632,80 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } +static int osd_get_auth_request(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + int ret; + + ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int osd_handle_auth_reply_more(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + int ret; + + ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int osd_handle_auth_done(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + + return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len, + session_key, session_key_len, + con_secret, con_secret_len); +} + +static int osd_handle_auth_bad_method(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + struct ceph_osd *o = con->private; + struct ceph_mon_client *monc = &o->o_osdc->client->monc; + int ret; + + if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD, + used_proto, result, + allowed_protos, proto_cnt, + allowed_modes, mode_cnt)) { + ret = ceph_monc_validate_auth(monc); + if (ret) + return ret; + } + + return -EACCES; +} + static void osd_reencode_message(struct ceph_msg *msg) { int type = le16_to_cpu(msg->hdr.type); @@ -5662,16 +5731,20 @@ static int osd_check_message_signature(struct ceph_msg *msg) } static const struct ceph_connection_operations osd_con_ops = { - .get = get_osd_con, - .put = put_osd_con, - .dispatch = dispatch, - .get_authorizer = get_authorizer, - .add_authorizer_challenge = add_authorizer_challenge, - .verify_authorizer_reply = verify_authorizer_reply, - .invalidate_authorizer = invalidate_authorizer, - .alloc_msg = alloc_msg, + .get = osd_get_con, + .put = osd_put_con, + .alloc_msg = osd_alloc_msg, + .dispatch = osd_dispatch, + .fault = osd_fault, .reencode_message = osd_reencode_message, + .get_authorizer = osd_get_authorizer, + .add_authorizer_challenge = osd_add_authorizer_challenge, + .verify_authorizer_reply = osd_verify_authorizer_reply, + .invalidate_authorizer = osd_invalidate_authorizer, .sign_message = osd_sign_message, .check_message_signature = osd_check_message_signature, - .fault = osd_fault, + .get_auth_request = osd_get_auth_request, + .handle_auth_reply_more = osd_handle_auth_reply_more, + .handle_auth_done = osd_handle_auth_done, + .handle_auth_bad_method = osd_handle_auth_bad_method, }; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index fa08c15be0c0..2b1dd252f231 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1647,7 +1647,8 @@ static int decode_old_pg_upmap_items(void **p, void *end, /* * decode a full map. */ -static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) +static int osdmap_decode(void **p, void *end, bool msgr2, + struct ceph_osdmap *map) { u8 struct_v; u32 epoch = 0; @@ -1718,9 +1719,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) goto e_inval; for (i = 0; i < map->max_osd; i++) { - err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]); + struct ceph_entity_addr *addr = &map->osd_addr[i]; + + if (struct_v >= 8) + err = ceph_decode_entity_addrvec(p, end, msgr2, addr); + else + err = ceph_decode_entity_addr(p, end, addr); if (err) goto bad; + + dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr)); } /* pg_temp */ @@ -1790,7 +1798,7 @@ bad: /* * Allocate and decode a full map. */ -struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2) { struct ceph_osdmap *map; int ret; @@ -1799,7 +1807,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) if (!map) return ERR_PTR(-ENOMEM); - ret = osdmap_decode(p, end, map); + ret = osdmap_decode(p, end, msgr2, map); if (ret) { ceph_osdmap_destroy(map); return ERR_PTR(ret); @@ -1817,12 +1825,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) * new_state: { osd=6, xorstate=EXISTS } # clear osd_state */ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, - struct ceph_osdmap *map) + bool msgr2, struct ceph_osdmap *map) { void *new_up_client; void *new_state; void *new_weight_end; u32 len; + int ret; int i; new_up_client = *p; @@ -1831,8 +1840,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, struct ceph_entity_addr addr; ceph_decode_skip_32(p, end, e_inval); - if (ceph_decode_entity_addr(p, end, &addr)) - goto e_inval; + if (struct_v >= 7) + ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); + else + ret = ceph_decode_entity_addr(p, end, &addr); + if (ret) + return ret; } new_state = *p; @@ -1874,7 +1887,6 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, while (len--) { s32 osd; u32 xorstate; - int ret; osd = ceph_decode_32(p); if (struct_v >= 5) @@ -1910,8 +1922,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, osd = ceph_decode_32(p); BUG_ON(osd >= map->max_osd); - if (ceph_decode_entity_addr(p, end, &addr)) - goto e_inval; + if (struct_v >= 7) + ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); + else + ret = ceph_decode_entity_addr(p, end, &addr); + if (ret) + return ret; + + dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr)); + pr_info("osd%d up\n", osd); map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; map->osd_addr[osd] = addr; @@ -1927,7 +1946,7 @@ e_inval: /* * decode and apply an incremental map update. */ -struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2, struct ceph_osdmap *map) { struct ceph_fsid fsid; @@ -1962,7 +1981,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); - return ceph_osdmap_decode(p, min(*p+len, end)); + return ceph_osdmap_decode(p, min(*p+len, end), msgr2); } /* new crush? */ @@ -2014,7 +2033,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_up_client, new_state, new_weight */ - err = decode_new_up_state_weight(p, end, struct_v, map); + err = decode_new_up_state_weight(p, end, struct_v, msgr2, map); if (err) goto bad; diff --git a/net/core/dev.c b/net/core/dev.c index a46334906c94..a979b86dbacd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1457,6 +1457,25 @@ void netdev_state_change(struct net_device *dev) EXPORT_SYMBOL(netdev_state_change); /** + * __netdev_notify_peers - notify network peers about existence of @dev, + * to be called when rtnl lock is already held. + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void __netdev_notify_peers(struct net_device *dev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); +} +EXPORT_SYMBOL(__netdev_notify_peers); + +/** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * @@ -1469,8 +1488,7 @@ EXPORT_SYMBOL(netdev_state_change); void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); - call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); - call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); + __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); @@ -9643,9 +9661,20 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } - if ((features & NETIF_F_HW_TLS_TX) && !(features & NETIF_F_HW_CSUM)) { - netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); - features &= ~NETIF_F_HW_TLS_TX; + if (features & NETIF_F_HW_TLS_TX) { + bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == + (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); + bool hw_csum = features & NETIF_F_HW_CSUM; + + if (!ip_csum && !hw_csum) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; + } + } + + if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { + netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_RX; } return features; @@ -10059,17 +10088,11 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); if (ret) { + /* Expect explicit free_netdev() on failure */ + dev->needs_free_netdev = false; rollback_registered(dev); - rcu_barrier(); - - dev->reg_state = NETREG_UNREGISTERED; - /* We should put the kobject that hold in - * netdev_unregister_kobject(), otherwise - * the net device cannot be freed when - * driver calls free_netdev(), because the - * kobject is being hold. - */ - kobject_put(&dev->dev.kobj); + net_set_todo(dev); + goto out; } /* * Prevent userspace races by waiting until the network @@ -10613,6 +10636,17 @@ void free_netdev(struct net_device *dev) struct napi_struct *p, *n; might_sleep(); + + /* When called immediately after register_netdevice() failed the unwind + * handling may still be dismantling the device. Handle that case by + * deferring the free. + */ + if (dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + dev->needs_free_netdev = true; + return; + } + netif_free_tx_queues(dev); netif_free_rx_queues(dev); diff --git a/net/core/devlink.c b/net/core/devlink.c index ee828e4b1007..738d4344d679 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4146,7 +4146,7 @@ out: static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink_param_item *param_item; struct sk_buff *msg; int err; @@ -4175,7 +4175,7 @@ static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; return __devlink_nl_cmd_param_set_doit(devlink_port->devlink, devlink_port->index, diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 80dbf2f4016e..8e582e29a41e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -80,11 +80,11 @@ static void est_timer(struct timer_list *t) u64 rate, brate; est_fetch_counters(est, &b); - brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); - brate -= (est->avbps >> est->ewma_log); + brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); + brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); - rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); - rate -= (est->avpps >> est->ewma_log); + rate = (b.packets - est->last_packets) << (10 - est->intvl_log); + rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); est->avbps += brate; @@ -143,6 +143,9 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (parm->interval < -2 || parm->interval > 3) return -EINVAL; + if (parm->ewma_log == 0 || parm->ewma_log >= 31) + return -EINVAL; + est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) return -ENOBUFS; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9500d28a43b0..277ed854aef1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1569,10 +1569,8 @@ static void neigh_proxy_process(struct timer_list *t) void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { - unsigned long now = jiffies; - - unsigned long sched_next = now + (prandom_u32() % - NEIGH_VAR(p, PROXY_DELAY)); + unsigned long sched_next = jiffies + + prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY)); if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 999b70c59761..daf502c13d6d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1317,8 +1317,8 @@ static const struct attribute_group dql_group = { static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { + int cpu, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; - int cpu, len, num_tc = 1, tc = 0; struct xps_dev_maps *dev_maps; cpumask_var_t mask; unsigned long index; @@ -1328,22 +1328,31 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; - if (num_tc < 0) - return -EINVAL; + if (num_tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); @@ -1366,9 +1375,15 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } rcu_read_unlock(); + rtnl_unlock(); + len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_cpus_store(struct netdev_queue *queue, @@ -1396,7 +1411,13 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } + if (!rtnl_trylock()) { + free_cpumask_var(mask); + return restart_syscall(); + } + err = netif_set_xps_queue(dev, mask, index); + rtnl_unlock(); free_cpumask_var(mask); @@ -1408,22 +1429,29 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { + int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; unsigned long *mask, index; - int j, len, num_tc = 1, tc = 0; index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); - if (!mask) - return -ENOMEM; + if (!mask) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_rxqs_map); @@ -1449,10 +1477,16 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) out_no_maps: rcu_read_unlock(); + rtnl_unlock(); + len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, @@ -1478,10 +1512,17 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } + if (!rtnl_trylock()) { + bitmap_free(mask); + return restart_syscall(); + } + cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, true); cpus_read_unlock(); + rtnl_unlock(); + bitmap_free(mask); return err ? : len; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bb0596c41b3e..3d6ab194d0f5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3439,26 +3439,15 @@ replay: dev->ifindex = ifm->ifi_index; - if (ops->newlink) { + if (ops->newlink) err = ops->newlink(link_net ? : net, dev, tb, data, extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED || - dev->reg_state == NETREG_UNREGISTERED) - free_netdev(dev); - goto out; - } - } else { + else err = register_netdevice(dev); - if (err < 0) { - free_netdev(dev); - goto out; - } + if (err < 0) { + free_netdev(dev); + goto out; } + err = rtnl_configure_link(dev, ifm); if (err < 0) goto out_unregister; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f62cae3f75d8..785daff48030 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -437,7 +437,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, len += NET_SKB_PAD; - if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) @@ -501,13 +505,17 @@ EXPORT_SYMBOL(__netdev_alloc_skb); struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, gfp_t gfp_mask) { - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + struct napi_alloc_cache *nc; struct sk_buff *skb; void *data; len += NET_SKB_PAD + NET_IP_ALIGN; - if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) @@ -515,6 +523,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, goto skb_success; } + nc = this_cpu_ptr(&napi_alloc_cache); len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); len = SKB_DATA_ALIGN(len); @@ -3442,6 +3451,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, st->root_skb = st->cur_skb = skb; st->frag_idx = st->stepped_offset = 0; st->frag_data = NULL; + st->frag_off = 0; } EXPORT_SYMBOL(skb_prepare_seq_read); @@ -3496,14 +3506,27 @@ next_skb: st->stepped_offset += skb_headlen(st->cur_skb); while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { + unsigned int pg_idx, pg_off, pg_sz; + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; - block_limit = skb_frag_size(frag) + st->stepped_offset; + pg_idx = 0; + pg_off = skb_frag_off(frag); + pg_sz = skb_frag_size(frag); + + if (skb_frag_must_loop(skb_frag_page(frag))) { + pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; + pg_off = offset_in_page(pg_off + st->frag_off); + pg_sz = min_t(unsigned int, pg_sz - st->frag_off, + PAGE_SIZE - pg_off); + } + + block_limit = pg_sz + st->stepped_offset; if (abs_offset < block_limit) { if (!st->frag_data) - st->frag_data = kmap_atomic(skb_frag_page(frag)); + st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); - *data = (u8 *) st->frag_data + skb_frag_off(frag) + + *data = (u8 *)st->frag_data + pg_off + (abs_offset - st->stepped_offset); return block_limit - abs_offset; @@ -3514,8 +3537,12 @@ next_skb: st->frag_data = NULL; } - st->frag_idx++; - st->stepped_offset += skb_frag_size(frag); + st->stepped_offset += pg_sz; + st->frag_off += pg_sz; + if (st->frag_off == skb_frag_size(frag)) { + st->frag_off = 0; + st->frag_idx++; + } } if (st->frag_data) { @@ -3655,7 +3682,8 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int delta_truesize = 0; unsigned int delta_len = 0; struct sk_buff *tail = NULL; - struct sk_buff *nskb; + struct sk_buff *nskb, *tmp; + int err; skb_push(skb, -skb_network_offset(skb) + offset); @@ -3665,11 +3693,28 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, nskb = list_skb; list_skb = list_skb->next; + err = 0; + if (skb_shared(nskb)) { + tmp = skb_clone(nskb, GFP_ATOMIC); + if (tmp) { + consume_skb(nskb); + nskb = tmp; + err = skb_unclone(nskb, GFP_ATOMIC); + } else { + err = -ENOMEM; + } + } + if (!tail) skb->next = nskb; else tail->next = nskb; + if (unlikely(err)) { + nskb->next = list_skb; + goto err_linearize; + } + tail = nskb; delta_len += nskb->len; diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index bbdd3c7b6cb5..b065f0a103ed 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -293,7 +293,7 @@ select_by_hash: i = j = reciprocal_scale(hash, socks); while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { i++; - if (i >= reuse->num_socks) + if (i >= socks) i = 0; if (i == j) goto out; diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 084e159a12ba..653e3bc9c87b 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1765,6 +1765,8 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, fn = &reply_funcs[dcb->cmd]; if (!fn->cb) return -EOPNOTSUPP; + if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 183003e45762..a47e0f9b20d0 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -353,9 +353,13 @@ static int dsa_port_devlink_setup(struct dsa_port *dp) static void dsa_port_teardown(struct dsa_port *dp) { + struct devlink_port *dlp = &dp->devlink_port; + if (!dp->setup) return; + devlink_port_type_clear(dlp); + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: break; diff --git a/net/dsa/master.c b/net/dsa/master.c index 5a0f6fec4271..cb3a5cf99b25 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -309,8 +309,18 @@ static struct lock_class_key dsa_master_addr_list_lock_key; int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead; + struct dsa_switch *ds = cpu_dp->ds; + struct device_link *consumer_link; int ret; + /* The DSA master must use SET_NETDEV_DEV for this to work. */ + consumer_link = device_link_add(ds->dev, dev->dev.parent, + DL_FLAG_AUTOREMOVE_CONSUMER); + if (!consumer_link) + netdev_err(dev, + "Failed to create a device link to DSA switch %s\n", + dev_name(ds->dev)); + rtnl_lock(); ret = dev_set_mtu(dev, mtu); rtnl_unlock(); diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 5635604cb9ba..25a9e566ef5c 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -194,8 +194,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) if (netif_is_rxfh_configured(dev) && !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && (channels.combined_count + channels.rx_count) <= max_rx_in_use) { + ret = -EINVAL; GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing indirection table settings"); - return -EINVAL; + goto out_ops; } /* Disabling channels, query zero-copy AF_XDP sockets */ @@ -203,8 +204,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) min(channels.rx_count, channels.tx_count); for (i = from_channel; i < old_total; i++) if (xsk_get_pool_from_qid(dev, i)) { + ret = -EINVAL; GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); - return -EINVAL; + goto out_ops; } ret = dev->ethtool_ops->set_channels(dev, &channels); diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 0baad0ce1832..c3a5489964cd 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -182,7 +182,7 @@ static int strset_parse_request(struct ethnl_req_info *req_base, ret = strset_get_id(attr, &id, extack); if (ret < 0) return ret; - if (ret >= ETH_SS_COUNT) { + if (id >= ETH_SS_COUNT) { NL_SET_ERR_MSG_ATTR(extack, attr, "unknown string set id"); return -EOPNOTSUPP; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 8b07f3a4f2db..a3271ec3e162 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -443,7 +443,6 @@ static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb, int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { u8 *tail; - u8 *vaddr; int nfrags; int esph_offset; struct page *page; @@ -485,14 +484,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * page = pfrag->page; get_page(page); - vaddr = kmap_atomic(page); - - tail = vaddr + pfrag->offset; + tail = page_address(page) + pfrag->offset; esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); - kunmap_atomic(vaddr); - nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cdf6ec5aa45d..84bb707bd88d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -292,7 +292,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_oif = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 66fdbfe5447c..5d1e6fe9d838 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -128,7 +128,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, * to 0 and sets the configured key in the * inner erspan header field */ - if (greh->protocol == htons(ETH_P_ERSPAN) || + if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) || greh->protocol == htons(ETH_P_ERSPAN2)) { struct erspan_base_hdr *ershdr; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fd8b8800a2c3..6bd7ca09af03 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -851,6 +851,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; + newicsk->icsk_probes_tstamp = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 89fff5f59eea..2ed0b01f72f0 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -302,7 +302,7 @@ static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff * if (skb_is_gso(skb)) return ip_finish_output_gso(net, sk, skb, mtu); - if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) + if (skb->len > mtu || IPCB(skb)->frag_max_size) return ip_fragment(net, sk, skb, mtu, ip_finish_output2); return ip_finish_output2(net, sk, skb); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index ee65c9225178..64594aa755f0 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -759,8 +759,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph, - 0, 0, false)) { + df = tnl_params->frag_off; + if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) + df |= (inner_iph->frag_off & htons(IP_DF)); + + if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { ip_rt_put(rt); goto tx_error; } @@ -788,10 +791,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ttl = ip4_dst_hoplimit(&rt->dst); } - df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) - df |= (inner_iph->frag_off&htons(IP_DF)); - max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); if (max_headroom > dev->needed_headroom) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 563b62b76a5f..c576a63d09db 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1379,7 +1379,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 6e2851f8d3a3..e8f6f9d86237 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1589,7 +1589,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index cc23f1ce239c..8cd3224d913e 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = RT_TOS(iph->tos); + flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par)); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 5e1b22d4f939..e53e43aef785 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -627,7 +627,7 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) { if (!tb[i]) continue; - if (tb[NHA_FDB]) + if (i == NHA_FDB) continue; NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); @@ -1459,8 +1459,10 @@ static struct nexthop *nexthop_create_group(struct net *net, return nh; out_no_nh: - for (; i >= 0; --i) + for (i--; i >= 0; --i) { + list_del(&nhg->nh_entries[i].nh_list); nexthop_put(nhg->nh_entries[i].nh); + } kfree(nhg->spare); kfree(nhg); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ed42d2193c5c..32545ecf2ab1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2937,6 +2937,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; + icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c7e16b0ed791..a7dfca0a38cd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3384,6 +3384,7 @@ static void tcp_ack_probe(struct sock *sk) return; if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; + icsk->icsk_probes_tstamp = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! @@ -4396,10 +4397,9 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * The receiver remembers and reflects via DSACKs. Leverage the * DSACK state and change the txhash to re-route speculatively. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) { - sk_rethink_txhash(sk); + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && + sk_rethink_txhash(sk)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); - } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 58207c7769d0..777306b5bc22 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1595,6 +1595,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_move_syn(newtp, req); ireq->ireq_opt = NULL; } else { + newinet->inet_opt = NULL; + if (!req_unhash && found_dup_sk) { /* This code path should only be executed in the * syncookie case only @@ -1602,8 +1604,6 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, bh_unlock_sock(newsk); sock_put(newsk); newsk = NULL; - } else { - newinet->inet_opt = NULL; } } return newsk; @@ -1760,6 +1760,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); + u32 tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -1767,6 +1768,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) unsigned int hdrlen; bool fragstolen; u32 gso_segs; + u32 gso_size; int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), @@ -1792,13 +1794,6 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) */ th = (const struct tcphdr *)skb->data; hdrlen = th->doff * 4; - shinfo = skb_shinfo(skb); - - if (!shinfo->gso_size) - shinfo->gso_size = skb->len - hdrlen; - - if (!shinfo->gso_segs) - shinfo->gso_segs = 1; tail = sk->sk_backlog.tail; if (!tail) @@ -1821,6 +1816,15 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) goto no_coalesce; __skb_pull(skb, hdrlen); + + shinfo = skb_shinfo(skb); + gso_size = shinfo->gso_size ?: skb->len; + gso_segs = shinfo->gso_segs ?: 1; + + shinfo = skb_shinfo(tail); + tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); + tail_gso_segs = shinfo->gso_segs ?: 1; + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; @@ -1847,11 +1851,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) } /* Not as strict as GRO. We only need to carry mss max value */ - skb_shinfo(tail)->gso_size = max(shinfo->gso_size, - skb_shinfo(tail)->gso_size); - - gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; - skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + shinfo->gso_size = max(gso_size, tail_gso_size); + shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); sk->sk_backlog.len += delta; __NET_INC_STATS(sock_net(sk), diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f322e798a351..ab458697881e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4084,6 +4084,7 @@ void tcp_send_probe0(struct sock *sk) /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; + icsk->icsk_probes_tstamp = 0; return; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6c62b9ea1320..faa92948441b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -219,14 +219,8 @@ static int tcp_write_timeout(struct sock *sk) int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (icsk->icsk_retransmits) { - dst_negative_advice(sk); - } else { - sk_rethink_txhash(sk); - tp->timeout_rehash++; - __NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTIMEOUTREHASH); - } + if (icsk->icsk_retransmits) + __dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; expired = icsk->icsk_retransmits >= retry_until; } else { @@ -234,12 +228,7 @@ static int tcp_write_timeout(struct sock *sk) /* Black hole detection */ tcp_mtu_probing(icsk, sk); - dst_negative_advice(sk); - } else { - sk_rethink_txhash(sk); - tp->timeout_rehash++; - __NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTIMEOUTREHASH); + __dst_negative_advice(sk); } retry_until = net->ipv4.sysctl_tcp_retries2; @@ -270,6 +259,11 @@ static int tcp_write_timeout(struct sock *sk) return 1; } + if (sk_rethink_txhash(sk)) { + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH); + } + return 0; } @@ -349,6 +343,7 @@ static void tcp_probe_timer(struct sock *sk) if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; + icsk->icsk_probes_tstamp = 0; return; } @@ -360,13 +355,12 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - if (icsk->icsk_user_timeout) { - u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out, - tcp_probe0_base(sk)); - - if (elapsed >= icsk->icsk_user_timeout) - goto abort; - } + if (!icsk->icsk_probes_tstamp) + icsk->icsk_probes_tstamp = tcp_jiffies32; + else if (icsk->icsk_user_timeout && + (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= + msecs_to_jiffies(icsk->icsk_user_timeout)) + goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7103b0a89756..69ea76578abb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2555,7 +2555,8 @@ int udp_v4_early_demux(struct sk_buff *skb) */ if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, - iph->saddr, iph->tos, + iph->saddr, + iph->tos & IPTOS_RT_MASK, skb->dev, in_dev, &itag); } return 0; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index eff2cacd5209..9edc5bb2d531 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2467,8 +2467,9 @@ static void addrconf_add_mroute(struct net_device *dev) .fc_ifindex = dev->ifindex, .fc_dst_len = 8, .fc_flags = RTF_UP, - .fc_type = RTN_UNICAST, + .fc_type = RTN_MULTICAST, .fc_nlinfo.nl_net = dev_net(dev), + .fc_protocol = RTPROT_KERNEL, }; ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 52c2f063529f..2b804fcebcc6 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -478,7 +478,6 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { u8 *tail; - u8 *vaddr; int nfrags; int esph_offset; struct page *page; @@ -519,14 +518,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info page = pfrag->page; get_page(page); - vaddr = kmap_atomic(page); - - tail = vaddr + pfrag->offset; + tail = page_address(page) + pfrag->offset; esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); - kunmap_atomic(vaddr); - nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 605cdd38a919..f43e27555725 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1025,6 +1025,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, { struct fib6_table *table = rt->fib6_table; + /* Flush all cached dst in exception table */ + rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) @@ -1927,9 +1929,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; - /* Flush all cached dst in exception table */ - rt6_flush_exceptions(rt); - /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 749ad72386b2..077d43af8226 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -125,8 +125,43 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * return -EINVAL; } +static int +ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, + struct sk_buff *skb, unsigned int mtu) +{ + struct sk_buff *segs, *nskb; + netdev_features_t features; + int ret = 0; + + /* Please see corresponding comment in ip_finish_output_gso + * describing the cases where GSO segment length exceeds the + * egress MTU. + */ + features = netif_skb_features(skb); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + skb_list_walk_safe(segs, segs, nskb) { + int err; + + skb_mark_not_on_list(segs); + err = ip6_fragment(net, sk, segs, ip6_finish_output2); + if (err && ret == 0) + ret = err; + } + + return ret; +} + static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { + unsigned int mtu; + #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -135,7 +170,11 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff } #endif - if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || + mtu = ip6_skb_dst_mtu(skb); + if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu)) + return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); + + if ((skb->len > mtu && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)) || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) return ip6_fragment(net, sk, skb, ip6_finish_output2); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c4f532f4d311..0d453fa9e327 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1598,7 +1598,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 2da0ee703779..93636867aee2 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1645,8 +1645,11 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev, } #ifdef CONFIG_IPV6_SIT_6RD - if (ipip6_netlink_6rd_parms(data, &ip6rd)) + if (ipip6_netlink_6rd_parms(data, &ip6rd)) { err = ipip6_tunnel_update_6rd(nt, &ip6rd); + if (err < 0) + unregister_netdevice_queue(dev, NULL); + } #endif return err; diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 213ea7abc9ab..40961889e9c0 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -489,6 +489,7 @@ static int lapb_device_event(struct notifier_block *this, unsigned long event, break; } + lapb_put(lapb); return NOTIFY_DONE; } diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 48f144f107d5..9e723d943421 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -120,18 +120,17 @@ static ssize_t aqm_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[100]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; - len = strlen(buf); - if (len > 0 && buf[len-1] == '\n') - buf[len-1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1) return count; @@ -177,18 +176,17 @@ static ssize_t airtime_flags_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[16]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = 0; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (kstrtou16(buf, 0, &local->airtime_flags)) return -EINVAL; @@ -237,20 +235,19 @@ static ssize_t aql_txq_limit_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[100]; - size_t len; u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old; struct sta_info *sta; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = 0; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3) return -EINVAL; @@ -306,18 +303,17 @@ static ssize_t force_tx_status_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[3]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (buf[0] == '0' && buf[1] == '\0') local->force_tx_status = 0; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 13b9bcc4865d..972895e9f22d 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4176,6 +4176,8 @@ void ieee80211_check_fast_rx(struct sta_info *sta) rcu_read_lock(); key = rcu_dereference(sta->ptk[sta->ptk_idx]); + if (!key) + key = rcu_dereference(sdata->default_unicast_key); if (key) { switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 6422da6690f7..ebb3228ce971 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -649,7 +649,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (!skip_hw && tx->key && tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) info->control.hw_key = &tx->key->conf; - } else if (!ieee80211_is_mgmt(hdr->frame_control) && tx->sta && + } else if (ieee80211_is_data_present(hdr->frame_control) && tx->sta && test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) { return TX_DROP; } @@ -3809,7 +3809,7 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw, * get immediately moved to the back of the list on the next * call to ieee80211_next_txq(). */ - if (txqi->txq.sta && + if (txqi->txq.sta && local->airtime_flags && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) list_add(&txqi->schedule_order, @@ -4251,7 +4251,6 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct ethhdr *ehdr = (struct ethhdr *)skb->data; struct ieee80211_key *key; struct sta_info *sta; - bool offload = true; if (unlikely(skb->len < ETH_HLEN)) { kfree_skb(skb); @@ -4267,18 +4266,22 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, if (unlikely(IS_ERR_OR_NULL(sta) || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || - sdata->control_port_protocol == ehdr->h_proto)) - offload = false; - else if ((key = rcu_dereference(sta->ptk[sta->ptk_idx])) && - (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || - key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) - offload = false; - - if (offload) - ieee80211_8023_xmit(sdata, dev, sta, key, skb); - else - ieee80211_subif_start_xmit(skb, dev); + sdata->control_port_protocol == ehdr->h_proto)) + goto skip_offload; + + key = rcu_dereference(sta->ptk[sta->ptk_idx]); + if (!key) + key = rcu_dereference(sdata->default_unicast_key); + + if (key && (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || + key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) + goto skip_offload; + + ieee80211_8023_xmit(sdata, dev, sta, key, skb); + goto out; +skip_offload: + ieee80211_subif_start_xmit(skb, dev); out: rcu_read_unlock(); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index c5328f407aab..e0d21c0607e5 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -606,6 +606,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * skb && skb_is_tcp_pure_ack(skb)) { pr_debug("drop other suboptions"); opts->suboptions = 0; + opts->ext_copy.use_ack = 0; + opts->ext_copy.use_map = 0; remaining += opt_size; drop_other_suboptions = true; } @@ -873,10 +875,13 @@ static void ack_update_msk(struct mptcp_sock *msk, new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; - if (after64(new_wnd_end, msk->wnd_end)) { + if (after64(new_wnd_end, msk->wnd_end)) msk->wnd_end = new_wnd_end; - __mptcp_wnd_updated(sk, ssk); - } + + /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ + if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)) && + sk_stream_memory_free(ssk)) + __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { msk->snd_una = new_snd_una; @@ -942,8 +947,8 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) * helpers are cheap. */ mptcp_data_lock(subflow->conn); - if (mptcp_send_head(subflow->conn)) - __mptcp_wnd_updated(subflow->conn, sk); + if (sk_stream_memory_free(sk)) + __mptcp_check_push(subflow->conn, sk); __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); return; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b812aaae8044..f998a077c7dd 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -427,7 +427,7 @@ static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) static bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & - (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE)); + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } static void mptcp_send_ack(struct mptcp_sock *msk) @@ -877,6 +877,9 @@ static void __mptcp_wmem_reserve(struct sock *sk, int size) struct mptcp_sock *msk = mptcp_sk(sk); WARN_ON_ONCE(msk->wmem_reserved); + if (WARN_ON_ONCE(amount < 0)) + amount = 0; + if (amount <= sk->sk_forward_alloc) goto reserve; @@ -1587,7 +1590,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; - mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, len)); + mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len))); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -1658,6 +1661,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) frag_truesize += psize; pfrag->offset += frag_truesize; WRITE_ONCE(msk->write_seq, msk->write_seq + psize); + msk->tx_pending_data += psize; /* charge data on mptcp pending queue to the msk socket * Note: we charge such data both to sk and ssk @@ -1683,10 +1687,8 @@ wait_for_memory: goto out; } - if (copied) { - msk->tx_pending_data += copied; + if (copied) mptcp_push_pending(sk, msg->msg_flags); - } out: release_sock(sk); @@ -2119,7 +2121,7 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, list_del(&subflow->node); - lock_sock(ssk); + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); /* if we are invoked by the msk cleanup code, the subflow is * already orphaned @@ -2640,11 +2642,17 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) static int mptcp_disconnect(struct sock *sk, int flags) { - /* Should never be called. - * inet_stream_connect() calls ->disconnect, but that - * refers to the subflow socket, not the mptcp one. - */ - WARN_ON_ONCE(1); + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + + __mptcp_flush_join_list(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + tcp_disconnect(ssk, flags); + release_sock(ssk); + } return 0; } @@ -2699,6 +2707,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk, sock_reset_flag(nsk, SOCK_RCU_FREE); /* will be fully established after successful MPC subflow creation */ inet_sk_state_store(nsk, TCP_SYN_RECV); + + security_inet_csk_clone(nsk, req); bh_unlock_sock(nsk); /* keep a single reference */ @@ -2913,7 +2923,7 @@ void __mptcp_data_acked(struct sock *sk) mptcp_schedule_work(sk); } -void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk) +void __mptcp_check_push(struct sock *sk, struct sock *ssk) { if (!mptcp_send_head(sk)) return; @@ -3085,6 +3095,14 @@ bool mptcp_finish_join(struct sock *ssk) return true; } +static void mptcp_shutdown(struct sock *sk, int how) +{ + pr_debug("sk=%p, how=%d", sk, how); + + if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) + __mptcp_wr_shutdown(sk); +} + static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, @@ -3094,7 +3112,7 @@ static struct proto mptcp_prot = { .accept = mptcp_accept, .setsockopt = mptcp_setsockopt, .getsockopt = mptcp_getsockopt, - .shutdown = tcp_shutdown, + .shutdown = mptcp_shutdown, .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, .recvmsg = mptcp_recvmsg, @@ -3340,43 +3358,6 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, return mask; } -static int mptcp_shutdown(struct socket *sock, int how) -{ - struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct sock *sk = sock->sk; - int ret = 0; - - pr_debug("sk=%p, how=%d", msk, how); - - lock_sock(sk); - - how++; - if ((how & ~SHUTDOWN_MASK) || !how) { - ret = -EINVAL; - goto out_unlock; - } - - if (sock->state == SS_CONNECTING) { - if ((1 << sk->sk_state) & - (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) - sock->state = SS_DISCONNECTING; - else - sock->state = SS_CONNECTED; - } - - sk->sk_shutdown |= how; - if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) - __mptcp_wr_shutdown(sk); - - /* Wake up anyone sleeping in poll. */ - sk->sk_state_change(sk); - -out_unlock: - release_sock(sk); - - return ret; -} - static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, @@ -3390,7 +3371,7 @@ static const struct proto_ops mptcp_stream_ops = { .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, - .shutdown = mptcp_shutdown, + .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, @@ -3440,7 +3421,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, - .shutdown = mptcp_shutdown, + .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet6_sendmsg, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7cf9d110b85f..d67de793d363 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -503,7 +503,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); -void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk); +void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void mptcp_subflow_eof(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 5b1f4ec66dd9..888ccc2d4e34 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -1120,7 +1120,7 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, int payload, i, ret; /* Find the NCSI device */ - nd = ncsi_find_dev(dev); + nd = ncsi_find_dev(orig_dev); ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; if (!ndp) return -ENODEV; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 5f1208ad049e..6186358eac7c 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -141,20 +141,6 @@ htable_size(u8 hbits) return hsize * sizeof(struct hbucket *) + sizeof(struct htable); } -/* Compute htable_bits from the user input parameter hashsize */ -static u8 -htable_bits(u32 hashsize) -{ - /* Assume that hashsize == 2^htable_bits */ - u8 bits = fls(hashsize - 1); - - if (jhash_size(bits) != hashsize) - /* Round up to the first 2^n value */ - bits = fls(hashsize); - - return bits; -} - #ifdef IP_SET_HASH_WITH_NETS #if IPSET_NET_COUNT > 1 #define __CIDR(cidr, i) (cidr[i]) @@ -640,7 +626,7 @@ mtype_resize(struct ip_set *set, bool retried) struct htype *h = set->data; struct htable *t, *orig; u8 htable_bits; - size_t dsize = set->dsize; + size_t hsize, dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; struct mtype_elem *tmp; @@ -664,14 +650,12 @@ mtype_resize(struct ip_set *set, bool retried) retry: ret = 0; htable_bits++; - if (!htable_bits) { - /* In case we have plenty of memory :-) */ - pr_warn("Cannot increase the hashsize of set %s further\n", - set->name); - ret = -IPSET_ERR_HASH_FULL; - goto out; - } - t = ip_set_alloc(htable_size(htable_bits)); + if (!htable_bits) + goto hbwarn; + hsize = htable_size(htable_bits); + if (!hsize) + goto hbwarn; + t = ip_set_alloc(hsize); if (!t) { ret = -ENOMEM; goto out; @@ -813,6 +797,12 @@ cleanup: if (ret == -EAGAIN) goto retry; goto out; + +hbwarn: + /* In case we have plenty of memory :-) */ + pr_warn("Cannot increase the hashsize of set %s further\n", set->name); + ret = -IPSET_ERR_HASH_FULL; + goto out; } /* Get the current number of elements and ext_size in the set */ @@ -1521,7 +1511,11 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (!h) return -ENOMEM; - hbits = htable_bits(hashsize); + /* Compute htable_bits from the user input parameter hashsize. + * Assume that hashsize == 2^htable_bits, + * otherwise round up to the first 2^n value. + */ + hbits = fls(hashsize - 1); hsize = htable_size(hbits); if (hsize == 0) { kfree(h); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 46c5557c1fec..0ee702d374b0 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -523,6 +523,9 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write, { int ret; + /* module_param hashsize could have changed value */ + nf_conntrack_htable_size_user = nf_conntrack_htable_size; + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret < 0 || !write) return ret; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index ea923f8cf9c4..b7c3c902290f 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1174,6 +1174,7 @@ static int __init nf_nat_init(void) ret = register_pernet_subsys(&nat_net_ops); if (ret < 0) { nf_ct_extend_unregister(&nat_extend); + kvfree(nf_nat_bysource); return ret; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8d5aa0ac45f4..15c467f1a9dd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4162,7 +4162,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | NFT_SET_INTERVAL | NFT_SET_TIMEOUT | NFT_SET_MAP | NFT_SET_EVAL | - NFT_SET_OBJECT | NFT_SET_CONCAT)) + NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR)) return -EOPNOTSUPP; /* Only one of these operations is supported */ if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == @@ -4304,6 +4304,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nlattr *tmp; int left; + if (!(flags & NFT_SET_EXPR)) { + err = -EINVAL; + goto err_set_alloc_name; + } i = 0; nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX) { @@ -5254,8 +5258,8 @@ static int nft_set_elem_expr_clone(const struct nft_ctx *ctx, return 0; err_expr: - for (k = i - 1; k >= 0; k++) - nft_expr_destroy(ctx, expr_array[i]); + for (k = i - 1; k >= 0; k--) + nft_expr_destroy(ctx, expr_array[k]); return -ENOMEM; } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 983a1d5ca3ab..0b053f75cd60 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -19,6 +19,7 @@ struct nft_dynset { enum nft_registers sreg_key:8; enum nft_registers sreg_data:8; bool invert; + bool expr; u8 num_exprs; u64 timeout; struct nft_expr *expr_array[NFT_SET_EXPR_MAX]; @@ -175,11 +176,12 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (tb[NFTA_DYNSET_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS])); - - if (flags & ~NFT_DYNSET_F_INV) - return -EINVAL; + if (flags & ~(NFT_DYNSET_F_INV | NFT_DYNSET_F_EXPR)) + return -EOPNOTSUPP; if (flags & NFT_DYNSET_F_INV) priv->invert = true; + if (flags & NFT_DYNSET_F_EXPR) + priv->expr = true; } set = nft_set_lookup_global(ctx->net, ctx->table, @@ -210,7 +212,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, timeout = 0; if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) - return -EINVAL; + return -EOPNOTSUPP; err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout); if (err) @@ -224,7 +226,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (tb[NFTA_DYNSET_SREG_DATA] != NULL) { if (!(set->flags & NFT_SET_MAP)) - return -EINVAL; + return -EOPNOTSUPP; if (set->dtype == NFT_DATA_VERDICT) return -EOPNOTSUPP; @@ -261,6 +263,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, struct nlattr *tmp; int left; + if (!priv->expr) + return -EINVAL; + i = 0; nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX) { diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 37253d399c6b..0d5c422f8745 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -115,6 +115,9 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) } cfg; int ret; + if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) + return -ENAMETOOLONG; + net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index e64727e1a72f..02a1f13f0798 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -508,7 +508,7 @@ static int nci_open_device(struct nci_dev *ndev) }; unsigned long opt = 0; - if (!(ndev->nci_ver & NCI_VER_2_MASK)) + if (ndev->nci_ver & NCI_VER_2_MASK) opt = (unsigned long)&nci_init_v2_cmd; rc = __nci_request(ndev, nci_init_req, opt, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index de8e8dbbdeb8..6bbc7a448593 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4595,7 +4595,9 @@ static void packet_seq_stop(struct seq_file *seq, void *v) static int packet_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) - seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); + seq_printf(seq, + "%*sRefCnt Type Proto Iface R Rmem User Inode\n", + IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk"); else { struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 56aaf8cb6527..8d00dfe8139e 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -755,7 +755,7 @@ static void qrtr_ns_data_ready(struct sock *sk) queue_work(qrtr_ns.workqueue, &qrtr_ns.work); } -void qrtr_ns_init(void) +int qrtr_ns_init(void) { struct sockaddr_qrtr sq; int ret; @@ -766,7 +766,7 @@ void qrtr_ns_init(void) ret = sock_create_kern(&init_net, AF_QIPCRTR, SOCK_DGRAM, PF_QIPCRTR, &qrtr_ns.sock); if (ret < 0) - return; + return ret; ret = kernel_getsockname(qrtr_ns.sock, (struct sockaddr *)&sq); if (ret < 0) { @@ -797,12 +797,13 @@ void qrtr_ns_init(void) if (ret < 0) goto err_wq; - return; + return 0; err_wq: destroy_workqueue(qrtr_ns.workqueue); err_sock: sock_release(qrtr_ns.sock); + return ret; } EXPORT_SYMBOL_GPL(qrtr_ns_init); diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index f4ab3ca6d73b..b34358282f37 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1287,13 +1287,19 @@ static int __init qrtr_proto_init(void) return rc; rc = sock_register(&qrtr_family); - if (rc) { - proto_unregister(&qrtr_proto); - return rc; - } + if (rc) + goto err_proto; - qrtr_ns_init(); + rc = qrtr_ns_init(); + if (rc) + goto err_sock; + return 0; + +err_sock: + sock_unregister(qrtr_family.family); +err_proto: + proto_unregister(&qrtr_proto); return rc; } postcore_initcall(qrtr_proto_init); diff --git a/net/qrtr/qrtr.h b/net/qrtr/qrtr.h index dc2b67f17927..3f2d28696062 100644 --- a/net/qrtr/qrtr.h +++ b/net/qrtr/qrtr.h @@ -29,7 +29,7 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep); int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len); -void qrtr_ns_init(void); +int qrtr_ns_init(void); void qrtr_ns_remove(void); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 667c44aa5a63..dc201363f2c4 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -430,7 +430,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) return; } - if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST) { + if (state == RXRPC_CALL_SERVER_RECV_REQUEST) { unsigned long timo = READ_ONCE(call->next_req_timo); unsigned long now, expect_req_by; diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 9631aa8543b5..8d2073e0e3da 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -598,7 +598,7 @@ static long rxrpc_read(const struct key *key, default: /* we have a ticket we can't encode */ pr_err("Unsupported key token type (%u)\n", token->security_index); - continue; + return -ENOPKG; } _debug("token[%u]: toksize=%u", ntoks, toksize); @@ -674,7 +674,9 @@ static long rxrpc_read(const struct key *key, break; default: - break; + pr_err("Unsupported key token type (%u)\n", + token->security_index); + return -ENOPKG; } ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==, diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1319986693fc..84f932532db7 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1272,6 +1272,10 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); + if (!nla_ok(nla_opt_msk, msk_depth)) { + NL_SET_ERR_MSG(extack, "Invalid nested attribute for masks"); + return -EINVAL; + } } nla_for_each_attr(nla_opt_key, nla_enc_key, @@ -1307,9 +1311,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; case TCA_FLOWER_KEY_ENC_OPTS_VXLAN: if (key->enc_opts.dst_opt_type) { @@ -1340,9 +1341,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN: if (key->enc_opts.dst_opt_type) { @@ -1373,14 +1371,20 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); return -EINVAL; } + + if (!msk_depth) + continue; + + if (!nla_ok(nla_opt_msk, msk_depth)) { + NL_SET_ERR_MSG(extack, "A mask attribute is invalid"); + return -EINVAL; + } + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); } return 0; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 78bec347b8b6..c4007b9cd16d 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -366,9 +366,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (tb[TCA_TCINDEX_MASK]) cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - if (tb[TCA_TCINDEX_SHIFT]) + if (tb[TCA_TCINDEX_SHIFT]) { cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); - + if (cp->shift > 16) { + err = -EINVAL; + goto errout; + } + } if (!cp->hash) { /* Hash not specified, use perfect hash if the upper limit * of the hashing index is below the threshold. diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 51cb553e4317..6fe4e5cc807c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -412,7 +412,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, { struct qdisc_rate_table *rtab; - if (tab == NULL || r->rate == 0 || r->cell_log == 0 || + if (tab == NULL || r->rate == 0 || + r->cell_log == 0 || r->cell_log >= 32 || nla_len(tab) != TC_RTAB_SIZE) { NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); return NULL; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index bd618b00d319..50f680f03a54 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -362,7 +362,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, ctl = nla_data(tb[TCA_CHOKE_PARMS]); - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) return -EINVAL; if (ctl->limit > CHOKE_MAX_QUEUE) diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 8599c6f31b05..e0bc77533acc 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -480,7 +480,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) { + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) { NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters"); return -EINVAL; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index e89fab6ccb34..b4ae34d7aa96 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -250,7 +250,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; ctl = nla_data(tb[TCA_RED_PARMS]); - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) return -EINVAL; err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS, diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index bca2be57d9fc..b25e51440623 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -647,7 +647,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) } if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, - ctl_v1->Wlog)) + ctl_v1->Wlog, ctl_v1->Scell_log)) return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 26fb8a62996b..6f775275826a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1597,6 +1597,22 @@ free_sched: return err; } +static void taprio_reset(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int i; + + hrtimer_cancel(&q->advance_timer); + if (q->qdiscs) { + for (i = 0; i < dev->num_tx_queues; i++) + if (q->qdiscs[i]) + qdisc_reset(q->qdiscs[i]); + } + sch->qstats.backlog = 0; + sch->q.qlen = 0; +} + static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); @@ -1607,12 +1623,11 @@ static void taprio_destroy(struct Qdisc *sch) list_del(&q->taprio_list); spin_unlock(&taprio_list_lock); - hrtimer_cancel(&q->advance_timer); taprio_disable_offload(dev, q, NULL); if (q->qdiscs) { - for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) + for (i = 0; i < dev->num_tx_queues; i++) qdisc_put(q->qdiscs[i]); kfree(q->qdiscs); @@ -1954,6 +1969,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .init = taprio_init, .change = taprio_change, .destroy = taprio_destroy, + .reset = taprio_reset, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 59342b519e34..0df85a12651e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -246,7 +246,8 @@ int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) goto errattr; smc_clc_get_hostname(&host); if (host) { - snprintf(hostname, sizeof(hostname), "%s", host); + memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN); + hostname[SMC_MAX_HOSTNAME_LEN] = 0; if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname)) goto errattr; } @@ -257,7 +258,8 @@ int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) smc_ism_get_system_eid(smcd_dev, &seid); mutex_unlock(&smcd_dev_list.mutex); if (seid && smc_ism_is_v2_capable()) { - snprintf(smc_seid, sizeof(smc_seid), "%s", seid); + memcpy(smc_seid, seid, SMC_MAX_EID_LEN); + smc_seid[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid)) goto errattr; } @@ -295,7 +297,8 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr, goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id)) goto errattr; - snprintf(smc_target, sizeof(smc_target), "%s", lgr->pnet_id); + memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN); + smc_target[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) goto errattr; @@ -312,7 +315,7 @@ static int smc_nl_fill_lgr_link(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { - char smc_ibname[IB_DEVICE_NAME_MAX + 1]; + char smc_ibname[IB_DEVICE_NAME_MAX]; u8 smc_gid_target[41]; struct nlattr *attrs; u32 link_uid = 0; @@ -461,7 +464,8 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd))) goto errattr; - snprintf(smc_pnet, sizeof(smc_pnet), "%s", lgr->smcd->pnetid); + memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) goto errattr; @@ -474,10 +478,12 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, goto errv2attr; if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) goto errv2attr; - snprintf(smc_host, sizeof(smc_host), "%s", lgr->peer_hostname); + memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN); + smc_host[SMC_MAX_HOSTNAME_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) goto errv2attr; - snprintf(smc_eid, sizeof(smc_eid), "%s", lgr->negotiated_eid); + memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN); + smc_eid[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) goto errv2attr; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 89ea10675a7d..7d7ba0320d5a 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -371,8 +371,8 @@ static int smc_nl_handle_dev_port(struct sk_buff *skb, if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcibdev->pnetid_by_user[port])) goto errattr; - snprintf(smc_pnet, sizeof(smc_pnet), "%s", - (char *)&smcibdev->pnetid[port]); + memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) goto errattr; if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV, @@ -394,11 +394,27 @@ errout: return -EMSGSIZE; } +static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev, + struct sk_buff *skb) +{ + if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device)) + return false; + if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id)) + return false; + return true; +} + static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, struct sk_buff *skb, struct netlink_callback *cb) { - char smc_ibname[IB_DEVICE_NAME_MAX + 1]; + char smc_ibname[IB_DEVICE_NAME_MAX]; struct smc_pci_dev smc_pci_dev; struct pci_dev *pci_dev; unsigned char is_crit; @@ -417,19 +433,13 @@ static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev); if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit)) goto errattr; - memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); - smc_set_pci_values(pci_dev, &smc_pci_dev); - if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) - goto errattr; - if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) - goto errattr; - if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor)) - goto errattr; - if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device)) - goto errattr; - if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id)) - goto errattr; + if (smcibdev->ibdev->dev.parent) { + memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); + pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); + smc_set_pci_values(pci_dev, &smc_pci_dev); + if (!smc_nl_handle_pci_values(&smc_pci_dev, skb)) + goto errattr; + } snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name); if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname)) goto errattr; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 524ef64a191a..9c6e95882553 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -250,7 +250,8 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, goto errattr; if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user)) goto errportattr; - snprintf(smc_pnet, sizeof(smc_pnet), "%s", smcd->pnetid); + memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) goto errportattr; diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index 010dcb876f9d..6e4dbd577a39 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -185,7 +185,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf, scope_id = dev->ifindex; dev_put(dev); } else { - if (kstrtou32(p, 10, &scope_id) == 0) { + if (kstrtou32(p, 10, &scope_id) != 0) { kfree(p); return 0; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 3259120462ed..612f0a641f4c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1251,10 +1251,7 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages, unsigned int base, unsigned int len, unsigned int hdrsize) { - /* Subtract one to force an extra word of buffer space for the - * payload's XDR pad to fall into the rcv_buf's tail iovec. - */ - hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1; + hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign; xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len); trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf); diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index fd9bca242724..56029e3af6ff 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -128,13 +128,13 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n return 0; len = snprintf(name, sizeof(name), "../../rpc_xprt/%s", xprt->debugfs->d_name.name); - if (len > sizeof(name)) + if (len >= sizeof(name)) return -1; if (*nump == 0) strcpy(link, "xprt"); else { len = snprintf(link, sizeof(link), "xprt%d", *nump); - if (len > sizeof(link)) + if (len >= sizeof(link)) return -1; } debugfs_create_symlink(link, clnt->cl_debugfs, name); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f06d7c315017..cf702a5f7fe5 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -676,6 +676,23 @@ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue) EXPORT_SYMBOL_GPL(rpc_wake_up_next); /** + * rpc_wake_up_locked - wake up all rpc_tasks + * @queue: rpc_wait_queue on which the tasks are sleeping + * + */ +static void rpc_wake_up_locked(struct rpc_wait_queue *queue) +{ + struct rpc_task *task; + + for (;;) { + task = __rpc_find_next_queued(queue); + if (task == NULL) + break; + rpc_wake_up_task_queue_locked(queue, task); + } +} + +/** * rpc_wake_up - wake up all rpc_tasks * @queue: rpc_wait_queue on which the tasks are sleeping * @@ -683,25 +700,28 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_next); */ void rpc_wake_up(struct rpc_wait_queue *queue) { - struct list_head *head; - spin_lock(&queue->lock); - head = &queue->tasks[queue->maxpriority]; + rpc_wake_up_locked(queue); + spin_unlock(&queue->lock); +} +EXPORT_SYMBOL_GPL(rpc_wake_up); + +/** + * rpc_wake_up_status_locked - wake up all rpc_tasks and set their status value. + * @queue: rpc_wait_queue on which the tasks are sleeping + * @status: status value to set + */ +static void rpc_wake_up_status_locked(struct rpc_wait_queue *queue, int status) +{ + struct rpc_task *task; + for (;;) { - while (!list_empty(head)) { - struct rpc_task *task; - task = list_first_entry(head, - struct rpc_task, - u.tk_wait.list); - rpc_wake_up_task_queue_locked(queue, task); - } - if (head == &queue->tasks[0]) + task = __rpc_find_next_queued(queue); + if (task == NULL) break; - head--; + rpc_wake_up_task_queue_set_status_locked(queue, task, status); } - spin_unlock(&queue->lock); } -EXPORT_SYMBOL_GPL(rpc_wake_up); /** * rpc_wake_up_status - wake up all rpc_tasks and set their status value. @@ -712,23 +732,8 @@ EXPORT_SYMBOL_GPL(rpc_wake_up); */ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) { - struct list_head *head; - spin_lock(&queue->lock); - head = &queue->tasks[queue->maxpriority]; - for (;;) { - while (!list_empty(head)) { - struct rpc_task *task; - task = list_first_entry(head, - struct rpc_task, - u.tk_wait.list); - task->tk_status = status; - rpc_wake_up_task_queue_locked(queue, task); - } - if (head == &queue->tasks[0]) - break; - head--; - } + rpc_wake_up_status_locked(queue, status); spin_unlock(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_status); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5fb9164aa690..dcc50ae54550 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -857,6 +857,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) err = -EAGAIN; if (len <= 0) goto out_release; + trace_svc_xdr_recvfrom(&rqstp->rq_arg); clear_bit(XPT_OLD, &xprt->xpt_flags); @@ -866,7 +867,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (serv->sv_stats) serv->sv_stats->netcnt++; - trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg); return len; out_release: rqstp->rq_res.len = 0; @@ -904,7 +904,7 @@ int svc_send(struct svc_rqst *rqstp) xb->len = xb->head[0].iov_len + xb->page_len + xb->tail[0].iov_len; - trace_svc_xdr_sendto(rqstp, xb); + trace_svc_xdr_sendto(rqstp->rq_xid, xb); trace_svc_stats_latency(rqstp); len = xprt->xpt_ops->xpo_sendto(rqstp); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index b248f2349437..c9766d07eb81 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1062,6 +1062,90 @@ err_noclose: return 0; /* record not complete */ } +static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, + int flags) +{ + return kernel_sendpage(sock, virt_to_page(vec->iov_base), + offset_in_page(vec->iov_base), + vec->iov_len, flags); +} + +/* + * kernel_sendpage() is used exclusively to reduce the number of + * copy operations in this path. Therefore the caller must ensure + * that the pages backing @xdr are unchanging. + * + * In addition, the logic assumes that * .bv_len is never larger + * than PAGE_SIZE. + */ +static int svc_tcp_sendmsg(struct socket *sock, struct msghdr *msg, + struct xdr_buf *xdr, rpc_fraghdr marker, + unsigned int *sentp) +{ + const struct kvec *head = xdr->head; + const struct kvec *tail = xdr->tail; + struct kvec rm = { + .iov_base = &marker, + .iov_len = sizeof(marker), + }; + int flags, ret; + + *sentp = 0; + xdr_alloc_bvec(xdr, GFP_KERNEL); + + msg->msg_flags = MSG_MORE; + ret = kernel_sendmsg(sock, msg, &rm, 1, rm.iov_len); + if (ret < 0) + return ret; + *sentp += ret; + if (ret != rm.iov_len) + return -EAGAIN; + + flags = head->iov_len < xdr->len ? MSG_MORE | MSG_SENDPAGE_NOTLAST : 0; + ret = svc_tcp_send_kvec(sock, head, flags); + if (ret < 0) + return ret; + *sentp += ret; + if (ret != head->iov_len) + goto out; + + if (xdr->page_len) { + unsigned int offset, len, remaining; + struct bio_vec *bvec; + + bvec = xdr->bvec; + offset = xdr->page_base; + remaining = xdr->page_len; + flags = MSG_MORE | MSG_SENDPAGE_NOTLAST; + while (remaining > 0) { + if (remaining <= PAGE_SIZE && tail->iov_len == 0) + flags = 0; + len = min(remaining, bvec->bv_len); + ret = kernel_sendpage(sock, bvec->bv_page, + bvec->bv_offset + offset, + len, flags); + if (ret < 0) + return ret; + *sentp += ret; + if (ret != len) + goto out; + remaining -= len; + offset = 0; + bvec++; + } + } + + if (tail->iov_len) { + ret = svc_tcp_send_kvec(sock, tail, 0); + if (ret < 0) + return ret; + *sentp += ret; + } + +out: + return 0; +} + /** * svc_tcp_sendto - Send out a reply on a TCP socket * @rqstp: completed svc_rqst @@ -1089,7 +1173,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) mutex_lock(&xprt->xpt_mutex); if (svc_xprt_is_dead(xprt)) goto out_notconn; - err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent); + err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent); xdr_free_bvec(xdr); trace_svcsock_tcp_send(xprt, err < 0 ? err : sent); if (err < 0 || sent != (xdr->len + sizeof(marker))) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 757560a3b06b..3964ff74ee51 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -123,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); * @len: length of string, in bytes * */ -void -xdr_terminate_string(struct xdr_buf *buf, const u32 len) +void xdr_terminate_string(const struct xdr_buf *buf, const u32 len) { char *kaddr; @@ -134,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len) } EXPORT_SYMBOL_GPL(xdr_terminate_string); -size_t -xdr_buf_pagecount(struct xdr_buf *buf) +size_t xdr_buf_pagecount(const struct xdr_buf *buf) { if (!buf->page_len) return 0; @@ -193,9 +191,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, tail->iov_base = buf + offset; tail->iov_len = buflen - offset; - if ((xdr->page_len & 3) == 0) - tail->iov_len -= sizeof(__be32); - xdr->buflen += len; } EXPORT_SYMBOL_GPL(xdr_inline_pages); @@ -228,6 +223,9 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base, BUG_ON(pgfrom_base <= pgto_base); + if (!len) + return; + pgto = pages + (pgto_base >> PAGE_SHIFT); pgfrom = pages + (pgfrom_base >> PAGE_SHIFT); @@ -266,26 +264,6 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base, } while ((len -= copy) != 0); } -static void -_shift_data_left_tail(struct xdr_buf *buf, unsigned int pgto, size_t len) -{ - struct kvec *tail = buf->tail; - - if (len > tail->iov_len) - len = tail->iov_len; - - _copy_to_pages(buf->pages, - buf->page_base + pgto, - (char *)tail->iov_base, - len); - tail->iov_len -= len; - - if (tail->iov_len > 0) - memmove((char *)tail->iov_base, - tail->iov_base + len, - tail->iov_len); -} - /** * _shift_data_right_pages * @pages: vector of pages containing both the source and dest memory area. @@ -310,6 +288,9 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, BUG_ON(pgto_base <= pgfrom_base); + if (!len) + return; + pgto_base += len; pgfrom_base += len; @@ -351,46 +332,6 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, } while ((len -= copy) != 0); } -static unsigned int -_shift_data_right_tail(struct xdr_buf *buf, unsigned int pgfrom, size_t len) -{ - struct kvec *tail = buf->tail; - unsigned int tailbuf_len; - unsigned int result = 0; - size_t copy; - - tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; - - /* Shift the tail first */ - if (tailbuf_len != 0) { - unsigned int free_space = tailbuf_len - tail->iov_len; - - if (len < free_space) - free_space = len; - if (len > free_space) - len = free_space; - - tail->iov_len += free_space; - copy = len; - - if (tail->iov_len > len) { - char *p = (char *)tail->iov_base + len; - memmove(p, tail->iov_base, tail->iov_len - free_space); - result += tail->iov_len - free_space; - } else - copy = tail->iov_len; - - /* Copy from the inlined pages into the tail */ - _copy_from_pages((char *)tail->iov_base, - buf->pages, - buf->page_base + pgfrom, - copy); - result += copy; - } - - return result; -} - /** * _copy_to_pages * @pages: array of pages @@ -408,6 +349,9 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) char *vto; size_t copy; + if (!len) + return; + pgto = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -452,6 +396,9 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) char *vfrom; size_t copy; + if (!len) + return; + pgfrom = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -475,18 +422,42 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) } EXPORT_SYMBOL_GPL(_copy_from_pages); +static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base, + unsigned int len) +{ + if (base >= iov->iov_len) + return; + if (len > iov->iov_len - base) + len = iov->iov_len - base; + memset(iov->iov_base + base, 0, len); +} + /** - * _zero_pages - * @pages: array of pages - * @pgbase: beginning page vector address + * xdr_buf_pages_zero + * @buf: xdr_buf + * @pgbase: beginning offset * @len: length */ -static void -_zero_pages(struct page **pages, size_t pgbase, size_t len) +static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase, + unsigned int len) { + struct page **pages = buf->pages; struct page **page; char *vpage; - size_t zero; + unsigned int zero; + + if (!len) + return; + if (pgbase >= buf->page_len) { + xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len); + return; + } + if (pgbase + len > buf->page_len) { + xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len); + len = buf->page_len - pgbase; + } + + pgbase += buf->page_base; page = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -507,122 +478,367 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len) } while ((len -= zero) != 0); } +static unsigned int xdr_buf_pages_fill_sparse(const struct xdr_buf *buf, + unsigned int buflen, gfp_t gfp) +{ + unsigned int i, npages, pagelen; + + if (!(buf->flags & XDRBUF_SPARSE_PAGES)) + return buflen; + if (buflen <= buf->head->iov_len) + return buflen; + pagelen = buflen - buf->head->iov_len; + if (pagelen > buf->page_len) + pagelen = buf->page_len; + npages = (pagelen + buf->page_base + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + if (!buf->pages[i]) + continue; + buf->pages[i] = alloc_page(gfp); + if (likely(buf->pages[i])) + continue; + buflen -= pagelen; + pagelen = i << PAGE_SHIFT; + if (pagelen > buf->page_base) + buflen += pagelen - buf->page_base; + break; + } + return buflen; +} + +static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len) +{ + struct kvec *head = buf->head; + struct kvec *tail = buf->tail; + unsigned int sum = head->iov_len + buf->page_len + tail->iov_len; + unsigned int free_space, newlen; + + if (sum > buf->len) { + free_space = min_t(unsigned int, sum - buf->len, len); + newlen = xdr_buf_pages_fill_sparse(buf, buf->len + free_space, + GFP_KERNEL); + free_space = newlen - buf->len; + buf->len = newlen; + len -= free_space; + if (!len) + return; + } + + if (buf->buflen > sum) { + /* Expand the tail buffer */ + free_space = min_t(unsigned int, buf->buflen - sum, len); + tail->iov_len += free_space; + buf->len += free_space; + } +} + +static void xdr_buf_tail_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + + if (to >= tail->iov_len) + return; + if (len + to > tail->iov_len) + len = tail->iov_len - to; + memmove(tail->iov_base + to, tail->iov_base + base, len); +} + +static void xdr_buf_pages_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + unsigned int pglen = 0; + unsigned int talen = 0, tato = 0; + + if (base >= buf->page_len) + return; + if (len > buf->page_len - base) + len = buf->page_len - base; + if (to >= buf->page_len) { + tato = to - buf->page_len; + if (tail->iov_len >= len + tato) + talen = len; + else if (tail->iov_len > tato) + talen = tail->iov_len - tato; + } else if (len + to >= buf->page_len) { + pglen = buf->page_len - to; + talen = len - pglen; + if (talen > tail->iov_len) + talen = tail->iov_len; + } else + pglen = len; + + _copy_from_pages(tail->iov_base + tato, buf->pages, + buf->page_base + base + pglen, talen); + _shift_data_right_pages(buf->pages, buf->page_base + to, + buf->page_base + base, pglen); +} + +static void xdr_buf_head_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + unsigned int pglen = 0, pgto = 0; + unsigned int talen = 0, tato = 0; + + if (base >= head->iov_len) + return; + if (len > head->iov_len - base) + len = head->iov_len - base; + if (to >= buf->page_len + head->iov_len) { + tato = to - buf->page_len - head->iov_len; + talen = len; + } else if (to >= head->iov_len) { + pgto = to - head->iov_len; + pglen = len; + if (pgto + pglen > buf->page_len) { + talen = pgto + pglen - buf->page_len; + pglen -= talen; + } + } else { + pglen = len - to; + if (pglen > buf->page_len) { + talen = pglen - buf->page_len; + pglen = buf->page_len; + } + } + + len -= talen; + base += len; + if (talen + tato > tail->iov_len) + talen = tail->iov_len > tato ? tail->iov_len - tato : 0; + memcpy(tail->iov_base + tato, head->iov_base + base, talen); + + len -= pglen; + base -= pglen; + _copy_to_pages(buf->pages, buf->page_base + pgto, head->iov_base + base, + pglen); + + base -= len; + memmove(head->iov_base + to, head->iov_base + base, len); +} + +static void xdr_buf_tail_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + + if (base >= tail->iov_len || !shift || !len) + return; + xdr_buf_tail_copy_right(buf, base, len, shift); +} + +static void xdr_buf_pages_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + if (base >= buf->page_len) { + xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift); + return; + } + if (base + len > buf->page_len) + xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len, + shift); + xdr_buf_pages_copy_right(buf, base, len, shift); +} + +static void xdr_buf_head_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; + + if (!shift) + return; + if (base >= head->iov_len) { + xdr_buf_pages_shift_right(buf, head->iov_len - base, len, + shift); + return; + } + if (base + len > head->iov_len) + xdr_buf_pages_shift_right(buf, 0, base + len - head->iov_len, + shift); + xdr_buf_head_copy_right(buf, base, len, shift); +} + +static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base, + unsigned int len, unsigned int shift) +{ + const struct kvec *tail = buf->tail; + + if (base >= tail->iov_len) + return; + if (len > tail->iov_len - base) + len = tail->iov_len - base; + /* Shift data into head */ + if (shift > buf->page_len + base) { + const struct kvec *head = buf->head; + unsigned int hdto = + head->iov_len + buf->page_len + base - shift; + unsigned int hdlen = len; + + if (WARN_ONCE(shift > head->iov_len + buf->page_len + base, + "SUNRPC: Misaligned data.\n")) + return; + if (hdto + hdlen > head->iov_len) + hdlen = head->iov_len - hdto; + memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen); + base += hdlen; + len -= hdlen; + if (!len) + return; + } + /* Shift data into pages */ + if (shift > base) { + unsigned int pgto = buf->page_len + base - shift; + unsigned int pglen = len; + + if (pgto + pglen > buf->page_len) + pglen = buf->page_len - pgto; + _copy_to_pages(buf->pages, buf->page_base + pgto, + tail->iov_base + base, pglen); + base += pglen; + len -= pglen; + if (!len) + return; + } + memmove(tail->iov_base + base - shift, tail->iov_base + base, len); +} + +static void xdr_buf_pages_copy_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + unsigned int pgto; + + if (base >= buf->page_len) + return; + if (len > buf->page_len - base) + len = buf->page_len - base; + /* Shift data into head */ + if (shift > base) { + const struct kvec *head = buf->head; + unsigned int hdto = head->iov_len + base - shift; + unsigned int hdlen = len; + + if (WARN_ONCE(shift > head->iov_len + base, + "SUNRPC: Misaligned data.\n")) + return; + if (hdto + hdlen > head->iov_len) + hdlen = head->iov_len - hdto; + _copy_from_pages(head->iov_base + hdto, buf->pages, + buf->page_base + base, hdlen); + base += hdlen; + len -= hdlen; + if (!len) + return; + } + pgto = base - shift; + _shift_data_left_pages(buf->pages, buf->page_base + pgto, + buf->page_base + base, len); +} + +static void xdr_buf_tail_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + xdr_buf_tail_copy_left(buf, base, len, shift); +} + +static void xdr_buf_pages_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + if (base >= buf->page_len) { + xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift); + return; + } + xdr_buf_pages_copy_left(buf, base, len, shift); + len += base; + if (len <= buf->page_len) + return; + xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift); +} + /** * xdr_shrink_bufhead * @buf: xdr_buf - * @len: bytes to remove from buf->head[0] + * @len: new length of buf->head[0] * - * Shrinks XDR buffer's header kvec buf->head[0] by + * Shrinks XDR buffer's header kvec buf->head[0], setting it to * 'len' bytes. The extra data is not lost, but is instead * moved into the inlined pages and/or the tail. */ -static unsigned int -xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) +static unsigned int xdr_shrink_bufhead(struct xdr_buf *buf, unsigned int len) { - struct kvec *head, *tail; - size_t copy, offs; - unsigned int pglen = buf->page_len; - unsigned int result; - - result = 0; - tail = buf->tail; - head = buf->head; + struct kvec *head = buf->head; + unsigned int shift, buflen = max(buf->len, len); WARN_ON_ONCE(len > head->iov_len); - if (len > head->iov_len) - len = head->iov_len; - - /* Shift the tail first */ - if (tail->iov_len != 0) { - if (tail->iov_len > len) { - copy = tail->iov_len - len; - memmove((char *)tail->iov_base + len, - tail->iov_base, copy); - result += copy; - } - /* Copy from the inlined pages into the tail */ - copy = len; - if (copy > pglen) - copy = pglen; - offs = len - copy; - if (offs >= tail->iov_len) - copy = 0; - else if (copy > tail->iov_len - offs) - copy = tail->iov_len - offs; - if (copy != 0) { - _copy_from_pages((char *)tail->iov_base + offs, - buf->pages, - buf->page_base + pglen + offs - len, - copy); - result += copy; - } - /* Do we also need to copy data from the head into the tail ? */ - if (len > pglen) { - offs = copy = len - pglen; - if (copy > tail->iov_len) - copy = tail->iov_len; - memcpy(tail->iov_base, - (char *)head->iov_base + - head->iov_len - offs, - copy); - result += copy; - } - } - /* Now handle pages */ - if (pglen != 0) { - if (pglen > len) - _shift_data_right_pages(buf->pages, - buf->page_base + len, - buf->page_base, - pglen - len); - copy = len; - if (len > pglen) - copy = pglen; - _copy_to_pages(buf->pages, buf->page_base, - (char *)head->iov_base + head->iov_len - len, - copy); - result += copy; + if (head->iov_len > buflen) { + buf->buflen -= head->iov_len - buflen; + head->iov_len = buflen; } - head->iov_len -= len; - buf->buflen -= len; - /* Have we truncated the message? */ - if (buf->len > buf->buflen) - buf->len = buf->buflen; - - return result; + if (len >= head->iov_len) + return 0; + shift = head->iov_len - len; + xdr_buf_try_expand(buf, shift); + xdr_buf_head_shift_right(buf, len, buflen - len, shift); + head->iov_len = len; + buf->buflen -= shift; + buf->len -= shift; + return shift; } /** - * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes + * xdr_shrink_pagelen - shrinks buf->pages to @len bytes * @buf: xdr_buf - * @len: bytes to remove from buf->pages + * @len: new page buffer length * * The extra data is not lost, but is instead moved into buf->tail. * Returns the actual number of bytes moved. */ -static unsigned int -xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) +static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len) { - unsigned int pglen = buf->page_len; - unsigned int result; + unsigned int shift, buflen = buf->len - buf->head->iov_len; - if (len > buf->page_len) - len = buf-> page_len; - - result = _shift_data_right_tail(buf, pglen - len, len); - buf->page_len -= len; - buf->buflen -= len; - /* Have we truncated the message? */ - if (buf->len > buf->buflen) - buf->len = buf->buflen; - - return result; + WARN_ON_ONCE(len > buf->page_len); + if (buf->head->iov_len >= buf->len || len > buflen) + buflen = len; + if (buf->page_len > buflen) { + buf->buflen -= buf->page_len - buflen; + buf->page_len = buflen; + } + if (len >= buf->page_len) + return 0; + shift = buf->page_len - len; + xdr_buf_try_expand(buf, shift); + xdr_buf_pages_shift_right(buf, len, buflen - len, shift); + buf->page_len = len; + buf->len -= shift; + buf->buflen -= shift; + return shift; } void xdr_shift_buf(struct xdr_buf *buf, size_t len) { - xdr_shrink_bufhead(buf, len); + xdr_shrink_bufhead(buf, buf->head->iov_len - len); } EXPORT_SYMBOL_GPL(xdr_shift_buf); @@ -636,6 +852,18 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr) } EXPORT_SYMBOL_GPL(xdr_stream_pos); +static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos) +{ + unsigned int blen = xdr->buf->len; + + xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0; +} + +static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos) +{ + xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len); +} + /** * xdr_page_pos - Return the current offset from the start of the xdr pages * @xdr: pointer to struct xdr_stream @@ -969,19 +1197,31 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b } EXPORT_SYMBOL_GPL(xdr_write_pages); -static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, - unsigned int len) +static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, + unsigned int base, unsigned int len) { if (len > iov->iov_len) len = iov->iov_len; - xdr->p = (__be32*)iov->iov_base; + if (unlikely(base > len)) + base = len; + xdr->p = (__be32*)(iov->iov_base + base); xdr->end = (__be32*)(iov->iov_base + len); xdr->iov = iov; xdr->page_ptr = NULL; + return len - base; +} + +static unsigned int xdr_set_tail_base(struct xdr_stream *xdr, + unsigned int base, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + + xdr_stream_set_pos(xdr, base + buf->page_len + buf->head->iov_len); + return xdr_set_iov(xdr, buf->tail, base, len); } -static int xdr_set_page_base(struct xdr_stream *xdr, - unsigned int base, unsigned int len) +static unsigned int xdr_set_page_base(struct xdr_stream *xdr, + unsigned int base, unsigned int len) { unsigned int pgnr; unsigned int maxlen; @@ -990,12 +1230,15 @@ static int xdr_set_page_base(struct xdr_stream *xdr, void *kaddr; maxlen = xdr->buf->page_len; - if (base >= maxlen) - return -EINVAL; - maxlen -= base; + if (base >= maxlen) { + base = maxlen; + maxlen = 0; + } else + maxlen -= base; if (len > maxlen) len = maxlen; + xdr_stream_page_set_pos(xdr, base); base += xdr->buf->page_base; pgnr = base >> PAGE_SHIFT; @@ -1010,14 +1253,16 @@ static int xdr_set_page_base(struct xdr_stream *xdr, pgend = PAGE_SIZE; xdr->end = (__be32*)(kaddr + pgend); xdr->iov = NULL; - return 0; + return len; } static void xdr_set_page(struct xdr_stream *xdr, unsigned int base, unsigned int len) { - if (xdr_set_page_base(xdr, base, len) < 0) - xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2); + if (xdr_set_page_base(xdr, base, len) == 0) { + base -= xdr->buf->page_len; + xdr_set_tail_base(xdr, base, len); + } } static void xdr_set_next_page(struct xdr_stream *xdr) @@ -1026,17 +1271,18 @@ static void xdr_set_next_page(struct xdr_stream *xdr) newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT; newbase -= xdr->buf->page_base; - - xdr_set_page(xdr, newbase, PAGE_SIZE); + if (newbase < xdr->buf->page_len) + xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr)); + else + xdr_set_tail_base(xdr, 0, xdr_stream_remaining(xdr)); } static bool xdr_set_next_buffer(struct xdr_stream *xdr) { if (xdr->page_ptr != NULL) xdr_set_next_page(xdr); - else if (xdr->iov == xdr->buf->head) { - xdr_set_page(xdr, 0, PAGE_SIZE); - } + else if (xdr->iov == xdr->buf->head) + xdr_set_page(xdr, 0, xdr_stream_remaining(xdr)); return xdr->p != xdr->end; } @@ -1053,12 +1299,9 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, xdr->buf = buf; xdr_reset_scratch_buffer(xdr); xdr->nwords = XDR_QUADLEN(buf->len); - if (buf->head[0].iov_len != 0) - xdr_set_iov(xdr, buf->head, buf->len); - else if (buf->page_len != 0) - xdr_set_page_base(xdr, 0, buf->len); - else - xdr_set_iov(xdr, buf->head, buf->len); + if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 && + xdr_set_page_base(xdr, 0, buf->len) == 0) + xdr_set_iov(xdr, buf->tail, 0, buf->len); if (p != NULL && p > xdr->p && xdr->end >= p) { xdr->nwords -= p - xdr->p; xdr->p = p; @@ -1158,14 +1401,13 @@ static void xdr_realign_pages(struct xdr_stream *xdr) struct xdr_buf *buf = xdr->buf; struct kvec *iov = buf->head; unsigned int cur = xdr_stream_pos(xdr); - unsigned int copied, offset; + unsigned int copied; /* Realign pages to current pointer position */ if (iov->iov_len > cur) { - offset = iov->iov_len - cur; - copied = xdr_shrink_bufhead(buf, offset); - trace_rpc_xdr_alignment(xdr, offset, copied); - xdr->nwords = XDR_QUADLEN(buf->len - cur); + copied = xdr_shrink_bufhead(buf, cur); + trace_rpc_xdr_alignment(xdr, cur, copied); + xdr_set_page(xdr, 0, buf->page_len); } } @@ -1173,8 +1415,7 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) { struct xdr_buf *buf = xdr->buf; unsigned int nwords = XDR_QUADLEN(len); - unsigned int cur = xdr_stream_pos(xdr); - unsigned int copied, offset; + unsigned int copied; if (xdr->nwords == 0) return 0; @@ -1188,125 +1429,103 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) len = buf->page_len; else if (nwords < xdr->nwords) { /* Truncate page data and move it into the tail */ - offset = buf->page_len - len; - copied = xdr_shrink_pagelen(buf, offset); - trace_rpc_xdr_alignment(xdr, offset, copied); - xdr->nwords = XDR_QUADLEN(buf->len - cur); + copied = xdr_shrink_pagelen(buf, len); + trace_rpc_xdr_alignment(xdr, len, copied); } return len; } /** - * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position + * xdr_read_pages - align page-based XDR data to current pointer position * @xdr: pointer to xdr_stream struct * @len: number of bytes of page data * * Moves data beyond the current pointer position from the XDR head[] buffer - * into the page list. Any data that lies beyond current position + "len" - * bytes is moved into the XDR tail[]. + * into the page list. Any data that lies beyond current position + @len + * bytes is moved into the XDR tail[]. The xdr_stream current position is + * then advanced past that data to align to the next XDR object in the tail. * * Returns the number of XDR encoded bytes now contained in the pages */ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len) { - struct xdr_buf *buf = xdr->buf; - struct kvec *iov; - unsigned int nwords; - unsigned int end; - unsigned int padding; + unsigned int nwords = XDR_QUADLEN(len); + unsigned int base, end, pglen; - len = xdr_align_pages(xdr, len); - if (len == 0) + pglen = xdr_align_pages(xdr, nwords << 2); + if (pglen == 0) return 0; - nwords = XDR_QUADLEN(len); - padding = (nwords << 2) - len; - xdr->iov = iov = buf->tail; - /* Compute remaining message length. */ - end = ((xdr->nwords - nwords) << 2) + padding; - if (end > iov->iov_len) - end = iov->iov_len; - /* - * Position current pointer at beginning of tail, and - * set remaining message length. - */ - xdr->p = (__be32 *)((char *)iov->iov_base + padding); - xdr->end = (__be32 *)((char *)iov->iov_base + end); - xdr->page_ptr = NULL; - xdr->nwords = XDR_QUADLEN(end - padding); - return len; + base = (nwords << 2) - pglen; + end = xdr_stream_remaining(xdr) - pglen; + + xdr_set_tail_base(xdr, base, end); + return len <= pglen ? len : pglen; } EXPORT_SYMBOL_GPL(xdr_read_pages); -uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length) +unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset, + unsigned int length) { struct xdr_buf *buf = xdr->buf; - unsigned int from, bytes; - unsigned int shift = 0; - - if ((offset + length) < offset || - (offset + length) > buf->page_len) - length = buf->page_len - offset; + unsigned int from, bytes, len; + unsigned int shift; xdr_realign_pages(xdr); from = xdr_page_pos(xdr); - bytes = xdr->nwords << 2; - if (length < bytes) - bytes = length; + + if (from >= buf->page_len + buf->tail->iov_len) + return 0; + if (from + buf->head->iov_len >= buf->len) + return 0; + + len = buf->len - buf->head->iov_len; + + /* We only shift data left! */ + if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n", + from, offset)) + return 0; + if (WARN_ONCE(offset > buf->page_len, + "SUNRPC: buffer overflow. offset=%u, page_len=%u\n", + offset, buf->page_len)) + return 0; /* Move page data to the left */ - if (from > offset) { - shift = min_t(unsigned int, bytes, buf->page_len - from); - _shift_data_left_pages(buf->pages, - buf->page_base + offset, - buf->page_base + from, - shift); - bytes -= shift; - - /* Move tail data into the pages, if necessary */ - if (bytes > 0) - _shift_data_left_tail(buf, offset + shift, bytes); - } + shift = from - offset; + xdr_buf_pages_shift_left(buf, from, len, shift); + + bytes = xdr_stream_remaining(xdr); + if (length > bytes) + length = bytes; + bytes -= length; - xdr->nwords -= XDR_QUADLEN(length); - xdr_set_page(xdr, from + length, PAGE_SIZE); + xdr->buf->len -= shift; + xdr_set_page(xdr, offset + length, bytes); return length; } EXPORT_SYMBOL_GPL(xdr_align_data); -uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t length) +unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset, + unsigned int length) { struct xdr_buf *buf = xdr->buf; - unsigned int bytes; - unsigned int from; - unsigned int truncated = 0; - - if ((offset + length) < offset || - (offset + length) > buf->page_len) - length = buf->page_len - offset; + unsigned int from, to, shift; xdr_realign_pages(xdr); from = xdr_page_pos(xdr); - bytes = xdr->nwords << 2; - - if (offset + length + bytes > buf->page_len) { - unsigned int shift = (offset + length + bytes) - buf->page_len; - unsigned int res = _shift_data_right_tail(buf, from + bytes - shift, shift); - truncated = shift - res; - xdr->nwords -= XDR_QUADLEN(truncated); - bytes -= shift; - } - - /* Now move the page data over and zero pages */ - if (bytes > 0) - _shift_data_right_pages(buf->pages, - buf->page_base + offset + length, - buf->page_base + from, - bytes); - _zero_pages(buf->pages, buf->page_base + offset, length); + to = xdr_align_size(offset + length); + + /* Could the hole be behind us? */ + if (to > from) { + unsigned int buflen = buf->len - buf->head->iov_len; + shift = to - from; + xdr_buf_try_expand(buf, shift); + xdr_buf_pages_shift_right(buf, from, buflen, shift); + xdr_set_page(xdr, to, xdr_stream_remaining(xdr)); + } else if (to != from) + xdr_align_data(xdr, to, 0); + xdr_buf_pages_zero(buf, offset, length); - buf->len += length - (from - offset) - truncated; - xdr_set_page(xdr, offset + length, PAGE_SIZE); return length; } EXPORT_SYMBOL_GPL(xdr_expand_hole); @@ -1335,8 +1554,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page); static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; -void -xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) +void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf) { buf->head[0] = *iov; buf->tail[0] = empty_iov; @@ -1493,7 +1711,8 @@ fix_len: } EXPORT_SYMBOL_GPL(xdr_buf_trim); -static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf, + void *obj, unsigned int len) { unsigned int this_len; @@ -1502,8 +1721,7 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); - if (this_len) - _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); + _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); @@ -1511,7 +1729,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne } /* obj is assumed to point to allocated memory of size at least len: */ -int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base, + void *obj, unsigned int len) { struct xdr_buf subbuf; int status; @@ -1524,7 +1743,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u } EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf); -static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf, + void *obj, unsigned int len) { unsigned int this_len; @@ -1533,8 +1753,7 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); - if (this_len) - _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); + _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); @@ -1542,7 +1761,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned } /* obj is assumed to point to allocated memory of size at least len: */ -int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base, + void *obj, unsigned int len) { struct xdr_buf subbuf; int status; @@ -1555,8 +1775,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un } EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf); -int -xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) +int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj) { __be32 raw; int status; @@ -1569,8 +1788,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) } EXPORT_SYMBOL_GPL(xdr_decode_word); -int -xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) +int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj) { __be32 raw = cpu_to_be32(obj); @@ -1579,9 +1797,8 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) EXPORT_SYMBOL_GPL(xdr_encode_word); /* Returns 0 on success, or else a negative error code. */ -static int -xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc, int encode) +static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc, int encode) { char *elem = NULL, *c; unsigned int copied = 0, todo, avail_here; @@ -1773,9 +1990,8 @@ out: return err; } -int -xdr_decode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc) +int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) { if (base >= buf->len) return -EINVAL; @@ -1784,9 +2000,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base, } EXPORT_SYMBOL_GPL(xdr_decode_array2); -int -xdr_encode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc) +int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) { if ((unsigned long) base + 4 + desc->array_len * desc->elem_size > buf->head->iov_len + buf->page_len + buf->tail->iov_len) @@ -1796,9 +2011,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base, } EXPORT_SYMBOL_GPL(xdr_encode_array2); -int -xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, - int (*actor)(struct scatterlist *, void *), void *data) +int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, + unsigned int len, + int (*actor)(struct scatterlist *, void *), void *data) { int i, ret = 0; unsigned int page_len, thislen, page_offset; @@ -1966,10 +2181,8 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); if (ret > 0) { - char *s = kmalloc(ret + 1, gfp_flags); + char *s = kmemdup_nul(p, ret, gfp_flags); if (s != NULL) { - memcpy(s, p, ret); - s[ret] = '\0'; *str = s; return strlen(s); } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index f6c17e75f20e..691ccf8049a4 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -151,33 +151,94 @@ out: } EXPORT_SYMBOL_GPL(xprt_unregister_transport); -/** - * xprt_load_transport - load a transport implementation - * @transport_name: transport to load - * - * Returns: - * 0: transport successfully loaded - * -ENOENT: transport module not available - */ -int xprt_load_transport(const char *transport_name) +static void +xprt_class_release(const struct xprt_class *t) { - struct xprt_class *t; - int result; + module_put(t->owner); +} + +static const struct xprt_class * +xprt_class_find_by_ident_locked(int ident) +{ + const struct xprt_class *t; + + list_for_each_entry(t, &xprt_list, list) { + if (t->ident != ident) + continue; + if (!try_module_get(t->owner)) + continue; + return t; + } + return NULL; +} + +static const struct xprt_class * +xprt_class_find_by_ident(int ident) +{ + const struct xprt_class *t; - result = 0; spin_lock(&xprt_list_lock); + t = xprt_class_find_by_ident_locked(ident); + spin_unlock(&xprt_list_lock); + return t; +} + +static const struct xprt_class * +xprt_class_find_by_netid_locked(const char *netid) +{ + const struct xprt_class *t; + unsigned int i; + list_for_each_entry(t, &xprt_list, list) { - if (strcmp(t->name, transport_name) == 0) { - spin_unlock(&xprt_list_lock); - goto out; + for (i = 0; t->netid[i][0] != '\0'; i++) { + if (strcmp(t->netid[i], netid) != 0) + continue; + if (!try_module_get(t->owner)) + continue; + return t; } } + return NULL; +} + +static const struct xprt_class * +xprt_class_find_by_netid(const char *netid) +{ + const struct xprt_class *t; + + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_netid_locked(netid); + if (!t) { + spin_unlock(&xprt_list_lock); + request_module("rpc%s", netid); + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_netid_locked(netid); + } spin_unlock(&xprt_list_lock); - result = request_module("xprt%s", transport_name); -out: - return result; + return t; +} + +/** + * xprt_find_transport_ident - convert a netid into a transport identifier + * @netid: transport to load + * + * Returns: + * > 0: transport identifier + * -ENOENT: transport module not available + */ +int xprt_find_transport_ident(const char *netid) +{ + const struct xprt_class *t; + int ret; + + t = xprt_class_find_by_netid(netid); + if (!t) + return -ENOENT; + ret = t->ident; + xprt_class_release(t); + return ret; } -EXPORT_SYMBOL_GPL(xprt_load_transport); +EXPORT_SYMBOL_GPL(xprt_find_transport_ident); static void xprt_clear_locked(struct rpc_xprt *xprt) { @@ -1896,21 +1957,17 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { struct rpc_xprt *xprt; - struct xprt_class *t; + const struct xprt_class *t; - spin_lock(&xprt_list_lock); - list_for_each_entry(t, &xprt_list, list) { - if (t->ident == args->ident) { - spin_unlock(&xprt_list_lock); - goto found; - } + t = xprt_class_find_by_ident(args->ident); + if (!t) { + dprintk("RPC: transport (%d) not supported\n", args->ident); + return ERR_PTR(-EIO); } - spin_unlock(&xprt_list_lock); - dprintk("RPC: transport (%d) not supported\n", args->ident); - return ERR_PTR(-EIO); -found: xprt = t->setup(args); + xprt_class_release(t); + if (IS_ERR(xprt)) goto out; if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index c92c1aac270a..946edf2db646 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015-2020, Oracle and/or its affiliates. * * Support for backward direction RPCs on RPC/RDMA. */ @@ -82,7 +82,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) &rqst->rq_snd_buf, rpcrdma_noch_pullup)) return -EIO; - trace_xprtrdma_cb_reply(rqst); + trace_xprtrdma_cb_reply(r_xprt, rqst); return 0; } @@ -260,7 +260,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, */ req = rpcr_to_rdmar(rqst); req->rl_reply = rep; - trace_xprtrdma_cb_call(rqst); + trace_xprtrdma_cb_call(r_xprt, rqst); /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 44888f5badef..baca49fe83af 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -65,18 +65,23 @@ void frwr_release_mr(struct rpcrdma_mr *mr) kfree(mr); } +static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) +{ + if (mr->mr_device) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents, + mr->mr_dir); + mr->mr_device = NULL; + } +} + static void frwr_mr_recycle(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; trace_xprtrdma_mr_recycle(mr); - if (mr->mr_dir != DMA_NONE) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; - } + frwr_mr_unmap(r_xprt, mr); spin_lock(&r_xprt->rx_buf.rb_lock); list_del(&mr->mr_all); @@ -86,6 +91,16 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr) frwr_release_mr(mr); } +static void frwr_mr_put(struct rpcrdma_mr *mr) +{ + frwr_mr_unmap(mr->mr_xprt, mr); + + /* The MR is returned to the req's MR free list instead + * of to the xprt's MR free list. No spinlock is needed. + */ + rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); +} + /* frwr_reset - Place MRs back on the free list * @req: request to reset * @@ -101,7 +116,7 @@ void frwr_reset(struct rpcrdma_req *req) struct rpcrdma_mr *mr; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) - rpcrdma_mr_put(mr); + frwr_mr_put(mr); } /** @@ -130,7 +145,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) mr->mr_xprt = r_xprt; mr->frwr.fr_mr = frmr; - mr->mr_dir = DMA_NONE; + mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); init_completion(&mr->frwr.fr_linv_done); @@ -315,6 +330,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_dir); if (!dma_nents) goto out_dmamap_err; + mr->mr_device = ep->re_id->device; ibmr = mr->frwr.fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); @@ -341,7 +357,6 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, return seg; out_dmamap_err: - mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); return ERR_PTR(-EIO); @@ -363,12 +378,21 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct rpcrdma_frwr, fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_fastreg(wc, frwr); + trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid); /* The MR will get recycled when the associated req is retransmitted */ rpcrdma_flush_disconnect(cq->cq_context, wc); } +static void frwr_cid_init(struct rpcrdma_ep *ep, + struct rpcrdma_frwr *frwr) +{ + struct rpc_rdma_cid *cid = &frwr->fr_cid; + + cid->ci_queue_id = ep->re_attr.send_cq->res.id; + cid->ci_completion_id = frwr->fr_mr->res.id; +} + /** * frwr_send - post Send WRs containing the RPC Call message * @r_xprt: controlling transport instance @@ -385,6 +409,7 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) */ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; @@ -395,6 +420,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_fastreg; + frwr_cid_init(ep, frwr); frwr->fr_regwr.wr.next = post_wr; frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; frwr->fr_regwr.wr.num_sge = 0; @@ -404,7 +430,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) post_wr = &frwr->fr_regwr.wr; } - return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL); + return ib_post_send(ep->re_id->qp, post_wr, NULL); } /** @@ -420,18 +446,17 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_mr_reminv(mr); - rpcrdma_mr_put(mr); + frwr_mr_put(mr); break; /* only one invalidated MR per RPC */ } } -static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) +static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr) { if (wc->status != IB_WC_SUCCESS) frwr_mr_recycle(mr); else - rpcrdma_mr_put(mr); + frwr_mr_put(mr); } /** @@ -448,8 +473,8 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li(wc, &frwr->fr_cid); + frwr_mr_done(wc, mr); rpcrdma_flush_disconnect(cq->cq_context, wc); } @@ -469,8 +494,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_wake(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid); + frwr_mr_done(wc, mr); complete(&frwr->fr_linv_done); rpcrdma_flush_disconnect(cq->cq_context, wc); @@ -490,6 +515,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, **prev, *last; + struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; @@ -509,6 +535,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; + frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; @@ -534,7 +561,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, &bad_wr); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will @@ -547,7 +574,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ - trace_xprtrdma_post_linv(req, rc); + trace_xprtrdma_post_linv_err(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); @@ -574,10 +601,10 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_rep *rep = mr->mr_req->rl_reply; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_done(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid); + frwr_mr_done(wc, mr); - /* Ensure @rep is generated before __frwr_release_mr */ + /* Ensure @rep is generated before frwr_mr_done */ smp_rmb(); rpcrdma_complete_rqst(rep); @@ -597,6 +624,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, *last, **prev; + struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; @@ -614,6 +642,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; + frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; @@ -639,13 +668,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, &bad_wr); if (!rc) return; /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ - trace_xprtrdma_post_linv(req, rc); + trace_xprtrdma_post_linv_err(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 620327c01302..45c5b41ac8dc 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("svcrdma"); MODULE_ALIAS("xprtrdma"); +MODULE_ALIAS("rpcrdma6"); static void __exit rpc_rdma_cleanup(void) { diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 0f5120c7668f..8f5d0cb68360 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (c) 2014-2017 Oracle. All rights reserved. + * Copyright (c) 2014-2020, Oracle and/or its affiliates. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -179,6 +179,31 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, r_xprt->rx_ep->re_max_inline_recv; } +/* ACL likes to be lazy in allocating pages. For TCP, these + * pages can be allocated during receive processing. Not true + * for RDMA, which must always provision receive buffers + * up front. + */ +static noinline int +rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) +{ + struct page **ppages; + int len; + + len = buf->page_len; + ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); + while (len > 0) { + if (!*ppages) + *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + if (!*ppages) + return -ENOBUFS; + ppages++; + len -= PAGE_SIZE; + } + + return 0; +} + /* Split @vec on page boundaries into SGEs. FMR registers pages, not * a byte range. Other modes coalesce these SGEs into a single MR * when they can. @@ -233,15 +258,6 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); while (len) { - /* ACL likes to be lazy in allocating pages - ACLs - * are small by default but can get huge. - */ - if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { - if (!*ppages) - *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); - if (!*ppages) - return -ENOBUFS; - } seg->mr_page = *ppages; seg->mr_offset = (char *)page_base; seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); @@ -315,7 +331,6 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, *mr = rpcrdma_mr_get(r_xprt); if (!*mr) goto out_getmr_err; - trace_xprtrdma_mr_get(req); (*mr)->mr_req = req; } @@ -323,7 +338,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); out_getmr_err: - trace_xprtrdma_nomrs(req); + trace_xprtrdma_nomrs_err(r_xprt, req); xprt_wait_for_buffer_space(&r_xprt->rx_xprt); rpcrdma_mrs_refresh(r_xprt); return ERR_PTR(-EAGAIN); @@ -867,6 +882,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) __be32 *p; int ret; + if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) { + ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf); + if (ret) + return ret; + } + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), rqst); @@ -1322,20 +1343,13 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) break; - dprintk("RPC: %s: server reports " - "version error (%u-%u), xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(*(p + 1)), - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_vers(rqst, p, p + 1); break; case err_chunk: - dprintk("RPC: %s: server reports " - "header decoding error, xid %08x\n", __func__, - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_chunk(rqst); break; default: - dprintk("RPC: %s: server reports " - "unrecognized error %d, xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_unrecognized(rqst, p); } return -EIO; @@ -1376,7 +1390,7 @@ out: return; out_badheader: - trace_xprtrdma_reply_hdr(rep); + trace_xprtrdma_reply_hdr_err(rep); r_xprt->rx_stats.bad_reply_count++; rqst->rq_task->tk_status = status; status = 0; @@ -1450,14 +1464,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) rpcrdma_post_recvs(r_xprt, false); req = rpcr_to_rdmar(rqst); - if (req->rl_reply) { - trace_xprtrdma_leaked_rep(rqst, req->rl_reply); + if (unlikely(req->rl_reply)) rpcrdma_recv_buffer_put(req->rl_reply); - } req->rl_reply = rep; rep->rr_rqst = rqst; - trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); + trace_xprtrdma_reply(rqst->rq_task, rep, credits); if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) frwr_reminv(rep, &req->rl_registered); @@ -1469,16 +1481,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) return; out_badversion: - trace_xprtrdma_reply_vers(rep); + trace_xprtrdma_reply_vers_err(rep); goto out; out_norqst: spin_unlock(&xprt->queue_lock); - trace_xprtrdma_reply_rqst(rep); + trace_xprtrdma_reply_rqst_err(rep); goto out; out_shortreply: - trace_xprtrdma_reply_short(rep); + trace_xprtrdma_reply_short_err(rep); out: rpcrdma_recv_buffer_put(rep); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8915e42240d3..78d29d1bcc20 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -599,11 +599,12 @@ static void xprt_rdma_free(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - if (!list_empty(&req->rl_registered)) - frwr_unmap_sync(r_xprt, req); + if (unlikely(!list_empty(&req->rl_registered))) { + trace_xprtrdma_mrs_zap(task); + frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req); + } /* XXX: If the RPC is completing because of a signal and * not because a reply was received, we ought to ensure @@ -768,6 +769,7 @@ static struct xprt_class xprt_rdma = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_RDMA, .setup = xprt_setup_rdma, + .netid = { "rdma", "rdma6", "" }, }; void xprt_rdma_cleanup(void) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ad6e2e4994ce..ec912cf9c618 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -167,7 +167,7 @@ static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_send(sc, wc); + trace_xprtrdma_wc_send(wc, &sc->sc_cid); rpcrdma_sendctx_put_locked(r_xprt, sc); rpcrdma_flush_disconnect(r_xprt, wc); } @@ -186,7 +186,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_receive(wc); + trace_xprtrdma_wc_receive(wc, &rep->rr_cid); --r_xprt->rx_ep->re_receive_count; if (wc->status != IB_WC_SUCCESS) goto out_flushed; @@ -643,6 +643,9 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) return NULL; sc->sc_cqe.done = rpcrdma_wc_send; + sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id; + sc->sc_cid.ci_completion_id = + atomic_inc_return(&ep->re_completion_ids); return sc; } @@ -972,6 +975,9 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) goto out_free_regbuf; + rep->rr_cid.ci_completion_id = + atomic_inc_return(&r_xprt->rx_ep->re_completion_ids); + xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf), rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; @@ -1179,25 +1185,6 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) } /** - * rpcrdma_mr_put - DMA unmap an MR and release it - * @mr: MR to release - * - */ -void rpcrdma_mr_put(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - if (mr->mr_dir != DMA_NONE) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; - } - - rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); -} - -/** * rpcrdma_buffer_get - Get a request buffer * @buffers: Buffer pool from which to obtain a buffer * @@ -1411,6 +1398,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!rep) break; + rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; trace_xprtrdma_post_recv(rep); rep->rr_recv_wr.next = wr; wr = &rep->rr_recv_wr; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 43974ef39a50..94b28657aeeb 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -53,6 +53,7 @@ #include <rdma/ib_verbs.h> /* RDMA verbs api */ #include <linux/sunrpc/clnt.h> /* rpc_xprt */ +#include <linux/sunrpc/rpc_rdma_cid.h> /* completion IDs */ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ @@ -93,6 +94,8 @@ struct rpcrdma_ep { unsigned int re_max_requests; /* depends on device */ unsigned int re_inline_send; /* negotiated */ unsigned int re_inline_recv; /* negotiated */ + + atomic_t re_completion_ids; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -180,6 +183,8 @@ enum { struct rpcrdma_rep { struct ib_cqe rr_cqe; + struct rpc_rdma_cid rr_cid; + __be32 rr_xid; __be32 rr_vers; __be32 rr_proc; @@ -211,6 +216,7 @@ enum { struct rpcrdma_req; struct rpcrdma_sendctx { struct ib_cqe sc_cqe; + struct rpc_rdma_cid sc_cid; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; struct ib_sge sc_sges[]; @@ -225,6 +231,7 @@ struct rpcrdma_sendctx { struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; + struct rpc_rdma_cid fr_cid; struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; @@ -236,6 +243,7 @@ struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; struct rpcrdma_req *mr_req; + struct ib_device *mr_device; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; @@ -466,7 +474,6 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); -void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7090bbee0ec5..c56a66cdf4ac 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -433,7 +433,8 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, if (ret <= 0) goto sock_err; xs_flush_bvec(buf->bvec, ret, seek + buf->page_base); - offset += ret - buf->page_base; + ret -= buf->page_base; + offset += ret; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) @@ -3059,6 +3060,7 @@ static struct xprt_class xs_local_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_LOCAL, .setup = xs_setup_local, + .netid = { "" }, }; static struct xprt_class xs_udp_transport = { @@ -3067,6 +3069,7 @@ static struct xprt_class xs_udp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_UDP, .setup = xs_setup_udp, + .netid = { "udp", "udp6", "" }, }; static struct xprt_class xs_tcp_transport = { @@ -3075,6 +3078,7 @@ static struct xprt_class xs_tcp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_TCP, .setup = xs_setup_tcp, + .netid = { "tcp", "tcp6", "" }, }; static struct xprt_class xs_bc_tcp_transport = { @@ -3083,6 +3087,7 @@ static struct xprt_class xs_bc_tcp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_BC_TCP, .setup = xs_setup_bc_tcp, + .netid = { "" }, }; /** diff --git a/net/tipc/link.c b/net/tipc/link.c index 6ae2140eb4f7..115109259430 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1030,7 +1030,6 @@ void tipc_link_reset(struct tipc_link *l) int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *xmitq) { - struct tipc_msg *hdr = buf_msg(skb_peek(list)); struct sk_buff_head *backlogq = &l->backlogq; struct sk_buff_head *transmq = &l->transmq; struct sk_buff *skb, *_skb; @@ -1038,13 +1037,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, u16 ack = l->rcv_nxt - 1; u16 seqno = l->snd_nxt; int pkt_cnt = skb_queue_len(list); - int imp = msg_importance(hdr); unsigned int mss = tipc_link_mss(l); unsigned int cwin = l->window; unsigned int mtu = l->mtu; + struct tipc_msg *hdr; bool new_bundle; int rc = 0; + int imp; + + if (pkt_cnt <= 0) + return 0; + hdr = buf_msg(skb_peek(list)); if (unlikely(msg_size(hdr) > mtu)) { pr_warn("Too large msg, purging xmit list %d %d %d %d %d!\n", skb_queue_len(list), msg_user(hdr), @@ -1053,6 +1057,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, return -EMSGSIZE; } + imp = msg_importance(hdr); /* Allow oversubscription of one data msg per source at congestion */ if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) { if (imp == TIPC_SYSTEM_IMPORTANCE) { @@ -2539,7 +2544,7 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win) } /** - * link_reset_stats - reset link statistics + * tipc_link_reset_stats - reset link statistics * @l: pointer to link */ void tipc_link_reset_stats(struct tipc_link *l) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 82f154989418..5a1ce64039f7 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -213,12 +213,14 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, } info.attrs = attrbuf; - err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf, - tipc_genl_family.maxattr, - tipc_genl_family.policy, NULL); - if (err) - goto err_out; + if (nlmsg_len(cb.nlh) > 0) { + err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf, + tipc_genl_family.maxattr, + tipc_genl_family.policy, NULL); + if (err) + goto err_out; + } do { int rem; diff --git a/net/tipc/node.c b/net/tipc/node.c index 83d9eb830592..008670d1f43e 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1665,7 +1665,7 @@ static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) } /** - * tipc_node_xmit() is the general link level function for message sending + * tipc_node_xmit() - general link level function for message sending * @net: the applicable net namespace * @list: chain of buffers containing message * @dnode: address of destination node diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 27026f587fa6..f620acd2a0f5 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -21,6 +21,7 @@ config CFG80211 tristate "cfg80211 - wireless configuration API" depends on RFKILL || !RFKILL select FW_LOADER + select CRC32 # may need to update this when certificates are changed and are # using a different algorithm, though right now they shouldn't # (this is here rather than below to allow it to be a module) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index bb72447ad960..8114bba8556c 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -139,6 +139,11 @@ static const struct ieee80211_regdomain *get_cfg80211_regdom(void) return rcu_dereference_rtnl(cfg80211_regdomain); } +/* + * Returns the regulatory domain associated with the wiphy. + * + * Requires either RTNL or RCU protection + */ const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy) { return rcu_dereference_rtnl(wiphy->regd); @@ -2571,9 +2576,13 @@ void wiphy_apply_custom_regulatory(struct wiphy *wiphy, if (IS_ERR(new_regd)) return; + rtnl_lock(); + tmp = get_wiphy_regdom(wiphy); rcu_assign_pointer(wiphy->regd, new_regd); rcu_free_regdom(tmp); + + rtnl_unlock(); } EXPORT_SYMBOL(wiphy_apply_custom_regulatory); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ac4a317038f1..4a83117507f5 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -108,9 +108,9 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { - if (queue_id < dev->real_num_rx_queues) + if (queue_id < dev->num_rx_queues) dev->_rx[queue_id].pool = NULL; - if (queue_id < dev->real_num_tx_queues) + if (queue_id < dev->num_tx_queues) dev->_tx[queue_id].pool = NULL; } @@ -423,9 +423,9 @@ static void xsk_destruct_skb(struct sk_buff *skb) struct xdp_sock *xs = xdp_sk(skb->sk); unsigned long flags; - spin_lock_irqsave(&xs->tx_completion_lock, flags); + spin_lock_irqsave(&xs->pool->cq_lock, flags); xskq_prod_submit_addr(xs->pool->cq, addr); - spin_unlock_irqrestore(&xs->tx_completion_lock, flags); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); sock_wfree(skb); } @@ -437,6 +437,7 @@ static int xsk_generic_xmit(struct sock *sk) bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; + unsigned long flags; int err = 0; mutex_lock(&xs->mutex); @@ -468,10 +469,13 @@ static int xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ + spin_lock_irqsave(&xs->pool->cq_lock, flags); if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) { + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); kfree_skb(skb); goto out; } + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); skb->dev = xs->dev; skb->priority = sk->sk_priority; @@ -483,6 +487,9 @@ static int xsk_generic_xmit(struct sock *sk) if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ skb->destructor = sock_wfree; + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel(xs->pool->cq); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); /* Free skb without triggering the perf drop trace */ consume_skb(skb); err = -EAGAIN; @@ -878,6 +885,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } } + /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ + xs->fq_tmp = NULL; + xs->cq_tmp = NULL; + xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; @@ -1299,7 +1310,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs->state = XSK_READY; mutex_init(&xs->mutex); spin_lock_init(&xs->rx_lock); - spin_lock_init(&xs->tx_completion_lock); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 67a4494d63b6..20598eea658c 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -71,12 +71,11 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); + spin_lock_init(&pool->cq_lock); refcount_set(&pool->users, 1); pool->fq = xs->fq_tmp; pool->cq = xs->cq_tmp; - xs->fq_tmp = NULL; - xs->cq_tmp = NULL; for (i = 0; i < pool->free_heads_cnt; i++) { xskb = &pool->heads[i]; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 4a9663aa7afe..2823b7c3302d 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -334,6 +334,11 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q) return xskq_prod_nb_free(q, 1) ? false : true; } +static inline void xskq_prod_cancel(struct xsk_queue *q) +{ + q->cached_prod--; +} + static inline int xskq_prod_reserve(struct xsk_queue *q) { if (xskq_prod_is_full(q)) |