112 files changed, 7893 insertions, 2748 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index f292e0267bb9..8b644113715e 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -284,8 +284,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 	return 0;
 
 out_free_newdev:
-	if (new_dev->reg_state == NETREG_UNINITIALIZED)
-		free_netdev(new_dev);
+	free_netdev(new_dev);
 	return err;
 }
 
diff --git a/net/9p/client.c b/net/9p/client.c
index 785a7bb6a539..4f62f299da0c 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -903,6 +903,7 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
 	fid->clnt = clnt;
 	fid->rdir = NULL;
 	fid->fid = 0;
+	refcount_set(&fid->count, 1);
 
 	idr_preload(GFP_KERNEL);
 	spin_lock_irq(&clnt->lock);
@@ -910,7 +911,6 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
 			    GFP_NOWAIT);
 	spin_unlock_irq(&clnt->lock);
 	idr_preload_end();
-
 	if (!ret)
 		return fid;
 
@@ -1189,7 +1189,6 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
 
 	p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n",
 		 oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL);
-
 	req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid,
 								nwname, wnames);
 	if (IS_ERR(req)) {
@@ -1221,7 +1220,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
 	if (nwname)
 		memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid));
 	else
-		fid->qid = oldfid->qid;
+		memmove(&fid->qid, &oldfid->qid, sizeof(struct p9_qid));
 
 	kfree(wqids);
 	return fid;
@@ -1274,6 +1273,7 @@ int p9_client_open(struct p9_fid *fid, int mode)
 		p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN",  qid.type,
 		(unsigned long long)qid.path, qid.version, iounit);
 
+	memmove(&fid->qid, &qid, sizeof(struct p9_qid));
 	fid->mode = mode;
 	fid->iounit = iounit;
 
@@ -1319,6 +1319,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32
 			(unsigned long long)qid->path,
 			qid->version, iounit);
 
+	memmove(&ofid->qid, qid, sizeof(struct p9_qid));
 	ofid->mode = mode;
 	ofid->iounit = iounit;
 
@@ -1364,6 +1365,7 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode,
 				(unsigned long long)qid.path,
 				qid.version, iounit);
 
+	memmove(&fid->qid, &qid, sizeof(struct p9_qid));
 	fid->mode = mode;
 	fid->iounit = iounit;
 
@@ -1460,12 +1462,14 @@ int p9_client_clunk(struct p9_fid *fid)
 	struct p9_req_t *req;
 	int retries = 0;
 
-	if (!fid) {
-		pr_warn("%s (%d): Trying to clunk with NULL fid\n",
+	if (!fid || IS_ERR(fid)) {
+		pr_warn("%s (%d): Trying to clunk with invalid fid\n",
 			__func__, task_pid_nr(current));
 		dump_stack();
 		return 0;
 	}
+	if (!refcount_dec_and_test(&fid->count))
+		return 0;
 
 again:
 	p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n", fid->fid,
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index c1c30a9f76f3..8b796c499cbb 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -272,7 +272,8 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 	    kattr->test.repeat)
 		return -EINVAL;
 
-	if (ctx_size_in < prog->aux->max_ctx_offset)
+	if (ctx_size_in < prog->aux->max_ctx_offset ||
+	    ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64))
 		return -EINVAL;
 
 	if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0)
diff --git a/net/can/isotp.c b/net/can/isotp.c
index 7839c3b9e5be..3ef7f78e553b 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -1155,6 +1155,7 @@ static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
 	if (peer)
 		return -EOPNOTSUPP;
 
+	memset(addr, 0, sizeof(*addr));
 	addr->can_family = AF_CAN;
 	addr->can_ifindex = so->ifindex;
 	addr->can_addr.tp.rx_id = so->rxid;
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index f36f9a3a4e20..c5c4eef3a9ff 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -5,6 +5,9 @@ config CEPH_LIB
 	select LIBCRC32C
 	select CRYPTO_AES
 	select CRYPTO_CBC
+	select CRYPTO_GCM
+	select CRYPTO_HMAC
+	select CRYPTO_SHA256
 	select CRYPTO
 	select KEYS
 	default n
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index ce09bb4fb249..8802a0c0155d 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -14,4 +14,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	crypto.o armor.o \
 	auth_x.o \
 	ceph_strings.o ceph_hash.o \
-	pagevec.o snapshot.o string_table.o
+	pagevec.o snapshot.o string_table.o \
+	messenger_v1.o messenger_v2.o
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index fbeee068ea14..eb261aa5fe18 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -21,28 +21,31 @@ static u32 supported_protocols[] = {
 	CEPH_AUTH_CEPHX
 };
 
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+static int init_protocol(struct ceph_auth_client *ac, int proto)
 {
-	switch (protocol) {
+	dout("%s proto %d\n", __func__, proto);
+
+	switch (proto) {
 	case CEPH_AUTH_NONE:
 		return ceph_auth_none_init(ac);
 	case CEPH_AUTH_CEPHX:
 		return ceph_x_init(ac);
 	default:
-		return -ENOENT;
+		pr_err("bad auth protocol %d\n", proto);
+		return -EINVAL;
 	}
 }
 
 /*
  * setup, teardown.
  */
-struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key)
+struct ceph_auth_client *ceph_auth_init(const char *name,
+					const struct ceph_crypto_key *key,
+					const int *con_modes)
 {
 	struct ceph_auth_client *ac;
 	int ret;
 
-	dout("auth_init name '%s'\n", name);
-
 	ret = -ENOMEM;
 	ac = kzalloc(sizeof(*ac), GFP_NOFS);
 	if (!ac)
@@ -54,8 +57,12 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
 		ac->name = name;
 	else
 		ac->name = CEPH_AUTH_NAME_DEFAULT;
-	dout("auth_init name %s\n", ac->name);
 	ac->key = key;
+	ac->preferred_mode = con_modes[0];
+	ac->fallback_mode = con_modes[1];
+
+	dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__,
+	     ac->name, ac->preferred_mode, ac->fallback_mode);
 	return ac;
 
 out:
@@ -145,31 +152,35 @@ bad:
 	goto out;
 }
 
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
-				   void *msg_buf, size_t msg_len)
+static int build_request(struct ceph_auth_client *ac, bool add_header,
+			 void *buf, int buf_len)
 {
-	struct ceph_mon_request_header *monhdr = msg_buf;
-	void *p = monhdr + 1;
-	void *end = msg_buf + msg_len;
+	void *end = buf + buf_len;
+	void *p;
 	int ret;
 
-	monhdr->have_version = 0;
-	monhdr->session_mon = cpu_to_le16(-1);
-	monhdr->session_mon_tid = 0;
-
-	ceph_encode_32(&p, ac->protocol);
+	p = buf;
+	if (add_header) {
+		/* struct ceph_mon_request_header + protocol */
+		ceph_encode_64_safe(&p, end, 0, e_range);
+		ceph_encode_16_safe(&p, end, -1, e_range);
+		ceph_encode_64_safe(&p, end, 0, e_range);
+		ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+	}
 
+	ceph_encode_need(&p, end, sizeof(u32), e_range);
 	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
 	if (ret < 0) {
-		pr_err("error %d building auth method %s request\n", ret,
-		       ac->ops->name);
-		goto out;
+		pr_err("auth protocol '%s' building request failed: %d\n",
+		       ceph_auth_proto_name(ac->protocol), ret);
+		return ret;
 	}
 	dout(" built request %d bytes\n", ret);
 	ceph_encode_32(&p, ret);
-	ret = p + ret - msg_buf;
-out:
-	return ret;
+	return p + ret - buf;
+
+e_range:
+	return -ERANGE;
 }
 
 /*
@@ -229,10 +240,10 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 			ac->ops = NULL;
 		}
 		if (ac->protocol != protocol) {
-			ret = ceph_auth_init_protocol(ac, protocol);
+			ret = init_protocol(ac, protocol);
 			if (ret) {
-				pr_err("error %d on auth protocol %d init\n",
-				       ret, protocol);
+				pr_err("auth protocol '%s' init failed: %d\n",
+				       ceph_auth_proto_name(protocol), ret);
 				goto out;
 			}
 		}
@@ -240,12 +251,13 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 		ac->negotiating = false;
 	}
 
-	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
-	if (ret == -EAGAIN) {
-		ret = ceph_build_auth_request(ac, reply_buf, reply_len);
-	} else if (ret) {
-		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
-	}
+	ret = ac->ops->handle_reply(ac, result, payload, payload_end,
+				    NULL, NULL, NULL, NULL);
+	if (ret == -EAGAIN)
+		ret = build_request(ac, true, reply_buf, reply_len);
+	else if (ret)
+		pr_err("auth protocol '%s' mauth authentication failed: %d\n",
+		       ceph_auth_proto_name(ac->protocol), result);
 
 out:
 	mutex_unlock(&ac->mutex);
@@ -264,7 +276,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
 
 	mutex_lock(&ac->mutex);
 	if (ac->ops->should_authenticate(ac))
-		ret = ceph_build_auth_request(ac, msg_buf, msg_len);
+		ret = build_request(ac, true, msg_buf, msg_len);
 	mutex_unlock(&ac->mutex);
 	return ret;
 }
@@ -281,19 +293,38 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
 }
 EXPORT_SYMBOL(ceph_auth_is_authenticated);
 
-int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
-				int peer_type,
-				struct ceph_auth_handshake *auth)
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			       struct ceph_auth_handshake *auth,
+			       int peer_type, bool force_new,
+			       int *proto, int *pref_mode, int *fallb_mode)
 {
-	int ret = 0;
+	int ret;
 
 	mutex_lock(&ac->mutex);
-	if (ac->ops && ac->ops->create_authorizer)
+	if (force_new && auth->authorizer) {
+		ceph_auth_destroy_authorizer(auth->authorizer);
+		auth->authorizer = NULL;
+	}
+	if (!auth->authorizer)
 		ret = ac->ops->create_authorizer(ac, peer_type, auth);
+	else if (ac->ops->update_authorizer)
+		ret = ac->ops->update_authorizer(ac, peer_type, auth);
+	else
+		ret = 0;
+	if (ret)
+		goto out;
+
+	*proto = ac->protocol;
+	if (pref_mode && fallb_mode) {
+		*pref_mode = ac->preferred_mode;
+		*fallb_mode = ac->fallback_mode;
+	}
+
+out:
 	mutex_unlock(&ac->mutex);
 	return ret;
 }
-EXPORT_SYMBOL(ceph_auth_create_authorizer);
+EXPORT_SYMBOL(__ceph_auth_get_authorizer);
 
 void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
 {
@@ -301,20 +332,6 @@ void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
 }
 EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
 
-int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
-				int peer_type,
-				struct ceph_auth_handshake *a)
-{
-	int ret = 0;
-
-	mutex_lock(&ac->mutex);
-	if (ac->ops && ac->ops->update_authorizer)
-		ret = ac->ops->update_authorizer(ac, peer_type, a);
-	mutex_unlock(&ac->mutex);
-	return ret;
-}
-EXPORT_SYMBOL(ceph_auth_update_authorizer);
-
 int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a,
 				       void *challenge_buf,
@@ -332,13 +349,18 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge);
 
 int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
+				      struct ceph_authorizer *a,
+				      void *reply, int reply_len,
+				      u8 *session_key, int *session_key_len,
+				      u8 *con_secret, int *con_secret_len)
 {
 	int ret = 0;
 
 	mutex_lock(&ac->mutex);
 	if (ac->ops && ac->ops->verify_authorizer_reply)
-		ret = ac->ops->verify_authorizer_reply(ac, a);
+		ret = ac->ops->verify_authorizer_reply(ac, a,
+			reply, reply_len, session_key, session_key_len,
+			con_secret, con_secret_len);
 	mutex_unlock(&ac->mutex);
 	return ret;
 }
@@ -352,3 +374,279 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
 	mutex_unlock(&ac->mutex);
 }
 EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
+
+/*
+ * msgr2 authentication
+ */
+
+static bool contains(const int *arr, int cnt, int val)
+{
+	int i;
+
+	for (i = 0; i < cnt; i++) {
+		if (arr[i] == val)
+			return true;
+	}
+
+	return false;
+}
+
+static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode)
+{
+	WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN);
+	if (fallb_mode != CEPH_CON_MODE_UNKNOWN) {
+		ceph_encode_32_safe(p, end, 2, e_range);
+		ceph_encode_32_safe(p, end, pref_mode, e_range);
+		ceph_encode_32_safe(p, end, fallb_mode, e_range);
+	} else {
+		ceph_encode_32_safe(p, end, 1, e_range);
+		ceph_encode_32_safe(p, end, pref_mode, e_range);
+	}
+
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+
+/*
+ * Similar to ceph_auth_build_hello().
+ */
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len)
+{
+	int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE;
+	void *end = buf + buf_len;
+	void *lenp;
+	void *p;
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	if (ac->protocol == CEPH_AUTH_UNKNOWN) {
+		ret = init_protocol(ac, proto);
+		if (ret) {
+			pr_err("auth protocol '%s' init failed: %d\n",
+			       ceph_auth_proto_name(proto), ret);
+			goto out;
+		}
+	} else {
+		WARN_ON(ac->protocol != proto);
+		ac->ops->reset(ac);
+	}
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+	ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode);
+	if (ret)
+		goto out;
+
+	lenp = p;
+	p += 4;  /* space for len */
+
+	ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range);
+	ret = ceph_auth_entity_name_encode(ac->name, &p, end);
+	if (ret)
+		goto out;
+
+	ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+	ceph_encode_32(&lenp, p - lenp - 4);
+	ret = p - buf;
+
+out:
+	mutex_unlock(&ac->mutex);
+	return ret;
+
+e_range:
+	ret = -ERANGE;
+	goto out;
+}
+
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+				int reply_len, void *buf, int buf_len)
+{
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+				    NULL, NULL, NULL, NULL);
+	if (ret == -EAGAIN)
+		ret = build_request(ac, false, buf, buf_len);
+	else
+		WARN_ON(ret >= 0);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	if (global_id && ac->global_id != global_id) {
+		dout("%s global_id %llu -> %llu\n", __func__, ac->global_id,
+		     global_id);
+		ac->global_id = global_id;
+	}
+
+	ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+				    session_key, session_key_len,
+				    con_secret, con_secret_len);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+				 int used_proto, int result,
+				 const int *allowed_protos, int proto_cnt,
+				 const int *allowed_modes, int mode_cnt)
+{
+	mutex_lock(&ac->mutex);
+	WARN_ON(used_proto != ac->protocol);
+
+	if (result == -EOPNOTSUPP) {
+		if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+			pr_err("auth protocol '%s' not allowed\n",
+			       ceph_auth_proto_name(ac->protocol));
+			goto not_allowed;
+		}
+		if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+		    (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+		     !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+			pr_err("preferred mode '%s' not allowed\n",
+			       ceph_con_mode_name(ac->preferred_mode));
+			if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+				pr_err("no fallback mode\n");
+			else
+				pr_err("fallback mode '%s' not allowed\n",
+				       ceph_con_mode_name(ac->fallback_mode));
+			goto not_allowed;
+		}
+	}
+
+	WARN_ON(result == -EOPNOTSUPP || result >= 0);
+	pr_err("auth protocol '%s' msgr authentication failed: %d\n",
+	       ceph_auth_proto_name(ac->protocol), result);
+
+	mutex_unlock(&ac->mutex);
+	return true;
+
+not_allowed:
+	mutex_unlock(&ac->mutex);
+	return false;
+}
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			     struct ceph_auth_handshake *auth,
+			     int peer_type, void *buf, int *buf_len)
+{
+	void *end = buf + *buf_len;
+	int pref_mode, fallb_mode;
+	int proto;
+	void *p;
+	int ret;
+
+	ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto,
+					 &pref_mode, &fallb_mode);
+	if (ret)
+		return ret;
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, proto, e_range);
+	ret = encode_con_modes(&p, end, pref_mode, fallb_mode);
+	if (ret)
+		return ret;
+
+	ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+	*buf_len = p - buf;
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_get_authorizer);
+
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    void *buf, int *buf_len)
+{
+	void *end = buf + *buf_len;
+	void *p;
+	int ret;
+
+	ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer,
+						 reply, reply_len);
+	if (ret)
+		return ret;
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+	*buf_len = p - buf;
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more);
+
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    u8 *session_key, int *session_key_len,
+				    u8 *con_secret, int *con_secret_len)
+{
+	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+		reply, reply_len, session_key, session_key_len,
+		con_secret, con_secret_len);
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done);
+
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+				     int peer_type, int used_proto, int result,
+				     const int *allowed_protos, int proto_cnt,
+				     const int *allowed_modes, int mode_cnt)
+{
+	mutex_lock(&ac->mutex);
+	WARN_ON(used_proto != ac->protocol);
+
+	if (result == -EOPNOTSUPP) {
+		if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+			pr_err("auth protocol '%s' not allowed by %s\n",
+			       ceph_auth_proto_name(ac->protocol),
+			       ceph_entity_type_name(peer_type));
+			goto not_allowed;
+		}
+		if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+		    (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+		     !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+			pr_err("preferred mode '%s' not allowed by %s\n",
+			       ceph_con_mode_name(ac->preferred_mode),
+			       ceph_entity_type_name(peer_type));
+			if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+				pr_err("no fallback mode\n");
+			else
+				pr_err("fallback mode '%s' not allowed by %s\n",
+				       ceph_con_mode_name(ac->fallback_mode),
+				       ceph_entity_type_name(peer_type));
+			goto not_allowed;
+		}
+	}
+
+	WARN_ON(result == -EOPNOTSUPP || result >= 0);
+	pr_err("auth protocol '%s' authorization to %s failed: %d\n",
+	       ceph_auth_proto_name(ac->protocol),
+	       ceph_entity_type_name(peer_type), result);
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, peer_type);
+
+	mutex_unlock(&ac->mutex);
+	return true;
+
+not_allowed:
+	mutex_unlock(&ac->mutex);
+	return false;
+}
+EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer);
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index edb7042479ed..70e86e462250 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -70,7 +70,9 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
  * authenticate state, so nothing happens here.
  */
 static int handle_reply(struct ceph_auth_client *ac, int result,
-			void *buf, void *end)
+			void *buf, void *end, u8 *session_key,
+			int *session_key_len, u8 *con_secret,
+			int *con_secret_len)
 {
 	struct ceph_auth_none_info *xi = ac->private;
 
@@ -116,7 +118,6 @@ static int ceph_auth_none_create_authorizer(
 }
 
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
-	.name = "none",
 	.reset = reset,
 	.destroy = destroy,
 	.is_authenticated = is_authenticated,
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index b52732337ca6..ca44c327bace 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -22,12 +22,15 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
 static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
 {
 	struct ceph_x_info *xi = ac->private;
-	int need;
+	int missing;
+	int need;  /* missing + need renewal */
 
 	ceph_x_validate_tickets(ac, &need);
-	dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
-	     ac->want_keys, need, xi->have_keys);
-	return (ac->want_keys & xi->have_keys) == ac->want_keys;
+	missing = ac->want_keys & ~xi->have_keys;
+	WARN_ON((need & missing) != missing);
+	dout("%s want 0x%x have 0x%x missing 0x%x -> %d\n", __func__,
+	     ac->want_keys, xi->have_keys, missing, !missing);
+	return !missing;
 }
 
 static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
@@ -36,9 +39,9 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
 	int need;
 
 	ceph_x_validate_tickets(ac, &need);
-	dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
-	     ac->want_keys, need, xi->have_keys);
-	return need != 0;
+	dout("%s want 0x%x have 0x%x need 0x%x -> %d\n", __func__,
+	     ac->want_keys, xi->have_keys, need, !!need);
+	return !!need;
 }
 
 static int ceph_x_encrypt_offset(void)
@@ -197,7 +200,7 @@ static int process_one_ticket(struct ceph_auth_client *ac,
 	dout(" decrypted %d bytes\n", ret);
 	dend = dp + ret;
 
-	tkt_struct_v = ceph_decode_8(&dp);
+	ceph_decode_8_safe(&dp, dend, tkt_struct_v, bad);
 	if (tkt_struct_v != 1)
 		goto bad;
 
@@ -205,6 +208,7 @@ static int process_one_ticket(struct ceph_auth_client *ac,
 	if (ret)
 		goto out;
 
+	ceph_decode_need(&dp, dend, sizeof(struct ceph_timespec), bad);
 	ceph_decode_timespec64(&validity, dp);
 	dp += sizeof(struct ceph_timespec);
 	new_expires = ktime_get_real_seconds() + validity.tv_sec;
@@ -265,22 +269,21 @@ out:
 
 static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 				    struct ceph_crypto_key *secret,
-				    void *buf, void *end)
+				    void **p, void *end)
 {
-	void *p = buf;
 	u8 reply_struct_v;
 	u32 num;
 	int ret;
 
-	ceph_decode_8_safe(&p, end, reply_struct_v, bad);
+	ceph_decode_8_safe(p, end, reply_struct_v, bad);
 	if (reply_struct_v != 1)
 		return -EINVAL;
 
-	ceph_decode_32_safe(&p, end, num, bad);
+	ceph_decode_32_safe(p, end, num, bad);
 	dout("%d tickets\n", num);
 
 	while (num--) {
-		ret = process_one_ticket(ac, secret, &p, end);
+		ret = process_one_ticket(ac, secret, p, end);
 		if (ret)
 			return ret;
 	}
@@ -379,6 +382,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
 		}
 	}
 	au->service = th->service;
+	WARN_ON(!th->secret_id);
 	au->secret_id = th->secret_id;
 
 	msg_a = au->buf->vec.iov_base;
@@ -442,9 +446,10 @@ static bool need_key(struct ceph_x_ticket_handler *th)
 
 static bool have_key(struct ceph_x_ticket_handler *th)
 {
-	if (th->have_key) {
-		if (ktime_get_real_seconds() >= th->expires)
-			th->have_key = false;
+	if (th->have_key && ktime_get_real_seconds() >= th->expires) {
+		dout("ticket %d (%s) secret_id %llu expired\n", th->service,
+		     ceph_entity_type_name(th->service), th->secret_id);
+		th->have_key = false;
 	}
 
 	return th->have_key;
@@ -486,6 +491,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 	struct ceph_x_info *xi = ac->private;
 	int need;
 	struct ceph_x_request_header *head = buf;
+	void *p;
 	int ret;
 	struct ceph_x_ticket_handler *th =
 		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
@@ -494,18 +500,17 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 		return PTR_ERR(th);
 
 	ceph_x_validate_tickets(ac, &need);
-
-	dout("build_request want %x have %x need %x\n",
-	     ac->want_keys, xi->have_keys, need);
+	dout("%s want 0x%x have 0x%x need 0x%x\n", __func__, ac->want_keys,
+	     xi->have_keys, need);
 
 	if (need & CEPH_ENTITY_TYPE_AUTH) {
 		struct ceph_x_authenticate *auth = (void *)(head + 1);
-		void *p = auth + 1;
 		void *enc_buf = xi->auth_authorizer.enc_buf;
 		struct ceph_x_challenge_blob *blob = enc_buf +
 							ceph_x_encrypt_offset();
 		u64 *u;
 
+		p = auth + 1;
 		if (p > end)
 			return -ERANGE;
 
@@ -521,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 		if (ret < 0)
 			return ret;
 
-		auth->struct_v = 1;
+		auth->struct_v = 2;  /* nautilus+ */
 		auth->key = 0;
 		for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
 			auth->key ^= *(__le64 *)u;
@@ -534,39 +539,137 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 		if (ret < 0)
 			return ret;
 
+		/* nautilus+: request service tickets at the same time */
+		need = ac->want_keys & ~CEPH_ENTITY_TYPE_AUTH;
+		WARN_ON(!need);
+		ceph_encode_32_safe(&p, end, need, e_range);
 		return p - buf;
 	}
 
 	if (need) {
-		void *p = head + 1;
-		struct ceph_x_service_ticket_request *req;
-
-		if (p > end)
-			return -ERANGE;
-		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-
+		dout(" get_principal_session_key\n");
 		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
 		if (ret)
 			return ret;
-		ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
-				 xi->auth_authorizer.buf->vec.iov_len);
 
-		req = p;
-		req->keys = cpu_to_le32(need);
-		p += sizeof(*req);
+		p = buf;
+		ceph_encode_16_safe(&p, end, CEPHX_GET_PRINCIPAL_SESSION_KEY,
+				    e_range);
+		ceph_encode_copy_safe(&p, end,
+			xi->auth_authorizer.buf->vec.iov_base,
+			xi->auth_authorizer.buf->vec.iov_len, e_range);
+		ceph_encode_8_safe(&p, end, 1, e_range);
+		ceph_encode_32_safe(&p, end, need, e_range);
 		return p - buf;
 	}
 
 	return 0;
+
+e_range:
+	return -ERANGE;
+}
+
+static int decode_con_secret(void **p, void *end, u8 *con_secret,
+			     int *con_secret_len)
+{
+	int len;
+
+	ceph_decode_32_safe(p, end, len, bad);
+	ceph_decode_need(p, end, len, bad);
+
+	dout("%s len %d\n", __func__, len);
+	if (con_secret) {
+		if (len > CEPH_MAX_CON_SECRET_LEN) {
+			pr_err("connection secret too big %d\n", len);
+			goto bad_memzero;
+		}
+		memcpy(con_secret, *p, len);
+		*con_secret_len = len;
+	}
+	memzero_explicit(*p, len);
+	*p += len;
+	return 0;
+
+bad_memzero:
+	memzero_explicit(*p, len);
+bad:
+	pr_err("failed to decode connection secret\n");
+	return -EINVAL;
+}
+
+static int handle_auth_session_key(struct ceph_auth_client *ac,
+				   void **p, void *end,
+				   u8 *session_key, int *session_key_len,
+				   u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct ceph_x_ticket_handler *th;
+	void *dp, *dend;
+	int len;
+	int ret;
+
+	/* AUTH ticket */
+	ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
+	if (ret)
+		return ret;
+
+	if (*p == end) {
+		/* pre-nautilus (or didn't request service tickets!) */
+		WARN_ON(session_key || con_secret);
+		return 0;
+	}
+
+	th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	if (session_key) {
+		memcpy(session_key, th->session_key.key, th->session_key.len);
+		*session_key_len = th->session_key.len;
+	}
+
+	/* connection secret */
+	ceph_decode_32_safe(p, end, len, e_inval);
+	dout("%s connection secret blob len %d\n", __func__, len);
+	if (len > 0) {
+		dp = *p + ceph_x_encrypt_offset();
+		ret = ceph_x_decrypt(&th->session_key, p, *p + len);
+		if (ret < 0)
+			return ret;
+
+		dout("%s decrypted %d bytes\n", __func__, ret);
+		dend = dp + ret;
+
+		ret = decode_con_secret(&dp, dend, con_secret, con_secret_len);
+		if (ret)
+			return ret;
+	}
+
+	/* service tickets */
+	ceph_decode_32_safe(p, end, len, e_inval);
+	dout("%s service tickets blob len %d\n", __func__, len);
+	if (len > 0) {
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+					       p, *p + len);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
 }
 
 static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
-			       void *buf, void *end)
+			       void *buf, void *end,
+			       u8 *session_key, int *session_key_len,
+			       u8 *con_secret, int *con_secret_len)
 {
 	struct ceph_x_info *xi = ac->private;
-	struct ceph_x_reply_header *head = buf;
 	struct ceph_x_ticket_handler *th;
 	int len = end - buf;
+	void *p;
 	int op;
 	int ret;
 
@@ -587,22 +690,25 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 		return -EAGAIN;
 	}
 
-	op = le16_to_cpu(head->op);
-	result = le32_to_cpu(head->result);
+	p = buf;
+	ceph_decode_16_safe(&p, end, op, e_inval);
+	ceph_decode_32_safe(&p, end, result, e_inval);
 	dout("handle_reply op %d result %d\n", op, result);
 	switch (op) {
 	case CEPHX_GET_AUTH_SESSION_KEY:
-		/* verify auth key */
-		ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
-					       buf + sizeof(*head), end);
+		/* AUTH ticket + [connection secret] + service tickets */
+		ret = handle_auth_session_key(ac, &p, end, session_key,
+					      session_key_len, con_secret,
+					      con_secret_len);
 		break;
 
 	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
 		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
 		if (IS_ERR(th))
 			return PTR_ERR(th);
-		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
-					       buf + sizeof(*head), end);
+
+		/* service tickets */
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key, &p, end);
 		break;
 
 	default:
@@ -613,6 +719,9 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 	if (ac->want_keys == xi->have_keys)
 		return 0;
 	return -EAGAIN;
+
+e_inval:
+	return -EINVAL;
 }
 
 static void ceph_x_destroy_authorizer(struct ceph_authorizer *a)
@@ -678,40 +787,44 @@ static int ceph_x_update_authorizer(
 	return 0;
 }
 
-static int decrypt_authorize_challenge(struct ceph_x_authorizer *au,
-				       void *challenge_buf,
-				       int challenge_buf_len,
-				       u64 *server_challenge)
+/*
+ * CephXAuthorizeChallenge
+ */
+static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret,
+					void *challenge, int challenge_len,
+					u64 *server_challenge)
 {
-	struct ceph_x_authorize_challenge *ch =
-	    challenge_buf + sizeof(struct ceph_x_encrypt_header);
+	void *dp, *dend;
 	int ret;
 
 	/* no leading len */
-	ret = __ceph_x_decrypt(&au->session_key, challenge_buf,
-			       challenge_buf_len);
+	ret = __ceph_x_decrypt(secret, challenge, challenge_len);
 	if (ret < 0)
 		return ret;
-	if (ret < sizeof(*ch)) {
-		pr_err("bad size %d for ceph_x_authorize_challenge\n", ret);
-		return -EINVAL;
-	}
 
-	*server_challenge = le64_to_cpu(ch->server_challenge);
+	dout("%s decrypted %d bytes\n", __func__, ret);
+	dp = challenge + sizeof(struct ceph_x_encrypt_header);
+	dend = dp + ret;
+
+	ceph_decode_skip_8(&dp, dend, e_inval);  /* struct_v */
+	ceph_decode_64_safe(&dp, dend, *server_challenge, e_inval);
+	dout("%s server_challenge %llu\n", __func__, *server_challenge);
 	return 0;
+
+e_inval:
+	return -EINVAL;
 }
 
 static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
 					   struct ceph_authorizer *a,
-					   void *challenge_buf,
-					   int challenge_buf_len)
+					   void *challenge, int challenge_len)
 {
 	struct ceph_x_authorizer *au = (void *)a;
 	u64 server_challenge;
 	int ret;
 
-	ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len,
-					  &server_challenge);
+	ret = decrypt_authorizer_challenge(&au->session_key, challenge,
+					   challenge_len, &server_challenge);
 	if (ret) {
 		pr_err("failed to decrypt authorize challenge: %d", ret);
 		return ret;
@@ -726,29 +839,67 @@ static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
 	return 0;
 }
 
+/*
+ * CephXAuthorizeReply
+ */
+static int decrypt_authorizer_reply(struct ceph_crypto_key *secret,
+				    void **p, void *end, u64 *nonce_plus_one,
+				    u8 *con_secret, int *con_secret_len)
+{
+	void *dp, *dend;
+	u8 struct_v;
+	int ret;
+
+	dp = *p + ceph_x_encrypt_offset();
+	ret = ceph_x_decrypt(secret, p, end);
+	if (ret < 0)
+		return ret;
+
+	dout("%s decrypted %d bytes\n", __func__, ret);
+	dend = dp + ret;
+
+	ceph_decode_8_safe(&dp, dend, struct_v, e_inval);
+	ceph_decode_64_safe(&dp, dend, *nonce_plus_one, e_inval);
+	dout("%s nonce_plus_one %llu\n", __func__, *nonce_plus_one);
+	if (struct_v >= 2) {
+		ret = decode_con_secret(&dp, dend, con_secret, con_secret_len);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
-					  struct ceph_authorizer *a)
+					  struct ceph_authorizer *a,
+					  void *reply, int reply_len,
+					  u8 *session_key, int *session_key_len,
+					  u8 *con_secret, int *con_secret_len)
 {
 	struct ceph_x_authorizer *au = (void *)a;
-	void *p = au->enc_buf;
-	struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset();
+	u64 nonce_plus_one;
 	int ret;
 
-	ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
-	if (ret < 0)
+	if (session_key) {
+		memcpy(session_key, au->session_key.key, au->session_key.len);
+		*session_key_len = au->session_key.len;
+	}
+
+	ret = decrypt_authorizer_reply(&au->session_key, &reply,
+				       reply + reply_len, &nonce_plus_one,
+				       con_secret, con_secret_len);
+	if (ret)
 		return ret;
-	if (ret < sizeof(*reply)) {
-		pr_err("bad size %d for ceph_x_authorize_reply\n", ret);
-		return -EINVAL;
+
+	if (nonce_plus_one != au->nonce + 1) {
+		pr_err("failed to authenticate server\n");
+		return -EPERM;
 	}
 
-	if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
-		ret = -EPERM;
-	else
-		ret = 0;
-	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
-	     au->nonce, le64_to_cpu(reply->nonce_plus_one), ret);
-	return ret;
+	return 0;
 }
 
 static void ceph_x_reset(struct ceph_auth_client *ac)
@@ -785,8 +936,15 @@ static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type)
 	struct ceph_x_ticket_handler *th;
 
 	th = get_ticket_handler(ac, peer_type);
-	if (!IS_ERR(th))
+	if (IS_ERR(th))
+		return;
+
+	if (th->have_key) {
+		dout("ticket %d (%s) secret_id %llu invalidated\n",
+		     th->service, ceph_entity_type_name(th->service),
+		     th->secret_id);
 		th->have_key = false;
+	}
 }
 
 static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
@@ -911,7 +1069,6 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
 }
 
 static const struct ceph_auth_client_ops ceph_x_ops = {
-	.name = "x",
 	.is_authenticated = ceph_x_is_authenticated,
 	.should_authenticate = ceph_x_should_authenticate,
 	.build_request = ceph_x_build_request,
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 24b0b74564d0..792fcb974dc3 100644
--- a/net/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
@@ -38,7 +38,8 @@ struct ceph_x_authenticate {
 	__u8 struct_v;
 	__le64 client_challenge;
 	__le64 key;
-	/* ticket blob */
+	/* old_ticket blob */
+	/* nautilus+: other_keys */
 } __attribute__ ((packed));
 
 struct ceph_x_service_ticket_request {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4e7edd707a14..271287c5ec12 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -265,6 +265,7 @@ enum {
 	Opt_ip,
 	Opt_crush_location,
 	Opt_read_from_replica,
+	Opt_ms_mode,
 	/* string args above */
 	Opt_share,
 	Opt_crc,
@@ -287,6 +288,23 @@ static const struct constant_table ceph_param_read_from_replica[] = {
 	{}
 };
 
+enum ceph_ms_mode {
+	Opt_ms_mode_legacy,
+	Opt_ms_mode_crc,
+	Opt_ms_mode_secure,
+	Opt_ms_mode_prefer_crc,
+	Opt_ms_mode_prefer_secure
+};
+
+static const struct constant_table ceph_param_ms_mode[] = {
+	{"legacy",		Opt_ms_mode_legacy},
+	{"crc",			Opt_ms_mode_crc},
+	{"secure",		Opt_ms_mode_secure},
+	{"prefer-crc",		Opt_ms_mode_prefer_crc},
+	{"prefer-secure",	Opt_ms_mode_prefer_secure},
+	{}
+};
+
 static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_flag	("abort_on_full",		Opt_abort_on_full),
 	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
@@ -305,6 +323,8 @@ static const struct fs_parameter_spec ceph_parameters[] = {
 			 fs_param_deprecated, NULL),
 	fsparam_enum	("read_from_replica",		Opt_read_from_replica,
 			 ceph_param_read_from_replica),
+	fsparam_enum	("ms_mode",			Opt_ms_mode,
+			 ceph_param_ms_mode),
 	fsparam_string	("secret",			Opt_secret),
 	fsparam_flag_no ("share",			Opt_share),
 	fsparam_flag_no ("tcp_nodelay",			Opt_tcp_nodelay),
@@ -333,6 +353,8 @@ struct ceph_options *ceph_alloc_options(void)
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
 	opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
 	opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT;
+	opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+	opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
 	return opt;
 }
 EXPORT_SYMBOL(ceph_alloc_options);
@@ -503,6 +525,32 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 			BUG();
 		}
 		break;
+	case Opt_ms_mode:
+		switch (result.uint_32) {
+		case Opt_ms_mode_legacy:
+			opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_crc:
+			opt->con_modes[0] = CEPH_CON_MODE_CRC;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_secure:
+			opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_prefer_crc:
+			opt->con_modes[0] = CEPH_CON_MODE_CRC;
+			opt->con_modes[1] = CEPH_CON_MODE_SECURE;
+			break;
+		case Opt_ms_mode_prefer_secure:
+			opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+			opt->con_modes[1] = CEPH_CON_MODE_CRC;
+			break;
+		default:
+			BUG();
+		}
+		break;
 
 	case Opt_osdtimeout:
 		warn_plog(&log, "Ignoring osdtimeout");
@@ -616,6 +664,21 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
 	} else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) {
 		seq_puts(m, "read_from_replica=localize,");
 	}
+	if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) {
+		if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+		    opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+			seq_puts(m, "ms_mode=crc,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+			   opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+			seq_puts(m, "ms_mode=secure,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+			   opt->con_modes[1] == CEPH_CON_MODE_SECURE) {
+			seq_puts(m, "ms_mode=prefer-crc,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+			   opt->con_modes[1] == CEPH_CON_MODE_CRC) {
+			seq_puts(m, "ms_mode=prefer-secure,");
+		}
+	}
 
 	if (opt->flags & CEPH_OPT_FSID)
 		seq_printf(m, "fsid=%pU,", &opt->fsid);
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 10e01494993c..355fea272120 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -18,6 +18,34 @@ const char *ceph_entity_type_name(int type)
 }
 EXPORT_SYMBOL(ceph_entity_type_name);
 
+const char *ceph_auth_proto_name(int proto)
+{
+	switch (proto) {
+	case CEPH_AUTH_UNKNOWN:
+		return "unknown";
+	case CEPH_AUTH_NONE:
+		return "none";
+	case CEPH_AUTH_CEPHX:
+		return "cephx";
+	default:
+		return "???";
+	}
+}
+
+const char *ceph_con_mode_name(int mode)
+{
+	switch (mode) {
+	case CEPH_CON_MODE_UNKNOWN:
+		return "unknown";
+	case CEPH_CON_MODE_CRC:
+		return "crc";
+	case CEPH_CON_MODE_SECURE:
+		return "secure";
+	default:
+		return "???";
+	}
+}
+
 const char *ceph_osd_op_name(int op)
 {
 	switch (op) {
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 4f75df40fb12..92d89b331645 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -96,6 +96,7 @@ int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
 	key->len = ceph_decode_16(p);
 	ceph_decode_need(p, end, key->len, bad);
 	ret = set_secret(key, *p);
+	memzero_explicit(*p, key->len);
 	*p += key->len;
 	return ret;
 
@@ -134,7 +135,7 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
 void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
 {
 	if (key) {
-		kfree(key->key);
+		kfree_sensitive(key->key);
 		key->key = NULL;
 		if (key->tfm) {
 			crypto_free_sync_skcipher(key->tfm);
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index 96ef4d860bc9..13bd526349fa 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -5,6 +5,9 @@
 #include <linux/ceph/types.h>
 #include <linux/ceph/buffer.h>
 
+#define CEPH_KEY_LEN			16
+#define CEPH_MAX_CON_SECRET_LEN		64
+
 /*
  * cryptographic secret
  */
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index eea529595a7a..b44f7651be04 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -1,4 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/inet.h>
 
 #include <linux/ceph/decode.h>
 
@@ -82,3 +85,101 @@ bad:
 }
 EXPORT_SYMBOL(ceph_decode_entity_addr);
 
+/*
+ * Return addr of desired type (MSGR2 or LEGACY) or error.
+ * Make sure there is only one match.
+ *
+ * Assume encoding with MSG_ADDR2.
+ */
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+			       struct ceph_entity_addr *addr)
+{
+	__le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 :
+				 CEPH_ENTITY_ADDR_TYPE_LEGACY;
+	struct ceph_entity_addr tmp_addr;
+	int addr_cnt;
+	bool found;
+	u8 marker;
+	int ret;
+	int i;
+
+	ceph_decode_8_safe(p, end, marker, e_inval);
+	if (marker != 2) {
+		pr_err("bad addrvec marker %d\n", marker);
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(p, end, addr_cnt, e_inval);
+
+	found = false;
+	for (i = 0; i < addr_cnt; i++) {
+		ret = ceph_decode_entity_addr(p, end, &tmp_addr);
+		if (ret)
+			return ret;
+
+		if (tmp_addr.type == my_type) {
+			if (found) {
+				pr_err("another match of type %d in addrvec\n",
+				       le32_to_cpu(my_type));
+				return -EINVAL;
+			}
+
+			memcpy(addr, &tmp_addr, sizeof(*addr));
+			found = true;
+		}
+	}
+	if (!found && addr_cnt != 0) {
+		pr_err("no match of type %d in addrvec\n",
+		       le32_to_cpu(my_type));
+		return -ENOENT;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addrvec);
+
+static int get_sockaddr_encoding_len(sa_family_t family)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in sin;
+		struct sockaddr_in6 sin6;
+	} u;
+
+	switch (family) {
+	case AF_INET:
+		return sizeof(u.sin);
+	case AF_INET6:
+		return sizeof(u.sin6);
+	default:
+		return sizeof(u);
+	}
+}
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr)
+{
+	sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+	int addr_len = get_sockaddr_encoding_len(family);
+
+	return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len;
+}
+
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr)
+{
+	sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+	int addr_len = get_sockaddr_encoding_len(family);
+
+	ceph_encode_8(p, 1);  /* marker */
+	ceph_start_encoding(p, 1, 1, sizeof(addr->type) +
+				     sizeof(addr->nonce) +
+				     sizeof(u32) + addr_len);
+	ceph_encode_copy(p, &addr->type, sizeof(addr->type));
+	ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce));
+
+	ceph_encode_32(p, addr_len);
+	ceph_encode_16(p, family);
+	ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family));
+}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index af0f1fa24937..57d043b382ed 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -82,71 +82,51 @@
 #define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */
 #define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */
 
-/*
- * connection states
- */
-#define CON_STATE_CLOSED        1  /* -> PREOPEN */
-#define CON_STATE_PREOPEN       2  /* -> CONNECTING, CLOSED */
-#define CON_STATE_CONNECTING    3  /* -> NEGOTIATING, CLOSED */
-#define CON_STATE_NEGOTIATING   4  /* -> OPEN, CLOSED */
-#define CON_STATE_OPEN          5  /* -> STANDBY, CLOSED */
-#define CON_STATE_STANDBY       6  /* -> PREOPEN, CLOSED */
-
-/*
- * ceph_connection flag bits
- */
-#define CON_FLAG_LOSSYTX           0  /* we can close channel or drop
-				       * messages on errors */
-#define CON_FLAG_KEEPALIVE_PENDING 1  /* we need to send a keepalive */
-#define CON_FLAG_WRITE_PENDING	   2  /* we have data ready to send */
-#define CON_FLAG_SOCK_CLOSED	   3  /* socket state changed to closed */
-#define CON_FLAG_BACKOFF           4  /* need to retry queuing delayed work */
-
 static bool con_flag_valid(unsigned long con_flag)
 {
 	switch (con_flag) {
-	case CON_FLAG_LOSSYTX:
-	case CON_FLAG_KEEPALIVE_PENDING:
-	case CON_FLAG_WRITE_PENDING:
-	case CON_FLAG_SOCK_CLOSED:
-	case CON_FLAG_BACKOFF:
+	case CEPH_CON_F_LOSSYTX:
+	case CEPH_CON_F_KEEPALIVE_PENDING:
+	case CEPH_CON_F_WRITE_PENDING:
+	case CEPH_CON_F_SOCK_CLOSED:
+	case CEPH_CON_F_BACKOFF:
 		return true;
 	default:
 		return false;
 	}
 }
 
-static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	clear_bit(con_flag, &con->flags);
 }
 
-static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	set_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	return test_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test_and_clear(struct ceph_connection *con,
-					unsigned long con_flag)
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+				  unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	return test_and_clear_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test_and_set(struct ceph_connection *con,
-					unsigned long con_flag)
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+				unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
@@ -157,12 +137,6 @@ static bool con_flag_test_and_set(struct ceph_connection *con,
 
 static struct kmem_cache	*ceph_msg_cache;
 
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
-
 #ifdef CONFIG_LOCKDEP
 static struct lock_class_key socket_class;
 #endif
@@ -184,7 +158,7 @@ static void con_fault(struct ceph_connection *con);
 static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
 static atomic_t addr_str_seq = ATOMIC_INIT(0);
 
-static struct page *zero_page;		/* used in certain error cases */
+struct page *ceph_zero_page;		/* used in certain error cases */
 
 const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 {
@@ -219,10 +193,13 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 }
 EXPORT_SYMBOL(ceph_pr_addr);
 
-static void encode_my_addr(struct ceph_messenger *msgr)
+void ceph_encode_my_addr(struct ceph_messenger *msgr)
 {
-	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-	ceph_encode_banner_addr(&msgr->my_enc_addr);
+	if (!ceph_msgr2(from_msgr(msgr))) {
+		memcpy(&msgr->my_enc_addr, &msgr->inst.addr,
+		       sizeof(msgr->my_enc_addr));
+		ceph_encode_banner_addr(&msgr->my_enc_addr);
+	}
 }
 
 /*
@@ -254,9 +231,9 @@ static void _ceph_msgr_exit(void)
 		ceph_msgr_wq = NULL;
 	}
 
-	BUG_ON(zero_page == NULL);
-	put_page(zero_page);
-	zero_page = NULL;
+	BUG_ON(!ceph_zero_page);
+	put_page(ceph_zero_page);
+	ceph_zero_page = NULL;
 
 	ceph_msgr_slab_exit();
 }
@@ -266,9 +243,9 @@ int __init ceph_msgr_init(void)
 	if (ceph_msgr_slab_init())
 		return -ENOMEM;
 
-	BUG_ON(zero_page != NULL);
-	zero_page = ZERO_PAGE(0);
-	get_page(zero_page);
+	BUG_ON(ceph_zero_page);
+	ceph_zero_page = ZERO_PAGE(0);
+	get_page(ceph_zero_page);
 
 	/*
 	 * The number of active work items is limited by the number of
@@ -372,7 +349,7 @@ static void ceph_sock_data_ready(struct sock *sk)
 	}
 
 	if (sk->sk_state != TCP_CLOSE_WAIT) {
-		dout("%s on %p state = %lu, queueing work\n", __func__,
+		dout("%s %p state = %d, queueing work\n", __func__,
 		     con, con->state);
 		queue_con(con);
 	}
@@ -390,7 +367,7 @@ static void ceph_sock_write_space(struct sock *sk)
 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
-	if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
+	if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
 		if (sk_stream_is_writeable(sk)) {
 			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -406,7 +383,7 @@ static void ceph_sock_state_change(struct sock *sk)
 {
 	struct ceph_connection *con = sk->sk_user_data;
 
-	dout("%s %p state = %lu sk_state = %u\n", __func__,
+	dout("%s %p state = %d sk_state = %u\n", __func__,
 	     con, con->state, sk->sk_state);
 
 	switch (sk->sk_state) {
@@ -416,7 +393,7 @@ static void ceph_sock_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		con_sock_state_closing(con);
-		con_flag_set(con, CON_FLAG_SOCK_CLOSED);
+		ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
 		queue_con(con);
 		break;
 	case TCP_ESTABLISHED:
@@ -450,13 +427,15 @@ static void set_sock_callbacks(struct socket *sock,
 /*
  * initiate connection to a remote socket.
  */
-static int ceph_tcp_connect(struct ceph_connection *con)
+int ceph_tcp_connect(struct ceph_connection *con)
 {
 	struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
 	struct socket *sock;
 	unsigned int noio_flag;
 	int ret;
 
+	dout("%s con %p peer_addr %s\n", __func__, con,
+	     ceph_pr_addr(&con->peer_addr));
 	BUG_ON(con->sock);
 
 	/* sock_create_kern() allocates with GFP_KERNEL */
@@ -474,8 +453,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 
 	set_sock_callbacks(sock, con);
 
-	dout("connect %s\n", ceph_pr_addr(&con->peer_addr));
-
 	con_sock_state_connecting(con);
 	ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss),
 				 O_NONBLOCK);
@@ -498,103 +475,13 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 }
 
 /*
- * If @buf is NULL, discard up to @len bytes.
- */
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
-	struct kvec iov = {buf, len};
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	if (!buf)
-		msg.msg_flags |= MSG_TRUNC;
-
-	iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
-	r = sock_recvmsg(sock, &msg, msg.msg_flags);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
-		     int page_offset, size_t length)
-{
-	struct bio_vec bvec = {
-		.bv_page = page,
-		.bv_offset = page_offset,
-		.bv_len = length
-	};
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	BUG_ON(page_offset + length > PAGE_SIZE);
-	iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
-	r = sock_recvmsg(sock, &msg, msg.msg_flags);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-/*
- * write something.  @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
-			    size_t kvlen, size_t len, bool more)
-{
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	if (more)
-		msg.msg_flags |= MSG_MORE;
-	else
-		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
-
-	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-/*
- * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
- */
-static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
-			     int offset, size_t size, int more)
-{
-	ssize_t (*sendpage)(struct socket *sock, struct page *page,
-			    int offset, size_t size, int flags);
-	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
-	int ret;
-
-	/*
-	 * sendpage cannot properly handle pages with page_count == 0,
-	 * we need to fall back to sendmsg if that's the case.
-	 *
-	 * Same goes for slab pages: skb_can_coalesce() allows
-	 * coalescing neighboring slab objects into a single frag which
-	 * triggers one of hardened usercopy checks.
-	 */
-	if (sendpage_ok(page))
-		sendpage = sock->ops->sendpage;
-	else
-		sendpage = sock_no_sendpage;
-
-	ret = sendpage(sock, page, offset, size, flags);
-	if (ret == -EAGAIN)
-		ret = 0;
-
-	return ret;
-}
-
-/*
  * Shutdown/close the socket for the given connection.
  */
-static int con_close_socket(struct ceph_connection *con)
+int ceph_con_close_socket(struct ceph_connection *con)
 {
 	int rc = 0;
 
-	dout("con_close_socket on %p sock %p\n", con, con->sock);
+	dout("%s con %p sock %p\n", __func__, con, con->sock);
 	if (con->sock) {
 		rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
 		sock_release(con->sock);
@@ -607,12 +494,34 @@ static int con_close_socket(struct ceph_connection *con)
 	 * received a socket close event before we had the chance to
 	 * shut the socket down.
 	 */
-	con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
+	ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
 
 	con_sock_state_closed(con);
 	return rc;
 }
 
+static void ceph_con_reset_protocol(struct ceph_connection *con)
+{
+	dout("%s con %p\n", __func__, con);
+
+	ceph_con_close_socket(con);
+	if (con->in_msg) {
+		WARN_ON(con->in_msg->con != con);
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+	if (con->out_msg) {
+		WARN_ON(con->out_msg->con != con);
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
+
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		ceph_con_v2_reset_protocol(con);
+	else
+		ceph_con_v1_reset_protocol(con);
+}
+
 /*
  * Reset a connection.  Discard all incoming and outgoing messages
  * and clear *_seq state.
@@ -623,6 +532,7 @@ static void ceph_msg_remove(struct ceph_msg *msg)
 
 	ceph_msg_put(msg);
 }
+
 static void ceph_msg_remove_list(struct list_head *head)
 {
 	while (!list_empty(head)) {
@@ -632,31 +542,22 @@ static void ceph_msg_remove_list(struct list_head *head)
 	}
 }
 
-static void reset_connection(struct ceph_connection *con)
+void ceph_con_reset_session(struct ceph_connection *con)
 {
-	/* reset connection, out_queue, msg_ and connect_seq */
-	/* discard existing out_queue and msg_seq */
-	dout("reset_connection %p\n", con);
+	dout("%s con %p\n", __func__, con);
+
+	WARN_ON(con->in_msg);
+	WARN_ON(con->out_msg);
 	ceph_msg_remove_list(&con->out_queue);
 	ceph_msg_remove_list(&con->out_sent);
-
-	if (con->in_msg) {
-		BUG_ON(con->in_msg->con != con);
-		ceph_msg_put(con->in_msg);
-		con->in_msg = NULL;
-	}
-
-	con->connect_seq = 0;
 	con->out_seq = 0;
-	if (con->out_msg) {
-		BUG_ON(con->out_msg->con != con);
-		ceph_msg_put(con->out_msg);
-		con->out_msg = NULL;
-	}
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
 
-	con->out_skip = 0;
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		ceph_con_v2_reset_session(con);
+	else
+		ceph_con_v1_reset_session(con);
 }
 
 /*
@@ -666,17 +567,17 @@ void ceph_con_close(struct ceph_connection *con)
 {
 	mutex_lock(&con->mutex);
 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
-	con->state = CON_STATE_CLOSED;
+	con->state = CEPH_CON_S_CLOSED;
 
-	con_flag_clear(con, CON_FLAG_LOSSYTX);	/* so we retry next connect */
-	con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
-	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
-	con_flag_clear(con, CON_FLAG_BACKOFF);
+	ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX);  /* so we retry next
+							  connect */
+	ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF);
 
-	reset_connection(con);
-	con->peer_global_seq = 0;
+	ceph_con_reset_protocol(con);
+	ceph_con_reset_session(con);
 	cancel_con(con);
-	con_close_socket(con);
 	mutex_unlock(&con->mutex);
 }
 EXPORT_SYMBOL(ceph_con_close);
@@ -691,8 +592,8 @@ void ceph_con_open(struct ceph_connection *con,
 	mutex_lock(&con->mutex);
 	dout("con_open %p %s\n", con, ceph_pr_addr(addr));
 
-	WARN_ON(con->state != CON_STATE_CLOSED);
-	con->state = CON_STATE_PREOPEN;
+	WARN_ON(con->state != CEPH_CON_S_CLOSED);
+	con->state = CEPH_CON_S_PREOPEN;
 
 	con->peer_name.type = (__u8) entity_type;
 	con->peer_name.num = cpu_to_le64(entity_num);
@@ -709,7 +610,10 @@ EXPORT_SYMBOL(ceph_con_open);
  */
 bool ceph_con_opened(struct ceph_connection *con)
 {
-	return con->connect_seq > 0;
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		return ceph_con_v2_opened(con);
+
+	return ceph_con_v1_opened(con);
 }
 
 /*
@@ -732,16 +636,15 @@ void ceph_con_init(struct ceph_connection *con, void *private,
 	INIT_LIST_HEAD(&con->out_sent);
 	INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
 
-	con->state = CON_STATE_CLOSED;
+	con->state = CEPH_CON_S_CLOSED;
 }
 EXPORT_SYMBOL(ceph_con_init);
 
-
 /*
  * We maintain a global counter to order connection attempts.  Get
  * a unique seq greater than @gt.
  */
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt)
 {
 	u32 ret;
 
@@ -753,48 +656,53 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
 	return ret;
 }
 
-static void con_out_kvec_reset(struct ceph_connection *con)
-{
-	BUG_ON(con->out_skip);
-
-	con->out_kvec_left = 0;
-	con->out_kvec_bytes = 0;
-	con->out_kvec_cur = &con->out_kvec[0];
-}
-
-static void con_out_kvec_add(struct ceph_connection *con,
-				size_t size, void *data)
+/*
+ * Discard messages that have been acked by the server.
+ */
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
 {
-	int index = con->out_kvec_left;
+	struct ceph_msg *msg;
+	u64 seq;
 
-	BUG_ON(con->out_skip);
-	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+	dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq);
+	while (!list_empty(&con->out_sent)) {
+		msg = list_first_entry(&con->out_sent, struct ceph_msg,
+				       list_head);
+		WARN_ON(msg->needs_out_seq);
+		seq = le64_to_cpu(msg->hdr.seq);
+		if (seq > ack_seq)
+			break;
 
-	con->out_kvec[index].iov_len = size;
-	con->out_kvec[index].iov_base = data;
-	con->out_kvec_left++;
-	con->out_kvec_bytes += size;
+		dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+		     msg, seq);
+		ceph_msg_remove(msg);
+	}
 }
 
 /*
- * Chop off a kvec from the end.  Return residual number of bytes for
- * that kvec, i.e. how many bytes would have been written if the kvec
- * hadn't been nuked.
+ * Discard messages that have been requeued in con_fault(), up to
+ * reconnect_seq.  This avoids gratuitously resending messages that
+ * the server had received and handled prior to reconnect.
  */
-static int con_out_kvec_skip(struct ceph_connection *con)
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
 {
-	int off = con->out_kvec_cur - con->out_kvec;
-	int skip = 0;
+	struct ceph_msg *msg;
+	u64 seq;
 
-	if (con->out_kvec_bytes > 0) {
-		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
-		BUG_ON(con->out_kvec_bytes < skip);
-		BUG_ON(!con->out_kvec_left);
-		con->out_kvec_bytes -= skip;
-		con->out_kvec_left--;
-	}
+	dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq);
+	while (!list_empty(&con->out_queue)) {
+		msg = list_first_entry(&con->out_queue, struct ceph_msg,
+				       list_head);
+		if (msg->needs_out_seq)
+			break;
+		seq = le64_to_cpu(msg->hdr.seq);
+		if (seq > reconnect_seq)
+			break;
 
-	return skip;
+		dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+		     msg, seq);
+		ceph_msg_remove(msg);
+	}
 }
 
 #ifdef CONFIG_BLOCK
@@ -1113,10 +1021,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
 	cursor->need_crc = true;
 }
 
-static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+			       struct ceph_msg *msg, size_t length)
 {
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
-
 	BUG_ON(!length);
 	BUG_ON(length > msg->data_length);
 	BUG_ON(!msg->num_data_items);
@@ -1132,9 +1039,9 @@ static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
  * data item, and supply the page offset and length of that piece.
  * Indicate whether this is the last piece in this data item.
  */
-static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
-					size_t *page_offset, size_t *length,
-					bool *last_piece)
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+				size_t *page_offset, size_t *length,
+				bool *last_piece)
 {
 	struct page *page;
 
@@ -1173,8 +1080,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
  * Returns true if the result moves the cursor on to the next piece
  * of the data item.
  */
-static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
-				  size_t bytes)
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
 {
 	bool new_piece;
 
@@ -1210,328 +1116,8 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
 	cursor->need_crc = new_piece;
 }
 
-static size_t sizeof_footer(struct ceph_connection *con)
-{
-	return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
-	    sizeof(struct ceph_msg_footer) :
-	    sizeof(struct ceph_msg_footer_old);
-}
-
-static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
-{
-	/* Initialize data cursor */
-
-	ceph_msg_data_cursor_init(msg, (size_t)data_len);
-}
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off.  Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con)
-{
-	struct ceph_msg *m = con->out_msg;
-
-	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
-
-	dout("prepare_write_message_footer %p\n", con);
-	con_out_kvec_add(con, sizeof_footer(con), &m->footer);
-	if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
-		if (con->ops->sign_message)
-			con->ops->sign_message(m);
-		else
-			m->footer.sig = 0;
-	} else {
-		m->old_footer.flags = m->footer.flags;
-	}
-	con->out_more = m->more_to_follow;
-	con->out_msg_done = true;
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m;
-	u32 crc;
-
-	con_out_kvec_reset(con);
-	con->out_msg_done = false;
-
-	/* Sneak an ack in there first?  If we can get it into the same
-	 * TCP packet that's a good thing. */
-	if (con->in_seq > con->in_seq_acked) {
-		con->in_seq_acked = con->in_seq;
-		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-		con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			&con->out_temp_ack);
-	}
-
-	BUG_ON(list_empty(&con->out_queue));
-	m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
-	con->out_msg = m;
-	BUG_ON(m->con != con);
-
-	/* put message on sent list */
-	ceph_msg_get(m);
-	list_move_tail(&m->list_head, &con->out_sent);
-
-	/*
-	 * only assign outgoing seq # if we haven't sent this message
-	 * yet.  if it is requeued, resend with it's original seq.
-	 */
-	if (m->needs_out_seq) {
-		m->hdr.seq = cpu_to_le64(++con->out_seq);
-		m->needs_out_seq = false;
-
-		if (con->ops->reencode_message)
-			con->ops->reencode_message(m);
-	}
-
-	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
-	     m, con->out_seq, le16_to_cpu(m->hdr.type),
-	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-	     m->data_length);
-	WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
-	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
-
-	/* tag + hdr + front + middle */
-	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
-	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
-	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
-
-	if (m->middle)
-		con_out_kvec_add(con, m->middle->vec.iov_len,
-			m->middle->vec.iov_base);
-
-	/* fill in hdr crc and finalize hdr */
-	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
-	con->out_msg->hdr.crc = cpu_to_le32(crc);
-	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
-
-	/* fill in front and middle crc, footer */
-	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
-	con->out_msg->footer.front_crc = cpu_to_le32(crc);
-	if (m->middle) {
-		crc = crc32c(0, m->middle->vec.iov_base,
-				m->middle->vec.iov_len);
-		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
-	} else
-		con->out_msg->footer.middle_crc = 0;
-	dout("%s front_crc %u middle_crc %u\n", __func__,
-	     le32_to_cpu(con->out_msg->footer.front_crc),
-	     le32_to_cpu(con->out_msg->footer.middle_crc));
-	con->out_msg->footer.flags = 0;
-
-	/* is there a data payload? */
-	con->out_msg->footer.data_crc = 0;
-	if (m->data_length) {
-		prepare_message_data(con->out_msg, m->data_length);
-		con->out_more = 1;  /* data + footer will follow */
-	} else {
-		/* no, queue up footer too and be done */
-		prepare_write_message_footer(con);
-	}
-
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
-	dout("prepare_write_ack %p %llu -> %llu\n", con,
-	     con->in_seq_acked, con->in_seq);
-	con->in_seq_acked = con->in_seq;
-
-	con_out_kvec_reset(con);
-
-	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-				&con->out_temp_ack);
-
-	con->out_more = 1;  /* more will follow.. eventually.. */
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare to share the seq during handshake
- */
-static void prepare_write_seq(struct ceph_connection *con)
-{
-	dout("prepare_write_seq %p %llu -> %llu\n", con,
-	     con->in_seq_acked, con->in_seq);
-	con->in_seq_acked = con->in_seq;
-
-	con_out_kvec_reset(con);
-
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			 &con->out_temp_ack);
-
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
-	dout("prepare_write_keepalive %p\n", con);
-	con_out_kvec_reset(con);
-	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
-		struct timespec64 now;
-
-		ktime_get_real_ts64(&now);
-		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
-		ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
-		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
-				 &con->out_temp_keepalive2);
-	} else {
-		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
-	}
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Connection negotiation.
- */
-
-static int get_connect_authorizer(struct ceph_connection *con)
-{
-	struct ceph_auth_handshake *auth;
-	int auth_proto;
-
-	if (!con->ops->get_authorizer) {
-		con->auth = NULL;
-		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
-		con->out_connect.authorizer_len = 0;
-		return 0;
-	}
-
-	auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
-	if (IS_ERR(auth))
-		return PTR_ERR(auth);
-
-	con->auth = auth;
-	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
-	con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
-	return 0;
-}
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_connection *con)
-{
-	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
-	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
-					&con->msgr->my_enc_addr);
-
-	con->out_more = 0;
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-static void __prepare_write_connect(struct ceph_connection *con)
-{
-	con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
-	if (con->auth)
-		con_out_kvec_add(con, con->auth->authorizer_buf_len,
-				 con->auth->authorizer_buf);
-
-	con->out_more = 0;
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-static int prepare_write_connect(struct ceph_connection *con)
-{
-	unsigned int global_seq = get_global_seq(con->msgr, 0);
-	int proto;
-	int ret;
-
-	switch (con->peer_name.type) {
-	case CEPH_ENTITY_TYPE_MON:
-		proto = CEPH_MONC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_OSD:
-		proto = CEPH_OSDC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_MDS:
-		proto = CEPH_MDSC_PROTOCOL;
-		break;
-	default:
-		BUG();
-	}
-
-	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-	     con->connect_seq, global_seq, proto);
-
-	con->out_connect.features =
-	    cpu_to_le64(from_msgr(con->msgr)->supported_features);
-	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-	con->out_connect.global_seq = cpu_to_le32(global_seq);
-	con->out_connect.protocol_version = cpu_to_le32(proto);
-	con->out_connect.flags = 0;
-
-	ret = get_connect_authorizer(con);
-	if (ret)
-		return ret;
-
-	__prepare_write_connect(con);
-	return 0;
-}
-
-/*
- * write as much of pending kvecs to the socket as we can.
- *  1 -> done
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
-	int ret;
-
-	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-	while (con->out_kvec_bytes > 0) {
-		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-				       con->out_kvec_left, con->out_kvec_bytes,
-				       con->out_more);
-		if (ret <= 0)
-			goto out;
-		con->out_kvec_bytes -= ret;
-		if (con->out_kvec_bytes == 0)
-			break;            /* done */
-
-		/* account for full iov entries consumed */
-		while (ret >= con->out_kvec_cur->iov_len) {
-			BUG_ON(!con->out_kvec_left);
-			ret -= con->out_kvec_cur->iov_len;
-			con->out_kvec_cur++;
-			con->out_kvec_left--;
-		}
-		/* and for a partially-consumed entry */
-		if (ret) {
-			con->out_kvec_cur->iov_len -= ret;
-			con->out_kvec_cur->iov_base += ret;
-		}
-	}
-	con->out_kvec_left = 0;
-	ret = 1;
-out:
-	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-	     con->out_kvec_bytes, con->out_kvec_left, ret);
-	return ret;  /* done! */
-}
-
-static u32 ceph_crc32c_page(u32 crc, struct page *page,
-				unsigned int page_offset,
-				unsigned int length)
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+		     unsigned int length)
 {
 	char *kaddr;
 
@@ -1542,257 +1128,8 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page,
 
 	return crc;
 }
-/*
- * Write as much message data payload as we can.  If we finish, queue
- * up the footer.
- *  1 -> done, footer is now queued in out_kvec[].
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_message_data(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->out_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
-	u32 crc;
-
-	dout("%s %p msg %p\n", __func__, con, msg);
-
-	if (!msg->num_data_items)
-		return -EINVAL;
-
-	/*
-	 * Iterate through each page that contains data to be
-	 * written, and send as much as possible for each.
-	 *
-	 * If we are calculating the data crc (the default), we will
-	 * need to map the page.  If we have no pages, they have
-	 * been revoked, so use the zero page.
-	 */
-	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
-	while (cursor->total_resid) {
-		struct page *page;
-		size_t page_offset;
-		size_t length;
-		int ret;
-
-		if (!cursor->resid) {
-			ceph_msg_data_advance(cursor, 0);
-			continue;
-		}
-
-		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
-		if (length == cursor->total_resid)
-			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
-					more);
-		if (ret <= 0) {
-			if (do_datacrc)
-				msg->footer.data_crc = cpu_to_le32(crc);
-
-			return ret;
-		}
-		if (do_datacrc && cursor->need_crc)
-			crc = ceph_crc32c_page(crc, page, page_offset, length);
-		ceph_msg_data_advance(cursor, (size_t)ret);
-	}
-
-	dout("%s %p msg %p done\n", __func__, con, msg);
-
-	/* prepare and queue up footer, too */
-	if (do_datacrc)
-		msg->footer.data_crc = cpu_to_le32(crc);
-	else
-		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
-	con_out_kvec_reset(con);
-	prepare_write_message_footer(con);
-
-	return 1;	/* must return > 0 to indicate success */
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
-	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
-	int ret;
-
-	dout("%s %p %d left\n", __func__, con, con->out_skip);
-	while (con->out_skip > 0) {
-		size_t size = min(con->out_skip, (int) PAGE_SIZE);
-
-		if (size == con->out_skip)
-			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more);
-		if (ret <= 0)
-			goto out;
-		con->out_skip -= ret;
-	}
-	ret = 1;
-out:
-	return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
-	dout("prepare_read_banner %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_connect(struct ceph_connection *con)
-{
-	dout("prepare_read_connect %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
-	dout("prepare_read_ack %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_seq(struct ceph_connection *con)
-{
-	dout("prepare_read_seq %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_SEQ;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
-	dout("prepare_read_tag %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-static void prepare_read_keepalive_ack(struct ceph_connection *con)
-{
-	dout("prepare_read_keepalive_ack %p\n", con);
-	con->in_base_pos = 0;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
-	dout("prepare_read_message %p\n", con);
-	BUG_ON(con->in_msg != NULL);
-	con->in_base_pos = 0;
-	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
-	return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
-			int end, int size, void *object)
-{
-	while (con->in_base_pos < end) {
-		int left = end - con->in_base_pos;
-		int have = size - left;
-		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-	}
-	return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
-	int size;
-	int end;
-	int ret;
-
-	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-
-	/* peer's banner */
-	size = strlen(CEPH_BANNER);
-	end = size;
-	ret = read_partial(con, end, size, con->in_banner);
-	if (ret <= 0)
-		goto out;
-
-	size = sizeof (con->actual_peer_addr);
-	end += size;
-	ret = read_partial(con, end, size, &con->actual_peer_addr);
-	if (ret <= 0)
-		goto out;
-	ceph_decode_banner_addr(&con->actual_peer_addr);
-
-	size = sizeof (con->peer_addr_for_me);
-	end += size;
-	ret = read_partial(con, end, size, &con->peer_addr_for_me);
-	if (ret <= 0)
-		goto out;
-	ceph_decode_banner_addr(&con->peer_addr_for_me);
-
-out:
-	return ret;
-}
-
-static int read_partial_connect(struct ceph_connection *con)
-{
-	int size;
-	int end;
-	int ret;
-
-	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
-	size = sizeof (con->in_reply);
-	end = size;
-	ret = read_partial(con, end, size, &con->in_reply);
-	if (ret <= 0)
-		goto out;
-
-	if (con->auth) {
-		size = le32_to_cpu(con->in_reply.authorizer_len);
-		if (size > con->auth->authorizer_reply_buf_len) {
-			pr_err("authorizer reply too big: %d > %zu\n", size,
-			       con->auth->authorizer_reply_buf_len);
-			ret = -EINVAL;
-			goto out;
-		}
-
-		end += size;
-		ret = read_partial(con, end, size,
-				   con->auth->authorizer_reply_buf);
-		if (ret <= 0)
-			goto out;
-	}
-
-	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-	     con, (int)con->in_reply.tag,
-	     le32_to_cpu(con->in_reply.connect_seq),
-	     le32_to_cpu(con->in_reply.global_seq));
-out:
-	return ret;
-}
 
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
-	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
-		pr_err("connect to %s got bad banner\n",
-		       ceph_pr_addr(&con->peer_addr));
-		con->error_msg = "protocol error, bad banner";
-		return -1;
-	}
-	return 0;
-}
-
-static bool addr_is_blank(struct ceph_entity_addr *addr)
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
 {
 	struct sockaddr_storage ss = addr->in_addr; /* align */
 	struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
@@ -1808,7 +1145,7 @@ static bool addr_is_blank(struct ceph_entity_addr *addr)
 	}
 }
 
-static int addr_port(struct ceph_entity_addr *addr)
+int ceph_addr_port(const struct ceph_entity_addr *addr)
 {
 	switch (get_unaligned(&addr->in_addr.ss_family)) {
 	case AF_INET:
@@ -1819,7 +1156,7 @@ static int addr_port(struct ceph_entity_addr *addr)
 	return 0;
 }
 
-static void addr_set_port(struct ceph_entity_addr *addr, int p)
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p)
 {
 	switch (get_unaligned(&addr->in_addr.ss_family)) {
 	case AF_INET:
@@ -1977,8 +1314,17 @@ int ceph_parse_ips(const char *c, const char *end,
 			port = CEPH_MON_PORT;
 		}
 
-		addr_set_port(&addr[i], port);
+		ceph_addr_set_port(&addr[i], port);
+		/*
+		 * We want the type to be set according to ms_mode
+		 * option, but options are normally parsed after mon
+		 * addresses.  Rather than complicating parsing, set
+		 * to LEGACY and override in build_initial_monmap()
+		 * for mon addresses and ceph_messenger_init() for
+		 * ip option.
+		 */
 		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+		addr[i].nonce = 0;
 
 		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
 
@@ -2000,521 +1346,12 @@ bad:
 	return ret;
 }
 
-static int process_banner(struct ceph_connection *con)
-{
-	dout("process_banner on %p\n", con);
-
-	if (verify_hello(con) < 0)
-		return -1;
-
-	/*
-	 * Make sure the other end is who we wanted.  note that the other
-	 * end may not yet know their ip address, so if it's 0.0.0.0, give
-	 * them the benefit of the doubt.
-	 */
-	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
-		   sizeof(con->peer_addr)) != 0 &&
-	    !(addr_is_blank(&con->actual_peer_addr) &&
-	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
-			ceph_pr_addr(&con->peer_addr),
-			le32_to_cpu(con->peer_addr.nonce),
-			ceph_pr_addr(&con->actual_peer_addr),
-			le32_to_cpu(con->actual_peer_addr.nonce));
-		con->error_msg = "wrong peer at address";
-		return -1;
-	}
-
-	/*
-	 * did we learn our address?
-	 */
-	if (addr_is_blank(&con->msgr->inst.addr)) {
-		int port = addr_port(&con->msgr->inst.addr);
-
-		memcpy(&con->msgr->inst.addr.in_addr,
-		       &con->peer_addr_for_me.in_addr,
-		       sizeof(con->peer_addr_for_me.in_addr));
-		addr_set_port(&con->msgr->inst.addr, port);
-		encode_my_addr(con->msgr);
-		dout("process_banner learned my addr is %s\n",
-		     ceph_pr_addr(&con->msgr->inst.addr));
-	}
-
-	return 0;
-}
-
-static int process_connect(struct ceph_connection *con)
-{
-	u64 sup_feat = from_msgr(con->msgr)->supported_features;
-	u64 req_feat = from_msgr(con->msgr)->required_features;
-	u64 server_feat = le64_to_cpu(con->in_reply.features);
-	int ret;
-
-	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
-	if (con->auth) {
-		int len = le32_to_cpu(con->in_reply.authorizer_len);
-
-		/*
-		 * Any connection that defines ->get_authorizer()
-		 * should also define ->add_authorizer_challenge() and
-		 * ->verify_authorizer_reply().
-		 *
-		 * See get_connect_authorizer().
-		 */
-		if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
-			ret = con->ops->add_authorizer_challenge(
-				    con, con->auth->authorizer_reply_buf, len);
-			if (ret < 0)
-				return ret;
-
-			con_out_kvec_reset(con);
-			__prepare_write_connect(con);
-			prepare_read_connect(con);
-			return 0;
-		}
-
-		if (len) {
-			ret = con->ops->verify_authorizer_reply(con);
-			if (ret < 0) {
-				con->error_msg = "bad authorize reply";
-				return ret;
-			}
-		}
-	}
-
-	switch (con->in_reply.tag) {
-	case CEPH_MSGR_TAG_FEATURES:
-		pr_err("%s%lld %s feature set mismatch,"
-		       " my %llx < server's %llx, missing %llx\n",
-		       ENTITY_NAME(con->peer_name),
-		       ceph_pr_addr(&con->peer_addr),
-		       sup_feat, server_feat, server_feat & ~sup_feat);
-		con->error_msg = "missing required protocol features";
-		reset_connection(con);
-		return -1;
-
-	case CEPH_MSGR_TAG_BADPROTOVER:
-		pr_err("%s%lld %s protocol version mismatch,"
-		       " my %d != server's %d\n",
-		       ENTITY_NAME(con->peer_name),
-		       ceph_pr_addr(&con->peer_addr),
-		       le32_to_cpu(con->out_connect.protocol_version),
-		       le32_to_cpu(con->in_reply.protocol_version));
-		con->error_msg = "protocol version mismatch";
-		reset_connection(con);
-		return -1;
-
-	case CEPH_MSGR_TAG_BADAUTHORIZER:
-		con->auth_retry++;
-		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-		     con->auth_retry);
-		if (con->auth_retry == 2) {
-			con->error_msg = "connect authorization failure";
-			return -1;
-		}
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RESETSESSION:
-		/*
-		 * If we connected with a large connect_seq but the peer
-		 * has no record of a session with us (no connection, or
-		 * connect_seq == 0), they will send RESETSESION to indicate
-		 * that they must have reset their session, and may have
-		 * dropped messages.
-		 */
-		dout("process_connect got RESET peer seq %u\n",
-		     le32_to_cpu(con->in_reply.connect_seq));
-		pr_err("%s%lld %s connection reset\n",
-		       ENTITY_NAME(con->peer_name),
-		       ceph_pr_addr(&con->peer_addr));
-		reset_connection(con);
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-
-		/* Tell ceph about it. */
-		mutex_unlock(&con->mutex);
-		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
-		if (con->ops->peer_reset)
-			con->ops->peer_reset(con);
-		mutex_lock(&con->mutex);
-		if (con->state != CON_STATE_NEGOTIATING)
-			return -EAGAIN;
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_SESSION:
-		/*
-		 * If we sent a smaller connect_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
-		     le32_to_cpu(con->out_connect.connect_seq),
-		     le32_to_cpu(con->in_reply.connect_seq));
-		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_GLOBAL:
-		/*
-		 * If we sent a smaller global_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.global_seq));
-		get_global_seq(con->msgr,
-			       le32_to_cpu(con->in_reply.global_seq));
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_SEQ:
-	case CEPH_MSGR_TAG_READY:
-		if (req_feat & ~server_feat) {
-			pr_err("%s%lld %s protocol feature mismatch,"
-			       " my required %llx > server's %llx, need %llx\n",
-			       ENTITY_NAME(con->peer_name),
-			       ceph_pr_addr(&con->peer_addr),
-			       req_feat, server_feat, req_feat & ~server_feat);
-			con->error_msg = "missing required protocol features";
-			reset_connection(con);
-			return -1;
-		}
-
-		WARN_ON(con->state != CON_STATE_NEGOTIATING);
-		con->state = CON_STATE_OPEN;
-		con->auth_retry = 0;    /* we authenticated; clear flag */
-		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-		con->connect_seq++;
-		con->peer_features = server_feat;
-		dout("process_connect got READY gseq %d cseq %d (%d)\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.connect_seq),
-		     con->connect_seq);
-		WARN_ON(con->connect_seq !=
-			le32_to_cpu(con->in_reply.connect_seq));
-
-		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			con_flag_set(con, CON_FLAG_LOSSYTX);
-
-		con->delay = 0;      /* reset backoff memory */
-
-		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
-			prepare_write_seq(con);
-			prepare_read_seq(con);
-		} else {
-			prepare_read_tag(con);
-		}
-		break;
-
-	case CEPH_MSGR_TAG_WAIT:
-		/*
-		 * If there is a connection race (we are opening
-		 * connections to each other), one of us may just have
-		 * to WAIT.  This shouldn't happen if we are the
-		 * client.
-		 */
-		con->error_msg = "protocol error, got WAIT as client";
-		return -1;
-
-	default:
-		con->error_msg = "protocol error, garbage tag during connect";
-		return -1;
-	}
-	return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
-	int size = sizeof (con->in_temp_ack);
-	int end = size;
-
-	return read_partial(con, end, size, &con->in_temp_ack);
-}
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
-	struct ceph_msg *m;
-	u64 ack = le64_to_cpu(con->in_temp_ack);
-	u64 seq;
-	bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ);
-	struct list_head *list = reconnect ? &con->out_queue : &con->out_sent;
-
-	/*
-	 * In the reconnect case, con_fault() has requeued messages
-	 * in out_sent. We should cleanup old messages according to
-	 * the reconnect seq.
-	 */
-	while (!list_empty(list)) {
-		m = list_first_entry(list, struct ceph_msg, list_head);
-		if (reconnect && m->needs_out_seq)
-			break;
-		seq = le64_to_cpu(m->hdr.seq);
-		if (seq > ack)
-			break;
-		dout("got ack for seq %llu type %d at %p\n", seq,
-		     le16_to_cpu(m->hdr.type), m);
-		m->ack_stamp = jiffies;
-		ceph_msg_remove(m);
-	}
-
-	prepare_read_tag(con);
-}
-
-
-static int read_partial_message_section(struct ceph_connection *con,
-					struct kvec *section,
-					unsigned int sec_len, u32 *crc)
-{
-	int ret, left;
-
-	BUG_ON(!section);
-
-	while (section->iov_len < sec_len) {
-		BUG_ON(section->iov_base == NULL);
-		left = sec_len - section->iov_len;
-		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
-				       section->iov_len, left);
-		if (ret <= 0)
-			return ret;
-		section->iov_len += ret;
-	}
-	if (section->iov_len == sec_len)
-		*crc = crc32c(0, section->iov_base, section->iov_len);
-
-	return 1;
-}
-
-static int read_partial_msg_data(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->in_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	struct page *page;
-	size_t page_offset;
-	size_t length;
-	u32 crc = 0;
-	int ret;
-
-	if (!msg->num_data_items)
-		return -EIO;
-
-	if (do_datacrc)
-		crc = con->in_data_crc;
-	while (cursor->total_resid) {
-		if (!cursor->resid) {
-			ceph_msg_data_advance(cursor, 0);
-			continue;
-		}
-
-		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
-		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
-		if (ret <= 0) {
-			if (do_datacrc)
-				con->in_data_crc = crc;
-
-			return ret;
-		}
-
-		if (do_datacrc)
-			crc = ceph_crc32c_page(crc, page, page_offset, ret);
-		ceph_msg_data_advance(cursor, (size_t)ret);
-	}
-	if (do_datacrc)
-		con->in_data_crc = crc;
-
-	return 1;	/* must return > 0 to indicate success */
-}
-
-/*
- * read (part of) a message.
- */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
-
-static int read_partial_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m = con->in_msg;
-	int size;
-	int end;
-	int ret;
-	unsigned int front_len, middle_len, data_len;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
-	u64 seq;
-	u32 crc;
-
-	dout("read_partial_message con %p msg %p\n", con, m);
-
-	/* header */
-	size = sizeof (con->in_hdr);
-	end = size;
-	ret = read_partial(con, end, size, &con->in_hdr);
-	if (ret <= 0)
-		return ret;
-
-	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
-	if (cpu_to_le32(crc) != con->in_hdr.crc) {
-		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
-		       crc, con->in_hdr.crc);
-		return -EBADMSG;
-	}
-
-	front_len = le32_to_cpu(con->in_hdr.front_len);
-	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
-		return -EIO;
-	middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
-		return -EIO;
-	data_len = le32_to_cpu(con->in_hdr.data_len);
-	if (data_len > CEPH_MSG_MAX_DATA_LEN)
-		return -EIO;
-
-	/* verify seq# */
-	seq = le64_to_cpu(con->in_hdr.seq);
-	if ((s64)seq - (s64)con->in_seq < 1) {
-		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
-			ENTITY_NAME(con->peer_name),
-			ceph_pr_addr(&con->peer_addr),
-			seq, con->in_seq + 1);
-		con->in_base_pos = -front_len - middle_len - data_len -
-			sizeof_footer(con);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		return 1;
-	} else if ((s64)seq - (s64)con->in_seq > 1) {
-		pr_err("read_partial_message bad seq %lld expected %lld\n",
-		       seq, con->in_seq + 1);
-		con->error_msg = "bad message sequence # for incoming message";
-		return -EBADE;
-	}
-
-	/* allocate message? */
-	if (!con->in_msg) {
-		int skip = 0;
-
-		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
-		     front_len, data_len);
-		ret = ceph_con_in_msg_alloc(con, &skip);
-		if (ret < 0)
-			return ret;
-
-		BUG_ON(!con->in_msg ^ skip);
-		if (skip) {
-			/* skip this message */
-			dout("alloc_msg said skip message\n");
-			con->in_base_pos = -front_len - middle_len - data_len -
-				sizeof_footer(con);
-			con->in_tag = CEPH_MSGR_TAG_READY;
-			con->in_seq++;
-			return 1;
-		}
-
-		BUG_ON(!con->in_msg);
-		BUG_ON(con->in_msg->con != con);
-		m = con->in_msg;
-		m->front.iov_len = 0;    /* haven't read it yet */
-		if (m->middle)
-			m->middle->vec.iov_len = 0;
-
-		/* prepare for data payload, if any */
-
-		if (data_len)
-			prepare_message_data(con->in_msg, data_len);
-	}
-
-	/* front */
-	ret = read_partial_message_section(con, &m->front, front_len,
-					   &con->in_front_crc);
-	if (ret <= 0)
-		return ret;
-
-	/* middle */
-	if (m->middle) {
-		ret = read_partial_message_section(con, &m->middle->vec,
-						   middle_len,
-						   &con->in_middle_crc);
-		if (ret <= 0)
-			return ret;
-	}
-
-	/* (page) data */
-	if (data_len) {
-		ret = read_partial_msg_data(con);
-		if (ret <= 0)
-			return ret;
-	}
-
-	/* footer */
-	size = sizeof_footer(con);
-	end += size;
-	ret = read_partial(con, end, size, &m->footer);
-	if (ret <= 0)
-		return ret;
-
-	if (!need_sign) {
-		m->footer.flags = m->old_footer.flags;
-		m->footer.sig = 0;
-	}
-
-	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
-	     m, front_len, m->footer.front_crc, middle_len,
-	     m->footer.middle_crc, data_len, m->footer.data_crc);
-
-	/* crc ok? */
-	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
-		pr_err("read_partial_message %p front crc %u != exp. %u\n",
-		       m, con->in_front_crc, m->footer.front_crc);
-		return -EBADMSG;
-	}
-	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
-		pr_err("read_partial_message %p middle crc %u != exp %u\n",
-		       m, con->in_middle_crc, m->footer.middle_crc);
-		return -EBADMSG;
-	}
-	if (do_datacrc &&
-	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
-	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
-		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
-		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
-		return -EBADMSG;
-	}
-
-	if (need_sign && con->ops->check_message_signature &&
-	    con->ops->check_message_signature(m)) {
-		pr_err("read_partial_message %p signature check failed\n", m);
-		return -EBADMSG;
-	}
-
-	return 1; /* done! */
-}
-
 /*
  * Process message.  This happens in the worker thread.  The callback should
  * be careful not to do anything that waits on other incoming messages or it
  * may deadlock.
  */
-static void process_message(struct ceph_connection *con)
+void ceph_con_process_message(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->in_msg;
 
@@ -2528,12 +1365,13 @@ static void process_message(struct ceph_connection *con)
 	con->in_seq++;
 	mutex_unlock(&con->mutex);
 
-	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+	dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n",
 	     msg, le64_to_cpu(msg->hdr.seq),
 	     ENTITY_NAME(msg->hdr.src),
 	     le16_to_cpu(msg->hdr.type),
 	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
 	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.middle_len),
 	     le32_to_cpu(msg->hdr.data_len),
 	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
 	con->ops->dispatch(con, msg);
@@ -2541,264 +1379,6 @@ static void process_message(struct ceph_connection *con)
 	mutex_lock(&con->mutex);
 }
 
-static int read_keepalive_ack(struct ceph_connection *con)
-{
-	struct ceph_timespec ceph_ts;
-	size_t size = sizeof(ceph_ts);
-	int ret = read_partial(con, size, size, &ceph_ts);
-	if (ret <= 0)
-		return ret;
-	ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
-	prepare_read_tag(con);
-	return 1;
-}
-
-/*
- * Write something to the socket.  Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-static int try_write(struct ceph_connection *con)
-{
-	int ret = 1;
-
-	dout("try_write start %p state %lu\n", con, con->state);
-	if (con->state != CON_STATE_PREOPEN &&
-	    con->state != CON_STATE_CONNECTING &&
-	    con->state != CON_STATE_NEGOTIATING &&
-	    con->state != CON_STATE_OPEN)
-		return 0;
-
-	/* open the socket first? */
-	if (con->state == CON_STATE_PREOPEN) {
-		BUG_ON(con->sock);
-		con->state = CON_STATE_CONNECTING;
-
-		con_out_kvec_reset(con);
-		prepare_write_banner(con);
-		prepare_read_banner(con);
-
-		BUG_ON(con->in_msg);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		dout("try_write initiating connect on %p new state %lu\n",
-		     con, con->state);
-		ret = ceph_tcp_connect(con);
-		if (ret < 0) {
-			con->error_msg = "connect error";
-			goto out;
-		}
-	}
-
-more:
-	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
-	BUG_ON(!con->sock);
-
-	/* kvec data queued? */
-	if (con->out_kvec_left) {
-		ret = write_partial_kvec(con);
-		if (ret <= 0)
-			goto out;
-	}
-	if (con->out_skip) {
-		ret = write_partial_skip(con);
-		if (ret <= 0)
-			goto out;
-	}
-
-	/* msg pages? */
-	if (con->out_msg) {
-		if (con->out_msg_done) {
-			ceph_msg_put(con->out_msg);
-			con->out_msg = NULL;   /* we're done with this one */
-			goto do_next;
-		}
-
-		ret = write_partial_message_data(con);
-		if (ret == 1)
-			goto more;  /* we need to send the footer, too! */
-		if (ret == 0)
-			goto out;
-		if (ret < 0) {
-			dout("try_write write_partial_message_data err %d\n",
-			     ret);
-			goto out;
-		}
-	}
-
-do_next:
-	if (con->state == CON_STATE_OPEN) {
-		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
-			prepare_write_keepalive(con);
-			goto more;
-		}
-		/* is anything else pending? */
-		if (!list_empty(&con->out_queue)) {
-			prepare_write_message(con);
-			goto more;
-		}
-		if (con->in_seq > con->in_seq_acked) {
-			prepare_write_ack(con);
-			goto more;
-		}
-	}
-
-	/* Nothing to do! */
-	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
-	dout("try_write nothing else to write.\n");
-	ret = 0;
-out:
-	dout("try_write done on %p ret %d\n", con, ret);
-	return ret;
-}
-
-/*
- * Read what we can from the socket.
- */
-static int try_read(struct ceph_connection *con)
-{
-	int ret = -1;
-
-more:
-	dout("try_read start on %p state %lu\n", con, con->state);
-	if (con->state != CON_STATE_CONNECTING &&
-	    con->state != CON_STATE_NEGOTIATING &&
-	    con->state != CON_STATE_OPEN)
-		return 0;
-
-	BUG_ON(!con->sock);
-
-	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-	     con->in_base_pos);
-
-	if (con->state == CON_STATE_CONNECTING) {
-		dout("try_read connecting\n");
-		ret = read_partial_banner(con);
-		if (ret <= 0)
-			goto out;
-		ret = process_banner(con);
-		if (ret < 0)
-			goto out;
-
-		con->state = CON_STATE_NEGOTIATING;
-
-		/*
-		 * Received banner is good, exchange connection info.
-		 * Do not reset out_kvec, as sending our banner raced
-		 * with receiving peer banner after connect completed.
-		 */
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			goto out;
-		prepare_read_connect(con);
-
-		/* Send connection info before awaiting response */
-		goto out;
-	}
-
-	if (con->state == CON_STATE_NEGOTIATING) {
-		dout("try_read negotiating\n");
-		ret = read_partial_connect(con);
-		if (ret <= 0)
-			goto out;
-		ret = process_connect(con);
-		if (ret < 0)
-			goto out;
-		goto more;
-	}
-
-	WARN_ON(con->state != CON_STATE_OPEN);
-
-	if (con->in_base_pos < 0) {
-		/*
-		 * skipping + discarding content.
-		 */
-		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
-		if (ret <= 0)
-			goto out;
-		dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
-		con->in_base_pos += ret;
-		if (con->in_base_pos)
-			goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_READY) {
-		/*
-		 * what's next?
-		 */
-		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
-		if (ret <= 0)
-			goto out;
-		dout("try_read got tag %d\n", (int)con->in_tag);
-		switch (con->in_tag) {
-		case CEPH_MSGR_TAG_MSG:
-			prepare_read_message(con);
-			break;
-		case CEPH_MSGR_TAG_ACK:
-			prepare_read_ack(con);
-			break;
-		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
-			prepare_read_keepalive_ack(con);
-			break;
-		case CEPH_MSGR_TAG_CLOSE:
-			con_close_socket(con);
-			con->state = CON_STATE_CLOSED;
-			goto out;
-		default:
-			goto bad_tag;
-		}
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
-		ret = read_partial_message(con);
-		if (ret <= 0) {
-			switch (ret) {
-			case -EBADMSG:
-				con->error_msg = "bad crc/signature";
-				fallthrough;
-			case -EBADE:
-				ret = -EIO;
-				break;
-			case -EIO:
-				con->error_msg = "io error";
-				break;
-			}
-			goto out;
-		}
-		if (con->in_tag == CEPH_MSGR_TAG_READY)
-			goto more;
-		process_message(con);
-		if (con->state == CON_STATE_OPEN)
-			prepare_read_tag(con);
-		goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
-	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
-		/*
-		 * the final handshake seq exchange is semantically
-		 * equivalent to an ACK
-		 */
-		ret = read_partial_ack(con);
-		if (ret <= 0)
-			goto out;
-		process_ack(con);
-		goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
-		ret = read_keepalive_ack(con);
-		if (ret <= 0)
-			goto out;
-		goto more;
-	}
-
-out:
-	dout("try_read done on %p ret %d\n", con, ret);
-	return ret;
-
-bad_tag:
-	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
-	con->error_msg = "protocol error, garbage tag";
-	ret = -1;
-	goto out;
-}
-
-
 /*
  * Atomically queue work on a connection after the specified delay.
  * Bump @con reference to avoid races with connection teardown.
@@ -2811,6 +1391,9 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
 		return -ENOENT;
 	}
 
+	if (delay >= HZ)
+		delay = round_jiffies_relative(delay);
+
 	dout("%s %p %lu\n", __func__, con, delay);
 	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
 		dout("%s %p - already queued\n", __func__, con);
@@ -2836,27 +1419,30 @@ static void cancel_con(struct ceph_connection *con)
 
 static bool con_sock_closed(struct ceph_connection *con)
 {
-	if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
+	if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
 		return false;
 
 #define CASE(x)								\
-	case CON_STATE_ ## x:						\
+	case CEPH_CON_S_ ## x:						\
 		con->error_msg = "socket closed (con state " #x ")";	\
 		break;
 
 	switch (con->state) {
 	CASE(CLOSED);
 	CASE(PREOPEN);
-	CASE(CONNECTING);
-	CASE(NEGOTIATING);
+	CASE(V1_BANNER);
+	CASE(V1_CONNECT_MSG);
+	CASE(V2_BANNER_PREFIX);
+	CASE(V2_BANNER_PAYLOAD);
+	CASE(V2_HELLO);
+	CASE(V2_AUTH);
+	CASE(V2_AUTH_SIGNATURE);
+	CASE(V2_SESSION_CONNECT);
+	CASE(V2_SESSION_RECONNECT);
 	CASE(OPEN);
 	CASE(STANDBY);
 	default:
-		pr_warn("%s con %p unrecognized state %lu\n",
-			__func__, con, con->state);
-		con->error_msg = "unrecognized con state";
 		BUG();
-		break;
 	}
 #undef CASE
 
@@ -2867,15 +1453,15 @@ static bool con_backoff(struct ceph_connection *con)
 {
 	int ret;
 
-	if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
+	if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
 		return false;
 
-	ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+	ret = queue_con_delay(con, con->delay);
 	if (ret) {
 		dout("%s: con %p FAILED to back off %lu\n", __func__,
 			con, con->delay);
 		BUG_ON(ret == -ENOENT);
-		con_flag_set(con, CON_FLAG_BACKOFF);
+		ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
 	}
 
 	return true;
@@ -2891,11 +1477,11 @@ static void con_fault_finish(struct ceph_connection *con)
 	 * in case we faulted due to authentication, invalidate our
 	 * current tickets so that we can get new ones.
 	 */
-	if (con->auth_retry) {
-		dout("auth_retry %d, invalidating\n", con->auth_retry);
+	if (con->v1.auth_retry) {
+		dout("auth_retry %d, invalidating\n", con->v1.auth_retry);
 		if (con->ops->invalidate_authorizer)
 			con->ops->invalidate_authorizer(con);
-		con->auth_retry = 0;
+		con->v1.auth_retry = 0;
 	}
 
 	if (con->ops->fault)
@@ -2923,21 +1509,24 @@ static void ceph_con_workfn(struct work_struct *work)
 			dout("%s: con %p BACKOFF\n", __func__, con);
 			break;
 		}
-		if (con->state == CON_STATE_STANDBY) {
+		if (con->state == CEPH_CON_S_STANDBY) {
 			dout("%s: con %p STANDBY\n", __func__, con);
 			break;
 		}
-		if (con->state == CON_STATE_CLOSED) {
+		if (con->state == CEPH_CON_S_CLOSED) {
 			dout("%s: con %p CLOSED\n", __func__, con);
 			BUG_ON(con->sock);
 			break;
 		}
-		if (con->state == CON_STATE_PREOPEN) {
+		if (con->state == CEPH_CON_S_PREOPEN) {
 			dout("%s: con %p PREOPEN\n", __func__, con);
 			BUG_ON(con->sock);
 		}
 
-		ret = try_read(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ret = ceph_con_v2_try_read(con);
+		else
+			ret = ceph_con_v1_try_read(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -2947,7 +1536,10 @@ static void ceph_con_workfn(struct work_struct *work)
 			break;
 		}
 
-		ret = try_write(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ret = ceph_con_v2_try_write(con);
+		else
+			ret = ceph_con_v1_try_write(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -2974,64 +1566,54 @@ static void ceph_con_workfn(struct work_struct *work)
  */
 static void con_fault(struct ceph_connection *con)
 {
-	dout("fault %p state %lu to peer %s\n",
+	dout("fault %p state %d to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr));
 
 	pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
 		ceph_pr_addr(&con->peer_addr), con->error_msg);
 	con->error_msg = NULL;
 
-	WARN_ON(con->state != CON_STATE_CONNECTING &&
-	       con->state != CON_STATE_NEGOTIATING &&
-	       con->state != CON_STATE_OPEN);
+	WARN_ON(con->state == CEPH_CON_S_STANDBY ||
+		con->state == CEPH_CON_S_CLOSED);
 
-	con_close_socket(con);
+	ceph_con_reset_protocol(con);
 
-	if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
+	if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
-		con->state = CON_STATE_CLOSED;
+		con->state = CEPH_CON_S_CLOSED;
 		return;
 	}
 
-	if (con->in_msg) {
-		BUG_ON(con->in_msg->con != con);
-		ceph_msg_put(con->in_msg);
-		con->in_msg = NULL;
-	}
-	if (con->out_msg) {
-		BUG_ON(con->out_msg->con != con);
-		ceph_msg_put(con->out_msg);
-		con->out_msg = NULL;
-	}
-
 	/* Requeue anything that hasn't been acked */
 	list_splice_init(&con->out_sent, &con->out_queue);
 
 	/* If there are no messages queued or keepalive pending, place
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
-	    !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
+	    !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
-		con_flag_clear(con, CON_FLAG_WRITE_PENDING);
-		con->state = CON_STATE_STANDBY;
+		ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+		con->state = CEPH_CON_S_STANDBY;
 	} else {
 		/* retry after a delay. */
-		con->state = CON_STATE_PREOPEN;
-		if (con->delay == 0)
+		con->state = CEPH_CON_S_PREOPEN;
+		if (!con->delay) {
 			con->delay = BASE_DELAY_INTERVAL;
-		else if (con->delay < MAX_DELAY_INTERVAL)
+		} else if (con->delay < MAX_DELAY_INTERVAL) {
 			con->delay *= 2;
-		con_flag_set(con, CON_FLAG_BACKOFF);
+			if (con->delay > MAX_DELAY_INTERVAL)
+				con->delay = MAX_DELAY_INTERVAL;
+		}
+		ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
 		queue_con(con);
 	}
 }
 
-
 void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
 {
 	u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
 	msgr->inst.addr.nonce = cpu_to_le32(nonce);
-	encode_my_addr(msgr);
+	ceph_encode_my_addr(msgr);
 }
 
 /*
@@ -3042,26 +1624,35 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 {
 	spin_lock_init(&msgr->global_seq_lock);
 
-	if (myaddr)
-		msgr->inst.addr = *myaddr;
+	if (myaddr) {
+		memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr,
+		       sizeof(msgr->inst.addr.in_addr));
+		ceph_addr_set_port(&msgr->inst.addr, 0);
+	}
 
-	/* select a random nonce */
-	msgr->inst.addr.type = 0;
-	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
-	encode_my_addr(msgr);
+	/*
+	 * Since nautilus, clients are identified using type ANY.
+	 * For msgr1, ceph_encode_banner_addr() munges it to NONE.
+	 */
+	msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY;
+
+	/* generate a random non-zero nonce */
+	do {
+		get_random_bytes(&msgr->inst.addr.nonce,
+				 sizeof(msgr->inst.addr.nonce));
+	} while (!msgr->inst.addr.nonce);
+	ceph_encode_my_addr(msgr);
 
 	atomic_set(&msgr->stopping, 0);
 	write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
 
 	dout("%s %p\n", __func__, msgr);
 }
-EXPORT_SYMBOL(ceph_messenger_init);
 
 void ceph_messenger_fini(struct ceph_messenger *msgr)
 {
 	put_net(read_pnet(&msgr->net));
 }
-EXPORT_SYMBOL(ceph_messenger_fini);
 
 static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
 {
@@ -3075,17 +1666,19 @@ static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
 static void clear_standby(struct ceph_connection *con)
 {
 	/* come back from STANDBY? */
-	if (con->state == CON_STATE_STANDBY) {
+	if (con->state == CEPH_CON_S_STANDBY) {
 		dout("clear_standby %p and ++connect_seq\n", con);
-		con->state = CON_STATE_PREOPEN;
-		con->connect_seq++;
-		WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
-		WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
+		con->state = CEPH_CON_S_PREOPEN;
+		con->v1.connect_seq++;
+		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
+		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
 	}
 }
 
 /*
  * Queue up an outgoing message on the given connection.
+ *
+ * Consumes a ref on @msg.
  */
 void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 {
@@ -3096,7 +1689,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	mutex_lock(&con->mutex);
 
-	if (con->state == CON_STATE_CLOSED) {
+	if (con->state == CEPH_CON_S_CLOSED) {
 		dout("con_send %p closed, dropping %p\n", con, msg);
 		ceph_msg_put(msg);
 		mutex_unlock(&con->mutex);
@@ -3119,7 +1712,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
-	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+	if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_send);
@@ -3137,36 +1730,30 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 	}
 
 	mutex_lock(&con->mutex);
-	if (!list_empty(&msg->list_head)) {
-		dout("%s %p msg %p - was on queue\n", __func__, con, msg);
-		list_del_init(&msg->list_head);
-		msg->hdr.seq = 0;
-
-		ceph_msg_put(msg);
+	if (list_empty(&msg->list_head)) {
+		WARN_ON(con->out_msg == msg);
+		dout("%s con %p msg %p not linked\n", __func__, con, msg);
+		mutex_unlock(&con->mutex);
+		return;
 	}
+
+	dout("%s con %p msg %p was linked\n", __func__, con, msg);
+	msg->hdr.seq = 0;
+	ceph_msg_remove(msg);
+
 	if (con->out_msg == msg) {
-		BUG_ON(con->out_skip);
-		/* footer */
-		if (con->out_msg_done) {
-			con->out_skip += con_out_kvec_skip(con);
-		} else {
-			BUG_ON(!msg->data_length);
-			con->out_skip += sizeof_footer(con);
-		}
-		/* data, middle, front */
-		if (msg->data_length)
-			con->out_skip += msg->cursor.total_resid;
-		if (msg->middle)
-			con->out_skip += con_out_kvec_skip(con);
-		con->out_skip += con_out_kvec_skip(con);
-
-		dout("%s %p msg %p - was sending, will write %d skip %d\n",
-		     __func__, con, msg, con->out_kvec_bytes, con->out_skip);
-		msg->hdr.seq = 0;
+		WARN_ON(con->state != CEPH_CON_S_OPEN);
+		dout("%s con %p msg %p was sending\n", __func__, con, msg);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ceph_con_v2_revoke(con);
+		else
+			ceph_con_v1_revoke(con);
+		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
-		ceph_msg_put(msg);
+	} else {
+		dout("%s con %p msg %p not current, out_msg %p\n", __func__,
+		     con, msg, con->out_msg);
 	}
-
 	mutex_unlock(&con->mutex);
 }
 
@@ -3184,25 +1771,17 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
 
 	mutex_lock(&con->mutex);
 	if (con->in_msg == msg) {
-		unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
-		unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
-		unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
-
-		/* skip rest of message */
-		dout("%s %p msg %p revoked\n", __func__, con, msg);
-		con->in_base_pos = con->in_base_pos -
-				sizeof(struct ceph_msg_header) -
-				front_len -
-				middle_len -
-				data_len -
-				sizeof(struct ceph_msg_footer);
+		WARN_ON(con->state != CEPH_CON_S_OPEN);
+		dout("%s con %p msg %p was recving\n", __func__, con, msg);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ceph_con_v2_revoke_incoming(con);
+		else
+			ceph_con_v1_revoke_incoming(con);
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		con->in_seq++;
 	} else {
-		dout("%s %p in_msg %p msg %p no-op\n",
-		     __func__, con, con->in_msg, msg);
+		dout("%s con %p msg %p not current, in_msg %p\n", __func__,
+		     con, msg, con->in_msg);
 	}
 	mutex_unlock(&con->mutex);
 }
@@ -3215,10 +1794,10 @@ void ceph_con_keepalive(struct ceph_connection *con)
 	dout("con_keepalive %p\n", con);
 	mutex_lock(&con->mutex);
 	clear_standby(con);
-	con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
 	mutex_unlock(&con->mutex);
 
-	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+	if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
@@ -3424,9 +2003,9 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
  * On error (ENOMEM, EAGAIN, ...),
  *  - con->in_msg == NULL
  */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+			  struct ceph_msg_header *hdr, int *skip)
 {
-	struct ceph_msg_header *hdr = &con->in_hdr;
 	int middle_len = le32_to_cpu(hdr->middle_len);
 	struct ceph_msg *msg;
 	int ret = 0;
@@ -3437,7 +2016,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 	mutex_unlock(&con->mutex);
 	msg = con->ops->alloc_msg(con, hdr, skip);
 	mutex_lock(&con->mutex);
-	if (con->state != CON_STATE_OPEN) {
+	if (con->state != CEPH_CON_S_OPEN) {
 		if (msg)
 			ceph_msg_put(msg);
 		return -EAGAIN;
@@ -3458,7 +2037,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 		con->error_msg = "error allocating memory for incoming message";
 		return -ENOMEM;
 	}
-	memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+	memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr));
 
 	if (middle_len && !con->in_msg->middle) {
 		ret = ceph_alloc_middle(con, con->in_msg);
@@ -3471,6 +2050,39 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 	return ret;
 }
 
+void ceph_con_get_out_msg(struct ceph_connection *con)
+{
+	struct ceph_msg *msg;
+
+	BUG_ON(list_empty(&con->out_queue));
+	msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
+	WARN_ON(msg->con != con);
+
+	/*
+	 * Put the message on "sent" list using a ref from ceph_con_send().
+	 * It is put when the message is acked or revoked.
+	 */
+	list_move_tail(&msg->list_head, &con->out_sent);
+
+	/*
+	 * Only assign outgoing seq # if we haven't sent this message
+	 * yet.  If it is requeued, resend with it's original seq.
+	 */
+	if (msg->needs_out_seq) {
+		msg->hdr.seq = cpu_to_le64(++con->out_seq);
+		msg->needs_out_seq = false;
+
+		if (con->ops->reencode_message)
+			con->ops->reencode_message(msg);
+	}
+
+	/*
+	 * Get a ref for out_msg.  It is put when we are done sending the
+	 * message or in case of a fault.
+	 */
+	WARN_ON(con->out_msg);
+	con->out_msg = ceph_msg_get(msg);
+}
 
 /*
  * Free a generically kmalloc'd message.
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
new file mode 100644
index 000000000000..2cb5ffdf071a
--- /dev/null
+++ b/net/ceph/messenger_v1.c
@@ -0,0 +1,1506 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
+
+/*
+ * If @buf is NULL, discard up to @len bytes.
+ */
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+	struct kvec iov = {buf, len};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	if (!buf)
+		msg.msg_flags |= MSG_TRUNC;
+
+	iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
+	r = sock_recvmsg(sock, &msg, msg.msg_flags);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
+		     int page_offset, size_t length)
+{
+	struct bio_vec bvec = {
+		.bv_page = page,
+		.bv_offset = page_offset,
+		.bv_len = length
+	};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	BUG_ON(page_offset + length > PAGE_SIZE);
+	iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
+	r = sock_recvmsg(sock, &msg, msg.msg_flags);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+			    size_t kvlen, size_t len, bool more)
+{
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	if (more)
+		msg.msg_flags |= MSG_MORE;
+	else
+		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+
+	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+/*
+ * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
+ */
+static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
+			     int offset, size_t size, int more)
+{
+	ssize_t (*sendpage)(struct socket *sock, struct page *page,
+			    int offset, size_t size, int flags);
+	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+	int ret;
+
+	/*
+	 * sendpage cannot properly handle pages with page_count == 0,
+	 * we need to fall back to sendmsg if that's the case.
+	 *
+	 * Same goes for slab pages: skb_can_coalesce() allows
+	 * coalescing neighboring slab objects into a single frag which
+	 * triggers one of hardened usercopy checks.
+	 */
+	if (sendpage_ok(page))
+		sendpage = sock->ops->sendpage;
+	else
+		sendpage = sock_no_sendpage;
+
+	ret = sendpage(sock, page, offset, size, flags);
+	if (ret == -EAGAIN)
+		ret = 0;
+
+	return ret;
+}
+
+static void con_out_kvec_reset(struct ceph_connection *con)
+{
+	BUG_ON(con->v1.out_skip);
+
+	con->v1.out_kvec_left = 0;
+	con->v1.out_kvec_bytes = 0;
+	con->v1.out_kvec_cur = &con->v1.out_kvec[0];
+}
+
+static void con_out_kvec_add(struct ceph_connection *con,
+				size_t size, void *data)
+{
+	int index = con->v1.out_kvec_left;
+
+	BUG_ON(con->v1.out_skip);
+	BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec));
+
+	con->v1.out_kvec[index].iov_len = size;
+	con->v1.out_kvec[index].iov_base = data;
+	con->v1.out_kvec_left++;
+	con->v1.out_kvec_bytes += size;
+}
+
+/*
+ * Chop off a kvec from the end.  Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+	int skip = 0;
+
+	if (con->v1.out_kvec_bytes > 0) {
+		skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len;
+		BUG_ON(con->v1.out_kvec_bytes < skip);
+		BUG_ON(!con->v1.out_kvec_left);
+		con->v1.out_kvec_bytes -= skip;
+		con->v1.out_kvec_left--;
+	}
+
+	return skip;
+}
+
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+	return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+	    sizeof(struct ceph_msg_footer) :
+	    sizeof(struct ceph_msg_footer_old);
+}
+
+static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
+{
+	/* Initialize data cursor */
+
+	ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
+}
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->out_msg;
+
+	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
+	dout("prepare_write_message_footer %p\n", con);
+	con_out_kvec_add(con, sizeof_footer(con), &m->footer);
+	if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
+		if (con->ops->sign_message)
+			con->ops->sign_message(m);
+		else
+			m->footer.sig = 0;
+	} else {
+		m->old_footer.flags = m->footer.flags;
+	}
+	con->v1.out_more = m->more_to_follow;
+	con->v1.out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	u32 crc;
+
+	con_out_kvec_reset(con);
+	con->v1.out_msg_done = false;
+
+	/* Sneak an ack in there first?  If we can get it into the same
+	 * TCP packet that's a good thing. */
+	if (con->in_seq > con->in_seq_acked) {
+		con->in_seq_acked = con->in_seq;
+		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+		con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			&con->v1.out_temp_ack);
+	}
+
+	ceph_con_get_out_msg(con);
+	m = con->out_msg;
+
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
+	     m, con->out_seq, le16_to_cpu(m->hdr.type),
+	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+	     m->data_length);
+	WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
+	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
+
+	/* tag + hdr + front + middle */
+	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+	con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr);
+	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+
+	if (m->middle)
+		con_out_kvec_add(con, m->middle->vec.iov_len,
+			m->middle->vec.iov_base);
+
+	/* fill in hdr crc and finalize hdr */
+	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
+	con->out_msg->hdr.crc = cpu_to_le32(crc);
+	memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr));
+
+	/* fill in front and middle crc, footer */
+	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
+	con->out_msg->footer.front_crc = cpu_to_le32(crc);
+	if (m->middle) {
+		crc = crc32c(0, m->middle->vec.iov_base,
+				m->middle->vec.iov_len);
+		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
+	} else
+		con->out_msg->footer.middle_crc = 0;
+	dout("%s front_crc %u middle_crc %u\n", __func__,
+	     le32_to_cpu(con->out_msg->footer.front_crc),
+	     le32_to_cpu(con->out_msg->footer.middle_crc));
+	con->out_msg->footer.flags = 0;
+
+	/* is there a data payload? */
+	con->out_msg->footer.data_crc = 0;
+	if (m->data_length) {
+		prepare_message_data(con->out_msg, m->data_length);
+		con->v1.out_more = 1;  /* data + footer will follow */
+	} else {
+		/* no, queue up footer too and be done */
+		prepare_write_message_footer(con);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+	dout("prepare_write_ack %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con_out_kvec_reset(con);
+
+	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+
+	con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			 &con->v1.out_temp_ack);
+
+	con->v1.out_more = 1;  /* more will follow.. eventually.. */
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+	dout("prepare_write_seq %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con_out_kvec_reset(con);
+
+	con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			 &con->v1.out_temp_ack);
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+	dout("prepare_write_keepalive %p\n", con);
+	con_out_kvec_reset(con);
+	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+		struct timespec64 now;
+
+		ktime_get_real_ts64(&now);
+		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+		ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now);
+		con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2),
+				 &con->v1.out_temp_keepalive2);
+	} else {
+		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+	}
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static int get_connect_authorizer(struct ceph_connection *con)
+{
+	struct ceph_auth_handshake *auth;
+	int auth_proto;
+
+	if (!con->ops->get_authorizer) {
+		con->v1.auth = NULL;
+		con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+		con->v1.out_connect.authorizer_len = 0;
+		return 0;
+	}
+
+	auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry);
+	if (IS_ERR(auth))
+		return PTR_ERR(auth);
+
+	con->v1.auth = auth;
+	con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+	con->v1.out_connect.authorizer_len =
+		cpu_to_le32(auth->authorizer_buf_len);
+	return 0;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_connection *con)
+{
+	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+					&con->msgr->my_enc_addr);
+
+	con->v1.out_more = 0;
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static void __prepare_write_connect(struct ceph_connection *con)
+{
+	con_out_kvec_add(con, sizeof(con->v1.out_connect),
+			 &con->v1.out_connect);
+	if (con->v1.auth)
+		con_out_kvec_add(con, con->v1.auth->authorizer_buf_len,
+				 con->v1.auth->authorizer_buf);
+
+	con->v1.out_more = 0;
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static int prepare_write_connect(struct ceph_connection *con)
+{
+	unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
+	int proto;
+	int ret;
+
+	switch (con->peer_name.type) {
+	case CEPH_ENTITY_TYPE_MON:
+		proto = CEPH_MONC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_OSD:
+		proto = CEPH_OSDC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_MDS:
+		proto = CEPH_MDSC_PROTOCOL;
+		break;
+	default:
+		BUG();
+	}
+
+	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+	     con->v1.connect_seq, global_seq, proto);
+
+	con->v1.out_connect.features =
+		cpu_to_le64(from_msgr(con->msgr)->supported_features);
+	con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq);
+	con->v1.out_connect.global_seq = cpu_to_le32(global_seq);
+	con->v1.out_connect.protocol_version = cpu_to_le32(proto);
+	con->v1.out_connect.flags = 0;
+
+	ret = get_connect_authorizer(con);
+	if (ret)
+		return ret;
+
+	__prepare_write_connect(con);
+	return 0;
+}
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes);
+	while (con->v1.out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur,
+				       con->v1.out_kvec_left,
+				       con->v1.out_kvec_bytes,
+				       con->v1.out_more);
+		if (ret <= 0)
+			goto out;
+		con->v1.out_kvec_bytes -= ret;
+		if (!con->v1.out_kvec_bytes)
+			break;            /* done */
+
+		/* account for full iov entries consumed */
+		while (ret >= con->v1.out_kvec_cur->iov_len) {
+			BUG_ON(!con->v1.out_kvec_left);
+			ret -= con->v1.out_kvec_cur->iov_len;
+			con->v1.out_kvec_cur++;
+			con->v1.out_kvec_left--;
+		}
+		/* and for a partially-consumed entry */
+		if (ret) {
+			con->v1.out_kvec_cur->iov_len -= ret;
+			con->v1.out_kvec_cur->iov_base += ret;
+		}
+	}
+	con->v1.out_kvec_left = 0;
+	ret = 1;
+out:
+	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+	     con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret);
+	return ret;  /* done! */
+}
+
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_message_data(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+	u32 crc;
+
+	dout("%s %p msg %p\n", __func__, con, msg);
+
+	if (!msg->num_data_items)
+		return -EINVAL;
+
+	/*
+	 * Iterate through each page that contains data to be
+	 * written, and send as much as possible for each.
+	 *
+	 * If we are calculating the data crc (the default), we will
+	 * need to map the page.  If we have no pages, they have
+	 * been revoked, so use the zero page.
+	 */
+	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
+	while (cursor->total_resid) {
+		struct page *page;
+		size_t page_offset;
+		size_t length;
+		int ret;
+
+		if (!cursor->resid) {
+			ceph_msg_data_advance(cursor, 0);
+			continue;
+		}
+
+		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+		if (length == cursor->total_resid)
+			more = MSG_MORE;
+		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
+					more);
+		if (ret <= 0) {
+			if (do_datacrc)
+				msg->footer.data_crc = cpu_to_le32(crc);
+
+			return ret;
+		}
+		if (do_datacrc && cursor->need_crc)
+			crc = ceph_crc32c_page(crc, page, page_offset, length);
+		ceph_msg_data_advance(cursor, (size_t)ret);
+	}
+
+	dout("%s %p msg %p done\n", __func__, con, msg);
+
+	/* prepare and queue up footer, too */
+	if (do_datacrc)
+		msg->footer.data_crc = cpu_to_le32(crc);
+	else
+		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+	con_out_kvec_reset(con);
+	prepare_write_message_footer(con);
+
+	return 1;	/* must return > 0 to indicate success */
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+	int ret;
+
+	dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
+	while (con->v1.out_skip > 0) {
+		size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
+
+		if (size == con->v1.out_skip)
+			more = MSG_MORE;
+		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
+					more);
+		if (ret <= 0)
+			goto out;
+		con->v1.out_skip -= ret;
+	}
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+	dout("prepare_read_banner %p\n", con);
+	con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+	dout("prepare_read_connect %p\n", con);
+	con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_ack %p\n", con);
+	con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_seq(struct ceph_connection *con)
+{
+	dout("prepare_read_seq %p\n", con);
+	con->v1.in_base_pos = 0;
+	con->v1.in_tag = CEPH_MSGR_TAG_SEQ;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+	dout("prepare_read_tag %p\n", con);
+	con->v1.in_base_pos = 0;
+	con->v1.in_tag = CEPH_MSGR_TAG_READY;
+}
+
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_keepalive_ack %p\n", con);
+	con->v1.in_base_pos = 0;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+	dout("prepare_read_message %p\n", con);
+	BUG_ON(con->in_msg != NULL);
+	con->v1.in_base_pos = 0;
+	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+	return 0;
+}
+
+static int read_partial(struct ceph_connection *con,
+			int end, int size, void *object)
+{
+	while (con->v1.in_base_pos < end) {
+		int left = end - con->v1.in_base_pos;
+		int have = size - left;
+		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+		if (ret <= 0)
+			return ret;
+		con->v1.in_base_pos += ret;
+	}
+	return 1;
+}
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+	int size;
+	int end;
+	int ret;
+
+	dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos);
+
+	/* peer's banner */
+	size = strlen(CEPH_BANNER);
+	end = size;
+	ret = read_partial(con, end, size, con->v1.in_banner);
+	if (ret <= 0)
+		goto out;
+
+	size = sizeof(con->v1.actual_peer_addr);
+	end += size;
+	ret = read_partial(con, end, size, &con->v1.actual_peer_addr);
+	if (ret <= 0)
+		goto out;
+	ceph_decode_banner_addr(&con->v1.actual_peer_addr);
+
+	size = sizeof(con->v1.peer_addr_for_me);
+	end += size;
+	ret = read_partial(con, end, size, &con->v1.peer_addr_for_me);
+	if (ret <= 0)
+		goto out;
+	ceph_decode_banner_addr(&con->v1.peer_addr_for_me);
+
+out:
+	return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+	int size;
+	int end;
+	int ret;
+
+	dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos);
+
+	size = sizeof(con->v1.in_reply);
+	end = size;
+	ret = read_partial(con, end, size, &con->v1.in_reply);
+	if (ret <= 0)
+		goto out;
+
+	if (con->v1.auth) {
+		size = le32_to_cpu(con->v1.in_reply.authorizer_len);
+		if (size > con->v1.auth->authorizer_reply_buf_len) {
+			pr_err("authorizer reply too big: %d > %zu\n", size,
+			       con->v1.auth->authorizer_reply_buf_len);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		end += size;
+		ret = read_partial(con, end, size,
+				   con->v1.auth->authorizer_reply_buf);
+		if (ret <= 0)
+			goto out;
+	}
+
+	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+	     con, con->v1.in_reply.tag,
+	     le32_to_cpu(con->v1.in_reply.connect_seq),
+	     le32_to_cpu(con->v1.in_reply.global_seq));
+out:
+	return ret;
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+	if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+		pr_err("connect to %s got bad banner\n",
+		       ceph_pr_addr(&con->peer_addr));
+		con->error_msg = "protocol error, bad banner";
+		return -1;
+	}
+	return 0;
+}
+
+static int process_banner(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+
+	dout("process_banner on %p\n", con);
+
+	if (verify_hello(con) < 0)
+		return -1;
+
+	/*
+	 * Make sure the other end is who we wanted.  note that the other
+	 * end may not yet know their ip address, so if it's 0.0.0.0, give
+	 * them the benefit of the doubt.
+	 */
+	if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr,
+		   sizeof(con->peer_addr)) != 0 &&
+	    !(ceph_addr_is_blank(&con->v1.actual_peer_addr) &&
+	      con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) {
+		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
+			ceph_pr_addr(&con->peer_addr),
+			le32_to_cpu(con->peer_addr.nonce),
+			ceph_pr_addr(&con->v1.actual_peer_addr),
+			le32_to_cpu(con->v1.actual_peer_addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -1;
+	}
+
+	/*
+	 * did we learn our address?
+	 */
+	if (ceph_addr_is_blank(my_addr)) {
+		memcpy(&my_addr->in_addr,
+		       &con->v1.peer_addr_for_me.in_addr,
+		       sizeof(con->v1.peer_addr_for_me.in_addr));
+		ceph_addr_set_port(my_addr, 0);
+		ceph_encode_my_addr(con->msgr);
+		dout("process_banner learned my addr is %s\n",
+		     ceph_pr_addr(my_addr));
+	}
+
+	return 0;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+	u64 sup_feat = from_msgr(con->msgr)->supported_features;
+	u64 req_feat = from_msgr(con->msgr)->required_features;
+	u64 server_feat = le64_to_cpu(con->v1.in_reply.features);
+	int ret;
+
+	dout("process_connect on %p tag %d\n", con, con->v1.in_tag);
+
+	if (con->v1.auth) {
+		int len = le32_to_cpu(con->v1.in_reply.authorizer_len);
+
+		/*
+		 * Any connection that defines ->get_authorizer()
+		 * should also define ->add_authorizer_challenge() and
+		 * ->verify_authorizer_reply().
+		 *
+		 * See get_connect_authorizer().
+		 */
+		if (con->v1.in_reply.tag ==
+				CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+			ret = con->ops->add_authorizer_challenge(
+				con, con->v1.auth->authorizer_reply_buf, len);
+			if (ret < 0)
+				return ret;
+
+			con_out_kvec_reset(con);
+			__prepare_write_connect(con);
+			prepare_read_connect(con);
+			return 0;
+		}
+
+		if (len) {
+			ret = con->ops->verify_authorizer_reply(con);
+			if (ret < 0) {
+				con->error_msg = "bad authorize reply";
+				return ret;
+			}
+		}
+	}
+
+	switch (con->v1.in_reply.tag) {
+	case CEPH_MSGR_TAG_FEATURES:
+		pr_err("%s%lld %s feature set mismatch,"
+		       " my %llx < server's %llx, missing %llx\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr),
+		       sup_feat, server_feat, server_feat & ~sup_feat);
+		con->error_msg = "missing required protocol features";
+		return -1;
+
+	case CEPH_MSGR_TAG_BADPROTOVER:
+		pr_err("%s%lld %s protocol version mismatch,"
+		       " my %d != server's %d\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr),
+		       le32_to_cpu(con->v1.out_connect.protocol_version),
+		       le32_to_cpu(con->v1.in_reply.protocol_version));
+		con->error_msg = "protocol version mismatch";
+		return -1;
+
+	case CEPH_MSGR_TAG_BADAUTHORIZER:
+		con->v1.auth_retry++;
+		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+		     con->v1.auth_retry);
+		if (con->v1.auth_retry == 2) {
+			con->error_msg = "connect authorization failure";
+			return -1;
+		}
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RESETSESSION:
+		/*
+		 * If we connected with a large connect_seq but the peer
+		 * has no record of a session with us (no connection, or
+		 * connect_seq == 0), they will send RESETSESION to indicate
+		 * that they must have reset their session, and may have
+		 * dropped messages.
+		 */
+		dout("process_connect got RESET peer seq %u\n",
+		     le32_to_cpu(con->v1.in_reply.connect_seq));
+		pr_info("%s%lld %s session reset\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr));
+		ceph_con_reset_session(con);
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+
+		/* Tell ceph about it. */
+		mutex_unlock(&con->mutex);
+		if (con->ops->peer_reset)
+			con->ops->peer_reset(con);
+		mutex_lock(&con->mutex);
+		if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
+			return -EAGAIN;
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_SESSION:
+		/*
+		 * If we sent a smaller connect_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
+		     le32_to_cpu(con->v1.out_connect.connect_seq),
+		     le32_to_cpu(con->v1.in_reply.connect_seq));
+		con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq);
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_GLOBAL:
+		/*
+		 * If we sent a smaller global_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+		     con->v1.peer_global_seq,
+		     le32_to_cpu(con->v1.in_reply.global_seq));
+		ceph_get_global_seq(con->msgr,
+				    le32_to_cpu(con->v1.in_reply.global_seq));
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_SEQ:
+	case CEPH_MSGR_TAG_READY:
+		if (req_feat & ~server_feat) {
+			pr_err("%s%lld %s protocol feature mismatch,"
+			       " my required %llx > server's %llx, need %llx\n",
+			       ENTITY_NAME(con->peer_name),
+			       ceph_pr_addr(&con->peer_addr),
+			       req_feat, server_feat, req_feat & ~server_feat);
+			con->error_msg = "missing required protocol features";
+			return -1;
+		}
+
+		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
+		con->state = CEPH_CON_S_OPEN;
+		con->v1.auth_retry = 0;    /* we authenticated; clear flag */
+		con->v1.peer_global_seq =
+			le32_to_cpu(con->v1.in_reply.global_seq);
+		con->v1.connect_seq++;
+		con->peer_features = server_feat;
+		dout("process_connect got READY gseq %d cseq %d (%d)\n",
+		     con->v1.peer_global_seq,
+		     le32_to_cpu(con->v1.in_reply.connect_seq),
+		     con->v1.connect_seq);
+		WARN_ON(con->v1.connect_seq !=
+			le32_to_cpu(con->v1.in_reply.connect_seq));
+
+		if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+
+		con->delay = 0;      /* reset backoff memory */
+
+		if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+			prepare_write_seq(con);
+			prepare_read_seq(con);
+		} else {
+			prepare_read_tag(con);
+		}
+		break;
+
+	case CEPH_MSGR_TAG_WAIT:
+		/*
+		 * If there is a connection race (we are opening
+		 * connections to each other), one of us may just have
+		 * to WAIT.  This shouldn't happen if we are the
+		 * client.
+		 */
+		con->error_msg = "protocol error, got WAIT as client";
+		return -1;
+
+	default:
+		con->error_msg = "protocol error, garbage tag during connect";
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+	int size = sizeof(con->v1.in_temp_ack);
+	int end = size;
+
+	return read_partial(con, end, size, &con->v1.in_temp_ack);
+}
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+	u64 ack = le64_to_cpu(con->v1.in_temp_ack);
+
+	if (con->v1.in_tag == CEPH_MSGR_TAG_ACK)
+		ceph_con_discard_sent(con, ack);
+	else
+		ceph_con_discard_requeued(con, ack);
+
+	prepare_read_tag(con);
+}
+
+static int read_partial_message_section(struct ceph_connection *con,
+					struct kvec *section,
+					unsigned int sec_len, u32 *crc)
+{
+	int ret, left;
+
+	BUG_ON(!section);
+
+	while (section->iov_len < sec_len) {
+		BUG_ON(section->iov_base == NULL);
+		left = sec_len - section->iov_len;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+				       section->iov_len, left);
+		if (ret <= 0)
+			return ret;
+		section->iov_len += ret;
+	}
+	if (section->iov_len == sec_len)
+		*crc = crc32c(0, section->iov_base, section->iov_len);
+
+	return 1;
+}
+
+static int read_partial_msg_data(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->in_msg;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	struct page *page;
+	size_t page_offset;
+	size_t length;
+	u32 crc = 0;
+	int ret;
+
+	if (!msg->num_data_items)
+		return -EIO;
+
+	if (do_datacrc)
+		crc = con->in_data_crc;
+	while (cursor->total_resid) {
+		if (!cursor->resid) {
+			ceph_msg_data_advance(cursor, 0);
+			continue;
+		}
+
+		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
+		if (ret <= 0) {
+			if (do_datacrc)
+				con->in_data_crc = crc;
+
+			return ret;
+		}
+
+		if (do_datacrc)
+			crc = ceph_crc32c_page(crc, page, page_offset, ret);
+		ceph_msg_data_advance(cursor, (size_t)ret);
+	}
+	if (do_datacrc)
+		con->in_data_crc = crc;
+
+	return 1;	/* must return > 0 to indicate success */
+}
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->in_msg;
+	int size;
+	int end;
+	int ret;
+	unsigned int front_len, middle_len, data_len;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
+	u64 seq;
+	u32 crc;
+
+	dout("read_partial_message con %p msg %p\n", con, m);
+
+	/* header */
+	size = sizeof(con->v1.in_hdr);
+	end = size;
+	ret = read_partial(con, end, size, &con->v1.in_hdr);
+	if (ret <= 0)
+		return ret;
+
+	crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc));
+	if (cpu_to_le32(crc) != con->v1.in_hdr.crc) {
+		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
+		       crc, con->v1.in_hdr.crc);
+		return -EBADMSG;
+	}
+
+	front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+		return -EIO;
+	middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
+		return -EIO;
+	data_len = le32_to_cpu(con->v1.in_hdr.data_len);
+	if (data_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+
+	/* verify seq# */
+	seq = le64_to_cpu(con->v1.in_hdr.seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr),
+			seq, con->in_seq + 1);
+		con->v1.in_base_pos = -front_len - middle_len - data_len -
+				      sizeof_footer(con);
+		con->v1.in_tag = CEPH_MSGR_TAG_READY;
+		return 1;
+	} else if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("read_partial_message bad seq %lld expected %lld\n",
+		       seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADE;
+	}
+
+	/* allocate message? */
+	if (!con->in_msg) {
+		int skip = 0;
+
+		dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type,
+		     front_len, data_len);
+		ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip);
+		if (ret < 0)
+			return ret;
+
+		BUG_ON((!con->in_msg) ^ skip);
+		if (skip) {
+			/* skip this message */
+			dout("alloc_msg said skip message\n");
+			con->v1.in_base_pos = -front_len - middle_len -
+					      data_len - sizeof_footer(con);
+			con->v1.in_tag = CEPH_MSGR_TAG_READY;
+			con->in_seq++;
+			return 1;
+		}
+
+		BUG_ON(!con->in_msg);
+		BUG_ON(con->in_msg->con != con);
+		m = con->in_msg;
+		m->front.iov_len = 0;    /* haven't read it yet */
+		if (m->middle)
+			m->middle->vec.iov_len = 0;
+
+		/* prepare for data payload, if any */
+
+		if (data_len)
+			prepare_message_data(con->in_msg, data_len);
+	}
+
+	/* front */
+	ret = read_partial_message_section(con, &m->front, front_len,
+					   &con->in_front_crc);
+	if (ret <= 0)
+		return ret;
+
+	/* middle */
+	if (m->middle) {
+		ret = read_partial_message_section(con, &m->middle->vec,
+						   middle_len,
+						   &con->in_middle_crc);
+		if (ret <= 0)
+			return ret;
+	}
+
+	/* (page) data */
+	if (data_len) {
+		ret = read_partial_msg_data(con);
+		if (ret <= 0)
+			return ret;
+	}
+
+	/* footer */
+	size = sizeof_footer(con);
+	end += size;
+	ret = read_partial(con, end, size, &m->footer);
+	if (ret <= 0)
+		return ret;
+
+	if (!need_sign) {
+		m->footer.flags = m->old_footer.flags;
+		m->footer.sig = 0;
+	}
+
+	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+	     m, front_len, m->footer.front_crc, middle_len,
+	     m->footer.middle_crc, data_len, m->footer.data_crc);
+
+	/* crc ok? */
+	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+		pr_err("read_partial_message %p front crc %u != exp. %u\n",
+		       m, con->in_front_crc, m->footer.front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+		pr_err("read_partial_message %p middle crc %u != exp %u\n",
+		       m, con->in_middle_crc, m->footer.middle_crc);
+		return -EBADMSG;
+	}
+	if (do_datacrc &&
+	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+		return -EBADMSG;
+	}
+
+	if (need_sign && con->ops->check_message_signature &&
+	    con->ops->check_message_signature(m)) {
+		pr_err("read_partial_message %p signature check failed\n", m);
+		return -EBADMSG;
+	}
+
+	return 1; /* done! */
+}
+
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+	struct ceph_timespec ceph_ts;
+	size_t size = sizeof(ceph_ts);
+	int ret = read_partial(con, size, size, &ceph_ts);
+	if (ret <= 0)
+		return ret;
+	ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
+	prepare_read_tag(con);
+	return 1;
+}
+
+/*
+ * Read what we can from the socket.
+ */
+int ceph_con_v1_try_read(struct ceph_connection *con)
+{
+	int ret = -1;
+
+more:
+	dout("try_read start %p state %d\n", con, con->state);
+	if (con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
+		return 0;
+
+	BUG_ON(!con->sock);
+
+	dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag,
+	     con->v1.in_base_pos);
+
+	if (con->state == CEPH_CON_S_V1_BANNER) {
+		ret = read_partial_banner(con);
+		if (ret <= 0)
+			goto out;
+		ret = process_banner(con);
+		if (ret < 0)
+			goto out;
+
+		con->state = CEPH_CON_S_V1_CONNECT_MSG;
+
+		/*
+		 * Received banner is good, exchange connection info.
+		 * Do not reset out_kvec, as sending our banner raced
+		 * with receiving peer banner after connect completed.
+		 */
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			goto out;
+		prepare_read_connect(con);
+
+		/* Send connection info before awaiting response */
+		goto out;
+	}
+
+	if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
+		ret = read_partial_connect(con);
+		if (ret <= 0)
+			goto out;
+		ret = process_connect(con);
+		if (ret < 0)
+			goto out;
+		goto more;
+	}
+
+	WARN_ON(con->state != CEPH_CON_S_OPEN);
+
+	if (con->v1.in_base_pos < 0) {
+		/*
+		 * skipping + discarding content.
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos);
+		if (ret <= 0)
+			goto out;
+		dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos);
+		con->v1.in_base_pos += ret;
+		if (con->v1.in_base_pos)
+			goto more;
+	}
+	if (con->v1.in_tag == CEPH_MSGR_TAG_READY) {
+		/*
+		 * what's next?
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1);
+		if (ret <= 0)
+			goto out;
+		dout("try_read got tag %d\n", con->v1.in_tag);
+		switch (con->v1.in_tag) {
+		case CEPH_MSGR_TAG_MSG:
+			prepare_read_message(con);
+			break;
+		case CEPH_MSGR_TAG_ACK:
+			prepare_read_ack(con);
+			break;
+		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+			prepare_read_keepalive_ack(con);
+			break;
+		case CEPH_MSGR_TAG_CLOSE:
+			ceph_con_close_socket(con);
+			con->state = CEPH_CON_S_CLOSED;
+			goto out;
+		default:
+			goto bad_tag;
+		}
+	}
+	if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) {
+		ret = read_partial_message(con);
+		if (ret <= 0) {
+			switch (ret) {
+			case -EBADMSG:
+				con->error_msg = "bad crc/signature";
+				fallthrough;
+			case -EBADE:
+				ret = -EIO;
+				break;
+			case -EIO:
+				con->error_msg = "io error";
+				break;
+			}
+			goto out;
+		}
+		if (con->v1.in_tag == CEPH_MSGR_TAG_READY)
+			goto more;
+		ceph_con_process_message(con);
+		if (con->state == CEPH_CON_S_OPEN)
+			prepare_read_tag(con);
+		goto more;
+	}
+	if (con->v1.in_tag == CEPH_MSGR_TAG_ACK ||
+	    con->v1.in_tag == CEPH_MSGR_TAG_SEQ) {
+		/*
+		 * the final handshake seq exchange is semantically
+		 * equivalent to an ACK
+		 */
+		ret = read_partial_ack(con);
+		if (ret <= 0)
+			goto out;
+		process_ack(con);
+		goto more;
+	}
+	if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+		ret = read_keepalive_ack(con);
+		if (ret <= 0)
+			goto out;
+		goto more;
+	}
+
+out:
+	dout("try_read done on %p ret %d\n", con, ret);
+	return ret;
+
+bad_tag:
+	pr_err("try_read bad tag %d\n", con->v1.in_tag);
+	con->error_msg = "protocol error, garbage tag";
+	ret = -1;
+	goto out;
+}
+
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+int ceph_con_v1_try_write(struct ceph_connection *con)
+{
+	int ret = 1;
+
+	dout("try_write start %p state %d\n", con, con->state);
+	if (con->state != CEPH_CON_S_PREOPEN &&
+	    con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
+		return 0;
+
+	/* open the socket first? */
+	if (con->state == CEPH_CON_S_PREOPEN) {
+		BUG_ON(con->sock);
+		con->state = CEPH_CON_S_V1_BANNER;
+
+		con_out_kvec_reset(con);
+		prepare_write_banner(con);
+		prepare_read_banner(con);
+
+		BUG_ON(con->in_msg);
+		con->v1.in_tag = CEPH_MSGR_TAG_READY;
+		dout("try_write initiating connect on %p new state %d\n",
+		     con, con->state);
+		ret = ceph_tcp_connect(con);
+		if (ret < 0) {
+			con->error_msg = "connect error";
+			goto out;
+		}
+	}
+
+more:
+	dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes);
+	BUG_ON(!con->sock);
+
+	/* kvec data queued? */
+	if (con->v1.out_kvec_left) {
+		ret = write_partial_kvec(con);
+		if (ret <= 0)
+			goto out;
+	}
+	if (con->v1.out_skip) {
+		ret = write_partial_skip(con);
+		if (ret <= 0)
+			goto out;
+	}
+
+	/* msg pages? */
+	if (con->out_msg) {
+		if (con->v1.out_msg_done) {
+			ceph_msg_put(con->out_msg);
+			con->out_msg = NULL;   /* we're done with this one */
+			goto do_next;
+		}
+
+		ret = write_partial_message_data(con);
+		if (ret == 1)
+			goto more;  /* we need to send the footer, too! */
+		if (ret == 0)
+			goto out;
+		if (ret < 0) {
+			dout("try_write write_partial_message_data err %d\n",
+			     ret);
+			goto out;
+		}
+	}
+
+do_next:
+	if (con->state == CEPH_CON_S_OPEN) {
+		if (ceph_con_flag_test_and_clear(con,
+				CEPH_CON_F_KEEPALIVE_PENDING)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
+		/* is anything else pending? */
+		if (!list_empty(&con->out_queue)) {
+			prepare_write_message(con);
+			goto more;
+		}
+		if (con->in_seq > con->in_seq_acked) {
+			prepare_write_ack(con);
+			goto more;
+		}
+	}
+
+	/* Nothing to do! */
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	dout("try_write nothing else to write.\n");
+	ret = 0;
+out:
+	dout("try_write done on %p ret %d\n", con, ret);
+	return ret;
+}
+
+void ceph_con_v1_revoke(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	WARN_ON(con->v1.out_skip);
+	/* footer */
+	if (con->v1.out_msg_done) {
+		con->v1.out_skip += con_out_kvec_skip(con);
+	} else {
+		WARN_ON(!msg->data_length);
+		con->v1.out_skip += sizeof_footer(con);
+	}
+	/* data, middle, front */
+	if (msg->data_length)
+		con->v1.out_skip += msg->cursor.total_resid;
+	if (msg->middle)
+		con->v1.out_skip += con_out_kvec_skip(con);
+	con->v1.out_skip += con_out_kvec_skip(con);
+
+	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
+	     con->v1.out_kvec_bytes, con->v1.out_skip);
+}
+
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
+{
+	unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+	unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+	unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len);
+
+	/* skip rest of message */
+	con->v1.in_base_pos = con->v1.in_base_pos -
+			sizeof(struct ceph_msg_header) -
+			front_len -
+			middle_len -
+			data_len -
+			sizeof(struct ceph_msg_footer);
+
+	con->v1.in_tag = CEPH_MSGR_TAG_READY;
+	con->in_seq++;
+
+	dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos);
+}
+
+bool ceph_con_v1_opened(struct ceph_connection *con)
+{
+	return con->v1.connect_seq;
+}
+
+void ceph_con_v1_reset_session(struct ceph_connection *con)
+{
+	con->v1.connect_seq = 0;
+	con->v1.peer_global_seq = 0;
+}
+
+void ceph_con_v1_reset_protocol(struct ceph_connection *con)
+{
+	con->v1.out_skip = 0;
+}
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
new file mode 100644
index 000000000000..cc40ce4e02fb
--- /dev/null
+++ b/net/ceph/messenger_v2.c
@@ -0,0 +1,3459 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph msgr2 protocol implementation
+ *
+ * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com>
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <crypto/aead.h>
+#include <crypto/algapi.h>  /* for crypto_memneq() */
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/sched/mm.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+#include "crypto.h"  /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */
+
+#define FRAME_TAG_HELLO			1
+#define FRAME_TAG_AUTH_REQUEST		2
+#define FRAME_TAG_AUTH_BAD_METHOD	3
+#define FRAME_TAG_AUTH_REPLY_MORE	4
+#define FRAME_TAG_AUTH_REQUEST_MORE	5
+#define FRAME_TAG_AUTH_DONE		6
+#define FRAME_TAG_AUTH_SIGNATURE	7
+#define FRAME_TAG_CLIENT_IDENT		8
+#define FRAME_TAG_SERVER_IDENT		9
+#define FRAME_TAG_IDENT_MISSING_FEATURES 10
+#define FRAME_TAG_SESSION_RECONNECT	11
+#define FRAME_TAG_SESSION_RESET		12
+#define FRAME_TAG_SESSION_RETRY		13
+#define FRAME_TAG_SESSION_RETRY_GLOBAL	14
+#define FRAME_TAG_SESSION_RECONNECT_OK	15
+#define FRAME_TAG_WAIT			16
+#define FRAME_TAG_MESSAGE		17
+#define FRAME_TAG_KEEPALIVE2		18
+#define FRAME_TAG_KEEPALIVE2_ACK	19
+#define FRAME_TAG_ACK			20
+
+#define FRAME_LATE_STATUS_ABORTED	0x1
+#define FRAME_LATE_STATUS_COMPLETE	0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK	0xf
+
+#define IN_S_HANDLE_PREAMBLE		1
+#define IN_S_HANDLE_CONTROL		2
+#define IN_S_HANDLE_CONTROL_REMAINDER	3
+#define IN_S_PREPARE_READ_DATA		4
+#define IN_S_PREPARE_READ_DATA_CONT	5
+#define IN_S_HANDLE_EPILOGUE		6
+#define IN_S_FINISH_SKIP		7
+
+#define OUT_S_QUEUE_DATA		1
+#define OUT_S_QUEUE_DATA_CONT		2
+#define OUT_S_QUEUE_ENC_PAGE		3
+#define OUT_S_QUEUE_ZEROS		4
+#define OUT_S_FINISH_MESSAGE		5
+#define OUT_S_GET_NEXT			6
+
+#define CTRL_BODY(p)	((void *)(p) + CEPH_PREAMBLE_LEN)
+#define FRONT_PAD(p)	((void *)(p) + CEPH_EPILOGUE_SECURE_LEN)
+#define MIDDLE_PAD(p)	(FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN)
+#define DATA_PAD(p)	(MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN)
+
+#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int do_recvmsg(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	int ret;
+
+	msg.msg_iter = *it;
+	while (iov_iter_count(it)) {
+		ret = sock_recvmsg(sock, &msg, msg.msg_flags);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	WARN_ON(msg_data_left(&msg));
+	return 1;
+}
+
+/*
+ * Read as much as possible.
+ *
+ * Return:
+ *   1 - done, nothing (else) to read
+ *   0 - socket is empty, need to wait
+ *  <0 - error
+ */
+static int ceph_tcp_recv(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p %s %zu\n", __func__, con,
+	     iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need",
+	     iov_iter_count(&con->v2.in_iter));
+	ret = do_recvmsg(con->sock, &con->v2.in_iter);
+	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+	     iov_iter_count(&con->v2.in_iter));
+	return ret;
+}
+
+static int do_sendmsg(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	int ret;
+
+	msg.msg_iter = *it;
+	while (iov_iter_count(it)) {
+		ret = sock_sendmsg(sock, &msg);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	WARN_ON(msg_data_left(&msg));
+	return 1;
+}
+
+static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	struct bio_vec bv;
+	int ret;
+
+	if (WARN_ON(!iov_iter_is_bvec(it)))
+		return -EINVAL;
+
+	while (iov_iter_count(it)) {
+		/* iov_iter_iovec() for ITER_BVEC */
+		bv.bv_page = it->bvec->bv_page;
+		bv.bv_offset = it->bvec->bv_offset + it->iov_offset;
+		bv.bv_len = min(iov_iter_count(it),
+				it->bvec->bv_len - it->iov_offset);
+
+		/*
+		 * sendpage cannot properly handle pages with
+		 * page_count == 0, we need to fall back to sendmsg if
+		 * that's the case.
+		 *
+		 * Same goes for slab pages: skb_can_coalesce() allows
+		 * coalescing neighboring slab objects into a single frag
+		 * which triggers one of hardened usercopy checks.
+		 */
+		if (sendpage_ok(bv.bv_page)) {
+			ret = sock->ops->sendpage(sock, bv.bv_page,
+						  bv.bv_offset, bv.bv_len,
+						  CEPH_MSG_FLAGS);
+		} else {
+			iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len);
+			ret = sock_sendmsg(sock, &msg);
+		}
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	return 1;
+}
+
+/*
+ * Write as much as possible.  The socket is expected to be corked,
+ * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
+ *
+ * Return:
+ *   1 - done, nothing (else) to write
+ *   0 - socket is full, need to wait
+ *  <0 - error
+ */
+static int ceph_tcp_send(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
+	     iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
+	if (con->v2.out_iter_sendpage)
+		ret = do_try_sendpage(con->sock, &con->v2.out_iter);
+	else
+		ret = do_sendmsg(con->sock, &con->v2.out_iter);
+	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+	     iov_iter_count(&con->v2.out_iter));
+	return ret;
+}
+
+static void add_in_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+
+	con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf;
+	con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len;
+	con->v2.in_kvec_cnt++;
+
+	con->v2.in_iter.nr_segs++;
+	con->v2.in_iter.count += len;
+}
+
+static void reset_in_kvecs(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	con->v2.in_kvec_cnt = 0;
+	iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0);
+}
+
+static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	con->v2.in_bvec = *bv;
+	iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len);
+}
+
+static void set_in_skip(struct ceph_connection *con, int len)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	dout("%s con %p len %d\n", __func__, con, len);
+	iov_iter_discard(&con->v2.in_iter, READ, len);
+}
+
+static void add_out_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf;
+	con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len;
+	con->v2.out_kvec_cnt++;
+
+	con->v2.out_iter.nr_segs++;
+	con->v2.out_iter.count += len;
+}
+
+static void reset_out_kvecs(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_kvec_cnt = 0;
+
+	iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0);
+	con->v2.out_iter_sendpage = false;
+}
+
+static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv,
+			 bool zerocopy)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_bvec = *bv;
+	con->v2.out_iter_sendpage = zerocopy;
+	iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+		      con->v2.out_bvec.bv_len);
+}
+
+static void set_out_bvec_zero(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(!con->v2.out_zero);
+
+	con->v2.out_bvec.bv_page = ceph_zero_page;
+	con->v2.out_bvec.bv_offset = 0;
+	con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE);
+	con->v2.out_iter_sendpage = true;
+	iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+		      con->v2.out_bvec.bv_len);
+}
+
+static void out_zero_add(struct ceph_connection *con, int len)
+{
+	dout("%s con %p len %d\n", __func__, con, len);
+	con->v2.out_zero += len;
+}
+
+static void *alloc_conn_buf(struct ceph_connection *con, int len)
+{
+	void *buf;
+
+	dout("%s con %p len %d\n", __func__, con, len);
+
+	if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs)))
+		return NULL;
+
+	buf = ceph_kvmalloc(len, GFP_NOIO);
+	if (!buf)
+		return NULL;
+
+	con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf;
+	return buf;
+}
+
+static void free_conn_bufs(struct ceph_connection *con)
+{
+	while (con->v2.conn_buf_cnt)
+		kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]);
+}
+
+static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs));
+
+	con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf;
+	con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len;
+	con->v2.in_sign_kvec_cnt++;
+}
+
+static void clear_in_sign_kvecs(struct ceph_connection *con)
+{
+	con->v2.in_sign_kvec_cnt = 0;
+}
+
+static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs));
+
+	con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf;
+	con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len;
+	con->v2.out_sign_kvec_cnt++;
+}
+
+static void clear_out_sign_kvecs(struct ceph_connection *con)
+{
+	con->v2.out_sign_kvec_cnt = 0;
+}
+
+static bool con_secure(struct ceph_connection *con)
+{
+	return con->v2.con_mode == CEPH_CON_MODE_SECURE;
+}
+
+static int front_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.front_len);
+}
+
+static int middle_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.middle_len);
+}
+
+static int data_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.data_len);
+}
+
+static bool need_padding(int len)
+{
+	return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padded_len(int len)
+{
+	return ALIGN(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padding_len(int len)
+{
+	return padded_len(len) - len;
+}
+
+/* preamble + control segment */
+static int head_onwire_len(int ctrl_len, bool secure)
+{
+	int head_len;
+	int rem_len;
+
+	if (secure) {
+		head_len = CEPH_PREAMBLE_SECURE_LEN;
+		if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+			rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+			head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN;
+		}
+	} else {
+		head_len = CEPH_PREAMBLE_PLAIN_LEN;
+		if (ctrl_len)
+			head_len += ctrl_len + CEPH_CRC_LEN;
+	}
+	return head_len;
+}
+
+/* front, middle and data segments + epilogue */
+static int __tail_onwire_len(int front_len, int middle_len, int data_len,
+			     bool secure)
+{
+	if (!front_len && !middle_len && !data_len)
+		return 0;
+
+	if (!secure)
+		return front_len + middle_len + data_len +
+		       CEPH_EPILOGUE_PLAIN_LEN;
+
+	return padded_len(front_len) + padded_len(middle_len) +
+	       padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN;
+}
+
+static int tail_onwire_len(const struct ceph_msg *msg, bool secure)
+{
+	return __tail_onwire_len(front_len(msg), middle_len(msg),
+				 data_len(msg), secure);
+}
+
+/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */
+#define MESSAGE_HEAD_PLAIN_LEN	(CEPH_PREAMBLE_PLAIN_LEN +		\
+				 sizeof(struct ceph_msg_header2) +	\
+				 CEPH_CRC_LEN)
+
+static const int frame_aligns[] = {
+	sizeof(void *),
+	sizeof(void *),
+	sizeof(void *),
+	PAGE_SIZE
+};
+
+/*
+ * Discards trailing empty segments, unless there is just one segment.
+ * A frame always has at least one (possibly empty) segment.
+ */
+static int calc_segment_count(const int *lens, int len_cnt)
+{
+	int i;
+
+	for (i = len_cnt - 1; i >= 0; i--) {
+		if (lens[i])
+			return i + 1;
+	}
+
+	return 1;
+}
+
+static void init_frame_desc(struct ceph_frame_desc *desc, int tag,
+			    const int *lens, int len_cnt)
+{
+	int i;
+
+	memset(desc, 0, sizeof(*desc));
+
+	desc->fd_tag = tag;
+	desc->fd_seg_cnt = calc_segment_count(lens, len_cnt);
+	BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT);
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		desc->fd_lens[i] = lens[i];
+		desc->fd_aligns[i] = frame_aligns[i];
+	}
+}
+
+/*
+ * Preamble crc covers everything up to itself (28 bytes) and
+ * is calculated and verified irrespective of the connection mode
+ * (i.e. even if the frame is encrypted).
+ */
+static void encode_preamble(const struct ceph_frame_desc *desc, void *p)
+{
+	void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+	void *start = p;
+	int i;
+
+	memset(p, 0, CEPH_PREAMBLE_LEN);
+
+	ceph_encode_8(&p, desc->fd_tag);
+	ceph_encode_8(&p, desc->fd_seg_cnt);
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		ceph_encode_32(&p, desc->fd_lens[i]);
+		ceph_encode_16(&p, desc->fd_aligns[i]);
+	}
+
+	put_unaligned_le32(crc32c(0, start, crcp - start), crcp);
+}
+
+static int decode_preamble(void *p, struct ceph_frame_desc *desc)
+{
+	void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+	u32 crc, expected_crc;
+	int i;
+
+	crc = crc32c(0, p, crcp - p);
+	expected_crc = get_unaligned_le32(crcp);
+	if (crc != expected_crc) {
+		pr_err("bad preamble crc, calculated %u, expected %u\n",
+		       crc, expected_crc);
+		return -EBADMSG;
+	}
+
+	memset(desc, 0, sizeof(*desc));
+
+	desc->fd_tag = ceph_decode_8(&p);
+	desc->fd_seg_cnt = ceph_decode_8(&p);
+	if (desc->fd_seg_cnt < 1 ||
+	    desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) {
+		pr_err("bad segment count %d\n", desc->fd_seg_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		desc->fd_lens[i] = ceph_decode_32(&p);
+		desc->fd_aligns[i] = ceph_decode_16(&p);
+	}
+
+	/*
+	 * This would fire for FRAME_TAG_WAIT (it has one empty
+	 * segment), but we should never get it as client.
+	 */
+	if (!desc->fd_lens[desc->fd_seg_cnt - 1]) {
+		pr_err("last segment empty\n");
+		return -EINVAL;
+	}
+
+	if (desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) {
+		pr_err("control segment too big %d\n", desc->fd_lens[0]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) {
+		pr_err("front segment too big %d\n", desc->fd_lens[1]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) {
+		pr_err("middle segment too big %d\n", desc->fd_lens[2]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) {
+		pr_err("data segment too big %d\n", desc->fd_lens[3]);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void encode_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+	con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+						 FRAME_LATE_STATUS_COMPLETE;
+	cpu_to_le32s(&con->v2.out_epil.front_crc);
+	cpu_to_le32s(&con->v2.out_epil.middle_crc);
+	cpu_to_le32s(&con->v2.out_epil.data_crc);
+}
+
+static void encode_epilogue_secure(struct ceph_connection *con, bool aborted)
+{
+	memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil));
+	con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+						 FRAME_LATE_STATUS_COMPLETE;
+}
+
+static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc,
+			   u32 *data_crc)
+{
+	u8 late_status;
+
+	late_status = ceph_decode_8(&p);
+	if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) !=
+			FRAME_LATE_STATUS_COMPLETE) {
+		/* we should never get an aborted message as client */
+		pr_err("bad late_status 0x%x\n", late_status);
+		return -EINVAL;
+	}
+
+	if (front_crc && middle_crc && data_crc) {
+		*front_crc = ceph_decode_32(&p);
+		*middle_crc = ceph_decode_32(&p);
+		*data_crc = ceph_decode_32(&p);
+	}
+
+	return 0;
+}
+
+static void fill_header(struct ceph_msg_header *hdr,
+			const struct ceph_msg_header2 *hdr2,
+			int front_len, int middle_len, int data_len,
+			const struct ceph_entity_name *peer_name)
+{
+	hdr->seq = hdr2->seq;
+	hdr->tid = hdr2->tid;
+	hdr->type = hdr2->type;
+	hdr->priority = hdr2->priority;
+	hdr->version = hdr2->version;
+	hdr->front_len = cpu_to_le32(front_len);
+	hdr->middle_len = cpu_to_le32(middle_len);
+	hdr->data_len = cpu_to_le32(data_len);
+	hdr->data_off = hdr2->data_off;
+	hdr->src = *peer_name;
+	hdr->compat_version = hdr2->compat_version;
+	hdr->reserved = 0;
+	hdr->crc = 0;
+}
+
+static void fill_header2(struct ceph_msg_header2 *hdr2,
+			 const struct ceph_msg_header *hdr, u64 ack_seq)
+{
+	hdr2->seq = hdr->seq;
+	hdr2->tid = hdr->tid;
+	hdr2->type = hdr->type;
+	hdr2->priority = hdr->priority;
+	hdr2->version = hdr->version;
+	hdr2->data_pre_padding_len = 0;
+	hdr2->data_off = hdr->data_off;
+	hdr2->ack_seq = cpu_to_le64(ack_seq);
+	hdr2->flags = 0;
+	hdr2->compat_version = hdr->compat_version;
+	hdr2->reserved = 0;
+}
+
+static int verify_control_crc(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	u32 crc, expected_crc;
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len);
+	WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN);
+
+	crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len);
+	expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base);
+	if (crc != expected_crc) {
+		pr_err("bad control crc, calculated %u, expected %u\n",
+		       crc, expected_crc);
+		return -EBADMSG;
+	}
+
+	return 0;
+}
+
+static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc,
+				u32 middle_crc, u32 data_crc)
+{
+	if (front_len(con->in_msg)) {
+		con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base,
+					   front_len(con->in_msg));
+	} else {
+		WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg));
+		con->in_front_crc = -1;
+	}
+
+	if (middle_len(con->in_msg))
+		con->in_middle_crc = crc32c(-1,
+					    con->in_msg->middle->vec.iov_base,
+					    middle_len(con->in_msg));
+	else if (data_len(con->in_msg))
+		con->in_middle_crc = -1;
+	else
+		con->in_middle_crc = 0;
+
+	if (!data_len(con->in_msg))
+		con->in_data_crc = 0;
+
+	dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg,
+	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+
+	if (con->in_front_crc != front_crc) {
+		pr_err("bad front crc, calculated %u, expected %u\n",
+		       con->in_front_crc, front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != middle_crc) {
+		pr_err("bad middle crc, calculated %u, expected %u\n",
+		       con->in_middle_crc, middle_crc);
+		return -EBADMSG;
+	}
+	if (con->in_data_crc != data_crc) {
+		pr_err("bad data crc, calculated %u, expected %u\n",
+		       con->in_data_crc, data_crc);
+		return -EBADMSG;
+	}
+
+	return 0;
+}
+
+static int setup_crypto(struct ceph_connection *con,
+			const u8 *session_key, int session_key_len,
+			const u8 *con_secret, int con_secret_len)
+{
+	unsigned int noio_flag;
+	int ret;
+
+	dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n",
+	     __func__, con, con->v2.con_mode, session_key_len, con_secret_len);
+	WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req);
+
+	if (con->v2.con_mode != CEPH_CON_MODE_CRC &&
+	    con->v2.con_mode != CEPH_CON_MODE_SECURE) {
+		pr_err("bad con_mode %d\n", con->v2.con_mode);
+		return -EINVAL;
+	}
+
+	if (!session_key_len) {
+		WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC);
+		WARN_ON(con_secret_len);
+		return 0;  /* auth_none */
+	}
+
+	noio_flag = memalloc_noio_save();
+	con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
+	memalloc_noio_restore(noio_flag);
+	if (IS_ERR(con->v2.hmac_tfm)) {
+		ret = PTR_ERR(con->v2.hmac_tfm);
+		con->v2.hmac_tfm = NULL;
+		pr_err("failed to allocate hmac tfm context: %d\n", ret);
+		return ret;
+	}
+
+	WARN_ON((unsigned long)session_key &
+		crypto_shash_alignmask(con->v2.hmac_tfm));
+	ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key,
+				  session_key_len);
+	if (ret) {
+		pr_err("failed to set hmac key: %d\n", ret);
+		return ret;
+	}
+
+	if (con->v2.con_mode == CEPH_CON_MODE_CRC) {
+		WARN_ON(con_secret_len);
+		return 0;  /* auth_x, plain mode */
+	}
+
+	if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) {
+		pr_err("con_secret too small %d\n", con_secret_len);
+		return -EINVAL;
+	}
+
+	noio_flag = memalloc_noio_save();
+	con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+	memalloc_noio_restore(noio_flag);
+	if (IS_ERR(con->v2.gcm_tfm)) {
+		ret = PTR_ERR(con->v2.gcm_tfm);
+		con->v2.gcm_tfm = NULL;
+		pr_err("failed to allocate gcm tfm context: %d\n", ret);
+		return ret;
+	}
+
+	WARN_ON((unsigned long)con_secret &
+		crypto_aead_alignmask(con->v2.gcm_tfm));
+	ret = crypto_aead_setkey(con->v2.gcm_tfm, con_secret, CEPH_GCM_KEY_LEN);
+	if (ret) {
+		pr_err("failed to set gcm key: %d\n", ret);
+		return ret;
+	}
+
+	WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN);
+	ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN);
+	if (ret) {
+		pr_err("failed to set gcm tag size: %d\n", ret);
+		return ret;
+	}
+
+	con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO);
+	if (!con->v2.gcm_req) {
+		pr_err("failed to allocate gcm request\n");
+		return -ENOMEM;
+	}
+
+	crypto_init_wait(&con->v2.gcm_wait);
+	aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				  crypto_req_done, &con->v2.gcm_wait);
+
+	memcpy(&con->v2.in_gcm_nonce, con_secret + CEPH_GCM_KEY_LEN,
+	       CEPH_GCM_IV_LEN);
+	memcpy(&con->v2.out_gcm_nonce,
+	       con_secret + CEPH_GCM_KEY_LEN + CEPH_GCM_IV_LEN,
+	       CEPH_GCM_IV_LEN);
+	return 0;  /* auth_x, secure mode */
+}
+
+static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs,
+		       int kvec_cnt, u8 *hmac)
+{
+	SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm);  /* tfm arg is ignored */
+	int ret;
+	int i;
+
+	dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con,
+	     con->v2.hmac_tfm, kvec_cnt);
+
+	if (!con->v2.hmac_tfm) {
+		memset(hmac, 0, SHA256_DIGEST_SIZE);
+		return 0;  /* auth_none */
+	}
+
+	desc->tfm = con->v2.hmac_tfm;
+	ret = crypto_shash_init(desc);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < kvec_cnt; i++) {
+		WARN_ON((unsigned long)kvecs[i].iov_base &
+			crypto_shash_alignmask(con->v2.hmac_tfm));
+		ret = crypto_shash_update(desc, kvecs[i].iov_base,
+					  kvecs[i].iov_len);
+		if (ret)
+			goto out;
+	}
+
+	ret = crypto_shash_final(desc, hmac);
+
+out:
+	shash_desc_zero(desc);
+	return ret;  /* auth_x, both plain and secure modes */
+}
+
+static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce)
+{
+	u64 counter;
+
+	counter = le64_to_cpu(nonce->counter);
+	nonce->counter = cpu_to_le64(counter + 1);
+}
+
+static int gcm_crypt(struct ceph_connection *con, bool encrypt,
+		     struct scatterlist *src, struct scatterlist *dst,
+		     int src_len)
+{
+	struct ceph_gcm_nonce *nonce;
+	int ret;
+
+	nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce;
+
+	aead_request_set_ad(con->v2.gcm_req, 0);  /* no AAD */
+	aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce);
+	ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) :
+					crypto_aead_decrypt(con->v2.gcm_req),
+			      &con->v2.gcm_wait);
+	if (ret)
+		return ret;
+
+	gcm_inc_nonce(nonce);
+	return 0;
+}
+
+static void get_bvec_at(struct ceph_msg_data_cursor *cursor,
+			struct bio_vec *bv)
+{
+	struct page *page;
+	size_t off, len;
+
+	WARN_ON(!cursor->total_resid);
+
+	/* skip zero-length data items */
+	while (!cursor->resid)
+		ceph_msg_data_advance(cursor, 0);
+
+	/* get a piece of data, cursor isn't advanced */
+	page = ceph_msg_data_next(cursor, &off, &len, NULL);
+
+	bv->bv_page = page;
+	bv->bv_offset = off;
+	bv->bv_len = len;
+}
+
+static int calc_sg_cnt(void *buf, int buf_len)
+{
+	int sg_cnt;
+
+	if (!buf_len)
+		return 0;
+
+	sg_cnt = need_padding(buf_len) ? 1 : 0;
+	if (is_vmalloc_addr(buf)) {
+		WARN_ON(offset_in_page(buf));
+		sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT;
+	} else {
+		sg_cnt++;
+	}
+
+	return sg_cnt;
+}
+
+static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor)
+{
+	int data_len = cursor->total_resid;
+	struct bio_vec bv;
+	int sg_cnt;
+
+	if (!data_len)
+		return 0;
+
+	sg_cnt = need_padding(data_len) ? 1 : 0;
+	do {
+		get_bvec_at(cursor, &bv);
+		sg_cnt++;
+
+		ceph_msg_data_advance(cursor, bv.bv_len);
+	} while (cursor->total_resid);
+
+	return sg_cnt;
+}
+
+static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad)
+{
+	void *end = buf + buf_len;
+	struct page *page;
+	int len;
+	void *p;
+
+	if (!buf_len)
+		return;
+
+	if (is_vmalloc_addr(buf)) {
+		p = buf;
+		do {
+			page = vmalloc_to_page(p);
+			len = min_t(int, end - p, PAGE_SIZE);
+			WARN_ON(!page || !len || offset_in_page(p));
+			sg_set_page(*sg, page, len, 0);
+			*sg = sg_next(*sg);
+			p += len;
+		} while (p != end);
+	} else {
+		sg_set_buf(*sg, buf, buf_len);
+		*sg = sg_next(*sg);
+	}
+
+	if (need_padding(buf_len)) {
+		sg_set_buf(*sg, pad, padding_len(buf_len));
+		*sg = sg_next(*sg);
+	}
+}
+
+static void init_sgs_cursor(struct scatterlist **sg,
+			    struct ceph_msg_data_cursor *cursor, u8 *pad)
+{
+	int data_len = cursor->total_resid;
+	struct bio_vec bv;
+
+	if (!data_len)
+		return;
+
+	do {
+		get_bvec_at(cursor, &bv);
+		sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
+		*sg = sg_next(*sg);
+
+		ceph_msg_data_advance(cursor, bv.bv_len);
+	} while (cursor->total_resid);
+
+	if (need_padding(data_len)) {
+		sg_set_buf(*sg, pad, padding_len(data_len));
+		*sg = sg_next(*sg);
+	}
+}
+
+static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg,
+			     u8 *front_pad, u8 *middle_pad, u8 *data_pad,
+			     void *epilogue, bool add_tag)
+{
+	struct ceph_msg_data_cursor cursor;
+	struct scatterlist *cur_sg;
+	int sg_cnt;
+	int ret;
+
+	if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
+		return 0;
+
+	sg_cnt = 1;  /* epilogue + [auth tag] */
+	if (front_len(msg))
+		sg_cnt += calc_sg_cnt(msg->front.iov_base,
+				      front_len(msg));
+	if (middle_len(msg))
+		sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base,
+				      middle_len(msg));
+	if (data_len(msg)) {
+		ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+		sg_cnt += calc_sg_cnt_cursor(&cursor);
+	}
+
+	ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	cur_sg = sgt->sgl;
+	if (front_len(msg))
+		init_sgs(&cur_sg, msg->front.iov_base, front_len(msg),
+			 front_pad);
+	if (middle_len(msg))
+		init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg),
+			 middle_pad);
+	if (data_len(msg)) {
+		ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+		init_sgs_cursor(&cur_sg, &cursor, data_pad);
+	}
+
+	WARN_ON(!sg_is_last(cur_sg));
+	sg_set_buf(cur_sg, epilogue,
+		   CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0));
+	return 0;
+}
+
+static int decrypt_preamble(struct ceph_connection *con)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN);
+	return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN);
+}
+
+static int decrypt_control_remainder(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN;
+	struct scatterlist sgs[2];
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len);
+	WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len);
+
+	sg_init_table(sgs, 2);
+	sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len);
+	sg_set_buf(&sgs[1], con->v2.in_buf, pt_len);
+
+	return gcm_crypt(con, false, sgs, sgs,
+			 padded_len(rem_len) + CEPH_GCM_TAG_LEN);
+}
+
+static int decrypt_message(struct ceph_connection *con)
+{
+	struct sg_table sgt = {};
+	int ret;
+
+	ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf),
+			MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf),
+			con->v2.in_buf, true);
+	if (ret)
+		goto out;
+
+	ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl,
+			tail_onwire_len(con->in_msg, true));
+
+out:
+	sg_free_table(&sgt);
+	return ret;
+}
+
+static int prepare_banner(struct ceph_connection *con)
+{
+	int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8;
+	void *buf, *p;
+
+	buf = alloc_conn_buf(con, buf_len);
+	if (!buf)
+		return -ENOMEM;
+
+	p = buf;
+	ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN);
+	ceph_encode_16(&p, sizeof(u64) + sizeof(u64));
+	ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES);
+	ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES);
+	WARN_ON(p != buf + buf_len);
+
+	add_out_kvec(con, buf, buf_len);
+	add_out_sign_kvec(con, buf, buf_len);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+/*
+ * base:
+ *   preamble
+ *   control body (ctrl_len bytes)
+ *   space for control crc
+ *
+ * extdata (optional):
+ *   control body (extdata_len bytes)
+ *
+ * Compute control crc and gather base and extdata into:
+ *
+ *   preamble
+ *   control body (ctrl_len + extdata_len bytes)
+ *   control crc
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static void prepare_head_plain(struct ceph_connection *con, void *base,
+			       int ctrl_len, void *extdata, int extdata_len,
+			       bool to_be_signed)
+{
+	int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN;
+	void *crcp = base + base_len - CEPH_CRC_LEN;
+	u32 crc;
+
+	crc = crc32c(-1, CTRL_BODY(base), ctrl_len);
+	if (extdata_len)
+		crc = crc32c(crc, extdata, extdata_len);
+	put_unaligned_le32(crc, crcp);
+
+	if (!extdata_len) {
+		add_out_kvec(con, base, base_len);
+		if (to_be_signed)
+			add_out_sign_kvec(con, base, base_len);
+		return;
+	}
+
+	add_out_kvec(con, base, crcp - base);
+	add_out_kvec(con, extdata, extdata_len);
+	add_out_kvec(con, crcp, CEPH_CRC_LEN);
+	if (to_be_signed) {
+		add_out_sign_kvec(con, base, crcp - base);
+		add_out_sign_kvec(con, extdata, extdata_len);
+		add_out_sign_kvec(con, crcp, CEPH_CRC_LEN);
+	}
+}
+
+static int prepare_head_secure_small(struct ceph_connection *con,
+				     void *base, int ctrl_len)
+{
+	struct scatterlist sg;
+	int ret;
+
+	/* inline buffer padding? */
+	if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN)
+		memset(CTRL_BODY(base) + ctrl_len, 0,
+		       CEPH_PREAMBLE_INLINE_LEN - ctrl_len);
+
+	sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN);
+	ret = gcm_crypt(con, true, &sg, &sg,
+			CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN);
+	if (ret)
+		return ret;
+
+	add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN);
+	return 0;
+}
+
+/*
+ * base:
+ *   preamble
+ *   control body (ctrl_len bytes)
+ *   space for padding, if needed
+ *   space for control remainder auth tag
+ *   space for preamble auth tag
+ *
+ * Encrypt preamble and the inline portion, then encrypt the remainder
+ * and gather into:
+ *
+ *   preamble
+ *   control body (48 bytes)
+ *   preamble auth tag
+ *   control body (ctrl_len - 48 bytes)
+ *   zero padding, if needed
+ *   control remainder auth tag
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static int prepare_head_secure_big(struct ceph_connection *con,
+				   void *base, int ctrl_len)
+{
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN;
+	void *rem_tag = rem + padded_len(rem_len);
+	void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN;
+	struct scatterlist sgs[2];
+	int ret;
+
+	sg_init_table(sgs, 2);
+	sg_set_buf(&sgs[0], base, rem - base);
+	sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN);
+	ret = gcm_crypt(con, true, sgs, sgs, rem - base);
+	if (ret)
+		return ret;
+
+	/* control remainder padding? */
+	if (need_padding(rem_len))
+		memset(rem + rem_len, 0, padding_len(rem_len));
+
+	sg_init_one(&sgs[0], rem, pmbl_tag - rem);
+	ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem);
+	if (ret)
+		return ret;
+
+	add_out_kvec(con, base, rem - base);
+	add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN);
+	add_out_kvec(con, rem, pmbl_tag - rem);
+	return 0;
+}
+
+static int __prepare_control(struct ceph_connection *con, int tag,
+			     void *base, int ctrl_len, void *extdata,
+			     int extdata_len, bool to_be_signed)
+{
+	int total_len = ctrl_len + extdata_len;
+	struct ceph_frame_desc desc;
+	int ret;
+
+	dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag,
+	     total_len, ctrl_len, extdata_len);
+
+	/* extdata may be vmalloc'ed but not base */
+	if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len))
+		return -EINVAL;
+
+	init_frame_desc(&desc, tag, &total_len, 1);
+	encode_preamble(&desc, base);
+
+	if (con_secure(con)) {
+		if (WARN_ON(extdata_len || to_be_signed))
+			return -EINVAL;
+
+		if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN)
+			/* fully inlined, inline buffer may need padding */
+			ret = prepare_head_secure_small(con, base, ctrl_len);
+		else
+			/* partially inlined, inline buffer is full */
+			ret = prepare_head_secure_big(con, base, ctrl_len);
+		if (ret)
+			return ret;
+	} else {
+		prepare_head_plain(con, base, ctrl_len, extdata, extdata_len,
+				   to_be_signed);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+static int prepare_control(struct ceph_connection *con, int tag,
+			   void *base, int ctrl_len)
+{
+	return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false);
+}
+
+static int prepare_hello(struct ceph_connection *con)
+{
+	void *buf, *p;
+	int ctrl_len;
+
+	ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr);
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT);
+	ceph_encode_entity_addr(&p, &con->peer_addr);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len,
+				 NULL, 0, true);
+}
+
+/* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */
+#define AUTH_BUF_LEN	(512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN)
+
+static int prepare_auth_request(struct ceph_connection *con)
+{
+	void *authorizer, *authorizer_copy;
+	int ctrl_len, authorizer_len;
+	void *buf;
+	int ret;
+
+	ctrl_len = AUTH_BUF_LEN;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len,
+					 &authorizer, &authorizer_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_HELLO) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p get_auth_request ret %d\n", __func__, con, ret);
+	if (ret)
+		return ret;
+
+	authorizer_copy = alloc_conn_buf(con, authorizer_len);
+	if (!authorizer_copy)
+		return -ENOMEM;
+
+	memcpy(authorizer_copy, authorizer, authorizer_len);
+
+	return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len,
+				 authorizer_copy, authorizer_len, true);
+}
+
+static int prepare_auth_request_more(struct ceph_connection *con,
+				     void *reply, int reply_len)
+{
+	int ctrl_len, authorizer_len;
+	void *authorizer;
+	void *buf;
+	int ret;
+
+	ctrl_len = AUTH_BUF_LEN;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->handle_auth_reply_more(con, reply, reply_len,
+					       CTRL_BODY(buf), &ctrl_len,
+					       &authorizer, &authorizer_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret);
+	if (ret)
+		return ret;
+
+	return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf,
+				 ctrl_len, authorizer, authorizer_len, true);
+}
+
+static int prepare_auth_signature(struct ceph_connection *con)
+{
+	void *buf;
+	int ret;
+
+	buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE,
+						  con_secure(con)));
+	if (!buf)
+		return -ENOMEM;
+
+	ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt,
+			  CTRL_BODY(buf));
+	if (ret)
+		return ret;
+
+	return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf,
+			       SHA256_DIGEST_SIZE);
+}
+
+static int prepare_client_ident(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 global_id = ceph_client_gid(client);
+	void *buf, *p;
+	int ctrl_len;
+
+	WARN_ON(con->v2.server_cookie);
+	WARN_ON(con->v2.connect_seq);
+	WARN_ON(con->v2.peer_global_seq);
+
+	if (!con->v2.client_cookie) {
+		do {
+			get_random_bytes(&con->v2.client_cookie,
+					 sizeof(con->v2.client_cookie));
+		} while (!con->v2.client_cookie);
+		dout("%s con %p generated cookie 0x%llx\n", __func__, con,
+		     con->v2.client_cookie);
+	} else {
+		dout("%s con %p cookie already set 0x%llx\n", __func__, con,
+		     con->v2.client_cookie);
+	}
+
+	dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n",
+	     __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+	     ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce),
+	     global_id, con->v2.global_seq, client->supported_features,
+	     client->required_features, con->v2.client_cookie);
+
+	ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) +
+		   ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, 2);  /* addrvec marker */
+	ceph_encode_32(&p, 1);  /* addr_cnt */
+	ceph_encode_entity_addr(&p, my_addr);
+	ceph_encode_entity_addr(&p, &con->peer_addr);
+	ceph_encode_64(&p, global_id);
+	ceph_encode_64(&p, con->v2.global_seq);
+	ceph_encode_64(&p, client->supported_features);
+	ceph_encode_64(&p, client->required_features);
+	ceph_encode_64(&p, 0);  /* flags */
+	ceph_encode_64(&p, con->v2.client_cookie);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len);
+}
+
+static int prepare_session_reconnect(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	void *buf, *p;
+	int ctrl_len;
+
+	WARN_ON(!con->v2.client_cookie);
+	WARN_ON(!con->v2.server_cookie);
+	WARN_ON(!con->v2.connect_seq);
+	WARN_ON(!con->v2.peer_global_seq);
+
+	dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n",
+	     __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+	     con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq,
+	     con->v2.connect_seq, con->in_seq);
+
+	ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, 2);  /* entity_addrvec_t marker */
+	ceph_encode_32(&p, 1);  /* my_addrs len */
+	ceph_encode_entity_addr(&p, my_addr);
+	ceph_encode_64(&p, con->v2.client_cookie);
+	ceph_encode_64(&p, con->v2.server_cookie);
+	ceph_encode_64(&p, con->v2.global_seq);
+	ceph_encode_64(&p, con->v2.connect_seq);
+	ceph_encode_64(&p, con->in_seq);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len);
+}
+
+static int prepare_keepalive2(struct ceph_connection *con)
+{
+	struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf);
+	struct timespec64 now;
+
+	ktime_get_real_ts64(&now);
+	dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec,
+	     now.tv_nsec);
+
+	ceph_encode_timespec64(ts, &now);
+
+	reset_out_kvecs(con);
+	return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf,
+			       sizeof(struct ceph_timespec));
+}
+
+static int prepare_ack(struct ceph_connection *con)
+{
+	void *p;
+
+	dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	p = CTRL_BODY(con->v2.out_buf);
+	ceph_encode_64(&p, con->in_seq_acked);
+
+	reset_out_kvecs(con);
+	return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8);
+}
+
+static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+	dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con,
+	     con->out_msg, aborted, con->v2.out_epil.front_crc,
+	     con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc);
+
+	encode_epilogue_plain(con, aborted);
+	add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN);
+}
+
+/*
+ * For "used" empty segments, crc is -1.  For unused (trailing)
+ * segments, crc is 0.
+ */
+static void prepare_message_plain(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	prepare_head_plain(con, con->v2.out_buf,
+			   sizeof(struct ceph_msg_header2), NULL, 0, false);
+
+	if (!front_len(msg) && !middle_len(msg)) {
+		if (!data_len(msg)) {
+			/*
+			 * Empty message: once the head is written,
+			 * we are done -- there is no epilogue.
+			 */
+			con->v2.out_state = OUT_S_FINISH_MESSAGE;
+			return;
+		}
+
+		con->v2.out_epil.front_crc = -1;
+		con->v2.out_epil.middle_crc = -1;
+		con->v2.out_state = OUT_S_QUEUE_DATA;
+		return;
+	}
+
+	if (front_len(msg)) {
+		con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base,
+						    front_len(msg));
+		add_out_kvec(con, msg->front.iov_base, front_len(msg));
+	} else {
+		/* middle (at least) is there, checked above */
+		con->v2.out_epil.front_crc = -1;
+	}
+
+	if (middle_len(msg)) {
+		con->v2.out_epil.middle_crc =
+			crc32c(-1, msg->middle->vec.iov_base, middle_len(msg));
+		add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+	} else {
+		con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0;
+	}
+
+	if (data_len(msg)) {
+		con->v2.out_state = OUT_S_QUEUE_DATA;
+	} else {
+		con->v2.out_epil.data_crc = 0;
+		prepare_epilogue_plain(con, false);
+		con->v2.out_state = OUT_S_FINISH_MESSAGE;
+	}
+}
+
+/*
+ * Unfortunately the kernel crypto API doesn't support streaming
+ * (piecewise) operation for AEAD algorithms, so we can't get away
+ * with a fixed size buffer and a couple sgs.  Instead, we have to
+ * allocate pages for the entire tail of the message (currently up
+ * to ~32M) and two sgs arrays (up to ~256K each)...
+ */
+static int prepare_message_secure(struct ceph_connection *con)
+{
+	void *zerop = page_address(ceph_zero_page);
+	struct sg_table enc_sgt = {};
+	struct sg_table sgt = {};
+	struct page **enc_pages;
+	int enc_page_cnt;
+	int tail_len;
+	int ret;
+
+	ret = prepare_head_secure_small(con, con->v2.out_buf,
+					sizeof(struct ceph_msg_header2));
+	if (ret)
+		return ret;
+
+	tail_len = tail_onwire_len(con->out_msg, true);
+	if (!tail_len) {
+		/*
+		 * Empty message: once the head is written,
+		 * we are done -- there is no epilogue.
+		 */
+		con->v2.out_state = OUT_S_FINISH_MESSAGE;
+		return 0;
+	}
+
+	encode_epilogue_secure(con, false);
+	ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop,
+				&con->v2.out_epil, false);
+	if (ret)
+		goto out;
+
+	enc_page_cnt = calc_pages_for(0, tail_len);
+	enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
+	if (IS_ERR(enc_pages)) {
+		ret = PTR_ERR(enc_pages);
+		goto out;
+	}
+
+	WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt);
+	con->v2.out_enc_pages = enc_pages;
+	con->v2.out_enc_page_cnt = enc_page_cnt;
+	con->v2.out_enc_resid = tail_len;
+	con->v2.out_enc_i = 0;
+
+	ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt,
+					0, tail_len, GFP_NOIO);
+	if (ret)
+		goto out;
+
+	ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl,
+			tail_len - CEPH_GCM_TAG_LEN);
+	if (ret)
+		goto out;
+
+	dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con,
+	     con->out_msg, sgt.orig_nents, enc_page_cnt);
+	con->v2.out_state = OUT_S_QUEUE_ENC_PAGE;
+
+out:
+	sg_free_table(&sgt);
+	sg_free_table(&enc_sgt);
+	return ret;
+}
+
+static int prepare_message(struct ceph_connection *con)
+{
+	int lens[] = {
+		sizeof(struct ceph_msg_header2),
+		front_len(con->out_msg),
+		middle_len(con->out_msg),
+		data_len(con->out_msg)
+	};
+	struct ceph_frame_desc desc;
+	int ret;
+
+	dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con,
+	     con->out_msg, lens[0], lens[1], lens[2], lens[3]);
+
+	if (con->in_seq > con->in_seq_acked) {
+		dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+		     con->in_seq_acked, con->in_seq);
+		con->in_seq_acked = con->in_seq;
+	}
+
+	reset_out_kvecs(con);
+	init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4);
+	encode_preamble(&desc, con->v2.out_buf);
+	fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr,
+		     con->in_seq_acked);
+
+	if (con_secure(con)) {
+		ret = prepare_message_secure(con);
+		if (ret)
+			return ret;
+	} else {
+		prepare_message_plain(con);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+static int prepare_read_banner_prefix(struct ceph_connection *con)
+{
+	void *buf;
+
+	buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN);
+	if (!buf)
+		return -ENOMEM;
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+	add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+	con->state = CEPH_CON_S_V2_BANNER_PREFIX;
+	return 0;
+}
+
+static int prepare_read_banner_payload(struct ceph_connection *con,
+				       int payload_len)
+{
+	void *buf;
+
+	buf = alloc_conn_buf(con, payload_len);
+	if (!buf)
+		return -ENOMEM;
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf, payload_len);
+	add_in_sign_kvec(con, buf, payload_len);
+	con->state = CEPH_CON_S_V2_BANNER_PAYLOAD;
+	return 0;
+}
+
+static void prepare_read_preamble(struct ceph_connection *con)
+{
+	reset_in_kvecs(con);
+	add_in_kvec(con, con->v2.in_buf,
+		    con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN :
+				      CEPH_PREAMBLE_PLAIN_LEN);
+	con->v2.in_state = IN_S_HANDLE_PREAMBLE;
+}
+
+static int prepare_read_control(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int head_len;
+	void *buf;
+
+	reset_in_kvecs(con);
+	if (con->state == CEPH_CON_S_V2_HELLO ||
+	    con->state == CEPH_CON_S_V2_AUTH) {
+		head_len = head_onwire_len(ctrl_len, false);
+		buf = alloc_conn_buf(con, head_len);
+		if (!buf)
+			return -ENOMEM;
+
+		/* preserve preamble */
+		memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN);
+
+		add_in_kvec(con, CTRL_BODY(buf), ctrl_len);
+		add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN);
+		add_in_sign_kvec(con, buf, head_len);
+	} else {
+		if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+			buf = alloc_conn_buf(con, ctrl_len);
+			if (!buf)
+				return -ENOMEM;
+
+			add_in_kvec(con, buf, ctrl_len);
+		} else {
+			add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len);
+		}
+		add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN);
+	}
+	con->v2.in_state = IN_S_HANDLE_CONTROL;
+	return 0;
+}
+
+static int prepare_read_control_remainder(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	void *buf;
+
+	buf = alloc_conn_buf(con, ctrl_len);
+	if (!buf)
+		return -ENOMEM;
+
+	memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN);
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len);
+	add_in_kvec(con, con->v2.in_buf,
+		    padding_len(rem_len) + CEPH_GCM_TAG_LEN);
+	con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER;
+	return 0;
+}
+
+static void prepare_read_data(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	if (!con_secure(con))
+		con->in_data_crc = -1;
+	ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg,
+				  data_len(con->in_msg));
+
+	get_bvec_at(&con->v2.in_cursor, &bv);
+	set_in_bvec(con, &bv);
+	con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
+}
+
+static void prepare_read_data_cont(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	if (!con_secure(con))
+		con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+						    con->v2.in_bvec.bv_page,
+						    con->v2.in_bvec.bv_offset,
+						    con->v2.in_bvec.bv_len);
+
+	ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len);
+	if (con->v2.in_cursor.total_resid) {
+		get_bvec_at(&con->v2.in_cursor, &bv);
+		set_in_bvec(con, &bv);
+		WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
+		return;
+	}
+
+	/*
+	 * We've read all data.  Prepare to read data padding (if any)
+	 * and epilogue.
+	 */
+	reset_in_kvecs(con);
+	if (con_secure(con)) {
+		if (need_padding(data_len(con->in_msg)))
+			add_in_kvec(con, DATA_PAD(con->v2.in_buf),
+				    padding_len(data_len(con->in_msg)));
+		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN);
+	} else {
+		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+	}
+	con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+}
+
+static void __finish_skip(struct ceph_connection *con)
+{
+	con->in_seq++;
+	prepare_read_preamble(con);
+}
+
+static void prepare_skip_message(struct ceph_connection *con)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	int tail_len;
+
+	dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1],
+	     desc->fd_lens[2], desc->fd_lens[3]);
+
+	tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2],
+				     desc->fd_lens[3], con_secure(con));
+	if (!tail_len) {
+		__finish_skip(con);
+	} else {
+		set_in_skip(con, tail_len);
+		con->v2.in_state = IN_S_FINISH_SKIP;
+	}
+}
+
+static int process_banner_prefix(struct ceph_connection *con)
+{
+	int payload_len;
+	void *p;
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN);
+
+	p = con->v2.in_kvecs[0].iov_base;
+	if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) {
+		if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN))
+			con->error_msg = "server is speaking msgr1 protocol";
+		else
+			con->error_msg = "protocol error, bad banner";
+		return -EINVAL;
+	}
+
+	p += CEPH_BANNER_V2_LEN;
+	payload_len = ceph_decode_16(&p);
+	dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+	return prepare_read_banner_payload(con, payload_len);
+}
+
+static int process_banner_payload(struct ceph_connection *con)
+{
+	void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len;
+	u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES;
+	u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES;
+	u64 server_feat, server_req_feat;
+	void *p;
+	int ret;
+
+	p = con->v2.in_kvecs[0].iov_base;
+	ceph_decode_64_safe(&p, end, server_feat, bad);
+	ceph_decode_64_safe(&p, end, server_req_feat, bad);
+
+	dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n",
+	     __func__, con, server_feat, server_req_feat);
+
+	if (req_feat & ~server_feat) {
+		pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+		       server_feat, req_feat & ~server_feat);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+	if (server_req_feat & ~feat) {
+		pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+		       feat, server_req_feat & ~feat);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+
+	/* no reset_out_kvecs() as our banner may still be pending */
+	ret = prepare_hello(con);
+	if (ret) {
+		pr_err("prepare_hello failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_HELLO;
+	prepare_read_preamble(con);
+	return 0;
+
+bad:
+	pr_err("failed to decode banner payload\n");
+	return -EINVAL;
+}
+
+static int process_hello(struct ceph_connection *con, void *p, void *end)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	struct ceph_entity_addr addr_for_me;
+	u8 entity_type;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_HELLO) {
+		con->error_msg = "protocol error, unexpected hello";
+		return -EINVAL;
+	}
+
+	ceph_decode_8_safe(&p, end, entity_type, bad);
+	ret = ceph_decode_entity_addr(&p, end, &addr_for_me);
+	if (ret) {
+		pr_err("failed to decode addr_for_me: %d\n", ret);
+		return ret;
+	}
+
+	dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con,
+	     entity_type, ceph_pr_addr(&addr_for_me));
+
+	if (entity_type != con->peer_name.type) {
+		pr_err("bad peer type, want %d, got %d\n",
+		       con->peer_name.type, entity_type);
+		con->error_msg = "wrong peer at address";
+		return -EINVAL;
+	}
+
+	/*
+	 * Set our address to the address our first peer (i.e. monitor)
+	 * sees that we are connecting from.  If we are behind some sort
+	 * of NAT and want to be identified by some private (not NATed)
+	 * address, ip option should be used.
+	 */
+	if (ceph_addr_is_blank(my_addr)) {
+		memcpy(&my_addr->in_addr, &addr_for_me.in_addr,
+		       sizeof(my_addr->in_addr));
+		ceph_addr_set_port(my_addr, 0);
+		dout("%s con %p set my addr %s, as seen by peer %s\n",
+		     __func__, con, ceph_pr_addr(my_addr),
+		     ceph_pr_addr(&con->peer_addr));
+	} else {
+		dout("%s con %p my addr already set %s\n",
+		     __func__, con, ceph_pr_addr(my_addr));
+	}
+
+	WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr));
+	WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY);
+	WARN_ON(!my_addr->nonce);
+
+	/* no reset_out_kvecs() as our hello may still be pending */
+	ret = prepare_auth_request(con);
+	if (ret) {
+		if (ret != -EAGAIN)
+			pr_err("prepare_auth_request failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_AUTH;
+	return 0;
+
+bad:
+	pr_err("failed to decode hello\n");
+	return -EINVAL;
+}
+
+static int process_auth_bad_method(struct ceph_connection *con,
+				   void *p, void *end)
+{
+	int allowed_protos[8], allowed_modes[8];
+	int allowed_proto_cnt, allowed_mode_cnt;
+	int used_proto, result;
+	int ret;
+	int i;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_bad_method";
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(&p, end, used_proto, bad);
+	ceph_decode_32_safe(&p, end, result, bad);
+	dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto,
+	     result);
+
+	ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad);
+	if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) {
+		pr_err("allowed_protos too big %d\n", allowed_proto_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < allowed_proto_cnt; i++) {
+		ceph_decode_32_safe(&p, end, allowed_protos[i], bad);
+		dout("%s con %p allowed_protos[%d] %d\n", __func__, con,
+		     i, allowed_protos[i]);
+	}
+
+	ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad);
+	if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) {
+		pr_err("allowed_modes too big %d\n", allowed_mode_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < allowed_mode_cnt; i++) {
+		ceph_decode_32_safe(&p, end, allowed_modes[i], bad);
+		dout("%s con %p allowed_modes[%d] %d\n", __func__, con,
+		     i, allowed_modes[i]);
+	}
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->handle_auth_bad_method(con, used_proto, result,
+					       allowed_protos,
+					       allowed_proto_cnt,
+					       allowed_modes,
+					       allowed_mode_cnt);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret);
+	return ret;
+
+bad:
+	pr_err("failed to decode auth_bad_method\n");
+	return -EINVAL;
+}
+
+static int process_auth_reply_more(struct ceph_connection *con,
+				   void *p, void *end)
+{
+	int payload_len;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_reply_more";
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(&p, end, payload_len, bad);
+	ceph_decode_need(&p, end, payload_len, bad);
+
+	dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+	reset_out_kvecs(con);
+	ret = prepare_auth_request_more(con, p, payload_len);
+	if (ret) {
+		if (ret != -EAGAIN)
+			pr_err("prepare_auth_request_more failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode auth_reply_more\n");
+	return -EINVAL;
+}
+
+/*
+ * Align session_key and con_secret to avoid GFP_ATOMIC allocation
+ * inside crypto_shash_setkey() and crypto_aead_setkey() called from
+ * setup_crypto().  __aligned(16) isn't guaranteed to work for stack
+ * objects, so do it by hand.
+ */
+static int process_auth_done(struct ceph_connection *con, void *p, void *end)
+{
+	u8 session_key_buf[CEPH_KEY_LEN + 16];
+	u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16];
+	u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16);
+	u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16);
+	int session_key_len, con_secret_len;
+	int payload_len;
+	u64 global_id;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_done";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, global_id, bad);
+	ceph_decode_32_safe(&p, end, con->v2.con_mode, bad);
+	ceph_decode_32_safe(&p, end, payload_len, bad);
+
+	dout("%s con %p global_id %llu con_mode %d payload_len %d\n",
+	     __func__, con, global_id, con->v2.con_mode, payload_len);
+
+	mutex_unlock(&con->mutex);
+	session_key_len = 0;
+	con_secret_len = 0;
+	ret = con->ops->handle_auth_done(con, global_id, p, payload_len,
+					 session_key, &session_key_len,
+					 con_secret, &con_secret_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret);
+	if (ret)
+		goto out;
+
+	ret = setup_crypto(con, session_key, session_key_len, con_secret,
+			   con_secret_len);
+	if (ret)
+		goto out;
+
+	reset_out_kvecs(con);
+	ret = prepare_auth_signature(con);
+	if (ret) {
+		pr_err("prepare_auth_signature failed: %d\n", ret);
+		goto out;
+	}
+
+	con->state = CEPH_CON_S_V2_AUTH_SIGNATURE;
+
+out:
+	memzero_explicit(session_key_buf, sizeof(session_key_buf));
+	memzero_explicit(con_secret_buf, sizeof(con_secret_buf));
+	return ret;
+
+bad:
+	pr_err("failed to decode auth_done\n");
+	return -EINVAL;
+}
+
+static int process_auth_signature(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	u8 hmac[SHA256_DIGEST_SIZE];
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) {
+		con->error_msg = "protocol error, unexpected auth_signature";
+		return -EINVAL;
+	}
+
+	ret = hmac_sha256(con, con->v2.out_sign_kvecs,
+			  con->v2.out_sign_kvec_cnt, hmac);
+	if (ret)
+		return ret;
+
+	ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad);
+	if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) {
+		con->error_msg = "integrity error, bad auth signature";
+		return -EBADMSG;
+	}
+
+	dout("%s con %p auth signature ok\n", __func__, con);
+
+	/* no reset_out_kvecs() as our auth_signature may still be pending */
+	if (!con->v2.server_cookie) {
+		ret = prepare_client_ident(con);
+		if (ret) {
+			pr_err("prepare_client_ident failed: %d\n", ret);
+			return ret;
+		}
+
+		con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+	} else {
+		ret = prepare_session_reconnect(con);
+		if (ret) {
+			pr_err("prepare_session_reconnect failed: %d\n", ret);
+			return ret;
+		}
+
+		con->state = CEPH_CON_S_V2_SESSION_RECONNECT;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode auth_signature\n");
+	return -EINVAL;
+}
+
+static int process_server_ident(struct ceph_connection *con,
+				void *p, void *end)
+{
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 features, required_features;
+	struct ceph_entity_addr addr;
+	u64 global_seq;
+	u64 global_id;
+	u64 cookie;
+	u64 flags;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+		con->error_msg = "protocol error, unexpected server_ident";
+		return -EINVAL;
+	}
+
+	ret = ceph_decode_entity_addrvec(&p, end, true, &addr);
+	if (ret) {
+		pr_err("failed to decode server addrs: %d\n", ret);
+		return ret;
+	}
+
+	ceph_decode_64_safe(&p, end, global_id, bad);
+	ceph_decode_64_safe(&p, end, global_seq, bad);
+	ceph_decode_64_safe(&p, end, features, bad);
+	ceph_decode_64_safe(&p, end, required_features, bad);
+	ceph_decode_64_safe(&p, end, flags, bad);
+	ceph_decode_64_safe(&p, end, cookie, bad);
+
+	dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n",
+	     __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce),
+	     global_id, global_seq, features, required_features, flags, cookie);
+
+	/* is this who we intended to talk to? */
+	if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) {
+		pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n",
+		       ceph_pr_addr(&con->peer_addr),
+		       le32_to_cpu(con->peer_addr.nonce),
+		       ceph_pr_addr(&addr), le32_to_cpu(addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -EINVAL;
+	}
+
+	if (client->required_features & ~features) {
+		pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+		       features, client->required_features & ~features);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+
+	/*
+	 * Both name->type and name->num are set in ceph_con_open() but
+	 * name->num may be bogus in the initial monmap.  name->type is
+	 * verified in handle_hello().
+	 */
+	WARN_ON(!con->peer_name.type);
+	con->peer_name.num = cpu_to_le64(global_id);
+	con->v2.peer_global_seq = global_seq;
+	con->peer_features = features;
+	WARN_ON(required_features & ~client->supported_features);
+	con->v2.server_cookie = cookie;
+
+	if (flags & CEPH_MSG_CONNECT_LOSSY) {
+		ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+		WARN_ON(con->v2.server_cookie);
+	} else {
+		WARN_ON(!con->v2.server_cookie);
+	}
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+	con->delay = 0;  /* reset backoff memory */
+
+	con->state = CEPH_CON_S_OPEN;
+	con->v2.out_state = OUT_S_GET_NEXT;
+	return 0;
+
+bad:
+	pr_err("failed to decode server_ident\n");
+	return -EINVAL;
+}
+
+static int process_ident_missing_features(struct ceph_connection *con,
+					  void *p, void *end)
+{
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 missing_features;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+		con->error_msg = "protocol error, unexpected ident_missing_features";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, missing_features, bad);
+	pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+	       client->supported_features, missing_features);
+	con->error_msg = "missing required protocol features";
+	return -EINVAL;
+
+bad:
+	pr_err("failed to decode ident_missing_features\n");
+	return -EINVAL;
+}
+
+static int process_session_reconnect_ok(struct ceph_connection *con,
+					void *p, void *end)
+{
+	u64 seq;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_reconnect_ok";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, seq, bad);
+
+	dout("%s con %p seq %llu\n", __func__, con, seq);
+	ceph_con_discard_requeued(con, seq);
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+	con->delay = 0;  /* reset backoff memory */
+
+	con->state = CEPH_CON_S_OPEN;
+	con->v2.out_state = OUT_S_GET_NEXT;
+	return 0;
+
+bad:
+	pr_err("failed to decode session_reconnect_ok\n");
+	return -EINVAL;
+}
+
+static int process_session_retry(struct ceph_connection *con,
+				 void *p, void *end)
+{
+	u64 connect_seq;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_retry";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, connect_seq, bad);
+
+	dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq);
+	WARN_ON(connect_seq <= con->v2.connect_seq);
+	con->v2.connect_seq = connect_seq + 1;
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_session_reconnect(con);
+	if (ret) {
+		pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode session_retry\n");
+	return -EINVAL;
+}
+
+static int process_session_retry_global(struct ceph_connection *con,
+					void *p, void *end)
+{
+	u64 global_seq;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_retry_global";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, global_seq, bad);
+
+	dout("%s con %p global_seq %llu\n", __func__, con, global_seq);
+	WARN_ON(global_seq <= con->v2.global_seq);
+	con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq);
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_session_reconnect(con);
+	if (ret) {
+		pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode session_retry_global\n");
+	return -EINVAL;
+}
+
+static int process_session_reset(struct ceph_connection *con,
+				 void *p, void *end)
+{
+	bool full;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_reset";
+		return -EINVAL;
+	}
+
+	ceph_decode_8_safe(&p, end, full, bad);
+	if (!full) {
+		con->error_msg = "protocol error, bad session_reset";
+		return -EINVAL;
+	}
+
+	pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name),
+		ceph_pr_addr(&con->peer_addr));
+	ceph_con_reset_session(con);
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->peer_reset)
+		con->ops->peer_reset(con);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_client_ident(con);
+	if (ret) {
+		pr_err("prepare_client_ident (rst) failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+	return 0;
+
+bad:
+	pr_err("failed to decode session_reset\n");
+	return -EINVAL;
+}
+
+static int process_keepalive2_ack(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	if (con->state != CEPH_CON_S_OPEN) {
+		con->error_msg = "protocol error, unexpected keepalive2_ack";
+		return -EINVAL;
+	}
+
+	ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad);
+	ceph_decode_timespec64(&con->last_keepalive_ack, p);
+
+	dout("%s con %p timestamp %lld.%09ld\n", __func__, con,
+	     con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec);
+
+	return 0;
+
+bad:
+	pr_err("failed to decode keepalive2_ack\n");
+	return -EINVAL;
+}
+
+static int process_ack(struct ceph_connection *con, void *p, void *end)
+{
+	u64 seq;
+
+	if (con->state != CEPH_CON_S_OPEN) {
+		con->error_msg = "protocol error, unexpected ack";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, seq, bad);
+
+	dout("%s con %p seq %llu\n", __func__, con, seq);
+	ceph_con_discard_sent(con, seq);
+	return 0;
+
+bad:
+	pr_err("failed to decode ack\n");
+	return -EINVAL;
+}
+
+static int process_control(struct ceph_connection *con, void *p, void *end)
+{
+	int tag = con->v2.in_desc.fd_tag;
+	int ret;
+
+	dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p));
+
+	switch (tag) {
+	case FRAME_TAG_HELLO:
+		ret = process_hello(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_BAD_METHOD:
+		ret = process_auth_bad_method(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_REPLY_MORE:
+		ret = process_auth_reply_more(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_DONE:
+		ret = process_auth_done(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_SIGNATURE:
+		ret = process_auth_signature(con, p, end);
+		break;
+	case FRAME_TAG_SERVER_IDENT:
+		ret = process_server_ident(con, p, end);
+		break;
+	case FRAME_TAG_IDENT_MISSING_FEATURES:
+		ret = process_ident_missing_features(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RECONNECT_OK:
+		ret = process_session_reconnect_ok(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RETRY:
+		ret = process_session_retry(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RETRY_GLOBAL:
+		ret = process_session_retry_global(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RESET:
+		ret = process_session_reset(con, p, end);
+		break;
+	case FRAME_TAG_KEEPALIVE2_ACK:
+		ret = process_keepalive2_ack(con, p, end);
+		break;
+	case FRAME_TAG_ACK:
+		ret = process_ack(con, p, end);
+		break;
+	default:
+		pr_err("bad tag %d\n", tag);
+		con->error_msg = "protocol error, bad tag";
+		return -EINVAL;
+	}
+	if (ret) {
+		dout("%s con %p error %d\n", __func__, con, ret);
+		return ret;
+	}
+
+	prepare_read_preamble(con);
+	return 0;
+}
+
+/*
+ * Return:
+ *   1 - con->in_msg set, read message
+ *   0 - skip message
+ *  <0 - error
+ */
+static int process_message_header(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	struct ceph_msg_header2 *hdr2 = p;
+	struct ceph_msg_header hdr;
+	int skip;
+	int ret;
+	u64 seq;
+
+	/* verify seq# */
+	seq = le64_to_cpu(hdr2->seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr),
+			seq, con->in_seq + 1);
+		return 0;
+	}
+	if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADE;
+	}
+
+	ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq));
+
+	fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2],
+		    desc->fd_lens[3], &con->peer_name);
+	ret = ceph_con_in_msg_alloc(con, &hdr, &skip);
+	if (ret)
+		return ret;
+
+	WARN_ON(!con->in_msg ^ skip);
+	if (skip)
+		return 0;
+
+	WARN_ON(!con->in_msg);
+	WARN_ON(con->in_msg->con != con);
+	return 1;
+}
+
+static int process_message(struct ceph_connection *con)
+{
+	ceph_con_process_message(con);
+
+	/*
+	 * We could have been closed by ceph_con_close() because
+	 * ceph_con_process_message() temporarily drops con->mutex.
+	 */
+	if (con->state != CEPH_CON_S_OPEN) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	prepare_read_preamble(con);
+	return 0;
+}
+
+static int __handle_control(struct ceph_connection *con, void *p)
+{
+	void *end = p + con->v2.in_desc.fd_lens[0];
+	struct ceph_msg *msg;
+	int ret;
+
+	if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE)
+		return process_control(con, p, end);
+
+	ret = process_message_header(con, p, end);
+	if (ret < 0)
+		return ret;
+	if (ret == 0) {
+		prepare_skip_message(con);
+		return 0;
+	}
+
+	msg = con->in_msg;  /* set in process_message_header() */
+	if (!front_len(msg) && !middle_len(msg)) {
+		if (!data_len(msg))
+			return process_message(con);
+
+		prepare_read_data(con);
+		return 0;
+	}
+
+	reset_in_kvecs(con);
+	if (front_len(msg)) {
+		WARN_ON(front_len(msg) > msg->front_alloc_len);
+		add_in_kvec(con, msg->front.iov_base, front_len(msg));
+		msg->front.iov_len = front_len(msg);
+
+		if (con_secure(con) && need_padding(front_len(msg)))
+			add_in_kvec(con, FRONT_PAD(con->v2.in_buf),
+				    padding_len(front_len(msg)));
+	} else {
+		msg->front.iov_len = 0;
+	}
+	if (middle_len(msg)) {
+		WARN_ON(middle_len(msg) > msg->middle->alloc_len);
+		add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+		msg->middle->vec.iov_len = middle_len(msg);
+
+		if (con_secure(con) && need_padding(middle_len(msg)))
+			add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf),
+				    padding_len(middle_len(msg)));
+	} else if (msg->middle) {
+		msg->middle->vec.iov_len = 0;
+	}
+
+	if (data_len(msg)) {
+		con->v2.in_state = IN_S_PREPARE_READ_DATA;
+	} else {
+		add_in_kvec(con, con->v2.in_buf,
+			    con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN :
+					      CEPH_EPILOGUE_PLAIN_LEN);
+		con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+	}
+	return 0;
+}
+
+static int handle_preamble(struct ceph_connection *con)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	int ret;
+
+	if (con_secure(con)) {
+		ret = decrypt_preamble(con);
+		if (ret) {
+			if (ret == -EBADMSG)
+				con->error_msg = "integrity error, bad preamble auth tag";
+			return ret;
+		}
+	}
+
+	ret = decode_preamble(con->v2.in_buf, desc);
+	if (ret) {
+		if (ret == -EBADMSG)
+			con->error_msg = "integrity error, bad crc";
+		else
+			con->error_msg = "protocol error, bad preamble";
+		return ret;
+	}
+
+	dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__,
+	     con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0],
+	     desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]);
+
+	if (!con_secure(con))
+		return prepare_read_control(con);
+
+	if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN)
+		return prepare_read_control_remainder(con);
+
+	return __handle_control(con, CTRL_BODY(con->v2.in_buf));
+}
+
+static int handle_control(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	void *buf;
+	int ret;
+
+	WARN_ON(con_secure(con));
+
+	ret = verify_control_crc(con);
+	if (ret) {
+		con->error_msg = "integrity error, bad crc";
+		return ret;
+	}
+
+	if (con->state == CEPH_CON_S_V2_AUTH) {
+		buf = alloc_conn_buf(con, ctrl_len);
+		if (!buf)
+			return -ENOMEM;
+
+		memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len);
+		return __handle_control(con, buf);
+	}
+
+	return __handle_control(con, con->v2.in_kvecs[0].iov_base);
+}
+
+static int handle_control_remainder(struct ceph_connection *con)
+{
+	int ret;
+
+	WARN_ON(!con_secure(con));
+
+	ret = decrypt_control_remainder(con);
+	if (ret) {
+		if (ret == -EBADMSG)
+			con->error_msg = "integrity error, bad control remainder auth tag";
+		return ret;
+	}
+
+	return __handle_control(con, con->v2.in_kvecs[0].iov_base -
+				     CEPH_PREAMBLE_INLINE_LEN);
+}
+
+static int handle_epilogue(struct ceph_connection *con)
+{
+	u32 front_crc, middle_crc, data_crc;
+	int ret;
+
+	if (con_secure(con)) {
+		ret = decrypt_message(con);
+		if (ret) {
+			if (ret == -EBADMSG)
+				con->error_msg = "integrity error, bad epilogue auth tag";
+			return ret;
+		}
+
+		/* just late_status */
+		ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL);
+		if (ret) {
+			con->error_msg = "protocol error, bad epilogue";
+			return ret;
+		}
+	} else {
+		ret = decode_epilogue(con->v2.in_buf, &front_crc,
+				      &middle_crc, &data_crc);
+		if (ret) {
+			con->error_msg = "protocol error, bad epilogue";
+			return ret;
+		}
+
+		ret = verify_epilogue_crcs(con, front_crc, middle_crc,
+					   data_crc);
+		if (ret) {
+			con->error_msg = "integrity error, bad crc";
+			return ret;
+		}
+	}
+
+	return process_message(con);
+}
+
+static void finish_skip(struct ceph_connection *con)
+{
+	dout("%s con %p\n", __func__, con);
+
+	if (con_secure(con))
+		gcm_inc_nonce(&con->v2.in_gcm_nonce);
+
+	__finish_skip(con);
+}
+
+static int populate_in_iter(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d in_state %d\n", __func__, con, con->state,
+	     con->v2.in_state);
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) {
+		ret = process_banner_prefix(con);
+	} else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) {
+		ret = process_banner_payload(con);
+	} else if ((con->state >= CEPH_CON_S_V2_HELLO &&
+		    con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) ||
+		   con->state == CEPH_CON_S_OPEN) {
+		switch (con->v2.in_state) {
+		case IN_S_HANDLE_PREAMBLE:
+			ret = handle_preamble(con);
+			break;
+		case IN_S_HANDLE_CONTROL:
+			ret = handle_control(con);
+			break;
+		case IN_S_HANDLE_CONTROL_REMAINDER:
+			ret = handle_control_remainder(con);
+			break;
+		case IN_S_PREPARE_READ_DATA:
+			prepare_read_data(con);
+			ret = 0;
+			break;
+		case IN_S_PREPARE_READ_DATA_CONT:
+			prepare_read_data_cont(con);
+			ret = 0;
+			break;
+		case IN_S_HANDLE_EPILOGUE:
+			ret = handle_epilogue(con);
+			break;
+		case IN_S_FINISH_SKIP:
+			finish_skip(con);
+			ret = 0;
+			break;
+		default:
+			WARN(1, "bad in_state %d", con->v2.in_state);
+			return -EINVAL;
+		}
+	} else {
+		WARN(1, "bad state %d", con->state);
+		return -EINVAL;
+	}
+	if (ret) {
+		dout("%s con %p error %d\n", __func__, con, ret);
+		return ret;
+	}
+
+	if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+		return -ENODATA;
+	dout("%s con %p populated %zu\n", __func__, con,
+	     iov_iter_count(&con->v2.in_iter));
+	return 1;
+}
+
+int ceph_con_v2_try_read(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d need %zu\n", __func__, con, con->state,
+	     iov_iter_count(&con->v2.in_iter));
+
+	if (con->state == CEPH_CON_S_PREOPEN)
+		return 0;
+
+	/*
+	 * We should always have something pending here.  If not,
+	 * avoid calling populate_in_iter() as if we read something
+	 * (ceph_tcp_recv() would immediately return 1).
+	 */
+	if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+		return -ENODATA;
+
+	for (;;) {
+		ret = ceph_tcp_recv(con);
+		if (ret <= 0)
+			return ret;
+
+		ret = populate_in_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "read processing error";
+			return ret;
+		}
+	}
+}
+
+static void queue_data(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	con->v2.out_epil.data_crc = -1;
+	ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg,
+				  data_len(con->out_msg));
+
+	get_bvec_at(&con->v2.out_cursor, &bv);
+	set_out_bvec(con, &bv, true);
+	con->v2.out_state = OUT_S_QUEUE_DATA_CONT;
+}
+
+static void queue_data_cont(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	con->v2.out_epil.data_crc = ceph_crc32c_page(
+		con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+		con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len);
+
+	ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len);
+	if (con->v2.out_cursor.total_resid) {
+		get_bvec_at(&con->v2.out_cursor, &bv);
+		set_out_bvec(con, &bv, true);
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT);
+		return;
+	}
+
+	/*
+	 * We've written all data.  Queue epilogue.  Once it's written,
+	 * we are done.
+	 */
+	reset_out_kvecs(con);
+	prepare_epilogue_plain(con, false);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_enc_page(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i,
+	     con->v2.out_enc_resid);
+	WARN_ON(!con->v2.out_enc_resid);
+
+	bv.bv_page = con->v2.out_enc_pages[con->v2.out_enc_i];
+	bv.bv_offset = 0;
+	bv.bv_len = min(con->v2.out_enc_resid, (int)PAGE_SIZE);
+
+	set_out_bvec(con, &bv, false);
+	con->v2.out_enc_i++;
+	con->v2.out_enc_resid -= bv.bv_len;
+
+	if (con->v2.out_enc_resid) {
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE);
+		return;
+	}
+
+	/*
+	 * We've queued the last piece of ciphertext (ending with
+	 * epilogue) + auth tag.  Once it's written, we are done.
+	 */
+	WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_zeros(struct ceph_connection *con)
+{
+	dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero);
+
+	if (con->v2.out_zero) {
+		set_out_bvec_zero(con);
+		con->v2.out_zero -= con->v2.out_bvec.bv_len;
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	/*
+	 * We've zero-filled everything up to epilogue.  Queue epilogue
+	 * with late_status set to ABORTED and crcs adjusted for zeros.
+	 * Once it's written, we are done patching up for the revoke.
+	 */
+	reset_out_kvecs(con);
+	prepare_epilogue_plain(con, true);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void finish_message(struct ceph_connection *con)
+{
+	dout("%s con %p msg %p\n", __func__, con, con->out_msg);
+
+	/* we end up here both plain and secure modes */
+	if (con->v2.out_enc_pages) {
+		WARN_ON(!con->v2.out_enc_page_cnt);
+		ceph_release_page_vector(con->v2.out_enc_pages,
+					 con->v2.out_enc_page_cnt);
+		con->v2.out_enc_pages = NULL;
+		con->v2.out_enc_page_cnt = 0;
+	}
+	/* message may have been revoked */
+	if (con->out_msg) {
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
+
+	con->v2.out_state = OUT_S_GET_NEXT;
+}
+
+static int populate_out_iter(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d out_state %d\n", __func__, con, con->state,
+	     con->v2.out_state);
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+
+	if (con->state != CEPH_CON_S_OPEN) {
+		WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX ||
+			con->state > CEPH_CON_S_V2_SESSION_RECONNECT);
+		goto nothing_pending;
+	}
+
+	switch (con->v2.out_state) {
+	case OUT_S_QUEUE_DATA:
+		WARN_ON(!con->out_msg);
+		queue_data(con);
+		goto populated;
+	case OUT_S_QUEUE_DATA_CONT:
+		WARN_ON(!con->out_msg);
+		queue_data_cont(con);
+		goto populated;
+	case OUT_S_QUEUE_ENC_PAGE:
+		queue_enc_page(con);
+		goto populated;
+	case OUT_S_QUEUE_ZEROS:
+		WARN_ON(con->out_msg);  /* revoked */
+		queue_zeros(con);
+		goto populated;
+	case OUT_S_FINISH_MESSAGE:
+		finish_message(con);
+		break;
+	case OUT_S_GET_NEXT:
+		break;
+	default:
+		WARN(1, "bad out_state %d", con->v2.out_state);
+		return -EINVAL;
+	}
+
+	WARN_ON(con->v2.out_state != OUT_S_GET_NEXT);
+	if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+		ret = prepare_keepalive2(con);
+		if (ret) {
+			pr_err("prepare_keepalive2 failed: %d\n", ret);
+			return ret;
+		}
+	} else if (!list_empty(&con->out_queue)) {
+		ceph_con_get_out_msg(con);
+		ret = prepare_message(con);
+		if (ret) {
+			pr_err("prepare_message failed: %d\n", ret);
+			return ret;
+		}
+	} else if (con->in_seq > con->in_seq_acked) {
+		ret = prepare_ack(con);
+		if (ret) {
+			pr_err("prepare_ack failed: %d\n", ret);
+			return ret;
+		}
+	} else {
+		goto nothing_pending;
+	}
+
+populated:
+	if (WARN_ON(!iov_iter_count(&con->v2.out_iter)))
+		return -ENODATA;
+	dout("%s con %p populated %zu\n", __func__, con,
+	     iov_iter_count(&con->v2.out_iter));
+	return 1;
+
+nothing_pending:
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	dout("%s con %p nothing pending\n", __func__, con);
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+int ceph_con_v2_try_write(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d have %zu\n", __func__, con, con->state,
+	     iov_iter_count(&con->v2.out_iter));
+
+	/* open the socket first? */
+	if (con->state == CEPH_CON_S_PREOPEN) {
+		WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2);
+
+		/*
+		 * Always bump global_seq.  Bump connect_seq only if
+		 * there is a session (i.e. we are reconnecting and will
+		 * send session_reconnect instead of client_ident).
+		 */
+		con->v2.global_seq = ceph_get_global_seq(con->msgr, 0);
+		if (con->v2.server_cookie)
+			con->v2.connect_seq++;
+
+		ret = prepare_read_banner_prefix(con);
+		if (ret) {
+			pr_err("prepare_read_banner_prefix failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+
+		reset_out_kvecs(con);
+		ret = prepare_banner(con);
+		if (ret) {
+			pr_err("prepare_banner failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+
+		ret = ceph_tcp_connect(con);
+		if (ret) {
+			pr_err("ceph_tcp_connect failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+	}
+
+	if (!iov_iter_count(&con->v2.out_iter)) {
+		ret = populate_out_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "write processing error";
+			return ret;
+		}
+	}
+
+	tcp_sock_set_cork(con->sock->sk, true);
+	for (;;) {
+		ret = ceph_tcp_send(con);
+		if (ret <= 0)
+			break;
+
+		ret = populate_out_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "write processing error";
+			break;
+		}
+	}
+
+	tcp_sock_set_cork(con->sock->sk, false);
+	return ret;
+}
+
+static u32 crc32c_zeros(u32 crc, int zero_len)
+{
+	int len;
+
+	while (zero_len) {
+		len = min(zero_len, (int)PAGE_SIZE);
+		crc = crc32c(crc, page_address(ceph_zero_page), len);
+		zero_len -= len;
+	}
+
+	return crc;
+}
+
+static void prepare_zero_front(struct ceph_connection *con, int resid)
+{
+	int sent;
+
+	WARN_ON(!resid || resid > front_len(con->out_msg));
+	sent = front_len(con->out_msg) - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.front_crc =
+			crc32c(-1, con->out_msg->front.iov_base, sent);
+		con->v2.out_epil.front_crc =
+			crc32c_zeros(con->v2.out_epil.front_crc, resid);
+	} else {
+		con->v2.out_epil.front_crc = crc32c_zeros(-1, resid);
+	}
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, resid);
+}
+
+static void prepare_zero_middle(struct ceph_connection *con, int resid)
+{
+	int sent;
+
+	WARN_ON(!resid || resid > middle_len(con->out_msg));
+	sent = middle_len(con->out_msg) - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.middle_crc =
+			crc32c(-1, con->out_msg->middle->vec.iov_base, sent);
+		con->v2.out_epil.middle_crc =
+			crc32c_zeros(con->v2.out_epil.middle_crc, resid);
+	} else {
+		con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid);
+	}
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, resid);
+}
+
+static void prepare_zero_data(struct ceph_connection *con)
+{
+	dout("%s con %p\n", __func__, con);
+	con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg));
+	out_zero_add(con, data_len(con->out_msg));
+}
+
+static void revoke_at_queue_data(struct ceph_connection *con)
+{
+	int boundary;
+	int resid;
+
+	WARN_ON(!data_len(con->out_msg));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+
+	boundary = front_len(con->out_msg) + middle_len(con->out_msg);
+	if (resid > boundary) {
+		resid -= boundary;
+		WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head\n", __func__, con);
+		if (front_len(con->out_msg))
+			prepare_zero_front(con, front_len(con->out_msg));
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		prepare_zero_data(con);
+		WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	boundary = middle_len(con->out_msg);
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending front\n", __func__, con);
+		prepare_zero_front(con, resid);
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		prepare_zero_data(con);
+		queue_zeros(con);
+		return;
+	}
+
+	WARN_ON(!resid);
+	dout("%s con %p was sending middle\n", __func__, con);
+	prepare_zero_middle(con, resid);
+	prepare_zero_data(con);
+	queue_zeros(con);
+}
+
+static void revoke_at_queue_data_cont(struct ceph_connection *con)
+{
+	int sent, resid;  /* current piece of data */
+
+	WARN_ON(!data_len(con->out_msg));
+	WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+	WARN_ON(!resid || resid > con->v2.out_bvec.bv_len);
+	sent = con->v2.out_bvec.bv_len - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.data_crc = ceph_crc32c_page(
+			con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+			con->v2.out_bvec.bv_offset, sent);
+		ceph_msg_data_advance(&con->v2.out_cursor, sent);
+	}
+	WARN_ON(resid > con->v2.out_cursor.total_resid);
+	con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc,
+						con->v2.out_cursor.total_resid);
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, con->v2.out_cursor.total_resid);
+	queue_zeros(con);
+}
+
+static void revoke_at_finish_message(struct ceph_connection *con)
+{
+	int boundary;
+	int resid;
+
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+
+	if (!front_len(con->out_msg) && !middle_len(con->out_msg) &&
+	    !data_len(con->out_msg)) {
+		WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head (empty message) - noop\n",
+		     __func__, con);
+		return;
+	}
+
+	boundary = front_len(con->out_msg) + middle_len(con->out_msg) +
+		   CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head\n", __func__, con);
+		if (front_len(con->out_msg))
+			prepare_zero_front(con, front_len(con->out_msg));
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending front\n", __func__, con);
+		prepare_zero_front(con, resid);
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		queue_zeros(con);
+		return;
+	}
+
+	boundary = CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending middle\n", __func__, con);
+		prepare_zero_middle(con, resid);
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		queue_zeros(con);
+		return;
+	}
+
+	WARN_ON(!resid);
+	dout("%s con %p was sending epilogue - noop\n", __func__, con);
+}
+
+void ceph_con_v2_revoke(struct ceph_connection *con)
+{
+	WARN_ON(con->v2.out_zero);
+
+	if (con_secure(con)) {
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE &&
+			con->v2.out_state != OUT_S_FINISH_MESSAGE);
+		dout("%s con %p secure - noop\n", __func__, con);
+		return;
+	}
+
+	switch (con->v2.out_state) {
+	case OUT_S_QUEUE_DATA:
+		revoke_at_queue_data(con);
+		break;
+	case OUT_S_QUEUE_DATA_CONT:
+		revoke_at_queue_data_cont(con);
+		break;
+	case OUT_S_FINISH_MESSAGE:
+		revoke_at_finish_message(con);
+		break;
+	default:
+		WARN(1, "bad out_state %d", con->v2.out_state);
+		break;
+	}
+}
+
+static void revoke_at_prepare_read_data(struct ceph_connection *con)
+{
+	int remaining;  /* data + [data padding] + epilogue */
+	int resid;
+
+	WARN_ON(!data_len(con->in_msg));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid);
+
+	if (con_secure(con))
+		remaining = padded_len(data_len(con->in_msg)) +
+			    CEPH_EPILOGUE_SECURE_LEN;
+	else
+		remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+
+	dout("%s con %p resid %d remaining %d\n", __func__, con, resid,
+	     remaining);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, resid + remaining);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
+{
+	int recved, resid;  /* current piece of data */
+	int remaining;  /* [data padding] + epilogue */
+
+	WARN_ON(!data_len(con->in_msg));
+	WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
+	recved = con->v2.in_bvec.bv_len - resid;
+	dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid);
+
+	if (recved)
+		ceph_msg_data_advance(&con->v2.in_cursor, recved);
+	WARN_ON(resid > con->v2.in_cursor.total_resid);
+
+	if (con_secure(con))
+		remaining = padding_len(data_len(con->in_msg)) +
+			    CEPH_EPILOGUE_SECURE_LEN;
+	else
+		remaining = CEPH_EPILOGUE_PLAIN_LEN;
+
+	dout("%s con %p total_resid %zu remaining %d\n", __func__, con,
+	     con->v2.in_cursor.total_resid, remaining);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, con->v2.in_cursor.total_resid + remaining);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_handle_epilogue(struct ceph_connection *con)
+{
+	int resid;
+
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid);
+
+	dout("%s con %p resid %d\n", __func__, con, resid);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, resid);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
+{
+	switch (con->v2.in_state) {
+	case IN_S_PREPARE_READ_DATA:
+		revoke_at_prepare_read_data(con);
+		break;
+	case IN_S_PREPARE_READ_DATA_CONT:
+		revoke_at_prepare_read_data_cont(con);
+		break;
+	case IN_S_HANDLE_EPILOGUE:
+		revoke_at_handle_epilogue(con);
+		break;
+	default:
+		WARN(1, "bad in_state %d", con->v2.in_state);
+		break;
+	}
+}
+
+bool ceph_con_v2_opened(struct ceph_connection *con)
+{
+	return con->v2.peer_global_seq;
+}
+
+void ceph_con_v2_reset_session(struct ceph_connection *con)
+{
+	con->v2.client_cookie = 0;
+	con->v2.server_cookie = 0;
+	con->v2.global_seq = 0;
+	con->v2.connect_seq = 0;
+	con->v2.peer_global_seq = 0;
+}
+
+void ceph_con_v2_reset_protocol(struct ceph_connection *con)
+{
+	iov_iter_truncate(&con->v2.in_iter, 0);
+	iov_iter_truncate(&con->v2.out_iter, 0);
+	con->v2.out_zero = 0;
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+
+	if (con->v2.out_enc_pages) {
+		WARN_ON(!con->v2.out_enc_page_cnt);
+		ceph_release_page_vector(con->v2.out_enc_pages,
+					 con->v2.out_enc_page_cnt);
+		con->v2.out_enc_pages = NULL;
+		con->v2.out_enc_page_cnt = 0;
+	}
+
+	con->v2.con_mode = CEPH_CON_MODE_UNKNOWN;
+	memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN);
+	memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN);
+
+	if (con->v2.hmac_tfm) {
+		crypto_free_shash(con->v2.hmac_tfm);
+		con->v2.hmac_tfm = NULL;
+	}
+	if (con->v2.gcm_req) {
+		aead_request_free(con->v2.gcm_req);
+		con->v2.gcm_req = NULL;
+	}
+	if (con->v2.gcm_tfm) {
+		crypto_free_aead(con->v2.gcm_tfm);
+		con->v2.gcm_tfm = NULL;
+	}
+}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index c4cf2529d08b..195ceb8afb06 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -36,57 +36,122 @@ static const struct ceph_connection_operations mon_con_ops;
 
 static int __validate_auth(struct ceph_mon_client *monc);
 
+static int decode_mon_info(void **p, void *end, bool msgr2,
+			   struct ceph_entity_addr *addr)
+{
+	void *mon_info_end;
+	u32 struct_len;
+	u8 struct_v;
+	int ret;
+
+	ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v,
+				  &struct_len);
+	if (ret)
+		return ret;
+
+	mon_info_end = *p + struct_len;
+	ceph_decode_skip_string(p, end, e_inval);  /* skip mon name */
+	ret = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+	if (ret)
+		return ret;
+
+	*p = mon_info_end;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 /*
  * Decode a monmap blob (e.g., during mount).
+ *
+ * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC).
  */
-static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2)
 {
-	struct ceph_monmap *m = NULL;
-	int i, err = -EINVAL;
+	struct ceph_monmap *monmap = NULL;
 	struct ceph_fsid fsid;
-	u32 epoch, num_mon;
-	u32 len;
+	u32 struct_len;
+	int blob_len;
+	int num_mon;
+	u8 struct_v;
+	u32 epoch;
+	int ret;
+	int i;
+
+	ceph_decode_32_safe(p, end, blob_len, e_inval);
+	ceph_decode_need(p, end, blob_len, e_inval);
+
+	ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len);
+	if (ret)
+		goto fail;
 
-	ceph_decode_32_safe(&p, end, len, bad);
-	ceph_decode_need(&p, end, len, bad);
+	dout("%s struct_v %d\n", __func__, struct_v);
+	ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval);
+	ceph_decode_32_safe(p, end, epoch, e_inval);
+	if (struct_v >= 6) {
+		u32 feat_struct_len;
+		u8 feat_struct_v;
 
-	dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
-	p += sizeof(u16);  /* skip version */
+		*p += sizeof(struct ceph_timespec);  /* skip last_changed */
+		*p += sizeof(struct ceph_timespec);  /* skip created */
 
-	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
-	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	epoch = ceph_decode_32(&p);
+		ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+					  &feat_struct_v, &feat_struct_len);
+		if (ret)
+			goto fail;
 
-	num_mon = ceph_decode_32(&p);
+		*p += feat_struct_len;  /* skip persistent_features */
+
+		ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+					  &feat_struct_v, &feat_struct_len);
+		if (ret)
+			goto fail;
+
+		*p += feat_struct_len;  /* skip optional_features */
+	}
+	ceph_decode_32_safe(p, end, num_mon, e_inval);
 
+	dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch,
+	     num_mon);
 	if (num_mon > CEPH_MAX_MON)
-		goto bad;
-	m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS);
-	if (m == NULL)
-		return ERR_PTR(-ENOMEM);
-	m->fsid = fsid;
-	m->epoch = epoch;
-	m->num_mon = num_mon;
-	for (i = 0; i < num_mon; ++i) {
-		struct ceph_entity_inst *inst = &m->mon_inst[i];
-
-		/* copy name portion */
-		ceph_decode_copy_safe(&p, end, &inst->name,
-					sizeof(inst->name), bad);
-		err = ceph_decode_entity_addr(&p, end, &inst->addr);
-		if (err)
-			goto bad;
+		goto e_inval;
+
+	monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO);
+	if (!monmap) {
+		ret = -ENOMEM;
+		goto fail;
 	}
-	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
-	     m->num_mon);
-	for (i = 0; i < m->num_mon; i++)
-		dout("monmap_decode  mon%d is %s\n", i,
-		     ceph_pr_addr(&m->mon_inst[i].addr));
-	return m;
-bad:
-	dout("monmap_decode failed with %d\n", err);
-	kfree(m);
-	return ERR_PTR(err);
+	monmap->fsid = fsid;
+	monmap->epoch = epoch;
+	monmap->num_mon = num_mon;
+
+	/* legacy_mon_addr map or mon_info map */
+	for (i = 0; i < num_mon; i++) {
+		struct ceph_entity_inst *inst = &monmap->mon_inst[i];
+
+		ceph_decode_skip_string(p, end, e_inval);  /* skip mon name */
+		inst->name.type = CEPH_ENTITY_TYPE_MON;
+		inst->name.num = cpu_to_le64(i);
+
+		if (struct_v >= 6)
+			ret = decode_mon_info(p, end, msgr2, &inst->addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &inst->addr);
+		if (ret)
+			goto fail;
+
+		dout("%s mon%d addr %s\n", __func__, i,
+		     ceph_pr_addr(&inst->addr));
+	}
+
+	return monmap;
+
+e_inval:
+	ret = -EINVAL;
+fail:
+	kfree(monmap);
+	return ERR_PTR(ret);
 }
 
 /*
@@ -96,9 +161,11 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
 {
 	int i;
 
-	for (i = 0; i < m->num_mon; i++)
-		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+	for (i = 0; i < m->num_mon; i++) {
+		if (ceph_addr_equal_no_type(addr, &m->mon_inst[i].addr))
 			return 1;
+	}
+
 	return 0;
 }
 
@@ -190,10 +257,16 @@ static void __open_session(struct ceph_mon_client *monc)
 		      &monc->monmap->mon_inst[monc->cur_mon].addr);
 
 	/*
-	 * send an initial keepalive to ensure our timestamp is valid
-	 * by the time we are in an OPENED state
+	 * Queue a keepalive to ensure that in case of an early fault
+	 * the messenger doesn't put us into STANDBY state and instead
+	 * retries.  This also ensures that our timestamp is valid by
+	 * the time we finish hunting and delayed_work() checks it.
 	 */
 	ceph_con_keepalive(&monc->con);
+	if (ceph_msgr2(monc->client)) {
+		monc->pending_auth = 1;
+		return;
+	}
 
 	/* initiate authentication handshake */
 	ret = ceph_auth_build_hello(monc->auth,
@@ -476,7 +549,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
 
-	monmap = ceph_monmap_decode(p, end);
+	monmap = ceph_monmap_decode(&p, end, ceph_msgr2(client));
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
@@ -1052,8 +1125,9 @@ static void delayed_work(struct work_struct *work)
  */
 static int build_initial_monmap(struct ceph_mon_client *monc)
 {
+	__le32 my_type = ceph_msgr2(monc->client) ?
+		CEPH_ENTITY_ADDR_TYPE_MSGR2 : CEPH_ENTITY_ADDR_TYPE_LEGACY;
 	struct ceph_options *opt = monc->client->options;
-	struct ceph_entity_addr *mon_addr = opt->mon_addr;
 	int num_mon = opt->num_mon;
 	int i;
 
@@ -1062,12 +1136,16 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 			       GFP_KERNEL);
 	if (!monc->monmap)
 		return -ENOMEM;
+
 	for (i = 0; i < num_mon; i++) {
-		monc->monmap->mon_inst[i].addr = mon_addr[i];
-		monc->monmap->mon_inst[i].addr.nonce = 0;
-		monc->monmap->mon_inst[i].name.type =
-			CEPH_ENTITY_TYPE_MON;
-		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+		struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i];
+
+		memcpy(&inst->addr.in_addr, &opt->mon_addr[i].in_addr,
+		       sizeof(inst->addr.in_addr));
+		inst->addr.type = my_type;
+		inst->addr.nonce = 0;
+		inst->name.type = CEPH_ENTITY_TYPE_MON;
+		inst->name.num = cpu_to_le64(i);
 	}
 	monc->monmap->num_mon = num_mon;
 	return 0;
@@ -1089,8 +1167,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 	/* connection */
 	/* authentication */
-	monc->auth = ceph_auth_init(cl->options->name,
-				    cl->options->key);
+	monc->auth = ceph_auth_init(cl->options->name, cl->options->key,
+				    cl->options->con_modes);
 	if (IS_ERR(monc->auth)) {
 		err = PTR_ERR(monc->auth);
 		goto out_monmap;
@@ -1194,30 +1272,22 @@ static void finish_hunting(struct ceph_mon_client *monc)
 	}
 }
 
-static void handle_auth_reply(struct ceph_mon_client *monc,
-			      struct ceph_msg *msg)
+static void finish_auth(struct ceph_mon_client *monc, int auth_err,
+			bool was_authed)
 {
-	int ret;
-	int was_auth = 0;
+	dout("%s auth_err %d was_authed %d\n", __func__, auth_err, was_authed);
+	WARN_ON(auth_err > 0);
 
-	mutex_lock(&monc->mutex);
-	was_auth = ceph_auth_is_authenticated(monc->auth);
 	monc->pending_auth = 0;
-	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
-				     msg->front.iov_len,
-				     monc->m_auth->front.iov_base,
-				     monc->m_auth->front_alloc_len);
-	if (ret > 0) {
-		__send_prepared_auth_request(monc, ret);
-		goto out;
+	if (auth_err) {
+		monc->client->auth_err = auth_err;
+		wake_up_all(&monc->client->auth_wq);
+		return;
 	}
 
-	finish_hunting(monc);
-
-	if (ret < 0) {
-		monc->client->auth_err = ret;
-	} else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
-		dout("authenticated, starting session\n");
+	if (!was_authed && ceph_auth_is_authenticated(monc->auth)) {
+		dout("%s authenticated, starting session global_id %llu\n",
+		     __func__, monc->auth->global_id);
 
 		monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
 		monc->client->msgr.inst.name.num =
@@ -1229,11 +1299,27 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 		pr_info("mon%d %s session established\n", monc->cur_mon,
 			ceph_pr_addr(&monc->con.peer_addr));
 	}
+}
 
-out:
+static void handle_auth_reply(struct ceph_mon_client *monc,
+			      struct ceph_msg *msg)
+{
+	bool was_authed;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	was_authed = ceph_auth_is_authenticated(monc->auth);
+	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+				     msg->front.iov_len,
+				     monc->m_auth->front.iov_base,
+				     monc->m_auth->front_alloc_len);
+	if (ret > 0) {
+		__send_prepared_auth_request(monc, ret);
+	} else {
+		finish_auth(monc, ret, was_authed);
+		finish_hunting(monc);
+	}
 	mutex_unlock(&monc->mutex);
-	if (monc->client->auth_err < 0)
-		wake_up_all(&monc->client->auth_wq);
 }
 
 static int __validate_auth(struct ceph_mon_client *monc)
@@ -1262,10 +1348,92 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc)
 }
 EXPORT_SYMBOL(ceph_monc_validate_auth);
 
+static int mon_get_auth_request(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = ceph_auth_get_request(monc->auth, buf, *buf_len);
+	mutex_unlock(&monc->mutex);
+	if (ret < 0)
+		return ret;
+
+	*buf_len = ret;
+	*authorizer = NULL;
+	*authorizer_len = 0;
+	return 0;
+}
+
+static int mon_handle_auth_reply_more(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = ceph_auth_handle_reply_more(monc->auth, reply, reply_len,
+					  buf, *buf_len);
+	mutex_unlock(&monc->mutex);
+	if (ret < 0)
+		return ret;
+
+	*buf_len = ret;
+	*authorizer = NULL;
+	*authorizer_len = 0;
+	return 0;
+}
+
+static int mon_handle_auth_done(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	bool was_authed;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	WARN_ON(!monc->hunting);
+	was_authed = ceph_auth_is_authenticated(monc->auth);
+	ret = ceph_auth_handle_reply_done(monc->auth, global_id,
+					  reply, reply_len,
+					  session_key, session_key_len,
+					  con_secret, con_secret_len);
+	finish_auth(monc, ret, was_authed);
+	if (!ret)
+		finish_hunting(monc);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+static int mon_handle_auth_bad_method(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt)
+{
+	struct ceph_mon_client *monc = con->private;
+	bool was_authed;
+
+	mutex_lock(&monc->mutex);
+	WARN_ON(!monc->hunting);
+	was_authed = ceph_auth_is_authenticated(monc->auth);
+	ceph_auth_handle_bad_method(monc->auth, used_proto, result,
+				    allowed_protos, proto_cnt,
+				    allowed_modes, mode_cnt);
+	finish_auth(monc, -EACCES, was_authed);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
 /*
  * handle incoming message
  */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+static void mon_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct ceph_mon_client *monc = con->private;
 	int type = le16_to_cpu(msg->hdr.type);
@@ -1397,19 +1565,23 @@ static void mon_fault(struct ceph_connection *con)
  * will come from the messenger workqueue, which is drained prior to
  * mon_client destruction.
  */
-static struct ceph_connection *con_get(struct ceph_connection *con)
+static struct ceph_connection *mon_get_con(struct ceph_connection *con)
 {
 	return con;
 }
 
-static void con_put(struct ceph_connection *con)
+static void mon_put_con(struct ceph_connection *con)
 {
 }
 
 static const struct ceph_connection_operations mon_con_ops = {
-	.get = con_get,
-	.put = con_put,
-	.dispatch = dispatch,
-	.fault = mon_fault,
+	.get = mon_get_con,
+	.put = mon_put_con,
 	.alloc_msg = mon_alloc_msg,
+	.dispatch = mon_dispatch,
+	.fault = mon_fault,
+	.get_auth_request = mon_get_auth_request,
+	.handle_auth_reply_more = mon_handle_auth_reply_more,
+	.handle_auth_done = mon_handle_auth_done,
+	.handle_auth_bad_method = mon_handle_auth_bad_method,
 };
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7901ab6c79fd..ff8624a7c964 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3918,9 +3918,11 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 	set_pool_was_full(osdc);
 
 	if (incremental)
-		newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+		newmap = osdmap_apply_incremental(&p, end,
+						  ceph_msgr2(osdc->client),
+						  osdc->osdmap);
 	else
-		newmap = ceph_osdmap_decode(&p, end);
+		newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client));
 	if (IS_ERR(newmap))
 		return PTR_ERR(newmap);
 
@@ -5410,7 +5412,7 @@ void ceph_osdc_cleanup(void)
 /*
  * handle incoming message
  */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct ceph_osd *osd = con->private;
 	struct ceph_osd_client *osdc = osd->o_osdc;
@@ -5532,9 +5534,9 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
 	return m;
 }
 
-static struct ceph_msg *alloc_msg(struct ceph_connection *con,
-				  struct ceph_msg_header *hdr,
-				  int *skip)
+static struct ceph_msg *osd_alloc_msg(struct ceph_connection *con,
+				      struct ceph_msg_header *hdr,
+				      int *skip)
 {
 	struct ceph_osd *osd = con->private;
 	int type = le16_to_cpu(hdr->type);
@@ -5558,7 +5560,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 /*
  * Wrappers to refcount containing ceph_osd struct
  */
-static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+static struct ceph_connection *osd_get_con(struct ceph_connection *con)
 {
 	struct ceph_osd *osd = con->private;
 	if (get_osd(osd))
@@ -5566,7 +5568,7 @@ static struct ceph_connection *get_osd_con(struct ceph_connection *con)
 	return NULL;
 }
 
-static void put_osd_con(struct ceph_connection *con)
+static void osd_put_con(struct ceph_connection *con)
 {
 	struct ceph_osd *osd = con->private;
 	put_osd(osd);
@@ -5575,39 +5577,29 @@ static void put_osd_con(struct ceph_connection *con)
 /*
  * authentication
  */
+
 /*
  * Note: returned pointer is the address of a structure that's
  * managed separately.  Caller must *not* attempt to free it.
  */
-static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
-					int *proto, int force_new)
+static struct ceph_auth_handshake *
+osd_get_authorizer(struct ceph_connection *con, int *proto, int force_new)
 {
 	struct ceph_osd *o = con->private;
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
 	struct ceph_auth_handshake *auth = &o->o_auth;
+	int ret;
 
-	if (force_new && auth->authorizer) {
-		ceph_auth_destroy_authorizer(auth->authorizer);
-		auth->authorizer = NULL;
-	}
-	if (!auth->authorizer) {
-		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
-						      auth);
-		if (ret)
-			return ERR_PTR(ret);
-	} else {
-		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
-						     auth);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-	*proto = ac->protocol;
+	ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+					 force_new, proto, NULL, NULL);
+	if (ret)
+		return ERR_PTR(ret);
 
 	return auth;
 }
 
-static int add_authorizer_challenge(struct ceph_connection *con,
+static int osd_add_authorizer_challenge(struct ceph_connection *con,
 				    void *challenge_buf, int challenge_buf_len)
 {
 	struct ceph_osd *o = con->private;
@@ -5618,16 +5610,19 @@ static int add_authorizer_challenge(struct ceph_connection *con,
 					    challenge_buf, challenge_buf_len);
 }
 
-static int verify_authorizer_reply(struct ceph_connection *con)
+static int osd_verify_authorizer_reply(struct ceph_connection *con)
 {
 	struct ceph_osd *o = con->private;
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
 
-	return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
+	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+		auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+		NULL, NULL, NULL, NULL);
 }
 
-static int invalidate_authorizer(struct ceph_connection *con)
+static int osd_invalidate_authorizer(struct ceph_connection *con)
 {
 	struct ceph_osd *o = con->private;
 	struct ceph_osd_client *osdc = o->o_osdc;
@@ -5637,6 +5632,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&osdc->client->monc);
 }
 
+static int osd_get_auth_request(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+	int ret;
+
+	ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+				       buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int osd_handle_auth_reply_more(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+	int ret;
+
+	ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+					      buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int osd_handle_auth_done(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+
+	return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+					       session_key, session_key_len,
+					       con_secret, con_secret_len);
+}
+
+static int osd_handle_auth_bad_method(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_mon_client *monc = &o->o_osdc->client->monc;
+	int ret;
+
+	if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD,
+					    used_proto, result,
+					    allowed_protos, proto_cnt,
+					    allowed_modes, mode_cnt)) {
+		ret = ceph_monc_validate_auth(monc);
+		if (ret)
+			return ret;
+	}
+
+	return -EACCES;
+}
+
 static void osd_reencode_message(struct ceph_msg *msg)
 {
 	int type = le16_to_cpu(msg->hdr.type);
@@ -5662,16 +5731,20 @@ static int osd_check_message_signature(struct ceph_msg *msg)
 }
 
 static const struct ceph_connection_operations osd_con_ops = {
-	.get = get_osd_con,
-	.put = put_osd_con,
-	.dispatch = dispatch,
-	.get_authorizer = get_authorizer,
-	.add_authorizer_challenge = add_authorizer_challenge,
-	.verify_authorizer_reply = verify_authorizer_reply,
-	.invalidate_authorizer = invalidate_authorizer,
-	.alloc_msg = alloc_msg,
+	.get = osd_get_con,
+	.put = osd_put_con,
+	.alloc_msg = osd_alloc_msg,
+	.dispatch = osd_dispatch,
+	.fault = osd_fault,
 	.reencode_message = osd_reencode_message,
+	.get_authorizer = osd_get_authorizer,
+	.add_authorizer_challenge = osd_add_authorizer_challenge,
+	.verify_authorizer_reply = osd_verify_authorizer_reply,
+	.invalidate_authorizer = osd_invalidate_authorizer,
 	.sign_message = osd_sign_message,
 	.check_message_signature = osd_check_message_signature,
-	.fault = osd_fault,
+	.get_auth_request = osd_get_auth_request,
+	.handle_auth_reply_more = osd_handle_auth_reply_more,
+	.handle_auth_done = osd_handle_auth_done,
+	.handle_auth_bad_method = osd_handle_auth_bad_method,
 };
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index fa08c15be0c0..2b1dd252f231 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1647,7 +1647,8 @@ static int decode_old_pg_upmap_items(void **p, void *end,
 /*
  * decode a full map.
  */
-static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+static int osdmap_decode(void **p, void *end, bool msgr2,
+			 struct ceph_osdmap *map)
 {
 	u8 struct_v;
 	u32 epoch = 0;
@@ -1718,9 +1719,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 		goto e_inval;
 
 	for (i = 0; i < map->max_osd; i++) {
-		err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
+		struct ceph_entity_addr *addr = &map->osd_addr[i];
+
+		if (struct_v >= 8)
+			err = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+		else
+			err = ceph_decode_entity_addr(p, end, addr);
 		if (err)
 			goto bad;
+
+		dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr));
 	}
 
 	/* pg_temp */
@@ -1790,7 +1798,7 @@ bad:
 /*
  * Allocate and decode a full map.
  */
-struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2)
 {
 	struct ceph_osdmap *map;
 	int ret;
@@ -1799,7 +1807,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 	if (!map)
 		return ERR_PTR(-ENOMEM);
 
-	ret = osdmap_decode(p, end, map);
+	ret = osdmap_decode(p, end, msgr2, map);
 	if (ret) {
 		ceph_osdmap_destroy(map);
 		return ERR_PTR(ret);
@@ -1817,12 +1825,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
  *     new_state: { osd=6, xorstate=EXISTS } # clear osd_state
  */
 static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
-				      struct ceph_osdmap *map)
+				      bool msgr2, struct ceph_osdmap *map)
 {
 	void *new_up_client;
 	void *new_state;
 	void *new_weight_end;
 	u32 len;
+	int ret;
 	int i;
 
 	new_up_client = *p;
@@ -1831,8 +1840,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 		struct ceph_entity_addr addr;
 
 		ceph_decode_skip_32(p, end, e_inval);
-		if (ceph_decode_entity_addr(p, end, &addr))
-			goto e_inval;
+		if (struct_v >= 7)
+			ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &addr);
+		if (ret)
+			return ret;
 	}
 
 	new_state = *p;
@@ -1874,7 +1887,6 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 	while (len--) {
 		s32 osd;
 		u32 xorstate;
-		int ret;
 
 		osd = ceph_decode_32(p);
 		if (struct_v >= 5)
@@ -1910,8 +1922,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 
 		osd = ceph_decode_32(p);
 		BUG_ON(osd >= map->max_osd);
-		if (ceph_decode_entity_addr(p, end, &addr))
-			goto e_inval;
+		if (struct_v >= 7)
+			ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &addr);
+		if (ret)
+			return ret;
+
+		dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
+
 		pr_info("osd%d up\n", osd);
 		map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
 		map->osd_addr[osd] = addr;
@@ -1927,7 +1946,7 @@ e_inval:
 /*
  * decode and apply an incremental map update.
  */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
 					     struct ceph_osdmap *map)
 {
 	struct ceph_fsid fsid;
@@ -1962,7 +1981,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	if (len > 0) {
 		dout("apply_incremental full map len %d, %p to %p\n",
 		     len, *p, end);
-		return ceph_osdmap_decode(p, min(*p+len, end));
+		return ceph_osdmap_decode(p, min(*p+len, end), msgr2);
 	}
 
 	/* new crush? */
@@ -2014,7 +2033,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 
 	/* new_up_client, new_state, new_weight */
-	err = decode_new_up_state_weight(p, end, struct_v, map);
+	err = decode_new_up_state_weight(p, end, struct_v, msgr2, map);
 	if (err)
 		goto bad;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index a46334906c94..a979b86dbacd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1457,6 +1457,25 @@ void netdev_state_change(struct net_device *dev)
 EXPORT_SYMBOL(netdev_state_change);
 
 /**
+ * __netdev_notify_peers - notify network peers about existence of @dev,
+ * to be called when rtnl lock is already held.
+ * @dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void __netdev_notify_peers(struct net_device *dev)
+{
+	ASSERT_RTNL();
+	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+}
+EXPORT_SYMBOL(__netdev_notify_peers);
+
+/**
  * netdev_notify_peers - notify network peers about existence of @dev
  * @dev: network device
  *
@@ -1469,8 +1488,7 @@ EXPORT_SYMBOL(netdev_state_change);
 void netdev_notify_peers(struct net_device *dev)
 {
 	rtnl_lock();
-	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
-	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+	__netdev_notify_peers(dev);
 	rtnl_unlock();
 }
 EXPORT_SYMBOL(netdev_notify_peers);
@@ -9643,9 +9661,20 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		}
 	}
 
-	if ((features & NETIF_F_HW_TLS_TX) && !(features & NETIF_F_HW_CSUM)) {
-		netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
-		features &= ~NETIF_F_HW_TLS_TX;
+	if (features & NETIF_F_HW_TLS_TX) {
+		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
+			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+		bool hw_csum = features & NETIF_F_HW_CSUM;
+
+		if (!ip_csum && !hw_csum) {
+			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
+			features &= ~NETIF_F_HW_TLS_TX;
+		}
+	}
+
+	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
+		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
+		features &= ~NETIF_F_HW_TLS_RX;
 	}
 
 	return features;
@@ -10059,17 +10088,11 @@ int register_netdevice(struct net_device *dev)
 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
 	ret = notifier_to_errno(ret);
 	if (ret) {
+		/* Expect explicit free_netdev() on failure */
+		dev->needs_free_netdev = false;
 		rollback_registered(dev);
-		rcu_barrier();
-
-		dev->reg_state = NETREG_UNREGISTERED;
-		/* We should put the kobject that hold in
-		 * netdev_unregister_kobject(), otherwise
-		 * the net device cannot be freed when
-		 * driver calls free_netdev(), because the
-		 * kobject is being hold.
-		 */
-		kobject_put(&dev->dev.kobj);
+		net_set_todo(dev);
+		goto out;
 	}
 	/*
 	 *	Prevent userspace races by waiting until the network
@@ -10613,6 +10636,17 @@ void free_netdev(struct net_device *dev)
 	struct napi_struct *p, *n;
 
 	might_sleep();
+
+	/* When called immediately after register_netdevice() failed the unwind
+	 * handling may still be dismantling the device. Handle that case by
+	 * deferring the free.
+	 */
+	if (dev->reg_state == NETREG_UNREGISTERING) {
+		ASSERT_RTNL();
+		dev->needs_free_netdev = true;
+		return;
+	}
+
 	netif_free_tx_queues(dev);
 	netif_free_rx_queues(dev);
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ee828e4b1007..738d4344d679 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4146,7 +4146,7 @@ out:
 static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb,
 					      struct genl_info *info)
 {
-	struct devlink_port *devlink_port = info->user_ptr[0];
+	struct devlink_port *devlink_port = info->user_ptr[1];
 	struct devlink_param_item *param_item;
 	struct sk_buff *msg;
 	int err;
@@ -4175,7 +4175,7 @@ static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb,
 static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb,
 					      struct genl_info *info)
 {
-	struct devlink_port *devlink_port = info->user_ptr[0];
+	struct devlink_port *devlink_port = info->user_ptr[1];
 
 	return __devlink_nl_cmd_param_set_doit(devlink_port->devlink,
 					       devlink_port->index,
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 80dbf2f4016e..8e582e29a41e 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -80,11 +80,11 @@ static void est_timer(struct timer_list *t)
 	u64 rate, brate;
 
 	est_fetch_counters(est, &b);
-	brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log);
-	brate -= (est->avbps >> est->ewma_log);
+	brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log);
+	brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log);
 
-	rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log);
-	rate -= (est->avpps >> est->ewma_log);
+	rate = (b.packets - est->last_packets) << (10 - est->intvl_log);
+	rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log);
 
 	write_seqcount_begin(&est->seq);
 	est->avbps += brate;
@@ -143,6 +143,9 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 	if (parm->interval < -2 || parm->interval > 3)
 		return -EINVAL;
 
+	if (parm->ewma_log == 0 || parm->ewma_log >= 31)
+		return -EINVAL;
+
 	est = kzalloc(sizeof(*est), GFP_KERNEL);
 	if (!est)
 		return -ENOBUFS;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 9500d28a43b0..277ed854aef1 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1569,10 +1569,8 @@ static void neigh_proxy_process(struct timer_list *t)
 void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
 		    struct sk_buff *skb)
 {
-	unsigned long now = jiffies;
-
-	unsigned long sched_next = now + (prandom_u32() %
-					  NEIGH_VAR(p, PROXY_DELAY));
+	unsigned long sched_next = jiffies +
+			prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY));
 
 	if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {
 		kfree_skb(skb);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 999b70c59761..daf502c13d6d 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1317,8 +1317,8 @@ static const struct attribute_group dql_group = {
 static ssize_t xps_cpus_show(struct netdev_queue *queue,
 			     char *buf)
 {
+	int cpu, len, ret, num_tc = 1, tc = 0;
 	struct net_device *dev = queue->dev;
-	int cpu, len, num_tc = 1, tc = 0;
 	struct xps_dev_maps *dev_maps;
 	cpumask_var_t mask;
 	unsigned long index;
@@ -1328,22 +1328,31 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
 
 	index = get_netdev_queue_index(queue);
 
+	if (!rtnl_trylock())
+		return restart_syscall();
+
 	if (dev->num_tc) {
 		/* Do not allow XPS on subordinate device directly */
 		num_tc = dev->num_tc;
-		if (num_tc < 0)
-			return -EINVAL;
+		if (num_tc < 0) {
+			ret = -EINVAL;
+			goto err_rtnl_unlock;
+		}
 
 		/* If queue belongs to subordinate dev use its map */
 		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
 
 		tc = netdev_txq_to_tc(dev, index);
-		if (tc < 0)
-			return -EINVAL;
+		if (tc < 0) {
+			ret = -EINVAL;
+			goto err_rtnl_unlock;
+		}
 	}
 
-	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
-		return -ENOMEM;
+	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto err_rtnl_unlock;
+	}
 
 	rcu_read_lock();
 	dev_maps = rcu_dereference(dev->xps_cpus_map);
@@ -1366,9 +1375,15 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
 	}
 	rcu_read_unlock();
 
+	rtnl_unlock();
+
 	len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
 	free_cpumask_var(mask);
 	return len < PAGE_SIZE ? len : -EINVAL;
+
+err_rtnl_unlock:
+	rtnl_unlock();
+	return ret;
 }
 
 static ssize_t xps_cpus_store(struct netdev_queue *queue,
@@ -1396,7 +1411,13 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
 		return err;
 	}
 
+	if (!rtnl_trylock()) {
+		free_cpumask_var(mask);
+		return restart_syscall();
+	}
+
 	err = netif_set_xps_queue(dev, mask, index);
+	rtnl_unlock();
 
 	free_cpumask_var(mask);
 
@@ -1408,22 +1429,29 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
 
 static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
 {
+	int j, len, ret, num_tc = 1, tc = 0;
 	struct net_device *dev = queue->dev;
 	struct xps_dev_maps *dev_maps;
 	unsigned long *mask, index;
-	int j, len, num_tc = 1, tc = 0;
 
 	index = get_netdev_queue_index(queue);
 
+	if (!rtnl_trylock())
+		return restart_syscall();
+
 	if (dev->num_tc) {
 		num_tc = dev->num_tc;
 		tc = netdev_txq_to_tc(dev, index);
-		if (tc < 0)
-			return -EINVAL;
+		if (tc < 0) {
+			ret = -EINVAL;
+			goto err_rtnl_unlock;
+		}
 	}
 	mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
-	if (!mask)
-		return -ENOMEM;
+	if (!mask) {
+		ret = -ENOMEM;
+		goto err_rtnl_unlock;
+	}
 
 	rcu_read_lock();
 	dev_maps = rcu_dereference(dev->xps_rxqs_map);
@@ -1449,10 +1477,16 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
 out_no_maps:
 	rcu_read_unlock();
 
+	rtnl_unlock();
+
 	len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
 	bitmap_free(mask);
 
 	return len < PAGE_SIZE ? len : -EINVAL;
+
+err_rtnl_unlock:
+	rtnl_unlock();
+	return ret;
 }
 
 static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
@@ -1478,10 +1512,17 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
 		return err;
 	}
 
+	if (!rtnl_trylock()) {
+		bitmap_free(mask);
+		return restart_syscall();
+	}
+
 	cpus_read_lock();
 	err = __netif_set_xps_queue(dev, mask, index, true);
 	cpus_read_unlock();
 
+	rtnl_unlock();
+
 	bitmap_free(mask);
 	return err ? : len;
 }
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bb0596c41b3e..3d6ab194d0f5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3439,26 +3439,15 @@ replay:
 
 	dev->ifindex = ifm->ifi_index;
 
-	if (ops->newlink) {
+	if (ops->newlink)
 		err = ops->newlink(link_net ? : net, dev, tb, data, extack);
-		/* Drivers should call free_netdev() in ->destructor
-		 * and unregister it on failure after registration
-		 * so that device could be finally freed in rtnl_unlock.
-		 */
-		if (err < 0) {
-			/* If device is not registered at all, free it now */
-			if (dev->reg_state == NETREG_UNINITIALIZED ||
-			    dev->reg_state == NETREG_UNREGISTERED)
-				free_netdev(dev);
-			goto out;
-		}
-	} else {
+	else
 		err = register_netdevice(dev);
-		if (err < 0) {
-			free_netdev(dev);
-			goto out;
-		}
+	if (err < 0) {
+		free_netdev(dev);
+		goto out;
 	}
+
 	err = rtnl_configure_link(dev, ifm);
 	if (err < 0)
 		goto out_unregister;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f62cae3f75d8..785daff48030 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -437,7 +437,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 
 	len += NET_SKB_PAD;
 
-	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+	/* If requested length is either too small or too big,
+	 * we use kmalloc() for skb->head allocation.
+	 */
+	if (len <= SKB_WITH_OVERHEAD(1024) ||
+	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
 	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
 		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
 		if (!skb)
@@ -501,13 +505,17 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 				 gfp_t gfp_mask)
 {
-	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	struct napi_alloc_cache *nc;
 	struct sk_buff *skb;
 	void *data;
 
 	len += NET_SKB_PAD + NET_IP_ALIGN;
 
-	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+	/* If requested length is either too small or too big,
+	 * we use kmalloc() for skb->head allocation.
+	 */
+	if (len <= SKB_WITH_OVERHEAD(1024) ||
+	    len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
 	    (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
 		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
 		if (!skb)
@@ -515,6 +523,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 		goto skb_success;
 	}
 
+	nc = this_cpu_ptr(&napi_alloc_cache);
 	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 	len = SKB_DATA_ALIGN(len);
 
@@ -3442,6 +3451,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
 	st->root_skb = st->cur_skb = skb;
 	st->frag_idx = st->stepped_offset = 0;
 	st->frag_data = NULL;
+	st->frag_off = 0;
 }
 EXPORT_SYMBOL(skb_prepare_seq_read);
 
@@ -3496,14 +3506,27 @@ next_skb:
 		st->stepped_offset += skb_headlen(st->cur_skb);
 
 	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+		unsigned int pg_idx, pg_off, pg_sz;
+
 		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
-		block_limit = skb_frag_size(frag) + st->stepped_offset;
 
+		pg_idx = 0;
+		pg_off = skb_frag_off(frag);
+		pg_sz = skb_frag_size(frag);
+
+		if (skb_frag_must_loop(skb_frag_page(frag))) {
+			pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
+			pg_off = offset_in_page(pg_off + st->frag_off);
+			pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
+						    PAGE_SIZE - pg_off);
+		}
+
+		block_limit = pg_sz + st->stepped_offset;
 		if (abs_offset < block_limit) {
 			if (!st->frag_data)
-				st->frag_data = kmap_atomic(skb_frag_page(frag));
+				st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
 
-			*data = (u8 *) st->frag_data + skb_frag_off(frag) +
+			*data = (u8 *)st->frag_data + pg_off +
 				(abs_offset - st->stepped_offset);
 
 			return block_limit - abs_offset;
@@ -3514,8 +3537,12 @@ next_skb:
 			st->frag_data = NULL;
 		}
 
-		st->frag_idx++;
-		st->stepped_offset += skb_frag_size(frag);
+		st->stepped_offset += pg_sz;
+		st->frag_off += pg_sz;
+		if (st->frag_off == skb_frag_size(frag)) {
+			st->frag_off = 0;
+			st->frag_idx++;
+		}
 	}
 
 	if (st->frag_data) {
@@ -3655,7 +3682,8 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
 	unsigned int delta_truesize = 0;
 	unsigned int delta_len = 0;
 	struct sk_buff *tail = NULL;
-	struct sk_buff *nskb;
+	struct sk_buff *nskb, *tmp;
+	int err;
 
 	skb_push(skb, -skb_network_offset(skb) + offset);
 
@@ -3665,11 +3693,28 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
 		nskb = list_skb;
 		list_skb = list_skb->next;
 
+		err = 0;
+		if (skb_shared(nskb)) {
+			tmp = skb_clone(nskb, GFP_ATOMIC);
+			if (tmp) {
+				consume_skb(nskb);
+				nskb = tmp;
+				err = skb_unclone(nskb, GFP_ATOMIC);
+			} else {
+				err = -ENOMEM;
+			}
+		}
+
 		if (!tail)
 			skb->next = nskb;
 		else
 			tail->next = nskb;
 
+		if (unlikely(err)) {
+			nskb->next = list_skb;
+			goto err_linearize;
+		}
+
 		tail = nskb;
 
 		delta_len += nskb->len;
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index bbdd3c7b6cb5..b065f0a103ed 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -293,7 +293,7 @@ select_by_hash:
 			i = j = reciprocal_scale(hash, socks);
 			while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
 				i++;
-				if (i >= reuse->num_socks)
+				if (i >= socks)
 					i = 0;
 				if (i == j)
 					goto out;
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 084e159a12ba..653e3bc9c87b 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1765,6 +1765,8 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 	fn = &reply_funcs[dcb->cmd];
 	if (!fn->cb)
 		return -EOPNOTSUPP;
+	if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
 
 	if (!tb[DCB_ATTR_IFNAME])
 		return -EINVAL;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 183003e45762..a47e0f9b20d0 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -353,9 +353,13 @@ static int dsa_port_devlink_setup(struct dsa_port *dp)
 
 static void dsa_port_teardown(struct dsa_port *dp)
 {
+	struct devlink_port *dlp = &dp->devlink_port;
+
 	if (!dp->setup)
 		return;
 
+	devlink_port_type_clear(dlp);
+
 	switch (dp->type) {
 	case DSA_PORT_TYPE_UNUSED:
 		break;
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 5a0f6fec4271..cb3a5cf99b25 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -309,8 +309,18 @@ static struct lock_class_key dsa_master_addr_list_lock_key;
 int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
 {
 	int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead;
+	struct dsa_switch *ds = cpu_dp->ds;
+	struct device_link *consumer_link;
 	int ret;
 
+	/* The DSA master must use SET_NETDEV_DEV for this to work. */
+	consumer_link = device_link_add(ds->dev, dev->dev.parent,
+					DL_FLAG_AUTOREMOVE_CONSUMER);
+	if (!consumer_link)
+		netdev_err(dev,
+			   "Failed to create a device link to DSA switch %s\n",
+			   dev_name(ds->dev));
+
 	rtnl_lock();
 	ret = dev_set_mtu(dev, mtu);
 	rtnl_unlock();
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
index 5635604cb9ba..25a9e566ef5c 100644
--- a/net/ethtool/channels.c
+++ b/net/ethtool/channels.c
@@ -194,8 +194,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info)
 	if (netif_is_rxfh_configured(dev) &&
 	    !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
 	    (channels.combined_count + channels.rx_count) <= max_rx_in_use) {
+		ret = -EINVAL;
 		GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing indirection table settings");
-		return -EINVAL;
+		goto out_ops;
 	}
 
 	/* Disabling channels, query zero-copy AF_XDP sockets */
@@ -203,8 +204,9 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info)
 		       min(channels.rx_count, channels.tx_count);
 	for (i = from_channel; i < old_total; i++)
 		if (xsk_get_pool_from_qid(dev, i)) {
+			ret = -EINVAL;
 			GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets");
-			return -EINVAL;
+			goto out_ops;
 		}
 
 	ret = dev->ethtool_ops->set_channels(dev, &channels);
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index 0baad0ce1832..c3a5489964cd 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -182,7 +182,7 @@ static int strset_parse_request(struct ethnl_req_info *req_base,
 		ret = strset_get_id(attr, &id, extack);
 		if (ret < 0)
 			return ret;
-		if (ret >= ETH_SS_COUNT) {
+		if (id >= ETH_SS_COUNT) {
 			NL_SET_ERR_MSG_ATTR(extack, attr,
 					    "unknown string set id");
 			return -EOPNOTSUPP;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 8b07f3a4f2db..a3271ec3e162 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -443,7 +443,6 @@ static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
 int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
 {
 	u8 *tail;
-	u8 *vaddr;
 	int nfrags;
 	int esph_offset;
 	struct page *page;
@@ -485,14 +484,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
 			page = pfrag->page;
 			get_page(page);
 
-			vaddr = kmap_atomic(page);
-
-			tail = vaddr + pfrag->offset;
+			tail = page_address(page) + pfrag->offset;
 
 			esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
 
-			kunmap_atomic(vaddr);
-
 			nfrags = skb_shinfo(skb)->nr_frags;
 
 			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cdf6ec5aa45d..84bb707bd88d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -292,7 +292,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 			.flowi4_iif = LOOPBACK_IFINDEX,
 			.flowi4_oif = l3mdev_master_ifindex_rcu(dev),
 			.daddr = ip_hdr(skb)->saddr,
-			.flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
+			.flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK,
 			.flowi4_scope = scope,
 			.flowi4_mark = vmark ? skb->mark : 0,
 		};
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 66fdbfe5447c..5d1e6fe9d838 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -128,7 +128,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 	 * to 0 and sets the configured key in the
 	 * inner erspan header field
 	 */
-	if (greh->protocol == htons(ETH_P_ERSPAN) ||
+	if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) ||
 	    greh->protocol == htons(ETH_P_ERSPAN2)) {
 		struct erspan_base_hdr *ershdr;
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fd8b8800a2c3..6bd7ca09af03 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -851,6 +851,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 		newicsk->icsk_retransmits = 0;
 		newicsk->icsk_backoff	  = 0;
 		newicsk->icsk_probes_out  = 0;
+		newicsk->icsk_probes_tstamp = 0;
 
 		/* Deinitialize accept_queue to trap illegal accesses. */
 		memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 89fff5f59eea..2ed0b01f72f0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -302,7 +302,7 @@ static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *
 	if (skb_is_gso(skb))
 		return ip_finish_output_gso(net, sk, skb, mtu);
 
-	if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
+	if (skb->len > mtu || IPCB(skb)->frag_max_size)
 		return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
 
 	return ip_finish_output2(net, sk, skb);
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index ee65c9225178..64594aa755f0 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -759,8 +759,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto tx_error;
 	}
 
-	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
-			    0, 0, false)) {
+	df = tnl_params->frag_off;
+	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
+		df |= (inner_iph->frag_off & htons(IP_DF));
+
+	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
 		ip_rt_put(rt);
 		goto tx_error;
 	}
@@ -788,10 +791,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 			ttl = ip4_dst_hoplimit(&rt->dst);
 	}
 
-	df = tnl_params->frag_off;
-	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
-		df |= (inner_iph->frag_off&htons(IP_DF));
-
 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
 	if (max_headroom > dev->needed_headroom)
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 563b62b76a5f..c576a63d09db 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1379,7 +1379,7 @@ static int compat_get_entries(struct net *net,
 	xt_compat_lock(NFPROTO_ARP);
 	t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
 	if (!IS_ERR(t)) {
-		const struct xt_table_info *private = t->private;
+		const struct xt_table_info *private = xt_table_get_private_protected(t);
 		struct xt_table_info info;
 
 		ret = compat_table_info(private, &info);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 6e2851f8d3a3..e8f6f9d86237 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1589,7 +1589,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
 	xt_compat_lock(AF_INET);
 	t = xt_find_table_lock(net, AF_INET, get.name);
 	if (!IS_ERR(t)) {
-		const struct xt_table_info *private = t->private;
+		const struct xt_table_info *private = xt_table_get_private_protected(t);
 		struct xt_table_info info;
 		ret = compat_table_info(private, &info);
 		if (!ret && get.size == info.size)
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index cc23f1ce239c..8cd3224d913e 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	flow.daddr = iph->saddr;
 	flow.saddr = rpfilter_get_saddr(iph->daddr);
 	flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
-	flow.flowi4_tos = RT_TOS(iph->tos);
+	flow.flowi4_tos = iph->tos & IPTOS_RT_MASK;
 	flow.flowi4_scope = RT_SCOPE_UNIVERSE;
 	flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par));
 
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 5e1b22d4f939..e53e43aef785 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -627,7 +627,7 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
 	for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) {
 		if (!tb[i])
 			continue;
-		if (tb[NHA_FDB])
+		if (i == NHA_FDB)
 			continue;
 		NL_SET_ERR_MSG(extack,
 			       "No other attributes can be set in nexthop groups");
@@ -1459,8 +1459,10 @@ static struct nexthop *nexthop_create_group(struct net *net,
 	return nh;
 
 out_no_nh:
-	for (; i >= 0; --i)
+	for (i--; i >= 0; --i) {
+		list_del(&nhg->nh_entries[i].nh_list);
 		nexthop_put(nhg->nh_entries[i].nh);
+	}
 
 	kfree(nhg->spare);
 	kfree(nhg);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ed42d2193c5c..32545ecf2ab1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2937,6 +2937,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 	icsk->icsk_backoff = 0;
 	icsk->icsk_probes_out = 0;
+	icsk->icsk_probes_tstamp = 0;
 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	icsk->icsk_rto_min = TCP_RTO_MIN;
 	icsk->icsk_delack_max = TCP_DELACK_MAX;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c7e16b0ed791..a7dfca0a38cd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3384,6 +3384,7 @@ static void tcp_ack_probe(struct sock *sk)
 		return;
 	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
 		icsk->icsk_backoff = 0;
+		icsk->icsk_probes_tstamp = 0;
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
 		/* Socket must be waked up by subsequent tcp_data_snd_check().
 		 * This function is not for random using!
@@ -4396,10 +4397,9 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
 	 * The receiver remembers and reflects via DSACKs. Leverage the
 	 * DSACK state and change the txhash to re-route speculatively.
 	 */
-	if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) {
-		sk_rethink_txhash(sk);
+	if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
+	    sk_rethink_txhash(sk))
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
-	}
 }
 
 static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 58207c7769d0..777306b5bc22 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1595,6 +1595,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 		tcp_move_syn(newtp, req);
 		ireq->ireq_opt = NULL;
 	} else {
+		newinet->inet_opt = NULL;
+
 		if (!req_unhash && found_dup_sk) {
 			/* This code path should only be executed in the
 			 * syncookie case only
@@ -1602,8 +1604,6 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 			bh_unlock_sock(newsk);
 			sock_put(newsk);
 			newsk = NULL;
-		} else {
-			newinet->inet_opt = NULL;
 		}
 	}
 	return newsk;
@@ -1760,6 +1760,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
+	u32 tail_gso_size, tail_gso_segs;
 	struct skb_shared_info *shinfo;
 	const struct tcphdr *th;
 	struct tcphdr *thtail;
@@ -1767,6 +1768,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 	unsigned int hdrlen;
 	bool fragstolen;
 	u32 gso_segs;
+	u32 gso_size;
 	int delta;
 
 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
@@ -1792,13 +1794,6 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 	 */
 	th = (const struct tcphdr *)skb->data;
 	hdrlen = th->doff * 4;
-	shinfo = skb_shinfo(skb);
-
-	if (!shinfo->gso_size)
-		shinfo->gso_size = skb->len - hdrlen;
-
-	if (!shinfo->gso_segs)
-		shinfo->gso_segs = 1;
 
 	tail = sk->sk_backlog.tail;
 	if (!tail)
@@ -1821,6 +1816,15 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 		goto no_coalesce;
 
 	__skb_pull(skb, hdrlen);
+
+	shinfo = skb_shinfo(skb);
+	gso_size = shinfo->gso_size ?: skb->len;
+	gso_segs = shinfo->gso_segs ?: 1;
+
+	shinfo = skb_shinfo(tail);
+	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
+	tail_gso_segs = shinfo->gso_segs ?: 1;
+
 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
 
@@ -1847,11 +1851,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 		}
 
 		/* Not as strict as GRO. We only need to carry mss max value */
-		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
-						 skb_shinfo(tail)->gso_size);
-
-		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
-		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+		shinfo->gso_size = max(gso_size, tail_gso_size);
+		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
 
 		sk->sk_backlog.len += delta;
 		__NET_INC_STATS(sock_net(sk),
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f322e798a351..ab458697881e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -4084,6 +4084,7 @@ void tcp_send_probe0(struct sock *sk)
 		/* Cancel probe timer, if it is not required. */
 		icsk->icsk_probes_out = 0;
 		icsk->icsk_backoff = 0;
+		icsk->icsk_probes_tstamp = 0;
 		return;
 	}
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6c62b9ea1320..faa92948441b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -219,14 +219,8 @@ static int tcp_write_timeout(struct sock *sk)
 	int retry_until;
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
-		if (icsk->icsk_retransmits) {
-			dst_negative_advice(sk);
-		} else {
-			sk_rethink_txhash(sk);
-			tp->timeout_rehash++;
-			__NET_INC_STATS(sock_net(sk),
-					LINUX_MIB_TCPTIMEOUTREHASH);
-		}
+		if (icsk->icsk_retransmits)
+			__dst_negative_advice(sk);
 		retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
 		expired = icsk->icsk_retransmits >= retry_until;
 	} else {
@@ -234,12 +228,7 @@ static int tcp_write_timeout(struct sock *sk)
 			/* Black hole detection */
 			tcp_mtu_probing(icsk, sk);
 
-			dst_negative_advice(sk);
-		} else {
-			sk_rethink_txhash(sk);
-			tp->timeout_rehash++;
-			__NET_INC_STATS(sock_net(sk),
-					LINUX_MIB_TCPTIMEOUTREHASH);
+			__dst_negative_advice(sk);
 		}
 
 		retry_until = net->ipv4.sysctl_tcp_retries2;
@@ -270,6 +259,11 @@ static int tcp_write_timeout(struct sock *sk)
 		return 1;
 	}
 
+	if (sk_rethink_txhash(sk)) {
+		tp->timeout_rehash++;
+		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
+	}
+
 	return 0;
 }
 
@@ -349,6 +343,7 @@ static void tcp_probe_timer(struct sock *sk)
 
 	if (tp->packets_out || !skb) {
 		icsk->icsk_probes_out = 0;
+		icsk->icsk_probes_tstamp = 0;
 		return;
 	}
 
@@ -360,13 +355,12 @@ static void tcp_probe_timer(struct sock *sk)
 	 * corresponding system limit. We also implement similar policy when
 	 * we use RTO to probe window in tcp_retransmit_timer().
 	 */
-	if (icsk->icsk_user_timeout) {
-		u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out,
-						tcp_probe0_base(sk));
-
-		if (elapsed >= icsk->icsk_user_timeout)
-			goto abort;
-	}
+	if (!icsk->icsk_probes_tstamp)
+		icsk->icsk_probes_tstamp = tcp_jiffies32;
+	else if (icsk->icsk_user_timeout &&
+		 (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
+		 msecs_to_jiffies(icsk->icsk_user_timeout))
+		goto abort;
 
 	max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
 	if (sock_flag(sk, SOCK_DEAD)) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7103b0a89756..69ea76578abb 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2555,7 +2555,8 @@ int udp_v4_early_demux(struct sk_buff *skb)
 		 */
 		if (!inet_sk(sk)->inet_daddr && in_dev)
 			return ip_mc_validate_source(skb, iph->daddr,
-						     iph->saddr, iph->tos,
+						     iph->saddr,
+						     iph->tos & IPTOS_RT_MASK,
 						     skb->dev, in_dev, &itag);
 	}
 	return 0;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index eff2cacd5209..9edc5bb2d531 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2467,8 +2467,9 @@ static void addrconf_add_mroute(struct net_device *dev)
 		.fc_ifindex = dev->ifindex,
 		.fc_dst_len = 8,
 		.fc_flags = RTF_UP,
-		.fc_type = RTN_UNICAST,
+		.fc_type = RTN_MULTICAST,
 		.fc_nlinfo.nl_net = dev_net(dev),
+		.fc_protocol = RTPROT_KERNEL,
 	};
 
 	ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 52c2f063529f..2b804fcebcc6 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -478,7 +478,6 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb,
 int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
 {
 	u8 *tail;
-	u8 *vaddr;
 	int nfrags;
 	int esph_offset;
 	struct page *page;
@@ -519,14 +518,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 			page = pfrag->page;
 			get_page(page);
 
-			vaddr = kmap_atomic(page);
-
-			tail = vaddr + pfrag->offset;
+			tail = page_address(page) + pfrag->offset;
 
 			esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
 
-			kunmap_atomic(vaddr);
-
 			nfrags = skb_shinfo(skb)->nr_frags;
 
 			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 605cdd38a919..f43e27555725 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1025,6 +1025,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 {
 	struct fib6_table *table = rt->fib6_table;
 
+	/* Flush all cached dst in exception table */
+	rt6_flush_exceptions(rt);
 	fib6_drop_pcpu_from(rt, table);
 
 	if (rt->nh && !list_empty(&rt->nh_list))
@@ -1927,9 +1929,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	net->ipv6.rt6_stats->fib_rt_entries--;
 	net->ipv6.rt6_stats->fib_discarded_routes++;
 
-	/* Flush all cached dst in exception table */
-	rt6_flush_exceptions(rt);
-
 	/* Reset round-robin state, if necessary */
 	if (rcu_access_pointer(fn->rr_ptr) == rt)
 		fn->rr_ptr = NULL;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 749ad72386b2..077d43af8226 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -125,8 +125,43 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 	return -EINVAL;
 }
 
+static int
+ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
+				    struct sk_buff *skb, unsigned int mtu)
+{
+	struct sk_buff *segs, *nskb;
+	netdev_features_t features;
+	int ret = 0;
+
+	/* Please see corresponding comment in ip_finish_output_gso
+	 * describing the cases where GSO segment length exceeds the
+	 * egress MTU.
+	 */
+	features = netif_skb_features(skb);
+	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+	if (IS_ERR_OR_NULL(segs)) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+
+	consume_skb(skb);
+
+	skb_list_walk_safe(segs, segs, nskb) {
+		int err;
+
+		skb_mark_not_on_list(segs);
+		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
+		if (err && ret == 0)
+			ret = err;
+	}
+
+	return ret;
+}
+
 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
+	unsigned int mtu;
+
 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 	/* Policy lookup after SNAT yielded a new policy */
 	if (skb_dst(skb)->xfrm) {
@@ -135,7 +170,11 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff
 	}
 #endif
 
-	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
+	mtu = ip6_skb_dst_mtu(skb);
+	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
+		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
+
+	if ((skb->len > mtu && !skb_is_gso(skb)) ||
 	    dst_allfrag(skb_dst(skb)) ||
 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c4f532f4d311..0d453fa9e327 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1598,7 +1598,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
 	xt_compat_lock(AF_INET6);
 	t = xt_find_table_lock(net, AF_INET6, get.name);
 	if (!IS_ERR(t)) {
-		const struct xt_table_info *private = t->private;
+		const struct xt_table_info *private = xt_table_get_private_protected(t);
 		struct xt_table_info info;
 		ret = compat_table_info(private, &info);
 		if (!ret && get.size == info.size)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 2da0ee703779..93636867aee2 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1645,8 +1645,11 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev,
 	}
 
 #ifdef CONFIG_IPV6_SIT_6RD
-	if (ipip6_netlink_6rd_parms(data, &ip6rd))
+	if (ipip6_netlink_6rd_parms(data, &ip6rd)) {
 		err = ipip6_tunnel_update_6rd(nt, &ip6rd);
+		if (err < 0)
+			unregister_netdevice_queue(dev, NULL);
+	}
 #endif
 
 	return err;
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
index 213ea7abc9ab..40961889e9c0 100644
--- a/net/lapb/lapb_iface.c
+++ b/net/lapb/lapb_iface.c
@@ -489,6 +489,7 @@ static int lapb_device_event(struct notifier_block *this, unsigned long event,
 		break;
 	}
 
+	lapb_put(lapb);
 	return NOTIFY_DONE;
 }
 
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 48f144f107d5..9e723d943421 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -120,18 +120,17 @@ static ssize_t aqm_write(struct file *file,
 {
 	struct ieee80211_local *local = file->private_data;
 	char buf[100];
-	size_t len;
 
-	if (count > sizeof(buf))
+	if (count >= sizeof(buf))
 		return -EINVAL;
 
 	if (copy_from_user(buf, user_buf, count))
 		return -EFAULT;
 
-	buf[sizeof(buf) - 1] = '\0';
-	len = strlen(buf);
-	if (len > 0 && buf[len-1] == '\n')
-		buf[len-1] = 0;
+	if (count && buf[count - 1] == '\n')
+		buf[count - 1] = '\0';
+	else
+		buf[count] = '\0';
 
 	if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1)
 		return count;
@@ -177,18 +176,17 @@ static ssize_t airtime_flags_write(struct file *file,
 {
 	struct ieee80211_local *local = file->private_data;
 	char buf[16];
-	size_t len;
 
-	if (count > sizeof(buf))
+	if (count >= sizeof(buf))
 		return -EINVAL;
 
 	if (copy_from_user(buf, user_buf, count))
 		return -EFAULT;
 
-	buf[sizeof(buf) - 1] = 0;
-	len = strlen(buf);
-	if (len > 0 && buf[len - 1] == '\n')
-		buf[len - 1] = 0;
+	if (count && buf[count - 1] == '\n')
+		buf[count - 1] = '\0';
+	else
+		buf[count] = '\0';
 
 	if (kstrtou16(buf, 0, &local->airtime_flags))
 		return -EINVAL;
@@ -237,20 +235,19 @@ static ssize_t aql_txq_limit_write(struct file *file,
 {
 	struct ieee80211_local *local = file->private_data;
 	char buf[100];
-	size_t len;
 	u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old;
 	struct sta_info *sta;
 
-	if (count > sizeof(buf))
+	if (count >= sizeof(buf))
 		return -EINVAL;
 
 	if (copy_from_user(buf, user_buf, count))
 		return -EFAULT;
 
-	buf[sizeof(buf) - 1] = 0;
-	len = strlen(buf);
-	if (len > 0 && buf[len - 1] == '\n')
-		buf[len - 1] = 0;
+	if (count && buf[count - 1] == '\n')
+		buf[count - 1] = '\0';
+	else
+		buf[count] = '\0';
 
 	if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3)
 		return -EINVAL;
@@ -306,18 +303,17 @@ static ssize_t force_tx_status_write(struct file *file,
 {
 	struct ieee80211_local *local = file->private_data;
 	char buf[3];
-	size_t len;
 
-	if (count > sizeof(buf))
+	if (count >= sizeof(buf))
 		return -EINVAL;
 
 	if (copy_from_user(buf, user_buf, count))
 		return -EFAULT;
 
-	buf[sizeof(buf) - 1] = '\0';
-	len = strlen(buf);
-	if (len > 0 && buf[len - 1] == '\n')
-		buf[len - 1] = 0;
+	if (count && buf[count - 1] == '\n')
+		buf[count - 1] = '\0';
+	else
+		buf[count] = '\0';
 
 	if (buf[0] == '0' && buf[1] == '\0')
 		local->force_tx_status = 0;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 13b9bcc4865d..972895e9f22d 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4176,6 +4176,8 @@ void ieee80211_check_fast_rx(struct sta_info *sta)
 
 	rcu_read_lock();
 	key = rcu_dereference(sta->ptk[sta->ptk_idx]);
+	if (!key)
+		key = rcu_dereference(sdata->default_unicast_key);
 	if (key) {
 		switch (key->conf.cipher) {
 		case WLAN_CIPHER_SUITE_TKIP:
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 6422da6690f7..ebb3228ce971 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -649,7 +649,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		if (!skip_hw && tx->key &&
 		    tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)
 			info->control.hw_key = &tx->key->conf;
-	} else if (!ieee80211_is_mgmt(hdr->frame_control) && tx->sta &&
+	} else if (ieee80211_is_data_present(hdr->frame_control) && tx->sta &&
 		   test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) {
 		return TX_DROP;
 	}
@@ -3809,7 +3809,7 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw,
 		 * get immediately moved to the back of the list on the next
 		 * call to ieee80211_next_txq().
 		 */
-		if (txqi->txq.sta &&
+		if (txqi->txq.sta && local->airtime_flags &&
 		    wiphy_ext_feature_isset(local->hw.wiphy,
 					    NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
 			list_add(&txqi->schedule_order,
@@ -4251,7 +4251,6 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
 	struct ethhdr *ehdr = (struct ethhdr *)skb->data;
 	struct ieee80211_key *key;
 	struct sta_info *sta;
-	bool offload = true;
 
 	if (unlikely(skb->len < ETH_HLEN)) {
 		kfree_skb(skb);
@@ -4267,18 +4266,22 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
 
 	if (unlikely(IS_ERR_OR_NULL(sta) || !sta->uploaded ||
 	    !test_sta_flag(sta, WLAN_STA_AUTHORIZED) ||
-		sdata->control_port_protocol == ehdr->h_proto))
-		offload = false;
-	else if ((key = rcu_dereference(sta->ptk[sta->ptk_idx])) &&
-		 (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) ||
-		  key->conf.cipher == WLAN_CIPHER_SUITE_TKIP))
-		offload = false;
-
-	if (offload)
-		ieee80211_8023_xmit(sdata, dev, sta, key, skb);
-	else
-		ieee80211_subif_start_xmit(skb, dev);
+	    sdata->control_port_protocol == ehdr->h_proto))
+		goto skip_offload;
+
+	key = rcu_dereference(sta->ptk[sta->ptk_idx]);
+	if (!key)
+		key = rcu_dereference(sdata->default_unicast_key);
+
+	if (key && (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) ||
+		    key->conf.cipher == WLAN_CIPHER_SUITE_TKIP))
+		goto skip_offload;
+
+	ieee80211_8023_xmit(sdata, dev, sta, key, skb);
+	goto out;
 
+skip_offload:
+	ieee80211_subif_start_xmit(skb, dev);
 out:
 	rcu_read_unlock();
 
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index c5328f407aab..e0d21c0607e5 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -606,6 +606,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
 	    skb && skb_is_tcp_pure_ack(skb)) {
 		pr_debug("drop other suboptions");
 		opts->suboptions = 0;
+		opts->ext_copy.use_ack = 0;
+		opts->ext_copy.use_map = 0;
 		remaining += opt_size;
 		drop_other_suboptions = true;
 	}
@@ -873,10 +875,13 @@ static void ack_update_msk(struct mptcp_sock *msk,
 
 	new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
 
-	if (after64(new_wnd_end, msk->wnd_end)) {
+	if (after64(new_wnd_end, msk->wnd_end))
 		msk->wnd_end = new_wnd_end;
-		__mptcp_wnd_updated(sk, ssk);
-	}
+
+	/* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
+	if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)) &&
+	    sk_stream_memory_free(ssk))
+		__mptcp_check_push(sk, ssk);
 
 	if (after64(new_snd_una, old_snd_una)) {
 		msk->snd_una = new_snd_una;
@@ -942,8 +947,8 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 		 * helpers are cheap.
 		 */
 		mptcp_data_lock(subflow->conn);
-		if (mptcp_send_head(subflow->conn))
-			__mptcp_wnd_updated(subflow->conn, sk);
+		if (sk_stream_memory_free(sk))
+			__mptcp_check_push(subflow->conn, sk);
 		__mptcp_data_acked(subflow->conn);
 		mptcp_data_unlock(subflow->conn);
 		return;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index b812aaae8044..f998a077c7dd 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -427,7 +427,7 @@ static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
 static bool tcp_can_send_ack(const struct sock *ssk)
 {
 	return !((1 << inet_sk_state_load(ssk)) &
-	       (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE));
+	       (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN));
 }
 
 static void mptcp_send_ack(struct mptcp_sock *msk)
@@ -877,6 +877,9 @@ static void __mptcp_wmem_reserve(struct sock *sk, int size)
 	struct mptcp_sock *msk = mptcp_sk(sk);
 
 	WARN_ON_ONCE(msk->wmem_reserved);
+	if (WARN_ON_ONCE(amount < 0))
+		amount = 0;
+
 	if (amount <= sk->sk_forward_alloc)
 		goto reserve;
 
@@ -1587,7 +1590,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
 		return -EOPNOTSUPP;
 
-	mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, len));
+	mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
 
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
@@ -1658,6 +1661,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		frag_truesize += psize;
 		pfrag->offset += frag_truesize;
 		WRITE_ONCE(msk->write_seq, msk->write_seq + psize);
+		msk->tx_pending_data += psize;
 
 		/* charge data on mptcp pending queue to the msk socket
 		 * Note: we charge such data both to sk and ssk
@@ -1683,10 +1687,8 @@ wait_for_memory:
 			goto out;
 	}
 
-	if (copied) {
-		msk->tx_pending_data += copied;
+	if (copied)
 		mptcp_push_pending(sk, msg->msg_flags);
-	}
 
 out:
 	release_sock(sk);
@@ -2119,7 +2121,7 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 
 	list_del(&subflow->node);
 
-	lock_sock(ssk);
+	lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
 
 	/* if we are invoked by the msk cleanup code, the subflow is
 	 * already orphaned
@@ -2640,11 +2642,17 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
 
 static int mptcp_disconnect(struct sock *sk, int flags)
 {
-	/* Should never be called.
-	 * inet_stream_connect() calls ->disconnect, but that
-	 * refers to the subflow socket, not the mptcp one.
-	 */
-	WARN_ON_ONCE(1);
+	struct mptcp_subflow_context *subflow;
+	struct mptcp_sock *msk = mptcp_sk(sk);
+
+	__mptcp_flush_join_list(msk);
+	mptcp_for_each_subflow(msk, subflow) {
+		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+		lock_sock(ssk);
+		tcp_disconnect(ssk, flags);
+		release_sock(ssk);
+	}
 	return 0;
 }
 
@@ -2699,6 +2707,8 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
 	sock_reset_flag(nsk, SOCK_RCU_FREE);
 	/* will be fully established after successful MPC subflow creation */
 	inet_sk_state_store(nsk, TCP_SYN_RECV);
+
+	security_inet_csk_clone(nsk, req);
 	bh_unlock_sock(nsk);
 
 	/* keep a single reference */
@@ -2913,7 +2923,7 @@ void __mptcp_data_acked(struct sock *sk)
 		mptcp_schedule_work(sk);
 }
 
-void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk)
+void __mptcp_check_push(struct sock *sk, struct sock *ssk)
 {
 	if (!mptcp_send_head(sk))
 		return;
@@ -3085,6 +3095,14 @@ bool mptcp_finish_join(struct sock *ssk)
 	return true;
 }
 
+static void mptcp_shutdown(struct sock *sk, int how)
+{
+	pr_debug("sk=%p, how=%d", sk, how);
+
+	if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk))
+		__mptcp_wr_shutdown(sk);
+}
+
 static struct proto mptcp_prot = {
 	.name		= "MPTCP",
 	.owner		= THIS_MODULE,
@@ -3094,7 +3112,7 @@ static struct proto mptcp_prot = {
 	.accept		= mptcp_accept,
 	.setsockopt	= mptcp_setsockopt,
 	.getsockopt	= mptcp_getsockopt,
-	.shutdown	= tcp_shutdown,
+	.shutdown	= mptcp_shutdown,
 	.destroy	= mptcp_destroy,
 	.sendmsg	= mptcp_sendmsg,
 	.recvmsg	= mptcp_recvmsg,
@@ -3340,43 +3358,6 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 	return mask;
 }
 
-static int mptcp_shutdown(struct socket *sock, int how)
-{
-	struct mptcp_sock *msk = mptcp_sk(sock->sk);
-	struct sock *sk = sock->sk;
-	int ret = 0;
-
-	pr_debug("sk=%p, how=%d", msk, how);
-
-	lock_sock(sk);
-
-	how++;
-	if ((how & ~SHUTDOWN_MASK) || !how) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-
-	if (sock->state == SS_CONNECTING) {
-		if ((1 << sk->sk_state) &
-		    (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
-			sock->state = SS_DISCONNECTING;
-		else
-			sock->state = SS_CONNECTED;
-	}
-
-	sk->sk_shutdown |= how;
-	if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk))
-		__mptcp_wr_shutdown(sk);
-
-	/* Wake up anyone sleeping in poll. */
-	sk->sk_state_change(sk);
-
-out_unlock:
-	release_sock(sk);
-
-	return ret;
-}
-
 static const struct proto_ops mptcp_stream_ops = {
 	.family		   = PF_INET,
 	.owner		   = THIS_MODULE,
@@ -3390,7 +3371,7 @@ static const struct proto_ops mptcp_stream_ops = {
 	.ioctl		   = inet_ioctl,
 	.gettstamp	   = sock_gettstamp,
 	.listen		   = mptcp_listen,
-	.shutdown	   = mptcp_shutdown,
+	.shutdown	   = inet_shutdown,
 	.setsockopt	   = sock_common_setsockopt,
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
@@ -3440,7 +3421,7 @@ static const struct proto_ops mptcp_v6_stream_ops = {
 	.ioctl		   = inet6_ioctl,
 	.gettstamp	   = sock_gettstamp,
 	.listen		   = mptcp_listen,
-	.shutdown	   = mptcp_shutdown,
+	.shutdown	   = inet_shutdown,
 	.setsockopt	   = sock_common_setsockopt,
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet6_sendmsg,
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 7cf9d110b85f..d67de793d363 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -503,7 +503,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
 void mptcp_data_ready(struct sock *sk, struct sock *ssk);
 bool mptcp_finish_join(struct sock *sk);
 bool mptcp_schedule_work(struct sock *sk);
-void __mptcp_wnd_updated(struct sock *sk, struct sock *ssk);
+void __mptcp_check_push(struct sock *sk, struct sock *ssk);
 void __mptcp_data_acked(struct sock *sk);
 void mptcp_subflow_eof(struct sock *sk);
 bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 5b1f4ec66dd9..888ccc2d4e34 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -1120,7 +1120,7 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
 	int payload, i, ret;
 
 	/* Find the NCSI device */
-	nd = ncsi_find_dev(dev);
+	nd = ncsi_find_dev(orig_dev);
 	ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL;
 	if (!ndp)
 		return -ENODEV;
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 5f1208ad049e..6186358eac7c 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -141,20 +141,6 @@ htable_size(u8 hbits)
 	return hsize * sizeof(struct hbucket *) + sizeof(struct htable);
 }
 
-/* Compute htable_bits from the user input parameter hashsize */
-static u8
-htable_bits(u32 hashsize)
-{
-	/* Assume that hashsize == 2^htable_bits */
-	u8 bits = fls(hashsize - 1);
-
-	if (jhash_size(bits) != hashsize)
-		/* Round up to the first 2^n value */
-		bits = fls(hashsize);
-
-	return bits;
-}
-
 #ifdef IP_SET_HASH_WITH_NETS
 #if IPSET_NET_COUNT > 1
 #define __CIDR(cidr, i)		(cidr[i])
@@ -640,7 +626,7 @@ mtype_resize(struct ip_set *set, bool retried)
 	struct htype *h = set->data;
 	struct htable *t, *orig;
 	u8 htable_bits;
-	size_t dsize = set->dsize;
+	size_t hsize, dsize = set->dsize;
 #ifdef IP_SET_HASH_WITH_NETS
 	u8 flags;
 	struct mtype_elem *tmp;
@@ -664,14 +650,12 @@ mtype_resize(struct ip_set *set, bool retried)
 retry:
 	ret = 0;
 	htable_bits++;
-	if (!htable_bits) {
-		/* In case we have plenty of memory :-) */
-		pr_warn("Cannot increase the hashsize of set %s further\n",
-			set->name);
-		ret = -IPSET_ERR_HASH_FULL;
-		goto out;
-	}
-	t = ip_set_alloc(htable_size(htable_bits));
+	if (!htable_bits)
+		goto hbwarn;
+	hsize = htable_size(htable_bits);
+	if (!hsize)
+		goto hbwarn;
+	t = ip_set_alloc(hsize);
 	if (!t) {
 		ret = -ENOMEM;
 		goto out;
@@ -813,6 +797,12 @@ cleanup:
 	if (ret == -EAGAIN)
 		goto retry;
 	goto out;
+
+hbwarn:
+	/* In case we have plenty of memory :-) */
+	pr_warn("Cannot increase the hashsize of set %s further\n", set->name);
+	ret = -IPSET_ERR_HASH_FULL;
+	goto out;
 }
 
 /* Get the current number of elements and ext_size in the set  */
@@ -1521,7 +1511,11 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 	if (!h)
 		return -ENOMEM;
 
-	hbits = htable_bits(hashsize);
+	/* Compute htable_bits from the user input parameter hashsize.
+	 * Assume that hashsize == 2^htable_bits,
+	 * otherwise round up to the first 2^n value.
+	 */
+	hbits = fls(hashsize - 1);
 	hsize = htable_size(hbits);
 	if (hsize == 0) {
 		kfree(h);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 46c5557c1fec..0ee702d374b0 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -523,6 +523,9 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
 {
 	int ret;
 
+	/* module_param hashsize could have changed value */
+	nf_conntrack_htable_size_user = nf_conntrack_htable_size;
+
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (ret < 0 || !write)
 		return ret;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index ea923f8cf9c4..b7c3c902290f 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -1174,6 +1174,7 @@ static int __init nf_nat_init(void)
 	ret = register_pernet_subsys(&nat_net_ops);
 	if (ret < 0) {
 		nf_ct_extend_unregister(&nat_extend);
+		kvfree(nf_nat_bysource);
 		return ret;
 	}
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8d5aa0ac45f4..15c467f1a9dd 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4162,7 +4162,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 		if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
 			      NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
 			      NFT_SET_MAP | NFT_SET_EVAL |
-			      NFT_SET_OBJECT | NFT_SET_CONCAT))
+			      NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR))
 			return -EOPNOTSUPP;
 		/* Only one of these operations is supported */
 		if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) ==
@@ -4304,6 +4304,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 		struct nlattr *tmp;
 		int left;
 
+		if (!(flags & NFT_SET_EXPR)) {
+			err = -EINVAL;
+			goto err_set_alloc_name;
+		}
 		i = 0;
 		nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) {
 			if (i == NFT_SET_EXPR_MAX) {
@@ -5254,8 +5258,8 @@ static int nft_set_elem_expr_clone(const struct nft_ctx *ctx,
 	return 0;
 
 err_expr:
-	for (k = i - 1; k >= 0; k++)
-		nft_expr_destroy(ctx, expr_array[i]);
+	for (k = i - 1; k >= 0; k--)
+		nft_expr_destroy(ctx, expr_array[k]);
 
 	return -ENOMEM;
 }
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 983a1d5ca3ab..0b053f75cd60 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -19,6 +19,7 @@ struct nft_dynset {
 	enum nft_registers		sreg_key:8;
 	enum nft_registers		sreg_data:8;
 	bool				invert;
+	bool				expr;
 	u8				num_exprs;
 	u64				timeout;
 	struct nft_expr			*expr_array[NFT_SET_EXPR_MAX];
@@ -175,11 +176,12 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 
 	if (tb[NFTA_DYNSET_FLAGS]) {
 		u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS]));
-
-		if (flags & ~NFT_DYNSET_F_INV)
-			return -EINVAL;
+		if (flags & ~(NFT_DYNSET_F_INV | NFT_DYNSET_F_EXPR))
+			return -EOPNOTSUPP;
 		if (flags & NFT_DYNSET_F_INV)
 			priv->invert = true;
+		if (flags & NFT_DYNSET_F_EXPR)
+			priv->expr = true;
 	}
 
 	set = nft_set_lookup_global(ctx->net, ctx->table,
@@ -210,7 +212,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 	timeout = 0;
 	if (tb[NFTA_DYNSET_TIMEOUT] != NULL) {
 		if (!(set->flags & NFT_SET_TIMEOUT))
-			return -EINVAL;
+			return -EOPNOTSUPP;
 
 		err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout);
 		if (err)
@@ -224,7 +226,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 
 	if (tb[NFTA_DYNSET_SREG_DATA] != NULL) {
 		if (!(set->flags & NFT_SET_MAP))
-			return -EINVAL;
+			return -EOPNOTSUPP;
 		if (set->dtype == NFT_DATA_VERDICT)
 			return -EOPNOTSUPP;
 
@@ -261,6 +263,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 		struct nlattr *tmp;
 		int left;
 
+		if (!priv->expr)
+			return -EINVAL;
+
 		i = 0;
 		nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) {
 			if (i == NFT_SET_EXPR_MAX) {
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 37253d399c6b..0d5c422f8745 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -115,6 +115,9 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 	} cfg;
 	int ret;
 
+	if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name))
+		return -ENAMETOOLONG;
+
 	net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
 
 	mutex_lock(&xn->hash_lock);
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index e64727e1a72f..02a1f13f0798 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -508,7 +508,7 @@ static int nci_open_device(struct nci_dev *ndev)
 		};
 		unsigned long opt = 0;
 
-		if (!(ndev->nci_ver & NCI_VER_2_MASK))
+		if (ndev->nci_ver & NCI_VER_2_MASK)
 			opt = (unsigned long)&nci_init_v2_cmd;
 
 		rc = __nci_request(ndev, nci_init_req, opt,
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index de8e8dbbdeb8..6bbc7a448593 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4595,7 +4595,9 @@ static void packet_seq_stop(struct seq_file *seq, void *v)
 static int packet_seq_show(struct seq_file *seq, void *v)
 {
 	if (v == SEQ_START_TOKEN)
-		seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
+		seq_printf(seq,
+			   "%*sRefCnt Type Proto  Iface R Rmem   User   Inode\n",
+			   IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk");
 	else {
 		struct sock *s = sk_entry(v);
 		const struct packet_sock *po = pkt_sk(s);
diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c
index 56aaf8cb6527..8d00dfe8139e 100644
--- a/net/qrtr/ns.c
+++ b/net/qrtr/ns.c
@@ -755,7 +755,7 @@ static void qrtr_ns_data_ready(struct sock *sk)
 	queue_work(qrtr_ns.workqueue, &qrtr_ns.work);
 }
 
-void qrtr_ns_init(void)
+int qrtr_ns_init(void)
 {
 	struct sockaddr_qrtr sq;
 	int ret;
@@ -766,7 +766,7 @@ void qrtr_ns_init(void)
 	ret = sock_create_kern(&init_net, AF_QIPCRTR, SOCK_DGRAM,
 			       PF_QIPCRTR, &qrtr_ns.sock);
 	if (ret < 0)
-		return;
+		return ret;
 
 	ret = kernel_getsockname(qrtr_ns.sock, (struct sockaddr *)&sq);
 	if (ret < 0) {
@@ -797,12 +797,13 @@ void qrtr_ns_init(void)
 	if (ret < 0)
 		goto err_wq;
 
-	return;
+	return 0;
 
 err_wq:
 	destroy_workqueue(qrtr_ns.workqueue);
 err_sock:
 	sock_release(qrtr_ns.sock);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(qrtr_ns_init);
 
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index f4ab3ca6d73b..b34358282f37 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1287,13 +1287,19 @@ static int __init qrtr_proto_init(void)
 		return rc;
 
 	rc = sock_register(&qrtr_family);
-	if (rc) {
-		proto_unregister(&qrtr_proto);
-		return rc;
-	}
+	if (rc)
+		goto err_proto;
 
-	qrtr_ns_init();
+	rc = qrtr_ns_init();
+	if (rc)
+		goto err_sock;
 
+	return 0;
+
+err_sock:
+	sock_unregister(qrtr_family.family);
+err_proto:
+	proto_unregister(&qrtr_proto);
 	return rc;
 }
 postcore_initcall(qrtr_proto_init);
diff --git a/net/qrtr/qrtr.h b/net/qrtr/qrtr.h
index dc2b67f17927..3f2d28696062 100644
--- a/net/qrtr/qrtr.h
+++ b/net/qrtr/qrtr.h
@@ -29,7 +29,7 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep);
 
 int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len);
 
-void qrtr_ns_init(void);
+int qrtr_ns_init(void);
 
 void qrtr_ns_remove(void);
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 667c44aa5a63..dc201363f2c4 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -430,7 +430,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb)
 		return;
 	}
 
-	if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST) {
+	if (state == RXRPC_CALL_SERVER_RECV_REQUEST) {
 		unsigned long timo = READ_ONCE(call->next_req_timo);
 		unsigned long now, expect_req_by;
 
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 9631aa8543b5..8d2073e0e3da 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -598,7 +598,7 @@ static long rxrpc_read(const struct key *key,
 		default: /* we have a ticket we can't encode */
 			pr_err("Unsupported key token type (%u)\n",
 			       token->security_index);
-			continue;
+			return -ENOPKG;
 		}
 
 		_debug("token[%u]: toksize=%u", ntoks, toksize);
@@ -674,7 +674,9 @@ static long rxrpc_read(const struct key *key,
 			break;
 
 		default:
-			break;
+			pr_err("Unsupported key token type (%u)\n",
+			       token->security_index);
+			return -ENOPKG;
 		}
 
 		ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==,
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 1319986693fc..84f932532db7 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1272,6 +1272,10 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 
 		nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
 		msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
+		if (!nla_ok(nla_opt_msk, msk_depth)) {
+			NL_SET_ERR_MSG(extack, "Invalid nested attribute for masks");
+			return -EINVAL;
+		}
 	}
 
 	nla_for_each_attr(nla_opt_key, nla_enc_key,
@@ -1307,9 +1311,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 				NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
 				return -EINVAL;
 			}
-
-			if (msk_depth)
-				nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
 			break;
 		case TCA_FLOWER_KEY_ENC_OPTS_VXLAN:
 			if (key->enc_opts.dst_opt_type) {
@@ -1340,9 +1341,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 				NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
 				return -EINVAL;
 			}
-
-			if (msk_depth)
-				nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
 			break;
 		case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN:
 			if (key->enc_opts.dst_opt_type) {
@@ -1373,14 +1371,20 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 				NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
 				return -EINVAL;
 			}
-
-			if (msk_depth)
-				nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
 			break;
 		default:
 			NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
 			return -EINVAL;
 		}
+
+		if (!msk_depth)
+			continue;
+
+		if (!nla_ok(nla_opt_msk, msk_depth)) {
+			NL_SET_ERR_MSG(extack, "A mask attribute is invalid");
+			return -EINVAL;
+		}
+		nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
 	}
 
 	return 0;
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 78bec347b8b6..c4007b9cd16d 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -366,9 +366,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 	if (tb[TCA_TCINDEX_MASK])
 		cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
 
-	if (tb[TCA_TCINDEX_SHIFT])
+	if (tb[TCA_TCINDEX_SHIFT]) {
 		cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
-
+		if (cp->shift > 16) {
+			err = -EINVAL;
+			goto errout;
+		}
+	}
 	if (!cp->hash) {
 		/* Hash not specified, use perfect hash if the upper limit
 		 * of the hashing index is below the threshold.
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 51cb553e4317..6fe4e5cc807c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -412,7 +412,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
 {
 	struct qdisc_rate_table *rtab;
 
-	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
+	if (tab == NULL || r->rate == 0 ||
+	    r->cell_log == 0 || r->cell_log >= 32 ||
 	    nla_len(tab) != TC_RTAB_SIZE) {
 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
 		return NULL;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index bd618b00d319..50f680f03a54 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -362,7 +362,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
 
 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
 
-	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log))
 		return -EINVAL;
 
 	if (ctl->limit > CHOKE_MAX_QUEUE)
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 8599c6f31b05..e0bc77533acc 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -480,7 +480,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp,
 	struct gred_sched *table = qdisc_priv(sch);
 	struct gred_sched_data *q = table->tab[dp];
 
-	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) {
+	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) {
 		NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters");
 		return -EINVAL;
 	}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index e89fab6ccb34..b4ae34d7aa96 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -250,7 +250,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb,
 	max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
 
 	ctl = nla_data(tb[TCA_RED_PARMS]);
-	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
+	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log))
 		return -EINVAL;
 
 	err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS,
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index bca2be57d9fc..b25e51440623 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -647,7 +647,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 	}
 
 	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
-					ctl_v1->Wlog))
+					ctl_v1->Wlog, ctl_v1->Scell_log))
 		return -EINVAL;
 	if (ctl_v1 && ctl_v1->qth_min) {
 		p = kmalloc(sizeof(*p), GFP_KERNEL);
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 26fb8a62996b..6f775275826a 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -1597,6 +1597,22 @@ free_sched:
 	return err;
 }
 
+static void taprio_reset(struct Qdisc *sch)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	int i;
+
+	hrtimer_cancel(&q->advance_timer);
+	if (q->qdiscs) {
+		for (i = 0; i < dev->num_tx_queues; i++)
+			if (q->qdiscs[i])
+				qdisc_reset(q->qdiscs[i]);
+	}
+	sch->qstats.backlog = 0;
+	sch->q.qlen = 0;
+}
+
 static void taprio_destroy(struct Qdisc *sch)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
@@ -1607,12 +1623,11 @@ static void taprio_destroy(struct Qdisc *sch)
 	list_del(&q->taprio_list);
 	spin_unlock(&taprio_list_lock);
 
-	hrtimer_cancel(&q->advance_timer);
 
 	taprio_disable_offload(dev, q, NULL);
 
 	if (q->qdiscs) {
-		for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++)
+		for (i = 0; i < dev->num_tx_queues; i++)
 			qdisc_put(q->qdiscs[i]);
 
 		kfree(q->qdiscs);
@@ -1954,6 +1969,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
 	.init		= taprio_init,
 	.change		= taprio_change,
 	.destroy	= taprio_destroy,
+	.reset		= taprio_reset,
 	.peek		= taprio_peek,
 	.dequeue	= taprio_dequeue,
 	.enqueue	= taprio_enqueue,
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 59342b519e34..0df85a12651e 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -246,7 +246,8 @@ int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb)
 		goto errattr;
 	smc_clc_get_hostname(&host);
 	if (host) {
-		snprintf(hostname, sizeof(hostname), "%s", host);
+		memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN);
+		hostname[SMC_MAX_HOSTNAME_LEN] = 0;
 		if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname))
 			goto errattr;
 	}
@@ -257,7 +258,8 @@ int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb)
 		smc_ism_get_system_eid(smcd_dev, &seid);
 	mutex_unlock(&smcd_dev_list.mutex);
 	if (seid && smc_ism_is_v2_capable()) {
-		snprintf(smc_seid, sizeof(smc_seid), "%s", seid);
+		memcpy(smc_seid, seid, SMC_MAX_EID_LEN);
+		smc_seid[SMC_MAX_EID_LEN] = 0;
 		if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid))
 			goto errattr;
 	}
@@ -295,7 +297,8 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr,
 		goto errattr;
 	if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id))
 		goto errattr;
-	snprintf(smc_target, sizeof(smc_target), "%s", lgr->pnet_id);
+	memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN);
+	smc_target[SMC_MAX_PNETID_LEN] = 0;
 	if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target))
 		goto errattr;
 
@@ -312,7 +315,7 @@ static int smc_nl_fill_lgr_link(struct smc_link_group *lgr,
 				struct sk_buff *skb,
 				struct netlink_callback *cb)
 {
-	char smc_ibname[IB_DEVICE_NAME_MAX + 1];
+	char smc_ibname[IB_DEVICE_NAME_MAX];
 	u8 smc_gid_target[41];
 	struct nlattr *attrs;
 	u32 link_uid = 0;
@@ -461,7 +464,8 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr,
 		goto errattr;
 	if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd)))
 		goto errattr;
-	snprintf(smc_pnet, sizeof(smc_pnet), "%s", lgr->smcd->pnetid);
+	memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN);
+	smc_pnet[SMC_MAX_PNETID_LEN] = 0;
 	if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet))
 		goto errattr;
 
@@ -474,10 +478,12 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr,
 		goto errv2attr;
 	if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os))
 		goto errv2attr;
-	snprintf(smc_host, sizeof(smc_host), "%s", lgr->peer_hostname);
+	memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN);
+	smc_host[SMC_MAX_HOSTNAME_LEN] = 0;
 	if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host))
 		goto errv2attr;
-	snprintf(smc_eid, sizeof(smc_eid), "%s", lgr->negotiated_eid);
+	memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN);
+	smc_eid[SMC_MAX_EID_LEN] = 0;
 	if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid))
 		goto errv2attr;
 
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 89ea10675a7d..7d7ba0320d5a 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -371,8 +371,8 @@ static int smc_nl_handle_dev_port(struct sk_buff *skb,
 	if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR,
 		       smcibdev->pnetid_by_user[port]))
 		goto errattr;
-	snprintf(smc_pnet, sizeof(smc_pnet), "%s",
-		 (char *)&smcibdev->pnetid[port]);
+	memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN);
+	smc_pnet[SMC_MAX_PNETID_LEN] = 0;
 	if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
 		goto errattr;
 	if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV,
@@ -394,11 +394,27 @@ errout:
 	return -EMSGSIZE;
 }
 
+static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev,
+				     struct sk_buff *skb)
+{
+	if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid))
+		return false;
+	if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid))
+		return false;
+	if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor))
+		return false;
+	if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device))
+		return false;
+	if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id))
+		return false;
+	return true;
+}
+
 static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev,
 				  struct sk_buff *skb,
 				  struct netlink_callback *cb)
 {
-	char smc_ibname[IB_DEVICE_NAME_MAX + 1];
+	char smc_ibname[IB_DEVICE_NAME_MAX];
 	struct smc_pci_dev smc_pci_dev;
 	struct pci_dev *pci_dev;
 	unsigned char is_crit;
@@ -417,19 +433,13 @@ static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev,
 	is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev);
 	if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit))
 		goto errattr;
-	memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
-	pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent);
-	smc_set_pci_values(pci_dev, &smc_pci_dev);
-	if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid))
-		goto errattr;
-	if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid))
-		goto errattr;
-	if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor))
-		goto errattr;
-	if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device))
-		goto errattr;
-	if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id))
-		goto errattr;
+	if (smcibdev->ibdev->dev.parent) {
+		memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
+		pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent);
+		smc_set_pci_values(pci_dev, &smc_pci_dev);
+		if (!smc_nl_handle_pci_values(&smc_pci_dev, skb))
+			goto errattr;
+	}
 	snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name);
 	if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname))
 		goto errattr;
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 524ef64a191a..9c6e95882553 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -250,7 +250,8 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
 		goto errattr;
 	if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user))
 		goto errportattr;
-	snprintf(smc_pnet, sizeof(smc_pnet), "%s", smcd->pnetid);
+	memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN);
+	smc_pnet[SMC_MAX_PNETID_LEN] = 0;
 	if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
 		goto errportattr;
 
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index 010dcb876f9d..6e4dbd577a39 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -185,7 +185,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
 			scope_id = dev->ifindex;
 			dev_put(dev);
 		} else {
-			if (kstrtou32(p, 10, &scope_id) == 0) {
+			if (kstrtou32(p, 10, &scope_id) != 0) {
 				kfree(p);
 				return 0;
 			}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 3259120462ed..612f0a641f4c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1251,10 +1251,7 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages,
 			     unsigned int base, unsigned int len,
 			     unsigned int hdrsize)
 {
-	/* Subtract one to force an extra word of buffer space for the
-	 * payload's XDR pad to fall into the rcv_buf's tail iovec.
-	 */
-	hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1;
+	hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign;
 
 	xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len);
 	trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf);
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index fd9bca242724..56029e3af6ff 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -128,13 +128,13 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n
 		return 0;
 	len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
 		       xprt->debugfs->d_name.name);
-	if (len > sizeof(name))
+	if (len >= sizeof(name))
 		return -1;
 	if (*nump == 0)
 		strcpy(link, "xprt");
 	else {
 		len = snprintf(link, sizeof(link), "xprt%d", *nump);
-		if (len > sizeof(link))
+		if (len >= sizeof(link))
 			return -1;
 	}
 	debugfs_create_symlink(link, clnt->cl_debugfs, name);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index f06d7c315017..cf702a5f7fe5 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -676,6 +676,23 @@ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue)
 EXPORT_SYMBOL_GPL(rpc_wake_up_next);
 
 /**
+ * rpc_wake_up_locked - wake up all rpc_tasks
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ *
+ */
+static void rpc_wake_up_locked(struct rpc_wait_queue *queue)
+{
+	struct rpc_task *task;
+
+	for (;;) {
+		task = __rpc_find_next_queued(queue);
+		if (task == NULL)
+			break;
+		rpc_wake_up_task_queue_locked(queue, task);
+	}
+}
+
+/**
  * rpc_wake_up - wake up all rpc_tasks
  * @queue: rpc_wait_queue on which the tasks are sleeping
  *
@@ -683,25 +700,28 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_next);
  */
 void rpc_wake_up(struct rpc_wait_queue *queue)
 {
-	struct list_head *head;
-
 	spin_lock(&queue->lock);
-	head = &queue->tasks[queue->maxpriority];
+	rpc_wake_up_locked(queue);
+	spin_unlock(&queue->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up);
+
+/**
+ * rpc_wake_up_status_locked - wake up all rpc_tasks and set their status value.
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ * @status: status value to set
+ */
+static void rpc_wake_up_status_locked(struct rpc_wait_queue *queue, int status)
+{
+	struct rpc_task *task;
+
 	for (;;) {
-		while (!list_empty(head)) {
-			struct rpc_task *task;
-			task = list_first_entry(head,
-					struct rpc_task,
-					u.tk_wait.list);
-			rpc_wake_up_task_queue_locked(queue, task);
-		}
-		if (head == &queue->tasks[0])
+		task = __rpc_find_next_queued(queue);
+		if (task == NULL)
 			break;
-		head--;
+		rpc_wake_up_task_queue_set_status_locked(queue, task, status);
 	}
-	spin_unlock(&queue->lock);
 }
-EXPORT_SYMBOL_GPL(rpc_wake_up);
 
 /**
  * rpc_wake_up_status - wake up all rpc_tasks and set their status value.
@@ -712,23 +732,8 @@ EXPORT_SYMBOL_GPL(rpc_wake_up);
  */
 void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
 {
-	struct list_head *head;
-
 	spin_lock(&queue->lock);
-	head = &queue->tasks[queue->maxpriority];
-	for (;;) {
-		while (!list_empty(head)) {
-			struct rpc_task *task;
-			task = list_first_entry(head,
-					struct rpc_task,
-					u.tk_wait.list);
-			task->tk_status = status;
-			rpc_wake_up_task_queue_locked(queue, task);
-		}
-		if (head == &queue->tasks[0])
-			break;
-		head--;
-	}
+	rpc_wake_up_status_locked(queue, status);
 	spin_unlock(&queue->lock);
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_status);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 5fb9164aa690..dcc50ae54550 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -857,6 +857,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 	err = -EAGAIN;
 	if (len <= 0)
 		goto out_release;
+	trace_svc_xdr_recvfrom(&rqstp->rq_arg);
 
 	clear_bit(XPT_OLD, &xprt->xpt_flags);
 
@@ -866,7 +867,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 
 	if (serv->sv_stats)
 		serv->sv_stats->netcnt++;
-	trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg);
 	return len;
 out_release:
 	rqstp->rq_res.len = 0;
@@ -904,7 +904,7 @@ int svc_send(struct svc_rqst *rqstp)
 	xb->len = xb->head[0].iov_len +
 		xb->page_len +
 		xb->tail[0].iov_len;
-	trace_svc_xdr_sendto(rqstp, xb);
+	trace_svc_xdr_sendto(rqstp->rq_xid, xb);
 	trace_svc_stats_latency(rqstp);
 
 	len = xprt->xpt_ops->xpo_sendto(rqstp);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index b248f2349437..c9766d07eb81 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1062,6 +1062,90 @@ err_noclose:
 	return 0;	/* record not complete */
 }
 
+static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
+			      int flags)
+{
+	return kernel_sendpage(sock, virt_to_page(vec->iov_base),
+			       offset_in_page(vec->iov_base),
+			       vec->iov_len, flags);
+}
+
+/*
+ * kernel_sendpage() is used exclusively to reduce the number of
+ * copy operations in this path. Therefore the caller must ensure
+ * that the pages backing @xdr are unchanging.
+ *
+ * In addition, the logic assumes that * .bv_len is never larger
+ * than PAGE_SIZE.
+ */
+static int svc_tcp_sendmsg(struct socket *sock, struct msghdr *msg,
+			   struct xdr_buf *xdr, rpc_fraghdr marker,
+			   unsigned int *sentp)
+{
+	const struct kvec *head = xdr->head;
+	const struct kvec *tail = xdr->tail;
+	struct kvec rm = {
+		.iov_base	= &marker,
+		.iov_len	= sizeof(marker),
+	};
+	int flags, ret;
+
+	*sentp = 0;
+	xdr_alloc_bvec(xdr, GFP_KERNEL);
+
+	msg->msg_flags = MSG_MORE;
+	ret = kernel_sendmsg(sock, msg, &rm, 1, rm.iov_len);
+	if (ret < 0)
+		return ret;
+	*sentp += ret;
+	if (ret != rm.iov_len)
+		return -EAGAIN;
+
+	flags = head->iov_len < xdr->len ? MSG_MORE | MSG_SENDPAGE_NOTLAST : 0;
+	ret = svc_tcp_send_kvec(sock, head, flags);
+	if (ret < 0)
+		return ret;
+	*sentp += ret;
+	if (ret != head->iov_len)
+		goto out;
+
+	if (xdr->page_len) {
+		unsigned int offset, len, remaining;
+		struct bio_vec *bvec;
+
+		bvec = xdr->bvec;
+		offset = xdr->page_base;
+		remaining = xdr->page_len;
+		flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+		while (remaining > 0) {
+			if (remaining <= PAGE_SIZE && tail->iov_len == 0)
+				flags = 0;
+			len = min(remaining, bvec->bv_len);
+			ret = kernel_sendpage(sock, bvec->bv_page,
+					      bvec->bv_offset + offset,
+					      len, flags);
+			if (ret < 0)
+				return ret;
+			*sentp += ret;
+			if (ret != len)
+				goto out;
+			remaining -= len;
+			offset = 0;
+			bvec++;
+		}
+	}
+
+	if (tail->iov_len) {
+		ret = svc_tcp_send_kvec(sock, tail, 0);
+		if (ret < 0)
+			return ret;
+		*sentp += ret;
+	}
+
+out:
+	return 0;
+}
+
 /**
  * svc_tcp_sendto - Send out a reply on a TCP socket
  * @rqstp: completed svc_rqst
@@ -1089,7 +1173,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
 	mutex_lock(&xprt->xpt_mutex);
 	if (svc_xprt_is_dead(xprt))
 		goto out_notconn;
-	err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);
+	err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent);
 	xdr_free_bvec(xdr);
 	trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
 	if (err < 0 || sent != (xdr->len + sizeof(marker)))
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 757560a3b06b..3964ff74ee51 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -123,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
  * @len: length of string, in bytes
  *
  */
-void
-xdr_terminate_string(struct xdr_buf *buf, const u32 len)
+void xdr_terminate_string(const struct xdr_buf *buf, const u32 len)
 {
 	char *kaddr;
 
@@ -134,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len)
 }
 EXPORT_SYMBOL_GPL(xdr_terminate_string);
 
-size_t
-xdr_buf_pagecount(struct xdr_buf *buf)
+size_t xdr_buf_pagecount(const struct xdr_buf *buf)
 {
 	if (!buf->page_len)
 		return 0;
@@ -193,9 +191,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
 
 	tail->iov_base = buf + offset;
 	tail->iov_len = buflen - offset;
-	if ((xdr->page_len & 3) == 0)
-		tail->iov_len -= sizeof(__be32);
-
 	xdr->buflen += len;
 }
 EXPORT_SYMBOL_GPL(xdr_inline_pages);
@@ -228,6 +223,9 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base,
 
 	BUG_ON(pgfrom_base <= pgto_base);
 
+	if (!len)
+		return;
+
 	pgto = pages + (pgto_base >> PAGE_SHIFT);
 	pgfrom = pages + (pgfrom_base >> PAGE_SHIFT);
 
@@ -266,26 +264,6 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base,
 	} while ((len -= copy) != 0);
 }
 
-static void
-_shift_data_left_tail(struct xdr_buf *buf, unsigned int pgto, size_t len)
-{
-	struct kvec *tail = buf->tail;
-
-	if (len > tail->iov_len)
-		len = tail->iov_len;
-
-	_copy_to_pages(buf->pages,
-		       buf->page_base + pgto,
-		       (char *)tail->iov_base,
-		       len);
-	tail->iov_len -= len;
-
-	if (tail->iov_len > 0)
-		memmove((char *)tail->iov_base,
-				tail->iov_base + len,
-				tail->iov_len);
-}
-
 /**
  * _shift_data_right_pages
  * @pages: vector of pages containing both the source and dest memory area.
@@ -310,6 +288,9 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
 
 	BUG_ON(pgto_base <= pgfrom_base);
 
+	if (!len)
+		return;
+
 	pgto_base += len;
 	pgfrom_base += len;
 
@@ -351,46 +332,6 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
 	} while ((len -= copy) != 0);
 }
 
-static unsigned int
-_shift_data_right_tail(struct xdr_buf *buf, unsigned int pgfrom, size_t len)
-{
-	struct kvec *tail = buf->tail;
-	unsigned int tailbuf_len;
-	unsigned int result = 0;
-	size_t copy;
-
-	tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
-
-	/* Shift the tail first */
-	if (tailbuf_len != 0) {
-		unsigned int free_space = tailbuf_len - tail->iov_len;
-
-		if (len < free_space)
-			free_space = len;
-		if (len > free_space)
-			len = free_space;
-
-		tail->iov_len += free_space;
-		copy = len;
-
-		if (tail->iov_len > len) {
-			char *p = (char *)tail->iov_base + len;
-			memmove(p, tail->iov_base, tail->iov_len - free_space);
-			result += tail->iov_len - free_space;
-		} else
-			copy = tail->iov_len;
-
-		/* Copy from the inlined pages into the tail */
-		_copy_from_pages((char *)tail->iov_base,
-					 buf->pages,
-					 buf->page_base + pgfrom,
-					 copy);
-		result += copy;
-	}
-
-	return result;
-}
-
 /**
  * _copy_to_pages
  * @pages: array of pages
@@ -408,6 +349,9 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
 	char *vto;
 	size_t copy;
 
+	if (!len)
+		return;
+
 	pgto = pages + (pgbase >> PAGE_SHIFT);
 	pgbase &= ~PAGE_MASK;
 
@@ -452,6 +396,9 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 	char *vfrom;
 	size_t copy;
 
+	if (!len)
+		return;
+
 	pgfrom = pages + (pgbase >> PAGE_SHIFT);
 	pgbase &= ~PAGE_MASK;
 
@@ -475,18 +422,42 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 }
 EXPORT_SYMBOL_GPL(_copy_from_pages);
 
+static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base,
+			     unsigned int len)
+{
+	if (base >= iov->iov_len)
+		return;
+	if (len > iov->iov_len - base)
+		len = iov->iov_len - base;
+	memset(iov->iov_base + base, 0, len);
+}
+
 /**
- * _zero_pages
- * @pages: array of pages
- * @pgbase: beginning page vector address
+ * xdr_buf_pages_zero
+ * @buf: xdr_buf
+ * @pgbase: beginning offset
  * @len: length
  */
-static void
-_zero_pages(struct page **pages, size_t pgbase, size_t len)
+static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase,
+			       unsigned int len)
 {
+	struct page **pages = buf->pages;
 	struct page **page;
 	char *vpage;
-	size_t zero;
+	unsigned int zero;
+
+	if (!len)
+		return;
+	if (pgbase >= buf->page_len) {
+		xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len);
+		return;
+	}
+	if (pgbase + len > buf->page_len) {
+		xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len);
+		len = buf->page_len - pgbase;
+	}
+
+	pgbase += buf->page_base;
 
 	page = pages + (pgbase >> PAGE_SHIFT);
 	pgbase &= ~PAGE_MASK;
@@ -507,122 +478,367 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len)
 	} while ((len -= zero) != 0);
 }
 
+static unsigned int xdr_buf_pages_fill_sparse(const struct xdr_buf *buf,
+					      unsigned int buflen, gfp_t gfp)
+{
+	unsigned int i, npages, pagelen;
+
+	if (!(buf->flags & XDRBUF_SPARSE_PAGES))
+		return buflen;
+	if (buflen <= buf->head->iov_len)
+		return buflen;
+	pagelen = buflen - buf->head->iov_len;
+	if (pagelen > buf->page_len)
+		pagelen = buf->page_len;
+	npages = (pagelen + buf->page_base + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		if (!buf->pages[i])
+			continue;
+		buf->pages[i] = alloc_page(gfp);
+		if (likely(buf->pages[i]))
+			continue;
+		buflen -= pagelen;
+		pagelen = i << PAGE_SHIFT;
+		if (pagelen > buf->page_base)
+			buflen += pagelen - buf->page_base;
+		break;
+	}
+	return buflen;
+}
+
+static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len)
+{
+	struct kvec *head = buf->head;
+	struct kvec *tail = buf->tail;
+	unsigned int sum = head->iov_len + buf->page_len + tail->iov_len;
+	unsigned int free_space, newlen;
+
+	if (sum > buf->len) {
+		free_space = min_t(unsigned int, sum - buf->len, len);
+		newlen = xdr_buf_pages_fill_sparse(buf, buf->len + free_space,
+						   GFP_KERNEL);
+		free_space = newlen - buf->len;
+		buf->len = newlen;
+		len -= free_space;
+		if (!len)
+			return;
+	}
+
+	if (buf->buflen > sum) {
+		/* Expand the tail buffer */
+		free_space = min_t(unsigned int, buf->buflen - sum, len);
+		tail->iov_len += free_space;
+		buf->len += free_space;
+	}
+}
+
+static void xdr_buf_tail_copy_right(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+	unsigned int to = base + shift;
+
+	if (to >= tail->iov_len)
+		return;
+	if (len + to > tail->iov_len)
+		len = tail->iov_len - to;
+	memmove(tail->iov_base + to, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_right(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+	unsigned int to = base + shift;
+	unsigned int pglen = 0;
+	unsigned int talen = 0, tato = 0;
+
+	if (base >= buf->page_len)
+		return;
+	if (len > buf->page_len - base)
+		len = buf->page_len - base;
+	if (to >= buf->page_len) {
+		tato = to - buf->page_len;
+		if (tail->iov_len >= len + tato)
+			talen = len;
+		else if (tail->iov_len > tato)
+			talen = tail->iov_len - tato;
+	} else if (len + to >= buf->page_len) {
+		pglen = buf->page_len - to;
+		talen = len - pglen;
+		if (talen > tail->iov_len)
+			talen = tail->iov_len;
+	} else
+		pglen = len;
+
+	_copy_from_pages(tail->iov_base + tato, buf->pages,
+			 buf->page_base + base + pglen, talen);
+	_shift_data_right_pages(buf->pages, buf->page_base + to,
+				buf->page_base + base, pglen);
+}
+
+static void xdr_buf_head_copy_right(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	const struct kvec *head = buf->head;
+	const struct kvec *tail = buf->tail;
+	unsigned int to = base + shift;
+	unsigned int pglen = 0, pgto = 0;
+	unsigned int talen = 0, tato = 0;
+
+	if (base >= head->iov_len)
+		return;
+	if (len > head->iov_len - base)
+		len = head->iov_len - base;
+	if (to >= buf->page_len + head->iov_len) {
+		tato = to - buf->page_len - head->iov_len;
+		talen = len;
+	} else if (to >= head->iov_len) {
+		pgto = to - head->iov_len;
+		pglen = len;
+		if (pgto + pglen > buf->page_len) {
+			talen = pgto + pglen - buf->page_len;
+			pglen -= talen;
+		}
+	} else {
+		pglen = len - to;
+		if (pglen > buf->page_len) {
+			talen = pglen - buf->page_len;
+			pglen = buf->page_len;
+		}
+	}
+
+	len -= talen;
+	base += len;
+	if (talen + tato > tail->iov_len)
+		talen = tail->iov_len > tato ? tail->iov_len - tato : 0;
+	memcpy(tail->iov_base + tato, head->iov_base + base, talen);
+
+	len -= pglen;
+	base -= pglen;
+	_copy_to_pages(buf->pages, buf->page_base + pgto, head->iov_base + base,
+		       pglen);
+
+	base -= len;
+	memmove(head->iov_base + to, head->iov_base + base, len);
+}
+
+static void xdr_buf_tail_shift_right(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+
+	if (base >= tail->iov_len || !shift || !len)
+		return;
+	xdr_buf_tail_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_right(const struct xdr_buf *buf,
+				      unsigned int base, unsigned int len,
+				      unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	if (base >= buf->page_len) {
+		xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift);
+		return;
+	}
+	if (base + len > buf->page_len)
+		xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len,
+					 shift);
+	xdr_buf_pages_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_head_shift_right(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	const struct kvec *head = buf->head;
+
+	if (!shift)
+		return;
+	if (base >= head->iov_len) {
+		xdr_buf_pages_shift_right(buf, head->iov_len - base, len,
+					  shift);
+		return;
+	}
+	if (base + len > head->iov_len)
+		xdr_buf_pages_shift_right(buf, 0, base + len - head->iov_len,
+					  shift);
+	xdr_buf_head_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base,
+				   unsigned int len, unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+
+	if (base >= tail->iov_len)
+		return;
+	if (len > tail->iov_len - base)
+		len = tail->iov_len - base;
+	/* Shift data into head */
+	if (shift > buf->page_len + base) {
+		const struct kvec *head = buf->head;
+		unsigned int hdto =
+			head->iov_len + buf->page_len + base - shift;
+		unsigned int hdlen = len;
+
+		if (WARN_ONCE(shift > head->iov_len + buf->page_len + base,
+			      "SUNRPC: Misaligned data.\n"))
+			return;
+		if (hdto + hdlen > head->iov_len)
+			hdlen = head->iov_len - hdto;
+		memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen);
+		base += hdlen;
+		len -= hdlen;
+		if (!len)
+			return;
+	}
+	/* Shift data into pages */
+	if (shift > base) {
+		unsigned int pgto = buf->page_len + base - shift;
+		unsigned int pglen = len;
+
+		if (pgto + pglen > buf->page_len)
+			pglen = buf->page_len - pgto;
+		_copy_to_pages(buf->pages, buf->page_base + pgto,
+			       tail->iov_base + base, pglen);
+		base += pglen;
+		len -= pglen;
+		if (!len)
+			return;
+	}
+	memmove(tail->iov_base + base - shift, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_left(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	unsigned int pgto;
+
+	if (base >= buf->page_len)
+		return;
+	if (len > buf->page_len - base)
+		len = buf->page_len - base;
+	/* Shift data into head */
+	if (shift > base) {
+		const struct kvec *head = buf->head;
+		unsigned int hdto = head->iov_len + base - shift;
+		unsigned int hdlen = len;
+
+		if (WARN_ONCE(shift > head->iov_len + base,
+			      "SUNRPC: Misaligned data.\n"))
+			return;
+		if (hdto + hdlen > head->iov_len)
+			hdlen = head->iov_len - hdto;
+		_copy_from_pages(head->iov_base + hdto, buf->pages,
+				 buf->page_base + base, hdlen);
+		base += hdlen;
+		len -= hdlen;
+		if (!len)
+			return;
+	}
+	pgto = base - shift;
+	_shift_data_left_pages(buf->pages, buf->page_base + pgto,
+			       buf->page_base + base, len);
+}
+
+static void xdr_buf_tail_shift_left(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	xdr_buf_tail_copy_left(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_left(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	if (base >= buf->page_len) {
+		xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift);
+		return;
+	}
+	xdr_buf_pages_copy_left(buf, base, len, shift);
+	len += base;
+	if (len <= buf->page_len)
+		return;
+	xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift);
+}
+
 /**
  * xdr_shrink_bufhead
  * @buf: xdr_buf
- * @len: bytes to remove from buf->head[0]
+ * @len: new length of buf->head[0]
  *
- * Shrinks XDR buffer's header kvec buf->head[0] by
+ * Shrinks XDR buffer's header kvec buf->head[0], setting it to
  * 'len' bytes. The extra data is not lost, but is instead
  * moved into the inlined pages and/or the tail.
  */
-static unsigned int
-xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
+static unsigned int xdr_shrink_bufhead(struct xdr_buf *buf, unsigned int len)
 {
-	struct kvec *head, *tail;
-	size_t copy, offs;
-	unsigned int pglen = buf->page_len;
-	unsigned int result;
-
-	result = 0;
-	tail = buf->tail;
-	head = buf->head;
+	struct kvec *head = buf->head;
+	unsigned int shift, buflen = max(buf->len, len);
 
 	WARN_ON_ONCE(len > head->iov_len);
-	if (len > head->iov_len)
-		len = head->iov_len;
-
-	/* Shift the tail first */
-	if (tail->iov_len != 0) {
-		if (tail->iov_len > len) {
-			copy = tail->iov_len - len;
-			memmove((char *)tail->iov_base + len,
-					tail->iov_base, copy);
-			result += copy;
-		}
-		/* Copy from the inlined pages into the tail */
-		copy = len;
-		if (copy > pglen)
-			copy = pglen;
-		offs = len - copy;
-		if (offs >= tail->iov_len)
-			copy = 0;
-		else if (copy > tail->iov_len - offs)
-			copy = tail->iov_len - offs;
-		if (copy != 0) {
-			_copy_from_pages((char *)tail->iov_base + offs,
-					buf->pages,
-					buf->page_base + pglen + offs - len,
-					copy);
-			result += copy;
-		}
-		/* Do we also need to copy data from the head into the tail ? */
-		if (len > pglen) {
-			offs = copy = len - pglen;
-			if (copy > tail->iov_len)
-				copy = tail->iov_len;
-			memcpy(tail->iov_base,
-					(char *)head->iov_base +
-					head->iov_len - offs,
-					copy);
-			result += copy;
-		}
-	}
-	/* Now handle pages */
-	if (pglen != 0) {
-		if (pglen > len)
-			_shift_data_right_pages(buf->pages,
-					buf->page_base + len,
-					buf->page_base,
-					pglen - len);
-		copy = len;
-		if (len > pglen)
-			copy = pglen;
-		_copy_to_pages(buf->pages, buf->page_base,
-				(char *)head->iov_base + head->iov_len - len,
-				copy);
-		result += copy;
+	if (head->iov_len > buflen) {
+		buf->buflen -= head->iov_len - buflen;
+		head->iov_len = buflen;
 	}
-	head->iov_len -= len;
-	buf->buflen -= len;
-	/* Have we truncated the message? */
-	if (buf->len > buf->buflen)
-		buf->len = buf->buflen;
-
-	return result;
+	if (len >= head->iov_len)
+		return 0;
+	shift = head->iov_len - len;
+	xdr_buf_try_expand(buf, shift);
+	xdr_buf_head_shift_right(buf, len, buflen - len, shift);
+	head->iov_len = len;
+	buf->buflen -= shift;
+	buf->len -= shift;
+	return shift;
 }
 
 /**
- * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes
+ * xdr_shrink_pagelen - shrinks buf->pages to @len bytes
  * @buf: xdr_buf
- * @len: bytes to remove from buf->pages
+ * @len: new page buffer length
  *
  * The extra data is not lost, but is instead moved into buf->tail.
  * Returns the actual number of bytes moved.
  */
-static unsigned int
-xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
+static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len)
 {
-	unsigned int pglen = buf->page_len;
-	unsigned int result;
+	unsigned int shift, buflen = buf->len - buf->head->iov_len;
 
-	if (len > buf->page_len)
-		len = buf-> page_len;
-
-	result = _shift_data_right_tail(buf, pglen - len, len);
-	buf->page_len -= len;
-	buf->buflen -= len;
-	/* Have we truncated the message? */
-	if (buf->len > buf->buflen)
-		buf->len = buf->buflen;
-
-	return result;
+	WARN_ON_ONCE(len > buf->page_len);
+	if (buf->head->iov_len >= buf->len || len > buflen)
+		buflen = len;
+	if (buf->page_len > buflen) {
+		buf->buflen -= buf->page_len - buflen;
+		buf->page_len = buflen;
+	}
+	if (len >= buf->page_len)
+		return 0;
+	shift = buf->page_len - len;
+	xdr_buf_try_expand(buf, shift);
+	xdr_buf_pages_shift_right(buf, len, buflen - len, shift);
+	buf->page_len = len;
+	buf->len -= shift;
+	buf->buflen -= shift;
+	return shift;
 }
 
 void
 xdr_shift_buf(struct xdr_buf *buf, size_t len)
 {
-	xdr_shrink_bufhead(buf, len);
+	xdr_shrink_bufhead(buf, buf->head->iov_len - len);
 }
 EXPORT_SYMBOL_GPL(xdr_shift_buf);
 
@@ -636,6 +852,18 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr)
 }
 EXPORT_SYMBOL_GPL(xdr_stream_pos);
 
+static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+	unsigned int blen = xdr->buf->len;
+
+	xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0;
+}
+
+static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+	xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len);
+}
+
 /**
  * xdr_page_pos - Return the current offset from the start of the xdr pages
  * @xdr: pointer to struct xdr_stream
@@ -969,19 +1197,31 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b
 }
 EXPORT_SYMBOL_GPL(xdr_write_pages);
 
-static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
-		unsigned int len)
+static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
+				unsigned int base, unsigned int len)
 {
 	if (len > iov->iov_len)
 		len = iov->iov_len;
-	xdr->p = (__be32*)iov->iov_base;
+	if (unlikely(base > len))
+		base = len;
+	xdr->p = (__be32*)(iov->iov_base + base);
 	xdr->end = (__be32*)(iov->iov_base + len);
 	xdr->iov = iov;
 	xdr->page_ptr = NULL;
+	return len - base;
+}
+
+static unsigned int xdr_set_tail_base(struct xdr_stream *xdr,
+				      unsigned int base, unsigned int len)
+{
+	struct xdr_buf *buf = xdr->buf;
+
+	xdr_stream_set_pos(xdr, base + buf->page_len + buf->head->iov_len);
+	return xdr_set_iov(xdr, buf->tail, base, len);
 }
 
-static int xdr_set_page_base(struct xdr_stream *xdr,
-		unsigned int base, unsigned int len)
+static unsigned int xdr_set_page_base(struct xdr_stream *xdr,
+				      unsigned int base, unsigned int len)
 {
 	unsigned int pgnr;
 	unsigned int maxlen;
@@ -990,12 +1230,15 @@ static int xdr_set_page_base(struct xdr_stream *xdr,
 	void *kaddr;
 
 	maxlen = xdr->buf->page_len;
-	if (base >= maxlen)
-		return -EINVAL;
-	maxlen -= base;
+	if (base >= maxlen) {
+		base = maxlen;
+		maxlen = 0;
+	} else
+		maxlen -= base;
 	if (len > maxlen)
 		len = maxlen;
 
+	xdr_stream_page_set_pos(xdr, base);
 	base += xdr->buf->page_base;
 
 	pgnr = base >> PAGE_SHIFT;
@@ -1010,14 +1253,16 @@ static int xdr_set_page_base(struct xdr_stream *xdr,
 		pgend = PAGE_SIZE;
 	xdr->end = (__be32*)(kaddr + pgend);
 	xdr->iov = NULL;
-	return 0;
+	return len;
 }
 
 static void xdr_set_page(struct xdr_stream *xdr, unsigned int base,
 			 unsigned int len)
 {
-	if (xdr_set_page_base(xdr, base, len) < 0)
-		xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
+	if (xdr_set_page_base(xdr, base, len) == 0) {
+		base -= xdr->buf->page_len;
+		xdr_set_tail_base(xdr, base, len);
+	}
 }
 
 static void xdr_set_next_page(struct xdr_stream *xdr)
@@ -1026,17 +1271,18 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
 
 	newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT;
 	newbase -= xdr->buf->page_base;
-
-	xdr_set_page(xdr, newbase, PAGE_SIZE);
+	if (newbase < xdr->buf->page_len)
+		xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr));
+	else
+		xdr_set_tail_base(xdr, 0, xdr_stream_remaining(xdr));
 }
 
 static bool xdr_set_next_buffer(struct xdr_stream *xdr)
 {
 	if (xdr->page_ptr != NULL)
 		xdr_set_next_page(xdr);
-	else if (xdr->iov == xdr->buf->head) {
-		xdr_set_page(xdr, 0, PAGE_SIZE);
-	}
+	else if (xdr->iov == xdr->buf->head)
+		xdr_set_page(xdr, 0, xdr_stream_remaining(xdr));
 	return xdr->p != xdr->end;
 }
 
@@ -1053,12 +1299,9 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
 	xdr->buf = buf;
 	xdr_reset_scratch_buffer(xdr);
 	xdr->nwords = XDR_QUADLEN(buf->len);
-	if (buf->head[0].iov_len != 0)
-		xdr_set_iov(xdr, buf->head, buf->len);
-	else if (buf->page_len != 0)
-		xdr_set_page_base(xdr, 0, buf->len);
-	else
-		xdr_set_iov(xdr, buf->head, buf->len);
+	if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 &&
+	    xdr_set_page_base(xdr, 0, buf->len) == 0)
+		xdr_set_iov(xdr, buf->tail, 0, buf->len);
 	if (p != NULL && p > xdr->p && xdr->end >= p) {
 		xdr->nwords -= p - xdr->p;
 		xdr->p = p;
@@ -1158,14 +1401,13 @@ static void xdr_realign_pages(struct xdr_stream *xdr)
 	struct xdr_buf *buf = xdr->buf;
 	struct kvec *iov = buf->head;
 	unsigned int cur = xdr_stream_pos(xdr);
-	unsigned int copied, offset;
+	unsigned int copied;
 
 	/* Realign pages to current pointer position */
 	if (iov->iov_len > cur) {
-		offset = iov->iov_len - cur;
-		copied = xdr_shrink_bufhead(buf, offset);
-		trace_rpc_xdr_alignment(xdr, offset, copied);
-		xdr->nwords = XDR_QUADLEN(buf->len - cur);
+		copied = xdr_shrink_bufhead(buf, cur);
+		trace_rpc_xdr_alignment(xdr, cur, copied);
+		xdr_set_page(xdr, 0, buf->page_len);
 	}
 }
 
@@ -1173,8 +1415,7 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 {
 	struct xdr_buf *buf = xdr->buf;
 	unsigned int nwords = XDR_QUADLEN(len);
-	unsigned int cur = xdr_stream_pos(xdr);
-	unsigned int copied, offset;
+	unsigned int copied;
 
 	if (xdr->nwords == 0)
 		return 0;
@@ -1188,125 +1429,103 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 		len = buf->page_len;
 	else if (nwords < xdr->nwords) {
 		/* Truncate page data and move it into the tail */
-		offset = buf->page_len - len;
-		copied = xdr_shrink_pagelen(buf, offset);
-		trace_rpc_xdr_alignment(xdr, offset, copied);
-		xdr->nwords = XDR_QUADLEN(buf->len - cur);
+		copied = xdr_shrink_pagelen(buf, len);
+		trace_rpc_xdr_alignment(xdr, len, copied);
 	}
 	return len;
 }
 
 /**
- * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position
+ * xdr_read_pages - align page-based XDR data to current pointer position
  * @xdr: pointer to xdr_stream struct
  * @len: number of bytes of page data
  *
  * Moves data beyond the current pointer position from the XDR head[] buffer
- * into the page list. Any data that lies beyond current position + "len"
- * bytes is moved into the XDR tail[].
+ * into the page list. Any data that lies beyond current position + @len
+ * bytes is moved into the XDR tail[]. The xdr_stream current position is
+ * then advanced past that data to align to the next XDR object in the tail.
  *
  * Returns the number of XDR encoded bytes now contained in the pages
  */
 unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 {
-	struct xdr_buf *buf = xdr->buf;
-	struct kvec *iov;
-	unsigned int nwords;
-	unsigned int end;
-	unsigned int padding;
+	unsigned int nwords = XDR_QUADLEN(len);
+	unsigned int base, end, pglen;
 
-	len = xdr_align_pages(xdr, len);
-	if (len == 0)
+	pglen = xdr_align_pages(xdr, nwords << 2);
+	if (pglen == 0)
 		return 0;
-	nwords = XDR_QUADLEN(len);
-	padding = (nwords << 2) - len;
-	xdr->iov = iov = buf->tail;
-	/* Compute remaining message length.  */
-	end = ((xdr->nwords - nwords) << 2) + padding;
-	if (end > iov->iov_len)
-		end = iov->iov_len;
 
-	/*
-	 * Position current pointer at beginning of tail, and
-	 * set remaining message length.
-	 */
-	xdr->p = (__be32 *)((char *)iov->iov_base + padding);
-	xdr->end = (__be32 *)((char *)iov->iov_base + end);
-	xdr->page_ptr = NULL;
-	xdr->nwords = XDR_QUADLEN(end - padding);
-	return len;
+	base = (nwords << 2) - pglen;
+	end = xdr_stream_remaining(xdr) - pglen;
+
+	xdr_set_tail_base(xdr, base, end);
+	return len <= pglen ? len : pglen;
 }
 EXPORT_SYMBOL_GPL(xdr_read_pages);
 
-uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length)
+unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
+			    unsigned int length)
 {
 	struct xdr_buf *buf = xdr->buf;
-	unsigned int from, bytes;
-	unsigned int shift = 0;
-
-	if ((offset + length) < offset ||
-	    (offset + length) > buf->page_len)
-		length = buf->page_len - offset;
+	unsigned int from, bytes, len;
+	unsigned int shift;
 
 	xdr_realign_pages(xdr);
 	from = xdr_page_pos(xdr);
-	bytes = xdr->nwords << 2;
-	if (length < bytes)
-		bytes = length;
+
+	if (from >= buf->page_len + buf->tail->iov_len)
+		return 0;
+	if (from + buf->head->iov_len >= buf->len)
+		return 0;
+
+	len = buf->len - buf->head->iov_len;
+
+	/* We only shift data left! */
+	if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n",
+		      from, offset))
+		return 0;
+	if (WARN_ONCE(offset > buf->page_len,
+		      "SUNRPC: buffer overflow. offset=%u, page_len=%u\n",
+		      offset, buf->page_len))
+		return 0;
 
 	/* Move page data to the left */
-	if (from > offset) {
-		shift = min_t(unsigned int, bytes, buf->page_len - from);
-		_shift_data_left_pages(buf->pages,
-				       buf->page_base + offset,
-				       buf->page_base + from,
-				       shift);
-		bytes -= shift;
-
-		/* Move tail data into the pages, if necessary */
-		if (bytes > 0)
-			_shift_data_left_tail(buf, offset + shift, bytes);
-	}
+	shift = from - offset;
+	xdr_buf_pages_shift_left(buf, from, len, shift);
+
+	bytes = xdr_stream_remaining(xdr);
+	if (length > bytes)
+		length = bytes;
+	bytes -= length;
 
-	xdr->nwords -= XDR_QUADLEN(length);
-	xdr_set_page(xdr, from + length, PAGE_SIZE);
+	xdr->buf->len -= shift;
+	xdr_set_page(xdr, offset + length, bytes);
 	return length;
 }
 EXPORT_SYMBOL_GPL(xdr_align_data);
 
-uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t length)
+unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset,
+			     unsigned int length)
 {
 	struct xdr_buf *buf = xdr->buf;
-	unsigned int bytes;
-	unsigned int from;
-	unsigned int truncated = 0;
-
-	if ((offset + length) < offset ||
-	    (offset + length) > buf->page_len)
-		length = buf->page_len - offset;
+	unsigned int from, to, shift;
 
 	xdr_realign_pages(xdr);
 	from = xdr_page_pos(xdr);
-	bytes = xdr->nwords << 2;
-
-	if (offset + length + bytes > buf->page_len) {
-		unsigned int shift = (offset + length + bytes) - buf->page_len;
-		unsigned int res = _shift_data_right_tail(buf, from + bytes - shift, shift);
-		truncated = shift - res;
-		xdr->nwords -= XDR_QUADLEN(truncated);
-		bytes -= shift;
-	}
-
-	/* Now move the page data over and zero pages */
-	if (bytes > 0)
-		_shift_data_right_pages(buf->pages,
-					buf->page_base + offset + length,
-					buf->page_base + from,
-					bytes);
-	_zero_pages(buf->pages, buf->page_base + offset, length);
+	to = xdr_align_size(offset + length);
+
+	/* Could the hole be behind us? */
+	if (to > from) {
+		unsigned int buflen = buf->len - buf->head->iov_len;
+		shift = to - from;
+		xdr_buf_try_expand(buf, shift);
+		xdr_buf_pages_shift_right(buf, from, buflen, shift);
+		xdr_set_page(xdr, to, xdr_stream_remaining(xdr));
+	} else if (to != from)
+		xdr_align_data(xdr, to, 0);
+	xdr_buf_pages_zero(buf, offset, length);
 
-	buf->len += length - (from - offset) - truncated;
-	xdr_set_page(xdr, offset + length, PAGE_SIZE);
 	return length;
 }
 EXPORT_SYMBOL_GPL(xdr_expand_hole);
@@ -1335,8 +1554,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page);
 
 static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
 
-void
-xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
+void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf)
 {
 	buf->head[0] = *iov;
 	buf->tail[0] = empty_iov;
@@ -1493,7 +1711,8 @@ fix_len:
 }
 EXPORT_SYMBOL_GPL(xdr_buf_trim);
 
-static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf,
+				      void *obj, unsigned int len)
 {
 	unsigned int this_len;
 
@@ -1502,8 +1721,7 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
 	len -= this_len;
 	obj += this_len;
 	this_len = min_t(unsigned int, len, subbuf->page_len);
-	if (this_len)
-		_copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
+	_copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
 	len -= this_len;
 	obj += this_len;
 	this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
@@ -1511,7 +1729,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
 }
 
 /* obj is assumed to point to allocated memory of size at least len: */
-int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+			    void *obj, unsigned int len)
 {
 	struct xdr_buf subbuf;
 	int status;
@@ -1524,7 +1743,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u
 }
 EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf);
 
-static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf,
+				     void *obj, unsigned int len)
 {
 	unsigned int this_len;
 
@@ -1533,8 +1753,7 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
 	len -= this_len;
 	obj += this_len;
 	this_len = min_t(unsigned int, len, subbuf->page_len);
-	if (this_len)
-		_copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
+	_copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
 	len -= this_len;
 	obj += this_len;
 	this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
@@ -1542,7 +1761,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
 }
 
 /* obj is assumed to point to allocated memory of size at least len: */
-int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+			   void *obj, unsigned int len)
 {
 	struct xdr_buf subbuf;
 	int status;
@@ -1555,8 +1775,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un
 }
 EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf);
 
-int
-xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
+int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj)
 {
 	__be32	raw;
 	int	status;
@@ -1569,8 +1788,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
 }
 EXPORT_SYMBOL_GPL(xdr_decode_word);
 
-int
-xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
+int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj)
 {
 	__be32	raw = cpu_to_be32(obj);
 
@@ -1579,9 +1797,8 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
 EXPORT_SYMBOL_GPL(xdr_encode_word);
 
 /* Returns 0 on success, or else a negative error code. */
-static int
-xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
-		 struct xdr_array2_desc *desc, int encode)
+static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base,
+			    struct xdr_array2_desc *desc, int encode)
 {
 	char *elem = NULL, *c;
 	unsigned int copied = 0, todo, avail_here;
@@ -1773,9 +1990,8 @@ out:
 	return err;
 }
 
-int
-xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
-		  struct xdr_array2_desc *desc)
+int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base,
+		      struct xdr_array2_desc *desc)
 {
 	if (base >= buf->len)
 		return -EINVAL;
@@ -1784,9 +2000,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
 }
 EXPORT_SYMBOL_GPL(xdr_decode_array2);
 
-int
-xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
-		  struct xdr_array2_desc *desc)
+int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base,
+		      struct xdr_array2_desc *desc)
 {
 	if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
 	    buf->head->iov_len + buf->page_len + buf->tail->iov_len)
@@ -1796,9 +2011,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
 }
 EXPORT_SYMBOL_GPL(xdr_encode_array2);
 
-int
-xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
-		int (*actor)(struct scatterlist *, void *), void *data)
+int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset,
+		    unsigned int len,
+		    int (*actor)(struct scatterlist *, void *), void *data)
 {
 	int i, ret = 0;
 	unsigned int page_len, thislen, page_offset;
@@ -1966,10 +2181,8 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
 
 	ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
 	if (ret > 0) {
-		char *s = kmalloc(ret + 1, gfp_flags);
+		char *s = kmemdup_nul(p, ret, gfp_flags);
 		if (s != NULL) {
-			memcpy(s, p, ret);
-			s[ret] = '\0';
 			*str = s;
 			return strlen(s);
 		}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index f6c17e75f20e..691ccf8049a4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -151,33 +151,94 @@ out:
 }
 EXPORT_SYMBOL_GPL(xprt_unregister_transport);
 
-/**
- * xprt_load_transport - load a transport implementation
- * @transport_name: transport to load
- *
- * Returns:
- * 0:		transport successfully loaded
- * -ENOENT:	transport module not available
- */
-int xprt_load_transport(const char *transport_name)
+static void
+xprt_class_release(const struct xprt_class *t)
 {
-	struct xprt_class *t;
-	int result;
+	module_put(t->owner);
+}
+
+static const struct xprt_class *
+xprt_class_find_by_ident_locked(int ident)
+{
+	const struct xprt_class *t;
+
+	list_for_each_entry(t, &xprt_list, list) {
+		if (t->ident != ident)
+			continue;
+		if (!try_module_get(t->owner))
+			continue;
+		return t;
+	}
+	return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_ident(int ident)
+{
+	const struct xprt_class *t;
 
-	result = 0;
 	spin_lock(&xprt_list_lock);
+	t = xprt_class_find_by_ident_locked(ident);
+	spin_unlock(&xprt_list_lock);
+	return t;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid_locked(const char *netid)
+{
+	const struct xprt_class *t;
+	unsigned int i;
+
 	list_for_each_entry(t, &xprt_list, list) {
-		if (strcmp(t->name, transport_name) == 0) {
-			spin_unlock(&xprt_list_lock);
-			goto out;
+		for (i = 0; t->netid[i][0] != '\0'; i++) {
+			if (strcmp(t->netid[i], netid) != 0)
+				continue;
+			if (!try_module_get(t->owner))
+				continue;
+			return t;
 		}
 	}
+	return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid(const char *netid)
+{
+	const struct xprt_class *t;
+
+	spin_lock(&xprt_list_lock);
+	t = xprt_class_find_by_netid_locked(netid);
+	if (!t) {
+		spin_unlock(&xprt_list_lock);
+		request_module("rpc%s", netid);
+		spin_lock(&xprt_list_lock);
+		t = xprt_class_find_by_netid_locked(netid);
+	}
 	spin_unlock(&xprt_list_lock);
-	result = request_module("xprt%s", transport_name);
-out:
-	return result;
+	return t;
+}
+
+/**
+ * xprt_find_transport_ident - convert a netid into a transport identifier
+ * @netid: transport to load
+ *
+ * Returns:
+ * > 0:		transport identifier
+ * -ENOENT:	transport module not available
+ */
+int xprt_find_transport_ident(const char *netid)
+{
+	const struct xprt_class *t;
+	int ret;
+
+	t = xprt_class_find_by_netid(netid);
+	if (!t)
+		return -ENOENT;
+	ret = t->ident;
+	xprt_class_release(t);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(xprt_load_transport);
+EXPORT_SYMBOL_GPL(xprt_find_transport_ident);
 
 static void xprt_clear_locked(struct rpc_xprt *xprt)
 {
@@ -1896,21 +1957,17 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
 struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
 {
 	struct rpc_xprt	*xprt;
-	struct xprt_class *t;
+	const struct xprt_class *t;
 
-	spin_lock(&xprt_list_lock);
-	list_for_each_entry(t, &xprt_list, list) {
-		if (t->ident == args->ident) {
-			spin_unlock(&xprt_list_lock);
-			goto found;
-		}
+	t = xprt_class_find_by_ident(args->ident);
+	if (!t) {
+		dprintk("RPC: transport (%d) not supported\n", args->ident);
+		return ERR_PTR(-EIO);
 	}
-	spin_unlock(&xprt_list_lock);
-	dprintk("RPC: transport (%d) not supported\n", args->ident);
-	return ERR_PTR(-EIO);
 
-found:
 	xprt = t->setup(args);
+	xprt_class_release(t);
+
 	if (IS_ERR(xprt))
 		goto out;
 	if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT)
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index c92c1aac270a..946edf2db646 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2015-2020, Oracle and/or its affiliates.
  *
  * Support for backward direction RPCs on RPC/RDMA.
  */
@@ -82,7 +82,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 				      &rqst->rq_snd_buf, rpcrdma_noch_pullup))
 		return -EIO;
 
-	trace_xprtrdma_cb_reply(rqst);
+	trace_xprtrdma_cb_reply(r_xprt, rqst);
 	return 0;
 }
 
@@ -260,7 +260,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	 */
 	req = rpcr_to_rdmar(rqst);
 	req->rl_reply = rep;
-	trace_xprtrdma_cb_call(rqst);
+	trace_xprtrdma_cb_call(r_xprt, rqst);
 
 	/* Queue rqst for ULP's callback service */
 	bc_serv = xprt->bc_serv;
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 44888f5badef..baca49fe83af 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -65,18 +65,23 @@ void frwr_release_mr(struct rpcrdma_mr *mr)
 	kfree(mr);
 }
 
+static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+{
+	if (mr->mr_device) {
+		trace_xprtrdma_mr_unmap(mr);
+		ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents,
+				mr->mr_dir);
+		mr->mr_device = NULL;
+	}
+}
+
 static void frwr_mr_recycle(struct rpcrdma_mr *mr)
 {
 	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
 
 	trace_xprtrdma_mr_recycle(mr);
 
-	if (mr->mr_dir != DMA_NONE) {
-		trace_xprtrdma_mr_unmap(mr);
-		ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
-				mr->mr_sg, mr->mr_nents, mr->mr_dir);
-		mr->mr_dir = DMA_NONE;
-	}
+	frwr_mr_unmap(r_xprt, mr);
 
 	spin_lock(&r_xprt->rx_buf.rb_lock);
 	list_del(&mr->mr_all);
@@ -86,6 +91,16 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr)
 	frwr_release_mr(mr);
 }
 
+static void frwr_mr_put(struct rpcrdma_mr *mr)
+{
+	frwr_mr_unmap(mr->mr_xprt, mr);
+
+	/* The MR is returned to the req's MR free list instead
+	 * of to the xprt's MR free list. No spinlock is needed.
+	 */
+	rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
+}
+
 /* frwr_reset - Place MRs back on the free list
  * @req: request to reset
  *
@@ -101,7 +116,7 @@ void frwr_reset(struct rpcrdma_req *req)
 	struct rpcrdma_mr *mr;
 
 	while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
-		rpcrdma_mr_put(mr);
+		frwr_mr_put(mr);
 }
 
 /**
@@ -130,7 +145,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
 
 	mr->mr_xprt = r_xprt;
 	mr->frwr.fr_mr = frmr;
-	mr->mr_dir = DMA_NONE;
+	mr->mr_device = NULL;
 	INIT_LIST_HEAD(&mr->mr_list);
 	init_completion(&mr->frwr.fr_linv_done);
 
@@ -315,6 +330,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 				  mr->mr_dir);
 	if (!dma_nents)
 		goto out_dmamap_err;
+	mr->mr_device = ep->re_id->device;
 
 	ibmr = mr->frwr.fr_mr;
 	n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
@@ -341,7 +357,6 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 	return seg;
 
 out_dmamap_err:
-	mr->mr_dir = DMA_NONE;
 	trace_xprtrdma_frwr_sgerr(mr, i);
 	return ERR_PTR(-EIO);
 
@@ -363,12 +378,21 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
 		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_fastreg(wc, frwr);
+	trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid);
 	/* The MR will get recycled when the associated req is retransmitted */
 
 	rpcrdma_flush_disconnect(cq->cq_context, wc);
 }
 
+static void frwr_cid_init(struct rpcrdma_ep *ep,
+			  struct rpcrdma_frwr *frwr)
+{
+	struct rpc_rdma_cid *cid = &frwr->fr_cid;
+
+	cid->ci_queue_id = ep->re_attr.send_cq->res.id;
+	cid->ci_completion_id = frwr->fr_mr->res.id;
+}
+
 /**
  * frwr_send - post Send WRs containing the RPC Call message
  * @r_xprt: controlling transport instance
@@ -385,6 +409,7 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
  */
 int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
 	struct ib_send_wr *post_wr;
 	struct rpcrdma_mr *mr;
 
@@ -395,6 +420,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 		frwr = &mr->frwr;
 
 		frwr->fr_cqe.done = frwr_wc_fastreg;
+		frwr_cid_init(ep, frwr);
 		frwr->fr_regwr.wr.next = post_wr;
 		frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
 		frwr->fr_regwr.wr.num_sge = 0;
@@ -404,7 +430,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 		post_wr = &frwr->fr_regwr.wr;
 	}
 
-	return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL);
+	return ib_post_send(ep->re_id->qp, post_wr, NULL);
 }
 
 /**
@@ -420,18 +446,17 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
 	list_for_each_entry(mr, mrs, mr_list)
 		if (mr->mr_handle == rep->rr_inv_rkey) {
 			list_del_init(&mr->mr_list);
-			trace_xprtrdma_mr_reminv(mr);
-			rpcrdma_mr_put(mr);
+			frwr_mr_put(mr);
 			break;	/* only one invalidated MR per RPC */
 		}
 }
 
-static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
+static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr)
 {
 	if (wc->status != IB_WC_SUCCESS)
 		frwr_mr_recycle(mr);
 	else
-		rpcrdma_mr_put(mr);
+		frwr_mr_put(mr);
 }
 
 /**
@@ -448,8 +473,8 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
 	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_li(wc, frwr);
-	__frwr_release_mr(wc, mr);
+	trace_xprtrdma_wc_li(wc, &frwr->fr_cid);
+	frwr_mr_done(wc, mr);
 
 	rpcrdma_flush_disconnect(cq->cq_context, wc);
 }
@@ -469,8 +494,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
 	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_li_wake(wc, frwr);
-	__frwr_release_mr(wc, mr);
+	trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid);
+	frwr_mr_done(wc, mr);
 	complete(&frwr->fr_linv_done);
 
 	rpcrdma_flush_disconnect(cq->cq_context, wc);
@@ -490,6 +515,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
 	struct ib_send_wr *first, **prev, *last;
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
 	const struct ib_send_wr *bad_wr;
 	struct rpcrdma_frwr *frwr;
 	struct rpcrdma_mr *mr;
@@ -509,6 +535,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 
 		frwr = &mr->frwr;
 		frwr->fr_cqe.done = frwr_wc_localinv;
+		frwr_cid_init(ep, frwr);
 		last = &frwr->fr_invwr;
 		last->next = NULL;
 		last->wr_cqe = &frwr->fr_cqe;
@@ -534,7 +561,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 * unless re_id->qp is a valid pointer.
 	 */
 	bad_wr = NULL;
-	rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
+	rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
 
 	/* The final LOCAL_INV WR in the chain is supposed to
 	 * do the wake. If it was never posted, the wake will
@@ -547,7 +574,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 
 	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
 	 */
-	trace_xprtrdma_post_linv(req, rc);
+	trace_xprtrdma_post_linv_err(req, rc);
 	while (bad_wr) {
 		frwr = container_of(bad_wr, struct rpcrdma_frwr,
 				    fr_invwr);
@@ -574,10 +601,10 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
 	struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_li_done(wc, frwr);
-	__frwr_release_mr(wc, mr);
+	trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid);
+	frwr_mr_done(wc, mr);
 
-	/* Ensure @rep is generated before __frwr_release_mr */
+	/* Ensure @rep is generated before frwr_mr_done */
 	smp_rmb();
 	rpcrdma_complete_rqst(rep);
 
@@ -597,6 +624,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
 void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
 	struct ib_send_wr *first, *last, **prev;
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
 	const struct ib_send_wr *bad_wr;
 	struct rpcrdma_frwr *frwr;
 	struct rpcrdma_mr *mr;
@@ -614,6 +642,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 
 		frwr = &mr->frwr;
 		frwr->fr_cqe.done = frwr_wc_localinv;
+		frwr_cid_init(ep, frwr);
 		last = &frwr->fr_invwr;
 		last->next = NULL;
 		last->wr_cqe = &frwr->fr_cqe;
@@ -639,13 +668,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 * unless re_id->qp is a valid pointer.
 	 */
 	bad_wr = NULL;
-	rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
+	rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
 	if (!rc)
 		return;
 
 	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
 	 */
-	trace_xprtrdma_post_linv(req, rc);
+	trace_xprtrdma_post_linv_err(req, rc);
 	while (bad_wr) {
 		frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
 		mr = container_of(frwr, struct rpcrdma_mr, frwr);
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
index 620327c01302..45c5b41ac8dc 100644
--- a/net/sunrpc/xprtrdma/module.c
+++ b/net/sunrpc/xprtrdma/module.c
@@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_ALIAS("svcrdma");
 MODULE_ALIAS("xprtrdma");
+MODULE_ALIAS("rpcrdma6");
 
 static void __exit rpc_rdma_cleanup(void)
 {
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 0f5120c7668f..8f5d0cb68360 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
- * Copyright (c) 2014-2017 Oracle.  All rights reserved.
+ * Copyright (c) 2014-2020, Oracle and/or its affiliates.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -179,6 +179,31 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
 		r_xprt->rx_ep->re_max_inline_recv;
 }
 
+/* ACL likes to be lazy in allocating pages. For TCP, these
+ * pages can be allocated during receive processing. Not true
+ * for RDMA, which must always provision receive buffers
+ * up front.
+ */
+static noinline int
+rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
+{
+	struct page **ppages;
+	int len;
+
+	len = buf->page_len;
+	ppages = buf->pages + (buf->page_base >> PAGE_SHIFT);
+	while (len > 0) {
+		if (!*ppages)
+			*ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+		if (!*ppages)
+			return -ENOBUFS;
+		ppages++;
+		len -= PAGE_SIZE;
+	}
+
+	return 0;
+}
+
 /* Split @vec on page boundaries into SGEs. FMR registers pages, not
  * a byte range. Other modes coalesce these SGEs into a single MR
  * when they can.
@@ -233,15 +258,6 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
 	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
 	page_base = offset_in_page(xdrbuf->page_base);
 	while (len) {
-		/* ACL likes to be lazy in allocating pages - ACLs
-		 * are small by default but can get huge.
-		 */
-		if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) {
-			if (!*ppages)
-				*ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
-			if (!*ppages)
-				return -ENOBUFS;
-		}
 		seg->mr_page = *ppages;
 		seg->mr_offset = (char *)page_base;
 		seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
@@ -315,7 +331,6 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
 		*mr = rpcrdma_mr_get(r_xprt);
 		if (!*mr)
 			goto out_getmr_err;
-		trace_xprtrdma_mr_get(req);
 		(*mr)->mr_req = req;
 	}
 
@@ -323,7 +338,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
 	return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
 
 out_getmr_err:
-	trace_xprtrdma_nomrs(req);
+	trace_xprtrdma_nomrs_err(r_xprt, req);
 	xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
 	rpcrdma_mrs_refresh(r_xprt);
 	return ERR_PTR(-EAGAIN);
@@ -867,6 +882,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 	__be32 *p;
 	int ret;
 
+	if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) {
+		ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf);
+		if (ret)
+			return ret;
+	}
+
 	rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
 	xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf),
 			rqst);
@@ -1322,20 +1343,13 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
 		p = xdr_inline_decode(xdr, 2 * sizeof(*p));
 		if (!p)
 			break;
-		dprintk("RPC:       %s: server reports "
-			"version error (%u-%u), xid %08x\n", __func__,
-			be32_to_cpup(p), be32_to_cpu(*(p + 1)),
-			be32_to_cpu(rep->rr_xid));
+		trace_xprtrdma_err_vers(rqst, p, p + 1);
 		break;
 	case err_chunk:
-		dprintk("RPC:       %s: server reports "
-			"header decoding error, xid %08x\n", __func__,
-			be32_to_cpu(rep->rr_xid));
+		trace_xprtrdma_err_chunk(rqst);
 		break;
 	default:
-		dprintk("RPC:       %s: server reports "
-			"unrecognized error %d, xid %08x\n", __func__,
-			be32_to_cpup(p), be32_to_cpu(rep->rr_xid));
+		trace_xprtrdma_err_unrecognized(rqst, p);
 	}
 
 	return -EIO;
@@ -1376,7 +1390,7 @@ out:
 	return;
 
 out_badheader:
-	trace_xprtrdma_reply_hdr(rep);
+	trace_xprtrdma_reply_hdr_err(rep);
 	r_xprt->rx_stats.bad_reply_count++;
 	rqst->rq_task->tk_status = status;
 	status = 0;
@@ -1450,14 +1464,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	rpcrdma_post_recvs(r_xprt, false);
 
 	req = rpcr_to_rdmar(rqst);
-	if (req->rl_reply) {
-		trace_xprtrdma_leaked_rep(rqst, req->rl_reply);
+	if (unlikely(req->rl_reply))
 		rpcrdma_recv_buffer_put(req->rl_reply);
-	}
 	req->rl_reply = rep;
 	rep->rr_rqst = rqst;
 
-	trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
+	trace_xprtrdma_reply(rqst->rq_task, rep, credits);
 
 	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
 		frwr_reminv(rep, &req->rl_registered);
@@ -1469,16 +1481,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	return;
 
 out_badversion:
-	trace_xprtrdma_reply_vers(rep);
+	trace_xprtrdma_reply_vers_err(rep);
 	goto out;
 
 out_norqst:
 	spin_unlock(&xprt->queue_lock);
-	trace_xprtrdma_reply_rqst(rep);
+	trace_xprtrdma_reply_rqst_err(rep);
 	goto out;
 
 out_shortreply:
-	trace_xprtrdma_reply_short(rep);
+	trace_xprtrdma_reply_short_err(rep);
 
 out:
 	rpcrdma_recv_buffer_put(rep);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 8915e42240d3..78d29d1bcc20 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -599,11 +599,12 @@ static void
 xprt_rdma_free(struct rpc_task *task)
 {
 	struct rpc_rqst *rqst = task->tk_rqstp;
-	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 
-	if (!list_empty(&req->rl_registered))
-		frwr_unmap_sync(r_xprt, req);
+	if (unlikely(!list_empty(&req->rl_registered))) {
+		trace_xprtrdma_mrs_zap(task);
+		frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req);
+	}
 
 	/* XXX: If the RPC is completing because of a signal and
 	 * not because a reply was received, we ought to ensure
@@ -768,6 +769,7 @@ static struct xprt_class xprt_rdma = {
 	.owner			= THIS_MODULE,
 	.ident			= XPRT_TRANSPORT_RDMA,
 	.setup			= xprt_setup_rdma,
+	.netid			= { "rdma", "rdma6", "" },
 };
 
 void xprt_rdma_cleanup(void)
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index ad6e2e4994ce..ec912cf9c618 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -167,7 +167,7 @@ static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 	struct rpcrdma_xprt *r_xprt = cq->cq_context;
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_send(sc, wc);
+	trace_xprtrdma_wc_send(wc, &sc->sc_cid);
 	rpcrdma_sendctx_put_locked(r_xprt, sc);
 	rpcrdma_flush_disconnect(r_xprt, wc);
 }
@@ -186,7 +186,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	struct rpcrdma_xprt *r_xprt = cq->cq_context;
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_receive(wc);
+	trace_xprtrdma_wc_receive(wc, &rep->rr_cid);
 	--r_xprt->rx_ep->re_receive_count;
 	if (wc->status != IB_WC_SUCCESS)
 		goto out_flushed;
@@ -643,6 +643,9 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
 		return NULL;
 
 	sc->sc_cqe.done = rpcrdma_wc_send;
+	sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id;
+	sc->sc_cid.ci_completion_id =
+		atomic_inc_return(&ep->re_completion_ids);
 	return sc;
 }
 
@@ -972,6 +975,9 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
 	if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
 		goto out_free_regbuf;
 
+	rep->rr_cid.ci_completion_id =
+		atomic_inc_return(&r_xprt->rx_ep->re_completion_ids);
+
 	xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf),
 		     rdmab_length(rep->rr_rdmabuf));
 	rep->rr_cqe.done = rpcrdma_wc_receive;
@@ -1179,25 +1185,6 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
 }
 
 /**
- * rpcrdma_mr_put - DMA unmap an MR and release it
- * @mr: MR to release
- *
- */
-void rpcrdma_mr_put(struct rpcrdma_mr *mr)
-{
-	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-
-	if (mr->mr_dir != DMA_NONE) {
-		trace_xprtrdma_mr_unmap(mr);
-		ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
-				mr->mr_sg, mr->mr_nents, mr->mr_dir);
-		mr->mr_dir = DMA_NONE;
-	}
-
-	rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
-}
-
-/**
  * rpcrdma_buffer_get - Get a request buffer
  * @buffers: Buffer pool from which to obtain a buffer
  *
@@ -1411,6 +1398,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
 		if (!rep)
 			break;
 
+		rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id;
 		trace_xprtrdma_post_recv(rep);
 		rep->rr_recv_wr.next = wr;
 		wr = &rep->rr_recv_wr;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 43974ef39a50..94b28657aeeb 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -53,6 +53,7 @@
 #include <rdma/ib_verbs.h>		/* RDMA verbs api */
 
 #include <linux/sunrpc/clnt.h> 		/* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma_cid.h> 	/* completion IDs */
 #include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
 #include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
 
@@ -93,6 +94,8 @@ struct rpcrdma_ep {
 	unsigned int		re_max_requests; /* depends on device */
 	unsigned int		re_inline_send;	/* negotiated */
 	unsigned int		re_inline_recv;	/* negotiated */
+
+	atomic_t		re_completion_ids;
 };
 
 /* Pre-allocate extra Work Requests for handling backward receives
@@ -180,6 +183,8 @@ enum {
 
 struct rpcrdma_rep {
 	struct ib_cqe		rr_cqe;
+	struct rpc_rdma_cid	rr_cid;
+
 	__be32			rr_xid;
 	__be32			rr_vers;
 	__be32			rr_proc;
@@ -211,6 +216,7 @@ enum {
 struct rpcrdma_req;
 struct rpcrdma_sendctx {
 	struct ib_cqe		sc_cqe;
+	struct rpc_rdma_cid	sc_cid;
 	struct rpcrdma_req	*sc_req;
 	unsigned int		sc_unmap_count;
 	struct ib_sge		sc_sges[];
@@ -225,6 +231,7 @@ struct rpcrdma_sendctx {
 struct rpcrdma_frwr {
 	struct ib_mr			*fr_mr;
 	struct ib_cqe			fr_cqe;
+	struct rpc_rdma_cid		fr_cid;
 	struct completion		fr_linv_done;
 	union {
 		struct ib_reg_wr	fr_regwr;
@@ -236,6 +243,7 @@ struct rpcrdma_req;
 struct rpcrdma_mr {
 	struct list_head	mr_list;
 	struct rpcrdma_req	*mr_req;
+	struct ib_device	*mr_device;
 	struct scatterlist	*mr_sg;
 	int			mr_nents;
 	enum dma_data_direction	mr_dir;
@@ -466,7 +474,6 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
 struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt);
 
 struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_mr_put(struct rpcrdma_mr *mr);
 void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt);
 
 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7090bbee0ec5..c56a66cdf4ac 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -433,7 +433,8 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
 		if (ret <= 0)
 			goto sock_err;
 		xs_flush_bvec(buf->bvec, ret, seek + buf->page_base);
-		offset += ret - buf->page_base;
+		ret -= buf->page_base;
+		offset += ret;
 		if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
 			goto out;
 		if (ret != want)
@@ -3059,6 +3060,7 @@ static struct xprt_class	xs_local_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_LOCAL,
 	.setup		= xs_setup_local,
+	.netid		= { "" },
 };
 
 static struct xprt_class	xs_udp_transport = {
@@ -3067,6 +3069,7 @@ static struct xprt_class	xs_udp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_UDP,
 	.setup		= xs_setup_udp,
+	.netid		= { "udp", "udp6", "" },
 };
 
 static struct xprt_class	xs_tcp_transport = {
@@ -3075,6 +3078,7 @@ static struct xprt_class	xs_tcp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_TCP,
 	.setup		= xs_setup_tcp,
+	.netid		= { "tcp", "tcp6", "" },
 };
 
 static struct xprt_class	xs_bc_tcp_transport = {
@@ -3083,6 +3087,7 @@ static struct xprt_class	xs_bc_tcp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_BC_TCP,
 	.setup		= xs_setup_bc_tcp,
+	.netid		= { "" },
 };
 
 /**
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 6ae2140eb4f7..115109259430 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1030,7 +1030,6 @@ void tipc_link_reset(struct tipc_link *l)
 int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 		   struct sk_buff_head *xmitq)
 {
-	struct tipc_msg *hdr = buf_msg(skb_peek(list));
 	struct sk_buff_head *backlogq = &l->backlogq;
 	struct sk_buff_head *transmq = &l->transmq;
 	struct sk_buff *skb, *_skb;
@@ -1038,13 +1037,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 	u16 ack = l->rcv_nxt - 1;
 	u16 seqno = l->snd_nxt;
 	int pkt_cnt = skb_queue_len(list);
-	int imp = msg_importance(hdr);
 	unsigned int mss = tipc_link_mss(l);
 	unsigned int cwin = l->window;
 	unsigned int mtu = l->mtu;
+	struct tipc_msg *hdr;
 	bool new_bundle;
 	int rc = 0;
+	int imp;
+
+	if (pkt_cnt <= 0)
+		return 0;
 
+	hdr = buf_msg(skb_peek(list));
 	if (unlikely(msg_size(hdr) > mtu)) {
 		pr_warn("Too large msg, purging xmit list %d %d %d %d %d!\n",
 			skb_queue_len(list), msg_user(hdr),
@@ -1053,6 +1057,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 		return -EMSGSIZE;
 	}
 
+	imp = msg_importance(hdr);
 	/* Allow oversubscription of one data msg per source at congestion */
 	if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
 		if (imp == TIPC_SYSTEM_IMPORTANCE) {
@@ -2539,7 +2544,7 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win)
 }
 
 /**
- * link_reset_stats - reset link statistics
+ * tipc_link_reset_stats - reset link statistics
  * @l: pointer to link
  */
 void tipc_link_reset_stats(struct tipc_link *l)
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 82f154989418..5a1ce64039f7 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -213,12 +213,14 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
 	}
 
 	info.attrs = attrbuf;
-	err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf,
-				     tipc_genl_family.maxattr,
-				     tipc_genl_family.policy, NULL);
-	if (err)
-		goto err_out;
 
+	if (nlmsg_len(cb.nlh) > 0) {
+		err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf,
+					     tipc_genl_family.maxattr,
+					     tipc_genl_family.policy, NULL);
+		if (err)
+			goto err_out;
+	}
 	do {
 		int rem;
 
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 83d9eb830592..008670d1f43e 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1665,7 +1665,7 @@ static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list)
 }
 
 /**
- * tipc_node_xmit() is the general link level function for message sending
+ * tipc_node_xmit() - general link level function for message sending
  * @net: the applicable net namespace
  * @list: chain of buffers containing message
  * @dnode: address of destination node
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index 27026f587fa6..f620acd2a0f5 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -21,6 +21,7 @@ config CFG80211
 	tristate "cfg80211 - wireless configuration API"
 	depends on RFKILL || !RFKILL
 	select FW_LOADER
+	select CRC32
 	# may need to update this when certificates are changed and are
 	# using a different algorithm, though right now they shouldn't
 	# (this is here rather than below to allow it to be a module)
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index bb72447ad960..8114bba8556c 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -5,7 +5,7 @@
  * Copyright 2008-2011	Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright      2017  Intel Deutschland GmbH
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2021 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -139,6 +139,11 @@ static const struct ieee80211_regdomain *get_cfg80211_regdom(void)
 	return rcu_dereference_rtnl(cfg80211_regdomain);
 }
 
+/*
+ * Returns the regulatory domain associated with the wiphy.
+ *
+ * Requires either RTNL or RCU protection
+ */
 const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy)
 {
 	return rcu_dereference_rtnl(wiphy->regd);
@@ -2571,9 +2576,13 @@ void wiphy_apply_custom_regulatory(struct wiphy *wiphy,
 	if (IS_ERR(new_regd))
 		return;
 
+	rtnl_lock();
+
 	tmp = get_wiphy_regdom(wiphy);
 	rcu_assign_pointer(wiphy->regd, new_regd);
 	rcu_free_regdom(tmp);
+
+	rtnl_unlock();
 }
 EXPORT_SYMBOL(wiphy_apply_custom_regulatory);
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ac4a317038f1..4a83117507f5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -108,9 +108,9 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid);
 
 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
 {
-	if (queue_id < dev->real_num_rx_queues)
+	if (queue_id < dev->num_rx_queues)
 		dev->_rx[queue_id].pool = NULL;
-	if (queue_id < dev->real_num_tx_queues)
+	if (queue_id < dev->num_tx_queues)
 		dev->_tx[queue_id].pool = NULL;
 }
 
@@ -423,9 +423,9 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 	struct xdp_sock *xs = xdp_sk(skb->sk);
 	unsigned long flags;
 
-	spin_lock_irqsave(&xs->tx_completion_lock, flags);
+	spin_lock_irqsave(&xs->pool->cq_lock, flags);
 	xskq_prod_submit_addr(xs->pool->cq, addr);
-	spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
+	spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 
 	sock_wfree(skb);
 }
@@ -437,6 +437,7 @@ static int xsk_generic_xmit(struct sock *sk)
 	bool sent_frame = false;
 	struct xdp_desc desc;
 	struct sk_buff *skb;
+	unsigned long flags;
 	int err = 0;
 
 	mutex_lock(&xs->mutex);
@@ -468,10 +469,13 @@ static int xsk_generic_xmit(struct sock *sk)
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
 		 */
+		spin_lock_irqsave(&xs->pool->cq_lock, flags);
 		if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 			kfree_skb(skb);
 			goto out;
 		}
+		spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 
 		skb->dev = xs->dev;
 		skb->priority = sk->sk_priority;
@@ -483,6 +487,9 @@ static int xsk_generic_xmit(struct sock *sk)
 		if  (err == NETDEV_TX_BUSY) {
 			/* Tell user-space to retry the send */
 			skb->destructor = sock_wfree;
+			spin_lock_irqsave(&xs->pool->cq_lock, flags);
+			xskq_prod_cancel(xs->pool->cq);
+			spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 			/* Free skb without triggering the perf drop trace */
 			consume_skb(skb);
 			err = -EAGAIN;
@@ -878,6 +885,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		}
 	}
 
+	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
+	xs->fq_tmp = NULL;
+	xs->cq_tmp = NULL;
+
 	xs->dev = dev;
 	xs->zc = xs->umem->zc;
 	xs->queue_id = qid;
@@ -1299,7 +1310,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
 	xs->state = XSK_READY;
 	mutex_init(&xs->mutex);
 	spin_lock_init(&xs->rx_lock);
-	spin_lock_init(&xs->tx_completion_lock);
 
 	INIT_LIST_HEAD(&xs->map_list);
 	spin_lock_init(&xs->map_list_lock);
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 67a4494d63b6..20598eea658c 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -71,12 +71,11 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 	INIT_LIST_HEAD(&pool->free_list);
 	INIT_LIST_HEAD(&pool->xsk_tx_list);
 	spin_lock_init(&pool->xsk_tx_list_lock);
+	spin_lock_init(&pool->cq_lock);
 	refcount_set(&pool->users, 1);
 
 	pool->fq = xs->fq_tmp;
 	pool->cq = xs->cq_tmp;
-	xs->fq_tmp = NULL;
-	xs->cq_tmp = NULL;
 
 	for (i = 0; i < pool->free_heads_cnt; i++) {
 		xskb = &pool->heads[i];
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 4a9663aa7afe..2823b7c3302d 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -334,6 +334,11 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q)
 	return xskq_prod_nb_free(q, 1) ? false : true;
 }
 
+static inline void xskq_prod_cancel(struct xsk_queue *q)
+{
+	q->cached_prod--;
+}
+
 static inline int xskq_prod_reserve(struct xsk_queue *q)
 {
 	if (xskq_prod_is_full(q))