summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/Kconfig2
-rw-r--r--net/8021q/vlan_core.c53
-rw-r--r--net/atm/lec.c7
-rw-r--r--net/atm/svc.c60
-rw-r--r--net/batman-adv/fragmentation.c2
-rw-r--r--net/batman-adv/multicast.c1
-rw-r--r--net/bridge/br_vlan.c2
-rw-r--r--net/bridge/netfilter/ebtables.c10
-rw-r--r--net/ceph/messenger.c47
-rw-r--r--net/ceph/osd_client.c129
-rw-r--r--net/core/dev.c2
-rw-r--r--net/core/net_namespace.c10
-rw-r--r--net/core/rtnetlink.c3
-rw-r--r--net/core/skbuff.c53
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv4/tcp.c14
-rw-r--r--net/ipv4/tcp_input.c27
-rw-r--r--net/ipv4/tcp_ipv4.c5
-rw-r--r--net/ipv4/tcp_metrics.c6
-rw-r--r--net/ipv4/tcp_output.c24
-rw-r--r--net/ipv6/ip6_fib.c2
-rw-r--r--net/ipv6/sit.c6
-rw-r--r--net/ipv6/tcp_ipv6.c3
-rw-r--r--net/irda/irlap_frame.c2
-rw-r--r--net/mac80211/chan.c2
-rw-r--r--net/netfilter/core.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c19
-rw-r--r--net/netfilter/nf_sockopt.c8
-rw-r--r--net/netfilter/nf_tables_api.c30
-rw-r--r--net/netfilter/x_tables.c47
-rw-r--r--net/netlink/af_netlink.c4
-rw-r--r--net/openvswitch/actions.c5
-rw-r--r--net/openvswitch/datapath.c2
-rw-r--r--net/openvswitch/vport.c4
-rw-r--r--net/packet/af_packet.c17
-rw-r--r--net/packet/internal.h1
-rw-r--r--net/sched/sch_cbq.c48
-rw-r--r--net/sctp/associola.c12
-rw-r--r--net/sunrpc/addr.c16
-rw-r--r--net/sunrpc/auth.c68
-rw-r--r--net/sunrpc/auth_generic.c6
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c126
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c9
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c28
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c20
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c2
-rw-r--r--net/sunrpc/auth_null.c2
-rw-r--r--net/sunrpc/clnt.c5
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rw-r--r--net/sunrpc/svc.c4
-rw-r--r--net/sunrpc/svc_xprt.c27
-rw-r--r--net/sunrpc/svcsock.c50
-rw-r--r--net/sunrpc/xdr.c3
-rw-r--r--net/sunrpc/xprt.c3
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c83
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c28
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c39
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c20
-rw-r--r--net/sunrpc/xprtrdma/transport.c17
-rw-r--r--net/sunrpc/xprtrdma/verbs.c739
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h61
-rw-r--r--net/sunrpc/xprtsock.c9
-rw-r--r--net/tipc/port.h5
-rw-r--r--net/tipc/socket.c2
64 files changed, 1283 insertions, 773 deletions
diff --git a/net/6lowpan/Kconfig b/net/6lowpan/Kconfig
index 028a5c6d1f61..e4a02ef55102 100644
--- a/net/6lowpan/Kconfig
+++ b/net/6lowpan/Kconfig
@@ -1,5 +1,5 @@
config 6LOWPAN
- bool "6LoWPAN Support"
+ tristate "6LoWPAN Support"
depends on IPV6
---help---
This enables IPv6 over Low power Wireless Personal Area Network -
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 75d427763992..90cc2bdd4064 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -112,59 +112,6 @@ __be16 vlan_dev_vlan_proto(const struct net_device *dev)
}
EXPORT_SYMBOL(vlan_dev_vlan_proto);
-static struct sk_buff *vlan_reorder_header(struct sk_buff *skb)
-{
- if (skb_cow(skb, skb_headroom(skb)) < 0) {
- kfree_skb(skb);
- return NULL;
- }
-
- memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
- skb->mac_header += VLAN_HLEN;
- return skb;
-}
-
-struct sk_buff *vlan_untag(struct sk_buff *skb)
-{
- struct vlan_hdr *vhdr;
- u16 vlan_tci;
-
- if (unlikely(vlan_tx_tag_present(skb))) {
- /* vlan_tci is already set-up so leave this for another time */
- return skb;
- }
-
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (unlikely(!skb))
- goto err_free;
-
- if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
- goto err_free;
-
- vhdr = (struct vlan_hdr *) skb->data;
- vlan_tci = ntohs(vhdr->h_vlan_TCI);
- __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
-
- skb_pull_rcsum(skb, VLAN_HLEN);
- vlan_set_encap_proto(skb, vhdr);
-
- skb = vlan_reorder_header(skb);
- if (unlikely(!skb))
- goto err_free;
-
- skb_reset_network_header(skb);
- skb_reset_transport_header(skb);
- skb_reset_mac_len(skb);
-
- return skb;
-
-err_free:
- kfree_skb(skb);
- return NULL;
-}
-EXPORT_SYMBOL(vlan_untag);
-
-
/*
* vlan info and vid list
*/
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 4c5b8ba0f84f..4b98f897044a 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -410,9 +410,11 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
priv->lane2_ops = NULL;
if (priv->lane_version > 1)
priv->lane2_ops = &lane2_ops;
+ rtnl_lock();
if (dev_set_mtu(dev, mesg->content.config.mtu))
pr_info("%s: change_mtu to %d failed\n",
dev->name, mesg->content.config.mtu);
+ rtnl_unlock();
priv->is_proxy = mesg->content.config.is_proxy;
break;
case l_flush_tran_id:
@@ -833,7 +835,6 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl,
loff_t *l)
{
struct hlist_node *e = state->node;
- struct lec_arp_table *tmp;
if (!e)
e = tbl->first;
@@ -842,9 +843,7 @@ static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl,
--*l;
}
- tmp = container_of(e, struct lec_arp_table, next);
-
- hlist_for_each_entry_from(tmp, next) {
+ for (; e; e = e->next) {
if (--*l < 0)
break;
}
diff --git a/net/atm/svc.c b/net/atm/svc.c
index d8e5d0c2ebbc..1ba23f5018e7 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -50,12 +50,12 @@ static void svc_disconnect(struct atm_vcc *vcc)
pr_debug("%p\n", vcc);
if (test_bit(ATM_VF_REGIS, &vcc->flags)) {
- prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
sigd_enq(vcc, as_close, NULL, NULL, NULL);
- while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) {
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd)
+ break;
schedule();
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_UNINTERRUPTIBLE);
}
finish_wait(sk_sleep(sk), &wait);
}
@@ -126,11 +126,12 @@ static int svc_bind(struct socket *sock, struct sockaddr *sockaddr,
}
vcc->local = *addr;
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local);
- while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
- schedule();
+ for (;;) {
prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
+ schedule();
}
finish_wait(sk_sleep(sk), &wait);
clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */
@@ -202,15 +203,14 @@ static int svc_connect(struct socket *sock, struct sockaddr *sockaddr,
}
vcc->remote = *addr;
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote);
if (flags & O_NONBLOCK) {
- finish_wait(sk_sleep(sk), &wait);
sock->state = SS_CONNECTING;
error = -EINPROGRESS;
goto out;
}
error = 0;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
schedule();
if (!signal_pending(current)) {
@@ -297,11 +297,12 @@ static int svc_listen(struct socket *sock, int backlog)
goto out;
}
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local);
- while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
- schedule();
+ for (;;) {
prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
+ schedule();
}
finish_wait(sk_sleep(sk), &wait);
if (!sigd) {
@@ -387,15 +388,15 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags)
}
/* wait should be short, so we ignore the non-blocking flag */
set_bit(ATM_VF_WAITING, &new_vcc->flags);
- prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
- TASK_UNINTERRUPTIBLE);
sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL);
- while (test_bit(ATM_VF_WAITING, &new_vcc->flags) && sigd) {
+ for (;;) {
+ prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &new_vcc->flags) || !sigd)
+ break;
release_sock(sk);
schedule();
lock_sock(sk);
- prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait,
- TASK_UNINTERRUPTIBLE);
}
finish_wait(sk_sleep(sk_atm(new_vcc)), &wait);
if (!sigd) {
@@ -433,12 +434,14 @@ int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
DEFINE_WAIT(wait);
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0);
- while (test_bit(ATM_VF_WAITING, &vcc->flags) &&
- !test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) {
- schedule();
+ for (;;) {
prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) ||
+ test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) {
+ break;
+ }
+ schedule();
}
finish_wait(sk_sleep(sk), &wait);
if (!sigd)
@@ -529,18 +532,18 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr,
lock_sock(sk);
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
sigd_enq(vcc, as_addparty, NULL, NULL,
(struct sockaddr_atmsvc *) sockaddr);
if (flags & O_NONBLOCK) {
- finish_wait(sk_sleep(sk), &wait);
error = -EINPROGRESS;
goto out;
}
pr_debug("added wait queue\n");
- while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
- schedule();
+ for (;;) {
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
+ schedule();
}
finish_wait(sk_sleep(sk), &wait);
error = xchg(&sk->sk_err_soft, 0);
@@ -558,11 +561,12 @@ static int svc_dropparty(struct socket *sock, int ep_ref)
lock_sock(sk);
set_bit(ATM_VF_WAITING, &vcc->flags);
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref);
- while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) {
- schedule();
+ for (;;) {
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd)
+ break;
+ schedule();
}
finish_wait(sk_sleep(sk), &wait);
if (!sigd) {
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 52c43f904220..fc1835c6bb40 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -188,7 +188,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node,
/* Reached the end of the list, so insert after 'frag_entry_last'. */
if (likely(frag_entry_last)) {
- hlist_add_behind(&frag_entry_last->list, &frag_entry_new->list);
+ hlist_add_behind(&frag_entry_new->list, &frag_entry_last->list);
chain->size += skb->len - hdr_size;
chain->timestamp = jiffies;
ret = true;
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 96b66fd30f96..ab6bb2af1d45 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -20,7 +20,6 @@
#include "originator.h"
#include "hard-interface.h"
#include "translation-table.h"
-#include "multicast.h"
/**
* batadv_mcast_mla_softif_get - get softif multicast listeners
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index febb0f87fa37..e1bcd653899b 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -181,7 +181,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
*/
if (unlikely(!vlan_tx_tag_present(skb) &&
skb->protocol == proto)) {
- skb = vlan_untag(skb);
+ skb = skb_vlan_untag(skb);
if (unlikely(!skb))
return false;
}
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 1059ed3bc255..6d69631b9f4d 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -327,10 +327,7 @@ find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
char name[EBT_FUNCTION_MAXNAMELEN];
} *e;
- *error = mutex_lock_interruptible(mutex);
- if (*error != 0)
- return NULL;
-
+ mutex_lock(mutex);
list_for_each_entry(e, head, list) {
if (strcmp(e->name, name) == 0)
return e;
@@ -1203,10 +1200,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table)
table->private = newinfo;
rwlock_init(&table->lock);
- ret = mutex_lock_interruptible(&ebt_mutex);
- if (ret != 0)
- goto free_chainstack;
-
+ mutex_lock(&ebt_mutex);
list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) {
if (strcmp(t->name, table->name) == 0) {
ret = -EEXIST;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1948d592aa54..b2f571dd933d 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -174,6 +174,7 @@ static struct lock_class_key socket_class;
#define SKIP_BUF_SIZE 1024
static void queue_con(struct ceph_connection *con);
+static void cancel_con(struct ceph_connection *con);
static void con_work(struct work_struct *);
static void con_fault(struct ceph_connection *con);
@@ -680,7 +681,7 @@ void ceph_con_close(struct ceph_connection *con)
reset_connection(con);
con->peer_global_seq = 0;
- cancel_delayed_work(&con->work);
+ cancel_con(con);
con_close_socket(con);
mutex_unlock(&con->mutex);
}
@@ -900,7 +901,7 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
BUG_ON(page_count > (int)USHRT_MAX);
cursor->page_count = (unsigned short)page_count;
BUG_ON(length > SIZE_MAX - cursor->page_offset);
- cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE;
+ cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE;
}
static struct page *
@@ -2667,19 +2668,16 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
{
if (!con->ops->get(con)) {
dout("%s %p ref count 0\n", __func__, con);
-
return -ENOENT;
}
if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
dout("%s %p - already queued\n", __func__, con);
con->ops->put(con);
-
return -EBUSY;
}
dout("%s %p %lu\n", __func__, con, delay);
-
return 0;
}
@@ -2688,6 +2686,14 @@ static void queue_con(struct ceph_connection *con)
(void) queue_con_delay(con, 0);
}
+static void cancel_con(struct ceph_connection *con)
+{
+ if (cancel_delayed_work(&con->work)) {
+ dout("%s %p\n", __func__, con);
+ con->ops->put(con);
+ }
+}
+
static bool con_sock_closed(struct ceph_connection *con)
{
if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
@@ -3269,24 +3275,21 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
/*
* Free a generically kmalloc'd message.
*/
-void ceph_msg_kfree(struct ceph_msg *m)
+static void ceph_msg_free(struct ceph_msg *m)
{
- dout("msg_kfree %p\n", m);
+ dout("%s %p\n", __func__, m);
ceph_kvfree(m->front.iov_base);
kmem_cache_free(ceph_msg_cache, m);
}
-/*
- * Drop a msg ref. Destroy as needed.
- */
-void ceph_msg_last_put(struct kref *kref)
+static void ceph_msg_release(struct kref *kref)
{
struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
LIST_HEAD(data);
struct list_head *links;
struct list_head *next;
- dout("ceph_msg_put last one on %p\n", m);
+ dout("%s %p\n", __func__, m);
WARN_ON(!list_empty(&m->list_head));
/* drop middle, data, if any */
@@ -3308,9 +3311,25 @@ void ceph_msg_last_put(struct kref *kref)
if (m->pool)
ceph_msgpool_put(m->pool, m);
else
- ceph_msg_kfree(m);
+ ceph_msg_free(m);
+}
+
+struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+ dout("%s %p (was %d)\n", __func__, msg,
+ atomic_read(&msg->kref.refcount));
+ kref_get(&msg->kref);
+ return msg;
+}
+EXPORT_SYMBOL(ceph_msg_get);
+
+void ceph_msg_put(struct ceph_msg *msg)
+{
+ dout("%s %p (was %d)\n", __func__, msg,
+ atomic_read(&msg->kref.refcount));
+ kref_put(&msg->kref, ceph_msg_release);
}
-EXPORT_SYMBOL(ceph_msg_last_put);
+EXPORT_SYMBOL(ceph_msg_put);
void ceph_msg_dump(struct ceph_msg *msg)
{
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 05be0c181695..30f6faf3584f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -297,12 +297,21 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
/*
* requests
*/
-void ceph_osdc_release_request(struct kref *kref)
+static void ceph_osdc_release_request(struct kref *kref)
{
- struct ceph_osd_request *req;
+ struct ceph_osd_request *req = container_of(kref,
+ struct ceph_osd_request, r_kref);
unsigned int which;
- req = container_of(kref, struct ceph_osd_request, r_kref);
+ dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
+ req->r_request, req->r_reply);
+ WARN_ON(!RB_EMPTY_NODE(&req->r_node));
+ WARN_ON(!list_empty(&req->r_req_lru_item));
+ WARN_ON(!list_empty(&req->r_osd_item));
+ WARN_ON(!list_empty(&req->r_linger_item));
+ WARN_ON(!list_empty(&req->r_linger_osd_item));
+ WARN_ON(req->r_osd);
+
if (req->r_request)
ceph_msg_put(req->r_request);
if (req->r_reply) {
@@ -320,7 +329,22 @@ void ceph_osdc_release_request(struct kref *kref)
kmem_cache_free(ceph_osd_request_cache, req);
}
-EXPORT_SYMBOL(ceph_osdc_release_request);
+
+void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+ dout("%s %p (was %d)\n", __func__, req,
+ atomic_read(&req->r_kref.refcount));
+ kref_get(&req->r_kref);
+}
+EXPORT_SYMBOL(ceph_osdc_get_request);
+
+void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+ dout("%s %p (was %d)\n", __func__, req,
+ atomic_read(&req->r_kref.refcount));
+ kref_put(&req->r_kref, ceph_osdc_release_request);
+}
+EXPORT_SYMBOL(ceph_osdc_put_request);
struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
struct ceph_snap_context *snapc,
@@ -364,7 +388,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
RB_CLEAR_NODE(&req->r_node);
INIT_LIST_HEAD(&req->r_unsafe_item);
INIT_LIST_HEAD(&req->r_linger_item);
- INIT_LIST_HEAD(&req->r_linger_osd);
+ INIT_LIST_HEAD(&req->r_linger_osd_item);
INIT_LIST_HEAD(&req->r_req_lru_item);
INIT_LIST_HEAD(&req->r_osd_item);
@@ -916,7 +940,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
* list at the end to keep things in tid order.
*/
list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
- r_linger_osd) {
+ r_linger_osd_item) {
/*
* reregister request prior to unregistering linger so
* that r_osd is preserved.
@@ -1008,6 +1032,8 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
{
dout("__remove_osd %p\n", osd);
BUG_ON(!list_empty(&osd->o_requests));
+ BUG_ON(!list_empty(&osd->o_linger_requests));
+
rb_erase(&osd->o_node, &osdc->osds);
list_del_init(&osd->o_osd_lru);
ceph_con_close(&osd->o_con);
@@ -1029,12 +1055,23 @@ static void remove_all_osds(struct ceph_osd_client *osdc)
static void __move_osd_to_lru(struct ceph_osd_client *osdc,
struct ceph_osd *osd)
{
- dout("__move_osd_to_lru %p\n", osd);
+ dout("%s %p\n", __func__, osd);
BUG_ON(!list_empty(&osd->o_osd_lru));
+
list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
}
+static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
+ struct ceph_osd *osd)
+{
+ dout("%s %p\n", __func__, osd);
+
+ if (list_empty(&osd->o_requests) &&
+ list_empty(&osd->o_linger_requests))
+ __move_osd_to_lru(osdc, osd);
+}
+
static void __remove_osd_from_lru(struct ceph_osd *osd)
{
dout("__remove_osd_from_lru %p\n", osd);
@@ -1175,6 +1212,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
dout("__unregister_request %p tid %lld\n", req, req->r_tid);
rb_erase(&req->r_node, &osdc->requests);
+ RB_CLEAR_NODE(&req->r_node);
osdc->num_requests--;
if (req->r_osd) {
@@ -1182,12 +1220,8 @@ static void __unregister_request(struct ceph_osd_client *osdc,
ceph_msg_revoke(req->r_request);
list_del_init(&req->r_osd_item);
- if (list_empty(&req->r_osd->o_requests) &&
- list_empty(&req->r_osd->o_linger_requests)) {
- dout("moving osd to %p lru\n", req->r_osd);
- __move_osd_to_lru(osdc, req->r_osd);
- }
- if (list_empty(&req->r_linger_item))
+ maybe_move_osd_to_lru(osdc, req->r_osd);
+ if (list_empty(&req->r_linger_osd_item))
req->r_osd = NULL;
}
@@ -1214,45 +1248,39 @@ static void __cancel_request(struct ceph_osd_request *req)
static void __register_linger_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req)
{
- dout("__register_linger_request %p\n", req);
+ dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+ WARN_ON(!req->r_linger);
+
ceph_osdc_get_request(req);
list_add_tail(&req->r_linger_item, &osdc->req_linger);
if (req->r_osd)
- list_add_tail(&req->r_linger_osd,
+ list_add_tail(&req->r_linger_osd_item,
&req->r_osd->o_linger_requests);
}
static void __unregister_linger_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req)
{
- dout("__unregister_linger_request %p\n", req);
+ WARN_ON(!req->r_linger);
+
+ if (list_empty(&req->r_linger_item)) {
+ dout("%s %p tid %llu not registered\n", __func__, req,
+ req->r_tid);
+ return;
+ }
+
+ dout("%s %p tid %llu\n", __func__, req, req->r_tid);
list_del_init(&req->r_linger_item);
- if (req->r_osd) {
- list_del_init(&req->r_linger_osd);
- if (list_empty(&req->r_osd->o_requests) &&
- list_empty(&req->r_osd->o_linger_requests)) {
- dout("moving osd to %p lru\n", req->r_osd);
- __move_osd_to_lru(osdc, req->r_osd);
- }
+ if (req->r_osd) {
+ list_del_init(&req->r_linger_osd_item);
+ maybe_move_osd_to_lru(osdc, req->r_osd);
if (list_empty(&req->r_osd_item))
req->r_osd = NULL;
}
ceph_osdc_put_request(req);
}
-void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
-{
- mutex_lock(&osdc->request_mutex);
- if (req->r_linger) {
- req->r_linger = 0;
- __unregister_linger_request(osdc, req);
- }
- mutex_unlock(&osdc->request_mutex);
-}
-EXPORT_SYMBOL(ceph_osdc_unregister_linger_request);
-
void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
struct ceph_osd_request *req)
{
@@ -2430,6 +2458,25 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
EXPORT_SYMBOL(ceph_osdc_start_request);
/*
+ * Unregister a registered request. The request is not completed (i.e.
+ * no callbacks or wakeups) - higher layers are supposed to know what
+ * they are canceling.
+ */
+void ceph_osdc_cancel_request(struct ceph_osd_request *req)
+{
+ struct ceph_osd_client *osdc = req->r_osdc;
+
+ mutex_lock(&osdc->request_mutex);
+ if (req->r_linger)
+ __unregister_linger_request(osdc, req);
+ __unregister_request(osdc, req);
+ mutex_unlock(&osdc->request_mutex);
+
+ dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_cancel_request);
+
+/*
* wait for a request to complete
*/
int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
@@ -2437,18 +2484,18 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
{
int rc;
+ dout("%s %p tid %llu\n", __func__, req, req->r_tid);
+
rc = wait_for_completion_interruptible(&req->r_completion);
if (rc < 0) {
- mutex_lock(&osdc->request_mutex);
- __cancel_request(req);
- __unregister_request(osdc, req);
- mutex_unlock(&osdc->request_mutex);
+ dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
+ ceph_osdc_cancel_request(req);
complete_request(req);
- dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
return rc;
}
- dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+ dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
+ req->r_result);
return req->r_result;
}
EXPORT_SYMBOL(ceph_osdc_wait_request);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1c15b189c52b..b65a5051361f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3602,7 +3602,7 @@ another_round:
if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
- skb = vlan_untag(skb);
+ skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto unlock;
}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 85b62691f4f2..7c6b51a58968 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -373,9 +373,11 @@ struct net *get_net_ns_by_pid(pid_t pid)
tsk = find_task_by_vpid(pid);
if (tsk) {
struct nsproxy *nsproxy;
- nsproxy = task_nsproxy(tsk);
+ task_lock(tsk);
+ nsproxy = tsk->nsproxy;
if (nsproxy)
net = get_net(nsproxy->net_ns);
+ task_unlock(tsk);
}
rcu_read_unlock();
return net;
@@ -632,11 +634,11 @@ static void *netns_get(struct task_struct *task)
struct net *net = NULL;
struct nsproxy *nsproxy;
- rcu_read_lock();
- nsproxy = task_nsproxy(task);
+ task_lock(task);
+ nsproxy = task->nsproxy;
if (nsproxy)
net = get_net(nsproxy->net_ns);
- rcu_read_unlock();
+ task_unlock(task);
return net;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8d39071f32d7..f0493e3b7471 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -804,7 +804,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
(nla_total_size(sizeof(struct ifla_vf_mac)) +
nla_total_size(sizeof(struct ifla_vf_vlan)) +
nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
- nla_total_size(sizeof(struct ifla_vf_rate)));
+ nla_total_size(sizeof(struct ifla_vf_rate)) +
+ nla_total_size(sizeof(struct ifla_vf_link_state)));
return size;
} else
return 0;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 224506a6fa80..163b673f9e62 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -62,6 +62,7 @@
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
+#include <linux/if_vlan.h>
#include <net/protocol.h>
#include <net/dst.h>
@@ -3973,3 +3974,55 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
return shinfo->gso_size;
}
EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
+
+static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
+{
+ if (skb_cow(skb, skb_headroom(skb)) < 0) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
+ skb->mac_header += VLAN_HLEN;
+ return skb;
+}
+
+struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
+{
+ struct vlan_hdr *vhdr;
+ u16 vlan_tci;
+
+ if (unlikely(vlan_tx_tag_present(skb))) {
+ /* vlan_tci is already set-up so leave this for another time */
+ return skb;
+ }
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ goto err_free;
+
+ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
+ goto err_free;
+
+ vhdr = (struct vlan_hdr *)skb->data;
+ vlan_tci = ntohs(vhdr->h_vlan_TCI);
+ __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
+
+ skb_pull_rcsum(skb, VLAN_HLEN);
+ vlan_set_encap_proto(skb, vhdr);
+
+ skb = skb_reorder_vlan_header(skb);
+ if (unlikely(!skb))
+ goto err_free;
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_reset_mac_len(skb);
+
+ return skb;
+
+err_free:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(skb_vlan_untag);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 190199851c9a..eaa4b000c7b4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1798,8 +1798,6 @@ local_input:
no_route:
RT_CACHE_STAT_INC(in_no_route);
res.type = RTN_UNREACHABLE;
- if (err == -ESRCH)
- err = -ENETUNREACH;
goto local_input;
/*
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 181b70ebd964..541f26a67ba2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1188,13 +1188,6 @@ new_segment:
goto wait_for_memory;
/*
- * All packets are restored as if they have
- * already been sent.
- */
- if (tp->repair)
- TCP_SKB_CB(skb)->when = tcp_time_stamp;
-
- /*
* Check whether we can use HW checksum.
*/
if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
@@ -1203,6 +1196,13 @@ new_segment:
skb_entail(sk, skb);
copy = size_goal;
max = size_goal;
+
+ /* All packets are restored as if they have
+ * already been sent. skb_mstamp isn't set to
+ * avoid wrong rtt estimation.
+ */
+ if (tp->repair)
+ TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
}
/* Try to append data to the end of skb. */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a3d47af01906..a906e0200ff2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2687,7 +2687,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
*/
static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
@@ -2713,12 +2712,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
if (recovered) {
/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
- icsk->icsk_retransmits = 0;
tcp_try_undo_recovery(sk);
return;
}
- if (flag & FLAG_DATA_ACKED)
- icsk->icsk_retransmits = 0;
if (tcp_is_reno(tp)) {
/* A Reno DUPACK means new data in F-RTO step 2.b above are
* delivered. Lower inflight to clock out (re)tranmissions.
@@ -3050,10 +3046,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
first_ackt.v64 = 0;
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
u8 sacked = scb->sacked;
u32 acked_pcount;
+ if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
+ between(shinfo->tskey, prior_snd_una, tp->snd_una - 1))
+ __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+
/* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
if (tcp_skb_pcount(skb) == 1 ||
@@ -3107,11 +3108,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->retrans_stamp = 0;
}
- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) &&
- between(skb_shinfo(skb)->tskey, prior_snd_una,
- tp->snd_una + 1))
- __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
-
if (!fully_acked)
break;
@@ -3405,8 +3401,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
- if (after(ack, prior_snd_una))
+ if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
+ icsk->icsk_retransmits = 0;
+ }
prior_fackets = tp->fackets_out;
@@ -5979,12 +5977,14 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
* timewait bucket, so that all the necessary checks
* are made in the function processing timewait state.
*/
- if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
+ if (tcp_death_row.sysctl_tw_recycle) {
bool strict;
dst = af_ops->route_req(sk, &fl, req, &strict);
+
if (dst && strict &&
- !tcp_peer_is_proven(req, dst, true)) {
+ !tcp_peer_is_proven(req, dst, true,
+ tmp_opt.saw_tstamp)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
@@ -5993,7 +5993,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
else if (!sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
- !tcp_peer_is_proven(req, dst, false)) {
+ !tcp_peer_is_proven(req, dst, false,
+ tmp_opt.saw_tstamp)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dceff5fe8e66..cd17f009aede 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -271,7 +271,7 @@ EXPORT_SYMBOL(tcp_v4_connect);
* It can be called through tcp_release_cb() if socket was owned by user
* at the time tcp_v4_err() was called to handle ICMP message.
*/
-static void tcp_v4_mtu_reduced(struct sock *sk)
+void tcp_v4_mtu_reduced(struct sock *sk)
{
struct dst_entry *dst;
struct inet_sock *inet = inet_sk(sk);
@@ -302,6 +302,7 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
}
+EXPORT_SYMBOL(tcp_v4_mtu_reduced);
static void do_redirect(struct sk_buff *skb, struct sock *sk)
{
@@ -1787,6 +1788,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
.compat_setsockopt = compat_ip_setsockopt,
.compat_getsockopt = compat_ip_getsockopt,
#endif
+ .mtu_reduced = tcp_v4_mtu_reduced,
};
EXPORT_SYMBOL(ipv4_specific);
@@ -2406,7 +2408,6 @@ struct proto tcp_prot = {
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
- .mtu_reduced = tcp_v4_mtu_reduced,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 0d54e59b9ea8..ed9c9a91851c 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -576,7 +576,8 @@ reset:
tp->snd_cwnd_stamp = tcp_time_stamp;
}
-bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
+bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
+ bool paws_check, bool timestamps)
{
struct tcp_metrics_block *tm;
bool ret;
@@ -589,7 +590,8 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool pa
if (paws_check) {
if (tm &&
(u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
- (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
+ ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW ||
+ !timestamps))
ret = false;
else
ret = true;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8fcfc91964ec..5a7c41fbc6d3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -800,7 +800,7 @@ void tcp_release_cb(struct sock *sk)
__sock_put(sk);
}
if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
- sk->sk_prot->mtu_reduced(sk);
+ inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
}
@@ -1069,6 +1069,21 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
tcp_verify_left_out(tp);
}
+static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+ !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
+ struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
+ u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+ shinfo->tx_flags &= ~tsflags;
+ shinfo2->tx_flags |= tsflags;
+ swap(shinfo->tskey, shinfo2->tskey);
+ }
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
@@ -1136,6 +1151,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
*/
TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
buff->tstamp = skb->tstamp;
+ tcp_fragment_tstamp(skb, buff);
old_factor = tcp_skb_pcount(skb);
@@ -1652,6 +1668,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
skb_split(skb, buff, len);
+ tcp_fragment_tstamp(skb, buff);
/* Fix up tso_factor for both original and new SKB. */
tcp_set_skb_tso_segs(sk, skb, mss_now);
@@ -1917,8 +1934,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
BUG_ON(!tso_segs);
- if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+ /* "when" is used as a start point for the retransmit timer */
+ TCP_SKB_CB(skb)->when = tcp_time_stamp;
goto repair; /* Skip network transmission */
+ }
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index cb4459bd1d29..76b7f5ee8f4c 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -643,7 +643,7 @@ static int fib6_commit_metrics(struct dst_entry *dst,
if (dst->flags & DST_HOST) {
mp = dst_metrics_write_ptr(dst);
} else {
- mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
if (!mp)
return -ENOMEM;
dst_init_metrics(dst, mp, 0);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 2e9ba035fb5f..6163f851dc01 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -101,19 +101,19 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net,
for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) {
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
- (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+ (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
(t->dev->flags & IFF_UP))
return t;
}
for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) {
if (remote == t->parms.iph.daddr &&
- (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+ (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
(t->dev->flags & IFF_UP))
return t;
}
for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) {
if (local == t->parms.iph.saddr &&
- (!dev || !t->parms.link || dev->iflink == t->parms.link) &&
+ (!dev || !t->parms.link || dev->ifindex == t->parms.link) &&
(t->dev->flags & IFF_UP))
return t;
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f2ce95502392..29964c3d363c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1595,6 +1595,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
#endif
+ .mtu_reduced = tcp_v6_mtu_reduced,
};
#ifdef CONFIG_TCP_MD5SIG
@@ -1625,6 +1626,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,
#endif
+ .mtu_reduced = tcp_v4_mtu_reduced,
};
#ifdef CONFIG_TCP_MD5SIG
@@ -1864,7 +1866,6 @@ struct proto tcpv6_prot = {
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v6_do_rcv,
.release_cb = tcp_release_cb,
- .mtu_reduced = tcp_v6_mtu_reduced,
.hash = tcp_v6_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 9ea0c933b9ff..a37998c6273d 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -622,7 +622,7 @@ void irlap_send_rd_frame(struct irlap_cb *self)
frame = (struct rd_frame *)skb_put(tx_skb, 2);
frame->caddr = self->caddr;
- frame->caddr = RD_RSP | PF_BIT;
+ frame->control = RD_RSP | PF_BIT;
irlap_queue_xmit(self, tx_skb);
}
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 6d537f03c0ba..0375009ddc0d 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -1444,7 +1444,7 @@ ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
list_del(&sdata->reserved_chanctx_list);
list_move(&sdata->assigned_chanctx_list,
- &new_ctx->assigned_vifs);
+ &ctx->assigned_vifs);
sdata->reserved_chanctx = NULL;
ieee80211_vif_chanctx_reservation_complete(sdata);
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 1fbab0cdd302..a93c97f106d4 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -35,11 +35,7 @@ EXPORT_SYMBOL_GPL(nf_ipv6_ops);
int nf_register_afinfo(const struct nf_afinfo *afinfo)
{
- int err;
-
- err = mutex_lock_interruptible(&afinfo_mutex);
- if (err < 0)
- return err;
+ mutex_lock(&afinfo_mutex);
RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo);
mutex_unlock(&afinfo_mutex);
return 0;
@@ -68,11 +64,8 @@ static DEFINE_MUTEX(nf_hook_mutex);
int nf_register_hook(struct nf_hook_ops *reg)
{
struct nf_hook_ops *elem;
- int err;
- err = mutex_lock_interruptible(&nf_hook_mutex);
- if (err < 0)
- return err;
+ mutex_lock(&nf_hook_mutex);
list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
if (reg->priority < elem->priority)
break;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 8416307fdd1d..fd3f444a4f96 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2271,10 +2271,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
cmd == IP_VS_SO_SET_STOPDAEMON) {
struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
- if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
- ret = -ERESTARTSYS;
- goto out_dec;
- }
+ mutex_lock(&ipvs->sync_mutex);
if (cmd == IP_VS_SO_SET_STARTDAEMON)
ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
dm->syncid);
@@ -2284,11 +2281,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
goto out_dec;
}
- if (mutex_lock_interruptible(&__ip_vs_mutex)) {
- ret = -ERESTARTSYS;
- goto out_dec;
- }
-
+ mutex_lock(&__ip_vs_mutex);
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
ret = ip_vs_flush(net, false);
@@ -2573,9 +2566,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
struct ip_vs_daemon_user d[2];
memset(&d, 0, sizeof(d));
- if (mutex_lock_interruptible(&ipvs->sync_mutex))
- return -ERESTARTSYS;
-
+ mutex_lock(&ipvs->sync_mutex);
if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
@@ -2594,9 +2585,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
return ret;
}
- if (mutex_lock_interruptible(&__ip_vs_mutex))
- return -ERESTARTSYS;
-
+ mutex_lock(&__ip_vs_mutex);
switch (cmd) {
case IP_VS_SO_GET_VERSION:
{
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index f042ae521557..c68c1e58b362 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -26,9 +26,7 @@ int nf_register_sockopt(struct nf_sockopt_ops *reg)
struct nf_sockopt_ops *ops;
int ret = 0;
- if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
- return -EINTR;
-
+ mutex_lock(&nf_sockopt_mutex);
list_for_each_entry(ops, &nf_sockopts, list) {
if (ops->pf == reg->pf
&& (overlap(ops->set_optmin, ops->set_optmax,
@@ -65,9 +63,7 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf,
{
struct nf_sockopt_ops *ops;
- if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
- return ERR_PTR(-EINTR);
-
+ mutex_lock(&nf_sockopt_mutex);
list_for_each_entry(ops, &nf_sockopts, list) {
if (ops->pf == pf) {
if (!try_module_get(ops->owner))
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b8035c2d6667..deeb95fb7028 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -899,6 +899,9 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
static void nft_chain_stats_replace(struct nft_base_chain *chain,
struct nft_stats __percpu *newstats)
{
+ if (newstats == NULL)
+ return;
+
if (chain->stats) {
struct nft_stats __percpu *oldstats =
nft_dereference(chain->stats);
@@ -3134,16 +3137,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
goto err2;
trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
- if (trans == NULL)
+ if (trans == NULL) {
+ err = -ENOMEM;
goto err2;
+ }
nft_trans_elem(trans) = elem;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
-
- nft_data_uninit(&elem.key, NFT_DATA_VALUE);
- if (set->flags & NFT_SET_MAP)
- nft_data_uninit(&elem.data, set->dtype);
-
return 0;
err2:
nft_data_uninit(&elem.key, desc.type);
@@ -3310,7 +3310,7 @@ static int nf_tables_commit(struct sk_buff *skb)
{
struct net *net = sock_net(skb->sk);
struct nft_trans *trans, *next;
- struct nft_set *set;
+ struct nft_trans_elem *te;
/* Bump generation counter, invalidate any dump in progress */
while (++net->nft.base_seq == 0);
@@ -3396,13 +3396,17 @@ static int nf_tables_commit(struct sk_buff *skb)
nft_trans_destroy(trans);
break;
case NFT_MSG_DELSETELEM:
- nf_tables_setelem_notify(&trans->ctx,
- nft_trans_elem_set(trans),
- &nft_trans_elem(trans),
+ te = (struct nft_trans_elem *)trans->data;
+ nf_tables_setelem_notify(&trans->ctx, te->set,
+ &te->elem,
NFT_MSG_DELSETELEM, 0);
- set = nft_trans_elem_set(trans);
- set->ops->get(set, &nft_trans_elem(trans));
- set->ops->remove(set, &nft_trans_elem(trans));
+ te->set->ops->get(te->set, &te->elem);
+ te->set->ops->remove(te->set, &te->elem);
+ nft_data_uninit(&te->elem.key, NFT_DATA_VALUE);
+ if (te->elem.flags & NFT_SET_MAP) {
+ nft_data_uninit(&te->elem.data,
+ te->set->dtype);
+ }
nft_trans_destroy(trans);
break;
}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 47b978bc3100..272ae4d6fdf4 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -71,18 +71,14 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
static const unsigned int xt_jumpstack_multiplier = 2;
/* Registration hooks for targets. */
-int
-xt_register_target(struct xt_target *target)
+int xt_register_target(struct xt_target *target)
{
u_int8_t af = target->family;
- int ret;
- ret = mutex_lock_interruptible(&xt[af].mutex);
- if (ret != 0)
- return ret;
+ mutex_lock(&xt[af].mutex);
list_add(&target->list, &xt[af].target);
mutex_unlock(&xt[af].mutex);
- return ret;
+ return 0;
}
EXPORT_SYMBOL(xt_register_target);
@@ -125,20 +121,14 @@ xt_unregister_targets(struct xt_target *target, unsigned int n)
}
EXPORT_SYMBOL(xt_unregister_targets);
-int
-xt_register_match(struct xt_match *match)
+int xt_register_match(struct xt_match *match)
{
u_int8_t af = match->family;
- int ret;
-
- ret = mutex_lock_interruptible(&xt[af].mutex);
- if (ret != 0)
- return ret;
+ mutex_lock(&xt[af].mutex);
list_add(&match->list, &xt[af].match);
mutex_unlock(&xt[af].mutex);
-
- return ret;
+ return 0;
}
EXPORT_SYMBOL(xt_register_match);
@@ -194,9 +184,7 @@ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)
struct xt_match *m;
int err = -ENOENT;
- if (mutex_lock_interruptible(&xt[af].mutex) != 0)
- return ERR_PTR(-EINTR);
-
+ mutex_lock(&xt[af].mutex);
list_for_each_entry(m, &xt[af].match, list) {
if (strcmp(m->name, name) == 0) {
if (m->revision == revision) {
@@ -239,9 +227,7 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
struct xt_target *t;
int err = -ENOENT;
- if (mutex_lock_interruptible(&xt[af].mutex) != 0)
- return ERR_PTR(-EINTR);
-
+ mutex_lock(&xt[af].mutex);
list_for_each_entry(t, &xt[af].target, list) {
if (strcmp(t->name, name) == 0) {
if (t->revision == revision) {
@@ -323,10 +309,7 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target,
{
int have_rev, best = -1;
- if (mutex_lock_interruptible(&xt[af].mutex) != 0) {
- *err = -EINTR;
- return 1;
- }
+ mutex_lock(&xt[af].mutex);
if (target == 1)
have_rev = target_revfn(af, name, revision, &best);
else
@@ -732,9 +715,7 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
{
struct xt_table *t;
- if (mutex_lock_interruptible(&xt[af].mutex) != 0)
- return ERR_PTR(-EINTR);
-
+ mutex_lock(&xt[af].mutex);
list_for_each_entry(t, &net->xt.tables[af], list)
if (strcmp(t->name, name) == 0 && try_module_get(t->me))
return t;
@@ -883,10 +864,7 @@ struct xt_table *xt_register_table(struct net *net,
goto out;
}
- ret = mutex_lock_interruptible(&xt[table->af].mutex);
- if (ret != 0)
- goto out_free;
-
+ mutex_lock(&xt[table->af].mutex);
/* Don't autoload: we'd eat our tail... */
list_for_each_entry(t, &net->xt.tables[table->af], list) {
if (strcmp(t->name, table->name) == 0) {
@@ -911,9 +889,8 @@ struct xt_table *xt_register_table(struct net *net,
mutex_unlock(&xt[table->af].mutex);
return table;
- unlock:
+unlock:
mutex_unlock(&xt[table->af].mutex);
-out_free:
kfree(table);
out:
return ERR_PTR(ret);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a324b4b34c90..c416725d28c4 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -213,7 +213,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
nskb->protocol = htons((u16) sk->sk_protocol);
nskb->pkt_type = netlink_is_kernel(sk) ?
PACKET_KERNEL : PACKET_USER;
-
+ skb_reset_network_header(nskb);
ret = dev_queue_xmit(nskb);
if (unlikely(ret > 0))
ret = net_xmit_errno(ret);
@@ -2921,6 +2921,7 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
}
static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
rcu_read_lock();
return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
@@ -2970,6 +2971,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void netlink_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
rcu_read_unlock();
}
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index fe5cda0deb39..5231652a95d9 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -42,6 +42,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
static int make_writable(struct sk_buff *skb, int write_len)
{
+ if (!pskb_may_pull(skb, write_len))
+ return -ENOMEM;
+
if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
return 0;
@@ -70,6 +73,8 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
vlan_set_encap_proto(skb, vhdr);
skb->mac_header += VLAN_HLEN;
+ if (skb_network_offset(skb) < ETH_HLEN)
+ skb_set_network_header(skb, ETH_HLEN);
skb_reset_mac_len(skb);
return 0;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 7ad3f029baae..7228ec3faf19 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -47,8 +47,6 @@
#include <linux/openvswitch.h>
#include <linux/rculist.h>
#include <linux/dmi.h>
-#include <linux/genetlink.h>
-#include <net/genetlink.h>
#include <net/genetlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 702fb21bfe15..6d8f2ec481d9 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -137,8 +137,10 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
vport->ops = ops;
INIT_HLIST_NODE(&vport->dp_hash_node);
- if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids))
+ if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) {
+ kfree(vport);
return ERR_PTR(-EINVAL);
+ }
vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!vport->percpu_stats) {
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8d9f8042705a..93896d2092f6 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -632,6 +632,7 @@ static void init_prb_bdqc(struct packet_sock *po,
p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
+ p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
prb_init_ft_ops(p1, req_u);
prb_setup_retire_blk_timer(po, tx_ring);
prb_open_block(p1, pbd);
@@ -1942,6 +1943,18 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if ((int)snaplen < 0)
snaplen = 0;
}
+ } else if (unlikely(macoff + snaplen >
+ GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
+ u32 nval;
+
+ nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
+ pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
+ snaplen, nval, macoff);
+ snaplen = nval;
+ if (unlikely((int)snaplen < 0)) {
+ snaplen = 0;
+ macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
+ }
}
spin_lock(&sk->sk_receive_queue.lock);
h.raw = packet_current_rx_frame(po, skb,
@@ -3783,6 +3796,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
goto out;
if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
goto out;
+ if (po->tp_version >= TPACKET_V3 &&
+ (int)(req->tp_block_size -
+ BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
+ goto out;
if (unlikely(req->tp_frame_size < po->tp_hdrlen +
po->tp_reserve))
goto out;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index eb9580a6b25f..cdddf6a30399 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -29,6 +29,7 @@ struct tpacket_kbdq_core {
char *pkblk_start;
char *pkblk_end;
int kblk_size;
+ unsigned int max_frame_len;
unsigned int knum_blocks;
uint64_t knxt_seq_num;
char *prev;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index ead526467cca..762a04bb8f6d 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -159,7 +159,6 @@ struct cbq_sched_data {
struct cbq_class *tx_borrowed;
int tx_len;
psched_time_t now; /* Cached timestamp */
- psched_time_t now_rt; /* Cached real time */
unsigned int pmask;
struct hrtimer delay_timer;
@@ -353,12 +352,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
int toplevel = q->toplevel;
if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
- psched_time_t now;
- psched_tdiff_t incr;
-
- now = psched_get_time();
- incr = now - q->now_rt;
- now = q->now + incr;
+ psched_time_t now = psched_get_time();
do {
if (cl->undertime < now) {
@@ -700,8 +694,13 @@ cbq_update(struct cbq_sched_data *q)
struct cbq_class *this = q->tx_class;
struct cbq_class *cl = this;
int len = q->tx_len;
+ psched_time_t now;
q->tx_class = NULL;
+ /* Time integrator. We calculate EOS time
+ * by adding expected packet transmission time.
+ */
+ now = q->now + L2T(&q->link, len);
for ( ; cl; cl = cl->share) {
long avgidle = cl->avgidle;
@@ -717,7 +716,7 @@ cbq_update(struct cbq_sched_data *q)
* idle = (now - last) - last_pktlen/rate
*/
- idle = q->now - cl->last;
+ idle = now - cl->last;
if ((unsigned long)idle > 128*1024*1024) {
avgidle = cl->maxidle;
} else {
@@ -761,7 +760,7 @@ cbq_update(struct cbq_sched_data *q)
idle -= L2T(&q->link, len);
idle += L2T(cl, len);
- cl->undertime = q->now + idle;
+ cl->undertime = now + idle;
} else {
/* Underlimit */
@@ -771,7 +770,8 @@ cbq_update(struct cbq_sched_data *q)
else
cl->avgidle = avgidle;
}
- cl->last = q->now;
+ if ((s64)(now - cl->last) > 0)
+ cl->last = now;
}
cbq_update_toplevel(q, this, q->tx_borrowed);
@@ -943,31 +943,13 @@ cbq_dequeue(struct Qdisc *sch)
struct sk_buff *skb;
struct cbq_sched_data *q = qdisc_priv(sch);
psched_time_t now;
- psched_tdiff_t incr;
now = psched_get_time();
- incr = now - q->now_rt;
-
- if (q->tx_class) {
- psched_tdiff_t incr2;
- /* Time integrator. We calculate EOS time
- * by adding expected packet transmission time.
- * If real time is greater, we warp artificial clock,
- * so that:
- *
- * cbq_time = max(real_time, work);
- */
- incr2 = L2T(&q->link, q->tx_len);
- q->now += incr2;
+
+ if (q->tx_class)
cbq_update(q);
- if ((incr -= incr2) < 0)
- incr = 0;
- q->now += incr;
- } else {
- if (now > q->now)
- q->now = now;
- }
- q->now_rt = now;
+
+ q->now = now;
for (;;) {
q->wd_expires = 0;
@@ -1223,7 +1205,6 @@ cbq_reset(struct Qdisc *sch)
hrtimer_cancel(&q->delay_timer);
q->toplevel = TC_CBQ_MAXLEVEL;
q->now = psched_get_time();
- q->now_rt = q->now;
for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
q->active[prio] = NULL;
@@ -1407,7 +1388,6 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
q->delay_timer.function = cbq_undelay;
q->toplevel = TC_CBQ_MAXLEVEL;
q->now = psched_get_time();
- q->now_rt = q->now;
cbq_link_class(&q->link);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 06a9ee6b2d3a..a88b8524846e 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -813,6 +813,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
else {
dst_release(transport->dst);
transport->dst = NULL;
+ ulp_notify = false;
}
spc_state = SCTP_ADDR_UNREACHABLE;
@@ -1244,7 +1245,7 @@ static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr,
{
u8 score_curr, score_best;
- if (best == NULL)
+ if (best == NULL || curr == best)
return curr;
score_curr = sctp_trans_score(curr);
@@ -1355,14 +1356,11 @@ static void sctp_select_active_and_retran_path(struct sctp_association *asoc)
trans_sec = trans_pri;
/* If we failed to find a usable transport, just camp on the
- * primary or retran, even if they are inactive, if possible
- * pick a PF iff it's the better choice.
+ * active or pick a PF iff it's the better choice.
*/
if (trans_pri == NULL) {
- trans_pri = sctp_trans_elect_best(asoc->peer.primary_path,
- asoc->peer.retran_path);
- trans_pri = sctp_trans_elect_best(trans_pri, trans_pf);
- trans_sec = asoc->peer.primary_path;
+ trans_pri = sctp_trans_elect_best(asoc->peer.active_path, trans_pf);
+ trans_sec = trans_pri;
}
/* Set the active and retran transports. */
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index a622ad64acd8..2e0a6f92e563 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -176,7 +176,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
len = (buf + buflen) - delim - 1;
p = kstrndup(delim + 1, len, GFP_KERNEL);
if (p) {
- unsigned long scope_id = 0;
+ u32 scope_id = 0;
struct net_device *dev;
dev = dev_get_by_name(net, p);
@@ -184,7 +184,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
scope_id = dev->ifindex;
dev_put(dev);
} else {
- if (strict_strtoul(p, 10, &scope_id) == 0) {
+ if (kstrtou32(p, 10, &scope_id) == 0) {
kfree(p);
return 0;
}
@@ -304,7 +304,7 @@ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags)
* @sap: buffer into which to plant socket address
* @salen: size of buffer
*
- * @uaddr does not have to be '\0'-terminated, but strict_strtoul() and
+ * @uaddr does not have to be '\0'-terminated, but kstrtou8() and
* rpc_pton() require proper string termination to be successful.
*
* Returns the size of the socket address if successful; otherwise
@@ -315,7 +315,7 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
const size_t salen)
{
char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')];
- unsigned long portlo, porthi;
+ u8 portlo, porthi;
unsigned short port;
if (uaddr_len > RPCBIND_MAXUADDRLEN)
@@ -327,18 +327,14 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
c = strrchr(buf, '.');
if (unlikely(c == NULL))
return 0;
- if (unlikely(strict_strtoul(c + 1, 10, &portlo) != 0))
- return 0;
- if (unlikely(portlo > 255))
+ if (unlikely(kstrtou8(c + 1, 10, &portlo) != 0))
return 0;
*c = '\0';
c = strrchr(buf, '.');
if (unlikely(c == NULL))
return 0;
- if (unlikely(strict_strtoul(c + 1, 10, &porthi) != 0))
- return 0;
- if (unlikely(porthi > 255))
+ if (unlikely(kstrtou8(c + 1, 10, &porthi) != 0))
return 0;
port = (unsigned short)((porthi << 8) | portlo);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index f77366717420..383eb919ac0b 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -48,7 +48,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
if (!val)
goto out_inval;
- ret = strict_strtoul(val, 0, &num);
+ ret = kstrtoul(val, 0, &num);
if (ret == -EINVAL)
goto out_inval;
nbits = fls(num);
@@ -80,6 +80,10 @@ static struct kernel_param_ops param_ops_hashtbl_sz = {
module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
+static unsigned long auth_max_cred_cachesize = ULONG_MAX;
+module_param(auth_max_cred_cachesize, ulong, 0644);
+MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size");
+
static u32
pseudoflavor_to_flavor(u32 flavor) {
if (flavor > RPC_AUTH_MAXFLAVOR)
@@ -363,6 +367,15 @@ rpcauth_cred_key_to_expire(struct rpc_cred *cred)
}
EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire);
+char *
+rpcauth_stringify_acceptor(struct rpc_cred *cred)
+{
+ if (!cred->cr_ops->crstringify_acceptor)
+ return NULL;
+ return cred->cr_ops->crstringify_acceptor(cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor);
+
/*
* Destroy a list of credentials
*/
@@ -472,6 +485,20 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
return freed;
}
+static unsigned long
+rpcauth_cache_do_shrink(int nr_to_scan)
+{
+ LIST_HEAD(free);
+ unsigned long freed;
+
+ spin_lock(&rpc_credcache_lock);
+ freed = rpcauth_prune_expired(&free, nr_to_scan);
+ spin_unlock(&rpc_credcache_lock);
+ rpcauth_destroy_credlist(&free);
+
+ return freed;
+}
+
/*
* Run memory cache shrinker.
*/
@@ -479,9 +506,6 @@ static unsigned long
rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- LIST_HEAD(free);
- unsigned long freed;
-
if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
return SHRINK_STOP;
@@ -489,12 +513,7 @@ rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
if (list_empty(&cred_unused))
return SHRINK_STOP;
- spin_lock(&rpc_credcache_lock);
- freed = rpcauth_prune_expired(&free, sc->nr_to_scan);
- spin_unlock(&rpc_credcache_lock);
- rpcauth_destroy_credlist(&free);
-
- return freed;
+ return rpcauth_cache_do_shrink(sc->nr_to_scan);
}
static unsigned long
@@ -504,6 +523,21 @@ rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
}
+static void
+rpcauth_cache_enforce_limit(void)
+{
+ unsigned long diff;
+ unsigned int nr_to_scan;
+
+ if (number_cred_unused <= auth_max_cred_cachesize)
+ return;
+ diff = number_cred_unused - auth_max_cred_cachesize;
+ nr_to_scan = 100;
+ if (diff < nr_to_scan)
+ nr_to_scan = diff;
+ rpcauth_cache_do_shrink(nr_to_scan);
+}
+
/*
* Look up a process' credentials in the authentication cache
*/
@@ -523,6 +557,12 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
if (!entry->cr_ops->crmatch(acred, entry, flags))
continue;
+ if (flags & RPCAUTH_LOOKUP_RCU) {
+ if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) &&
+ !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags))
+ cred = entry;
+ break;
+ }
spin_lock(&cache->lock);
if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
spin_unlock(&cache->lock);
@@ -537,6 +577,9 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
if (cred != NULL)
goto found;
+ if (flags & RPCAUTH_LOOKUP_RCU)
+ return ERR_PTR(-ECHILD);
+
new = auth->au_ops->crcreate(auth, acred, flags);
if (IS_ERR(new)) {
cred = new;
@@ -557,6 +600,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
} else
list_add_tail(&new->cr_lru, &free);
spin_unlock(&cache->lock);
+ rpcauth_cache_enforce_limit();
found:
if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) &&
cred->cr_ops->cr_init != NULL &&
@@ -586,10 +630,8 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
memset(&acred, 0, sizeof(acred));
acred.uid = cred->fsuid;
acred.gid = cred->fsgid;
- acred.group_info = get_group_info(((struct cred *)cred)->group_info);
-
+ acred.group_info = cred->group_info;
ret = auth->au_ops->lookup_cred(auth, &acred, flags);
- put_group_info(acred.group_info);
return ret;
}
EXPORT_SYMBOL_GPL(rpcauth_lookupcred);
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index ed04869b2d4f..6f6b829c9e8e 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,12 @@ struct rpc_cred *rpc_lookup_cred(void)
}
EXPORT_SYMBOL_GPL(rpc_lookup_cred);
+struct rpc_cred *rpc_lookup_cred_nonblock(void)
+{
+ return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock);
+
/*
* Public call interface for looking up machine creds.
*/
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index b6e440baccc3..afb292cd797d 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -183,8 +183,9 @@ gss_cred_get_ctx(struct rpc_cred *cred)
struct gss_cl_ctx *ctx = NULL;
rcu_read_lock();
- if (gss_cred->gc_ctx)
- ctx = gss_get_ctx(gss_cred->gc_ctx);
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (ctx)
+ gss_get_ctx(ctx);
rcu_read_unlock();
return ctx;
}
@@ -262,9 +263,22 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
p = ERR_PTR(ret);
goto err;
}
- dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u\n",
- __func__, ctx->gc_expiry, now, timeout);
- return q;
+
+ /* is there any trailing data? */
+ if (q == end) {
+ p = q;
+ goto done;
+ }
+
+ /* pull in acceptor name (if there is one) */
+ p = simple_get_netobj(q, end, &ctx->gc_acceptor);
+ if (IS_ERR(p))
+ goto err;
+done:
+ dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u acceptor %.*s\n",
+ __func__, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len,
+ ctx->gc_acceptor.data);
+ return p;
err:
dprintk("RPC: %s returns error %ld\n", __func__, -PTR_ERR(p));
return p;
@@ -1194,13 +1208,13 @@ gss_destroying_context(struct rpc_cred *cred)
{
struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
+ struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
struct rpc_task *task;
- if (gss_cred->gc_ctx == NULL ||
- test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
+ if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
return 0;
- gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY;
+ ctx->gc_proc = RPC_GSS_PROC_DESTROY;
cred->cr_ops = &gss_nullops;
/* Take a reference to ensure the cred will be destroyed either
@@ -1225,6 +1239,7 @@ gss_do_free_ctx(struct gss_cl_ctx *ctx)
gss_delete_sec_context(&ctx->gc_gss_ctx);
kfree(ctx->gc_wire_ctx.data);
+ kfree(ctx->gc_acceptor.data);
kfree(ctx);
}
@@ -1260,7 +1275,7 @@ gss_destroy_nullcred(struct rpc_cred *cred)
{
struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
- struct gss_cl_ctx *ctx = gss_cred->gc_ctx;
+ struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
RCU_INIT_POINTER(gss_cred->gc_ctx, NULL);
call_rcu(&cred->cr_rcu, gss_free_cred_callback);
@@ -1332,6 +1347,36 @@ gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred)
return err;
}
+static char *
+gss_stringify_acceptor(struct rpc_cred *cred)
+{
+ char *string = NULL;
+ struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+ struct gss_cl_ctx *ctx;
+ struct xdr_netobj *acceptor;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (!ctx)
+ goto out;
+
+ acceptor = &ctx->gc_acceptor;
+
+ /* no point if there's no string */
+ if (!acceptor->len)
+ goto out;
+
+ string = kmalloc(acceptor->len + 1, GFP_KERNEL);
+ if (!string)
+ goto out;
+
+ memcpy(string, acceptor->data, acceptor->len);
+ string[acceptor->len] = '\0';
+out:
+ rcu_read_unlock();
+ return string;
+}
+
/*
* Returns -EACCES if GSS context is NULL or will expire within the
* timeout (miliseconds)
@@ -1340,15 +1385,16 @@ static int
gss_key_timeout(struct rpc_cred *rc)
{
struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+ struct gss_cl_ctx *ctx;
unsigned long now = jiffies;
unsigned long expire;
- if (gss_cred->gc_ctx == NULL)
- return -EACCES;
-
- expire = gss_cred->gc_ctx->gc_expiry - (gss_key_expire_timeo * HZ);
-
- if (time_after(now, expire))
+ rcu_read_lock();
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (ctx)
+ expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ);
+ rcu_read_unlock();
+ if (!ctx || time_after(now, expire))
return -EACCES;
return 0;
}
@@ -1357,13 +1403,19 @@ static int
gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
{
struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+ struct gss_cl_ctx *ctx;
int ret;
if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
goto out;
/* Don't match with creds that have expired. */
- if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry))
+ rcu_read_lock();
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (!ctx || time_after(jiffies, ctx->gc_expiry)) {
+ rcu_read_unlock();
return 0;
+ }
+ rcu_read_unlock();
if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags))
return 0;
out:
@@ -1909,29 +1961,31 @@ static const struct rpc_authops authgss_ops = {
};
static const struct rpc_credops gss_credops = {
- .cr_name = "AUTH_GSS",
- .crdestroy = gss_destroy_cred,
- .cr_init = gss_cred_init,
- .crbind = rpcauth_generic_bind_cred,
- .crmatch = gss_match,
- .crmarshal = gss_marshal,
- .crrefresh = gss_refresh,
- .crvalidate = gss_validate,
- .crwrap_req = gss_wrap_req,
- .crunwrap_resp = gss_unwrap_resp,
- .crkey_timeout = gss_key_timeout,
+ .cr_name = "AUTH_GSS",
+ .crdestroy = gss_destroy_cred,
+ .cr_init = gss_cred_init,
+ .crbind = rpcauth_generic_bind_cred,
+ .crmatch = gss_match,
+ .crmarshal = gss_marshal,
+ .crrefresh = gss_refresh,
+ .crvalidate = gss_validate,
+ .crwrap_req = gss_wrap_req,
+ .crunwrap_resp = gss_unwrap_resp,
+ .crkey_timeout = gss_key_timeout,
+ .crstringify_acceptor = gss_stringify_acceptor,
};
static const struct rpc_credops gss_nullops = {
- .cr_name = "AUTH_GSS",
- .crdestroy = gss_destroy_nullcred,
- .crbind = rpcauth_generic_bind_cred,
- .crmatch = gss_match,
- .crmarshal = gss_marshal,
- .crrefresh = gss_refresh_null,
- .crvalidate = gss_validate,
- .crwrap_req = gss_wrap_req,
- .crunwrap_resp = gss_unwrap_resp,
+ .cr_name = "AUTH_GSS",
+ .crdestroy = gss_destroy_nullcred,
+ .crbind = rpcauth_generic_bind_cred,
+ .crmatch = gss_match,
+ .crmarshal = gss_marshal,
+ .crrefresh = gss_refresh_null,
+ .crvalidate = gss_validate,
+ .crwrap_req = gss_wrap_req,
+ .crunwrap_resp = gss_unwrap_resp,
+ .crstringify_acceptor = gss_stringify_acceptor,
};
static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 0f43e894bc0a..f5ed9f6ece06 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -641,7 +641,7 @@ out:
u32
gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
- struct xdr_buf *buf, int ec, struct page **pages)
+ struct xdr_buf *buf, struct page **pages)
{
u32 err;
struct xdr_netobj hmac;
@@ -684,13 +684,8 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
ecptr = buf->tail[0].iov_base;
}
- memset(ecptr, 'X', ec);
- buf->tail[0].iov_len += ec;
- buf->len += ec;
-
/* copy plaintext gss token header after filler (if any) */
- memcpy(ecptr + ec, buf->head[0].iov_base + offset,
- GSS_KRB5_TOK_HDR_LEN);
+ memcpy(ecptr, buf->head[0].iov_base + offset, GSS_KRB5_TOK_HDR_LEN);
buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN;
buf->len += GSS_KRB5_TOK_HDR_LEN;
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 62ae3273186c..42768e5c3994 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -70,31 +70,37 @@
DEFINE_SPINLOCK(krb5_seq_lock);
-static char *
+static void *
setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token)
{
- __be16 *ptr, *krb5_hdr;
+ u16 *ptr;
+ void *krb5_hdr;
int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
token->len = g_token_size(&ctx->mech_used, body_size);
- ptr = (__be16 *)token->data;
+ ptr = (u16 *)token->data;
g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr);
/* ptr now at start of header described in rfc 1964, section 1.2.1: */
krb5_hdr = ptr;
*ptr++ = KG_TOK_MIC_MSG;
- *ptr++ = cpu_to_le16(ctx->gk5e->signalg);
+ /*
+ * signalg is stored as if it were converted from LE to host endian, even
+ * though it's an opaque pair of bytes according to the RFC.
+ */
+ *ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg);
*ptr++ = SEAL_ALG_NONE;
- *ptr++ = 0xffff;
+ *ptr = 0xffff;
- return (char *)krb5_hdr;
+ return krb5_hdr;
}
static void *
setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
{
- __be16 *ptr, *krb5_hdr;
+ u16 *ptr;
+ void *krb5_hdr;
u8 *p, flags = 0x00;
if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0)
@@ -104,15 +110,15 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
/* Per rfc 4121, sec 4.2.6.1, there is no header,
* just start the token */
- krb5_hdr = ptr = (__be16 *)token->data;
+ krb5_hdr = ptr = (u16 *)token->data;
*ptr++ = KG2_TOK_MIC;
p = (u8 *)ptr;
*p++ = flags;
*p++ = 0xff;
- ptr = (__be16 *)p;
- *ptr++ = 0xffff;
+ ptr = (u16 *)p;
*ptr++ = 0xffff;
+ *ptr = 0xffff;
token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
return krb5_hdr;
@@ -181,7 +187,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
spin_lock(&krb5_seq_lock);
seq_send = ctx->seq_send64++;
spin_unlock(&krb5_seq_lock);
- *((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
+ *((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
if (ctx->initiate) {
cksumkey = ctx->initiator_sign;
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 42560e55d978..4b614c604fe0 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -201,9 +201,15 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength;
- *(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
- memset(ptr + 4, 0xff, 4);
- *(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+ /*
+ * signalg and sealalg are stored as if they were converted from LE
+ * to host endian, even though they're opaque pairs of bytes according
+ * to the RFC.
+ */
+ *(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
+ *(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+ ptr[6] = 0xff;
+ ptr[7] = 0xff;
gss_krb5_make_confounder(msg_start, conflen);
@@ -438,7 +444,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
u8 *ptr, *plainhdr;
s32 now;
u8 flags = 0x00;
- __be16 *be16ptr, ec = 0;
+ __be16 *be16ptr;
__be64 *be64ptr;
u32 err;
@@ -468,16 +474,16 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
be16ptr = (__be16 *)ptr;
blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc);
- *be16ptr++ = cpu_to_be16(ec);
+ *be16ptr++ = 0;
/* "inner" token header always uses 0 for RRC */
- *be16ptr++ = cpu_to_be16(0);
+ *be16ptr++ = 0;
be64ptr = (__be64 *)be16ptr;
spin_lock(&krb5_seq_lock);
*be64ptr = cpu_to_be64(kctx->seq_send64++);
spin_unlock(&krb5_seq_lock);
- err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages);
+ err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages);
if (err)
return err;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 4ce5eccec1f6..c548ab213f76 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -886,7 +886,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs
u32 priv_len, maj_stat;
int pad, saved_len, remaining_len, offset;
- rqstp->rq_splice_ok = 0;
+ rqstp->rq_splice_ok = false;
priv_len = svc_getnl(&buf->head[0]);
if (rqstp->rq_deferred) {
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index f0ebe07978a2..712c123e04e9 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -35,6 +35,8 @@ nul_destroy(struct rpc_auth *auth)
static struct rpc_cred *
nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
+ if (flags & RPCAUTH_LOOKUP_RCU)
+ return &null_cred;
return get_rpccred(&null_cred);
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2e6ab10734f6..488ddeed9363 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1746,6 +1746,7 @@ call_bind_status(struct rpc_task *task)
case -EHOSTDOWN:
case -EHOSTUNREACH:
case -ENETUNREACH:
+ case -ENOBUFS:
case -EPIPE:
dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
task->tk_pid, task->tk_status);
@@ -1812,6 +1813,8 @@ call_connect_status(struct rpc_task *task)
case -ECONNABORTED:
case -ENETUNREACH:
case -EHOSTUNREACH:
+ case -ENOBUFS:
+ case -EPIPE:
if (RPC_IS_SOFTCONN(task))
break;
/* retry with existing socket, after a delay */
@@ -1918,6 +1921,7 @@ call_transmit_status(struct rpc_task *task)
case -ECONNRESET:
case -ECONNABORTED:
case -ENOTCONN:
+ case -ENOBUFS:
case -EPIPE:
rpc_task_force_reencode(task);
}
@@ -2034,6 +2038,7 @@ call_status(struct rpc_task *task)
case -ECONNRESET:
case -ECONNABORTED:
rpc_force_rebind(clnt);
+ case -ENOBUFS:
rpc_delay(task, 3*HZ);
case -EPIPE:
case -ENOTCONN:
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index b18554898562..2d12b76b5a64 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -195,7 +195,7 @@ static struct inode *
rpc_alloc_inode(struct super_block *sb)
{
struct rpc_inode *rpci;
- rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
+ rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
if (!rpci)
return NULL;
return &rpci->vfs_inode;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 5de6801cd924..1db5007ddbce 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1086,9 +1086,9 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
goto err_short_len;
/* Will be turned off only in gss privacy case: */
- rqstp->rq_splice_ok = 1;
+ rqstp->rq_splice_ok = true;
/* Will be turned off only when NFSv4 Sessions are used */
- rqstp->rq_usedeferral = 1;
+ rqstp->rq_usedeferral = true;
rqstp->rq_dropme = false;
/* Setup reply header */
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index b4737fbdec13..6666c6745858 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -23,6 +23,7 @@ static int svc_deferred_recv(struct svc_rqst *rqstp);
static struct cache_deferred_req *svc_defer(struct cache_req *req);
static void svc_age_temp_xprts(unsigned long closure);
static void svc_delete_xprt(struct svc_xprt *xprt);
+static void svc_xprt_do_enqueue(struct svc_xprt *xprt);
/* apparently the "standard" is that clients close
* idle connections after 5 minutes, servers after
@@ -222,11 +223,12 @@ static void svc_xprt_received(struct svc_xprt *xprt)
if (!test_bit(XPT_BUSY, &xprt->xpt_flags))
return;
/* As soon as we clear busy, the xprt could be closed and
- * 'put', so we need a reference to call svc_xprt_enqueue with:
+ * 'put', so we need a reference to call svc_xprt_do_enqueue with:
*/
svc_xprt_get(xprt);
+ smp_mb__before_atomic();
clear_bit(XPT_BUSY, &xprt->xpt_flags);
- svc_xprt_enqueue(xprt);
+ svc_xprt_do_enqueue(xprt);
svc_xprt_put(xprt);
}
@@ -335,12 +337,7 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
return false;
}
-/*
- * Queue up a transport with data pending. If there are idle nfsd
- * processes, wake 'em up.
- *
- */
-void svc_xprt_enqueue(struct svc_xprt *xprt)
+static void svc_xprt_do_enqueue(struct svc_xprt *xprt)
{
struct svc_pool *pool;
struct svc_rqst *rqstp;
@@ -398,6 +395,18 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
out_unlock:
spin_unlock_bh(&pool->sp_lock);
}
+
+/*
+ * Queue up a transport with data pending. If there are idle nfsd
+ * processes, wake 'em up.
+ *
+ */
+void svc_xprt_enqueue(struct svc_xprt *xprt)
+{
+ if (test_bit(XPT_BUSY, &xprt->xpt_flags))
+ return;
+ svc_xprt_do_enqueue(xprt);
+}
EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
/*
@@ -439,6 +448,8 @@ void svc_reserve(struct svc_rqst *rqstp, int space)
atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
rqstp->rq_reserved = space;
+ if (xprt->xpt_ops->xpo_adjust_wspace)
+ xprt->xpt_ops->xpo_adjust_wspace(xprt);
svc_xprt_enqueue(xprt);
}
}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index b507cd327d9b..c24a8ff33f8f 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -446,15 +446,43 @@ static void svc_write_space(struct sock *sk)
}
}
+static int svc_tcp_has_wspace(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct svc_serv *serv = svsk->sk_xprt.xpt_server;
+ int required;
+
+ if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
+ return 1;
+ required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
+ if (sk_stream_wspace(svsk->sk_sk) >= required ||
+ (sk_stream_min_wspace(svsk->sk_sk) == 0 &&
+ atomic_read(&xprt->xpt_reserved) == 0))
+ return 1;
+ set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ return 0;
+}
+
static void svc_tcp_write_space(struct sock *sk)
{
+ struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
struct socket *sock = sk->sk_socket;
- if (sk_stream_is_writeable(sk) && sock)
+ if (!sk_stream_is_writeable(sk) || !sock)
+ return;
+ if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt))
clear_bit(SOCK_NOSPACE, &sock->flags);
svc_write_space(sk);
}
+static void svc_tcp_adjust_wspace(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+
+ if (svc_tcp_has_wspace(xprt))
+ clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+}
+
/*
* See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo
*/
@@ -692,6 +720,7 @@ static struct svc_xprt_class svc_udp_class = {
.xcl_owner = THIS_MODULE,
.xcl_ops = &svc_udp_ops,
.xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
+ .xcl_ident = XPRT_TRANSPORT_UDP,
};
static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
@@ -1197,23 +1226,6 @@ static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
svc_putnl(resv, 0);
}
-static int svc_tcp_has_wspace(struct svc_xprt *xprt)
-{
- struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
- struct svc_serv *serv = svsk->sk_xprt.xpt_server;
- int required;
-
- if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
- return 1;
- required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
- if (sk_stream_wspace(svsk->sk_sk) >= required ||
- (sk_stream_min_wspace(svsk->sk_sk) == 0 &&
- atomic_read(&xprt->xpt_reserved) == 0))
- return 1;
- set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
- return 0;
-}
-
static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
struct net *net,
struct sockaddr *sa, int salen,
@@ -1285,6 +1297,7 @@ static struct svc_xprt_ops svc_tcp_ops = {
.xpo_has_wspace = svc_tcp_has_wspace,
.xpo_accept = svc_tcp_accept,
.xpo_secure_port = svc_sock_secure_port,
+ .xpo_adjust_wspace = svc_tcp_adjust_wspace,
};
static struct svc_xprt_class svc_tcp_class = {
@@ -1292,6 +1305,7 @@ static struct svc_xprt_class svc_tcp_class = {
.xcl_owner = THIS_MODULE,
.xcl_ops = &svc_tcp_ops,
.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+ .xcl_ident = XPRT_TRANSPORT_TCP,
};
void svc_init_xprt_sock(void)
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 23fb4e75e245..290af97bf6f9 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -509,7 +509,8 @@ void xdr_commit_encode(struct xdr_stream *xdr)
}
EXPORT_SYMBOL_GPL(xdr_commit_encode);
-__be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes)
+static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
+ size_t nbytes)
{
static __be32 *p;
int space_left;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c3b2b3369e52..56e4e150e80e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -744,6 +744,7 @@ static void xprt_connect_status(struct rpc_task *task)
case -ECONNABORTED:
case -ENETUNREACH:
case -EHOSTUNREACH:
+ case -EPIPE:
case -EAGAIN:
dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
break;
@@ -1306,7 +1307,7 @@ struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
}
}
spin_unlock(&xprt_list_lock);
- printk(KERN_ERR "RPC: transport (%d) not supported\n", args->ident);
+ dprintk("RPC: transport (%d) not supported\n", args->ident);
return ERR_PTR(-EIO);
found:
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 693966d3f33b..6166c985fe24 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,14 +53,6 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-enum rpcrdma_chunktype {
- rpcrdma_noch = 0,
- rpcrdma_readch,
- rpcrdma_areadch,
- rpcrdma_writech,
- rpcrdma_replych
-};
-
#ifdef RPC_DEBUG
static const char transfertypes[][12] = {
"pure inline", /* no chunks */
@@ -279,13 +271,37 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
return (unsigned char *)iptr - (unsigned char *)headerp;
out:
- for (pos = 0; nchunks--;)
- pos += rpcrdma_deregister_external(
- &req->rl_segments[pos], r_xprt);
+ if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
+ for (pos = 0; nchunks--;)
+ pos += rpcrdma_deregister_external(
+ &req->rl_segments[pos], r_xprt);
+ }
return n;
}
/*
+ * Marshal chunks. This routine returns the header length
+ * consumed by marshaling.
+ *
+ * Returns positive RPC/RDMA header size, or negative errno.
+ */
+
+ssize_t
+rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
+{
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base;
+
+ if (req->rl_rtype != rpcrdma_noch)
+ result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
+ headerp, req->rl_rtype);
+ else if (req->rl_wtype != rpcrdma_noch)
+ result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
+ headerp, req->rl_wtype);
+ return result;
+}
+
+/*
* Copy write data inline.
* This function is used for "small" requests. Data which is passed
* to RPC via iovecs (or page list) is copied directly into the
@@ -377,7 +393,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
char *base;
size_t rpclen, padlen;
ssize_t hdrlen;
- enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
/*
@@ -415,13 +430,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* into pages; otherwise use reply chunks.
*/
if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
- wtype = rpcrdma_noch;
+ req->rl_wtype = rpcrdma_noch;
else if (rqst->rq_rcv_buf.page_len == 0)
- wtype = rpcrdma_replych;
+ req->rl_wtype = rpcrdma_replych;
else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
- wtype = rpcrdma_writech;
+ req->rl_wtype = rpcrdma_writech;
else
- wtype = rpcrdma_replych;
+ req->rl_wtype = rpcrdma_replych;
/*
* Chunks needed for arguments?
@@ -438,16 +453,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* TBD check NFSv4 setacl
*/
if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
- rtype = rpcrdma_noch;
+ req->rl_rtype = rpcrdma_noch;
else if (rqst->rq_snd_buf.page_len == 0)
- rtype = rpcrdma_areadch;
+ req->rl_rtype = rpcrdma_areadch;
else
- rtype = rpcrdma_readch;
+ req->rl_rtype = rpcrdma_readch;
/* The following simplification is not true forever */
- if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
- wtype = rpcrdma_noch;
- if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
+ if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
+ req->rl_wtype = rpcrdma_noch;
+ if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
@@ -461,7 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* When padding is in use and applies to the transfer, insert
* it and change the message type.
*/
- if (rtype == rpcrdma_noch) {
+ if (req->rl_rtype == rpcrdma_noch) {
padlen = rpcrdma_inline_pullup(rqst,
RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -476,7 +491,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
- if (wtype != rpcrdma_noch) {
+ if (req->rl_wtype != rpcrdma_noch) {
dprintk("RPC: %s: invalid chunk list\n",
__func__);
return -EIO;
@@ -497,30 +512,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* on receive. Therefore, we request a reply chunk
* for non-writes wherever feasible and efficient.
*/
- if (wtype == rpcrdma_noch)
- wtype = rpcrdma_replych;
+ if (req->rl_wtype == rpcrdma_noch)
+ req->rl_wtype = rpcrdma_replych;
}
}
- /*
- * Marshal chunks. This routine will return the header length
- * consumed by marshaling.
- */
- if (rtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst,
- &rqst->rq_snd_buf, headerp, rtype);
- wtype = rtype; /* simplify dprintk */
-
- } else if (wtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst,
- &rqst->rq_rcv_buf, headerp, wtype);
- }
+ hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
if (hdrlen < 0)
return hdrlen;
dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
" headerp 0x%p base 0x%p lkey 0x%x\n",
- __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+ __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
headerp, base, req->rl_iov.lkey);
/*
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 8f92a61ee2df..e0110270d650 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -43,6 +43,7 @@
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/spinlock.h>
+#include <linux/highmem.h>
#include <asm/unaligned.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
@@ -435,6 +436,32 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
return ret;
}
+/*
+ * To avoid a separate RDMA READ just for a handful of zero bytes,
+ * RFC 5666 section 3.7 allows the client to omit the XDR zero pad
+ * in chunk lists.
+ */
+static void
+rdma_fix_xdr_pad(struct xdr_buf *buf)
+{
+ unsigned int page_len = buf->page_len;
+ unsigned int size = (XDR_QUADLEN(page_len) << 2) - page_len;
+ unsigned int offset, pg_no;
+ char *p;
+
+ if (size == 0)
+ return;
+
+ pg_no = page_len >> PAGE_SHIFT;
+ offset = page_len & ~PAGE_MASK;
+ p = page_address(buf->pages[pg_no]);
+ memset(p + offset, 0, size);
+
+ buf->page_len += size;
+ buf->buflen += size;
+ buf->len += size;
+}
+
static int rdma_read_complete(struct svc_rqst *rqstp,
struct svc_rdma_op_ctxt *head)
{
@@ -449,6 +476,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_pages[page_no] = head->pages[page_no];
}
/* Point rq_arg.pages past header */
+ rdma_fix_xdr_pad(&head->arg);
rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
rqstp->rq_arg.page_len = head->arg.page_len;
rqstp->rq_arg.page_base = head->arg.page_base;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 49fd21a5c215..9f1b50689c0f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -192,6 +192,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
xdr_sge_no++;
BUG_ON(xdr_sge_no > vec->count);
bc -= sge_bytes;
+ if (sge_no == xprt->sc_max_sge)
+ break;
}
/* Prepare WRITE WR */
@@ -209,7 +211,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
atomic_inc(&rdma_stat_write);
if (svc_rdma_send(xprt, &write_wr))
goto err;
- return 0;
+ return write_len - bc;
err:
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 0);
@@ -225,7 +227,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
{
u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
int write_len;
- int max_write;
u32 xdr_off;
int chunk_off;
int chunk_no;
@@ -239,8 +240,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[1];
- max_write = xprt->sc_max_sge * PAGE_SIZE;
-
/* Write chunks start at the pagelist */
for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
xfer_len && chunk_no < arg_ary->wc_nchunks;
@@ -260,23 +259,21 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
write_len);
chunk_off = 0;
while (write_len) {
- int this_write;
- this_write = min(write_len, max_write);
ret = send_write(xprt, rqstp,
ntohl(arg_ch->rs_handle),
rs_offset + chunk_off,
xdr_off,
- this_write,
+ write_len,
vec);
- if (ret) {
+ if (ret <= 0) {
dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
ret);
return -EIO;
}
- chunk_off += this_write;
- xdr_off += this_write;
- xfer_len -= this_write;
- write_len -= this_write;
+ chunk_off += ret;
+ xdr_off += ret;
+ xfer_len -= ret;
+ write_len -= ret;
}
}
/* Update the req with the number of chunks actually used */
@@ -293,7 +290,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
{
u32 xfer_len = rqstp->rq_res.len;
int write_len;
- int max_write;
u32 xdr_off;
int chunk_no;
int chunk_off;
@@ -311,8 +307,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[2];
- max_write = xprt->sc_max_sge * PAGE_SIZE;
-
/* xdr offset starts at RPC message */
nchunks = ntohl(arg_ary->wc_nchunks);
for (xdr_off = 0, chunk_no = 0;
@@ -330,24 +324,21 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
write_len);
chunk_off = 0;
while (write_len) {
- int this_write;
-
- this_write = min(write_len, max_write);
ret = send_write(xprt, rqstp,
ntohl(ch->rs_handle),
rs_offset + chunk_off,
xdr_off,
- this_write,
+ write_len,
vec);
- if (ret) {
+ if (ret <= 0) {
dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
ret);
return -EIO;
}
- chunk_off += this_write;
- xdr_off += this_write;
- xfer_len -= this_write;
- write_len -= this_write;
+ chunk_off += ret;
+ xdr_off += ret;
+ xfer_len -= ret;
+ write_len -= ret;
}
}
/* Update the req with the number of chunks actually used */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index e7323fbbd348..374feb44afea 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -92,6 +92,7 @@ struct svc_xprt_class svc_rdma_class = {
.xcl_owner = THIS_MODULE,
.xcl_ops = &svc_rdma_ops,
.xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+ .xcl_ident = XPRT_TRANSPORT_RDMA,
};
struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
@@ -942,23 +943,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
if (ret) {
- /*
- * XXX: This is a hack. We need a xx_request_qp interface
- * that will adjust the qp_attr's with a best-effort
- * number
- */
- qp_attr.cap.max_send_sge -= 2;
- qp_attr.cap.max_recv_sge -= 2;
- ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd,
- &qp_attr);
- if (ret) {
- dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
- goto errout;
- }
- newxprt->sc_max_sge = qp_attr.cap.max_send_sge;
- newxprt->sc_max_sge = qp_attr.cap.max_recv_sge;
- newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
- newxprt->sc_max_requests = qp_attr.cap.max_recv_wr;
+ dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
+ goto errout;
}
newxprt->sc_qp = newxprt->sc_cm_id->qp;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 66f91f0d071a..2faac4940563 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -296,7 +296,6 @@ xprt_setup_rdma(struct xprt_create *args)
xprt->resvport = 0; /* privileged port not needed */
xprt->tsh_size = 0; /* RPC-RDMA handles framing */
- xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
xprt->ops = &xprt_rdma_procs;
/*
@@ -382,6 +381,9 @@ xprt_setup_rdma(struct xprt_create *args)
new_ep->rep_xprt = xprt;
xprt_rdma_format_addresses(xprt);
+ xprt->max_payload = rpcrdma_max_payload(new_xprt);
+ dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
+ __func__, xprt->max_payload);
if (!try_module_get(THIS_MODULE))
goto out4;
@@ -412,7 +414,7 @@ xprt_rdma_close(struct rpc_xprt *xprt)
if (r_xprt->rx_ep.rep_connected > 0)
xprt->reestablish_timeout = 0;
xprt_disconnect_done(xprt);
- (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
}
static void
@@ -595,13 +597,14 @@ xprt_rdma_send_request(struct rpc_task *task)
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- int rc;
+ int rc = 0;
- if (req->rl_niovs == 0) {
+ if (req->rl_niovs == 0)
rc = rpcrdma_marshal_req(rqst);
- if (rc < 0)
- goto failed_marshal;
- }
+ else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+ rc = rpcrdma_marshal_chunks(rqst, 0);
+ if (rc < 0)
+ goto failed_marshal;
if (req->rl_reply == NULL) /* e.g. reconnection */
rpcrdma_recv_buffer_get(req);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 13dbd1c389ff..61c41298b4ea 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -61,6 +61,8 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
+static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
+
/*
* internal functions
*/
@@ -103,17 +105,6 @@ rpcrdma_run_tasklet(unsigned long data)
static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
-static inline void
-rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
- list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
- spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
- tasklet_schedule(&rpcrdma_tasklet_g);
-}
-
static void
rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
{
@@ -153,12 +144,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
if (wc->wr_id == 0ULL)
return;
if (wc->status != IB_WC_SUCCESS)
- return;
-
- if (wc->opcode == IB_WC_FAST_REG_MR)
- frmr->r.frmr.state = FRMR_IS_VALID;
- else if (wc->opcode == IB_WC_LOCAL_INV)
- frmr->r.frmr.state = FRMR_IS_INVALID;
+ frmr->r.frmr.fr_state = FRMR_IS_STALE;
}
static int
@@ -217,7 +203,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
}
static void
-rpcrdma_recvcq_process_wc(struct ib_wc *wc)
+rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
{
struct rpcrdma_rep *rep =
(struct rpcrdma_rep *)(unsigned long)wc->wr_id;
@@ -248,28 +234,38 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc)
}
out_schedule:
- rpcrdma_schedule_tasklet(rep);
+ list_add_tail(&rep->rr_list, sched_list);
}
static int
rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
{
+ struct list_head sched_list;
struct ib_wc *wcs;
int budget, count, rc;
+ unsigned long flags;
+ INIT_LIST_HEAD(&sched_list);
budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
do {
wcs = ep->rep_recv_wcs;
rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
if (rc <= 0)
- return rc;
+ goto out_schedule;
count = rc;
while (count-- > 0)
- rpcrdma_recvcq_process_wc(wcs++);
+ rpcrdma_recvcq_process_wc(wcs++, &sched_list);
} while (rc == RPCRDMA_POLLSIZE && --budget);
- return 0;
+ rc = 0;
+
+out_schedule:
+ spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+ list_splice_tail(&sched_list, &rpcrdma_tasklets_g);
+ spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+ tasklet_schedule(&rpcrdma_tasklet_g);
+ return rc;
}
/*
@@ -310,6 +306,13 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
rpcrdma_recvcq_poll(cq, ep);
}
+static void
+rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
+{
+ rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
+ rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
+}
+
#ifdef RPC_DEBUG
static const char * const conn[] = {
"address resolved",
@@ -323,8 +326,16 @@ static const char * const conn[] = {
"rejected",
"established",
"disconnected",
- "device removal"
+ "device removal",
+ "multicast join",
+ "multicast error",
+ "address change",
+ "timewait exit",
};
+
+#define CONNECTION_MSG(status) \
+ ((status) < ARRAY_SIZE(conn) ? \
+ conn[(status)] : "unrecognized connection error")
#endif
static int
@@ -382,23 +393,18 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
case RDMA_CM_EVENT_DEVICE_REMOVAL:
connstate = -ENODEV;
connected:
- dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
- __func__,
- (event->event <= 11) ? conn[event->event] :
- "unknown connection error",
- &addr->sin_addr.s_addr,
- ntohs(addr->sin_port),
- ep, event->event);
atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
dprintk("RPC: %s: %sconnected\n",
__func__, connstate > 0 ? "" : "dis");
ep->rep_connected = connstate;
ep->rep_func(ep);
wake_up_all(&ep->rep_connect_wait);
- break;
+ /*FALLTHROUGH*/
default:
- dprintk("RPC: %s: unexpected CM event %d\n",
- __func__, event->event);
+ dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n",
+ __func__, &addr->sin_addr.s_addr,
+ ntohs(addr->sin_port), ep,
+ CONNECTION_MSG(event->event));
break;
}
@@ -558,12 +564,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if (!ia->ri_id->device->alloc_fmr) {
dprintk("RPC: %s: MTHCAFMR registration "
"not supported by HCA\n", __func__);
-#if RPCRDMA_PERSISTENT_REGISTRATION
memreg = RPCRDMA_ALLPHYSICAL;
-#else
- rc = -ENOMEM;
- goto out2;
-#endif
}
}
@@ -578,20 +579,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
switch (memreg) {
case RPCRDMA_FRMR:
break;
-#if RPCRDMA_PERSISTENT_REGISTRATION
case RPCRDMA_ALLPHYSICAL:
mem_priv = IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_READ;
goto register_setup;
-#endif
case RPCRDMA_MTHCAFMR:
if (ia->ri_have_dma_lkey)
break;
mem_priv = IB_ACCESS_LOCAL_WRITE;
-#if RPCRDMA_PERSISTENT_REGISTRATION
register_setup:
-#endif
ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
if (IS_ERR(ia->ri_bind_mem)) {
printk(KERN_ALERT "%s: ib_get_dma_mr for "
@@ -613,6 +610,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
/* Else will do memory reg/dereg for each chunk */
ia->ri_memreg_strategy = memreg;
+ rwlock_init(&ia->ri_qplock);
return 0;
out2:
rdma_destroy_id(ia->ri_id);
@@ -826,10 +824,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
cancel_delayed_work_sync(&ep->rep_connect_worker);
if (ia->ri_id->qp) {
- rc = rpcrdma_ep_disconnect(ep, ia);
- if (rc)
- dprintk("RPC: %s: rpcrdma_ep_disconnect"
- " returned %i\n", __func__, rc);
+ rpcrdma_ep_disconnect(ep, ia);
rdma_destroy_qp(ia->ri_id);
ia->ri_id->qp = NULL;
}
@@ -859,7 +854,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
int
rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
- struct rdma_cm_id *id;
+ struct rdma_cm_id *id, *old;
int rc = 0;
int retry_count = 0;
@@ -867,13 +862,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
struct rpcrdma_xprt *xprt;
retry:
dprintk("RPC: %s: reconnecting...\n", __func__);
- rc = rpcrdma_ep_disconnect(ep, ia);
- if (rc && rc != -ENOTCONN)
- dprintk("RPC: %s: rpcrdma_ep_disconnect"
- " status %i\n", __func__, rc);
- rpcrdma_clean_cq(ep->rep_attr.recv_cq);
- rpcrdma_clean_cq(ep->rep_attr.send_cq);
+ rpcrdma_ep_disconnect(ep, ia);
+ rpcrdma_flush_cqs(ep);
+
+ if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
+ rpcrdma_reset_frmrs(ia);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
id = rpcrdma_create_id(xprt, ia,
@@ -905,9 +899,14 @@ retry:
rc = -ENETUNREACH;
goto out;
}
- rdma_destroy_qp(ia->ri_id);
- rdma_destroy_id(ia->ri_id);
+
+ write_lock(&ia->ri_qplock);
+ old = ia->ri_id;
ia->ri_id = id;
+ write_unlock(&ia->ri_qplock);
+
+ rdma_destroy_qp(old);
+ rdma_destroy_id(old);
} else {
dprintk("RPC: %s: connecting...\n", __func__);
rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -974,13 +973,12 @@ out:
* This call is not reentrant, and must not be made in parallel
* on the same endpoint.
*/
-int
+void
rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;
- rpcrdma_clean_cq(ep->rep_attr.recv_cq);
- rpcrdma_clean_cq(ep->rep_attr.send_cq);
+ rpcrdma_flush_cqs(ep);
rc = rdma_disconnect(ia->ri_id);
if (!rc) {
/* returns without wait if not connected */
@@ -992,12 +990,93 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
ep->rep_connected = rc;
}
+}
+
+static int
+rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+ int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
+ struct ib_fmr_attr fmr_attr = {
+ .max_pages = RPCRDMA_MAX_DATA_SEGS,
+ .max_maps = 1,
+ .page_shift = PAGE_SHIFT
+ };
+ struct rpcrdma_mw *r;
+ int i, rc;
+
+ i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+ dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
+
+ while (i--) {
+ r = kzalloc(sizeof(*r), GFP_KERNEL);
+ if (r == NULL)
+ return -ENOMEM;
+
+ r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
+ if (IS_ERR(r->r.fmr)) {
+ rc = PTR_ERR(r->r.fmr);
+ dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
+ __func__, rc);
+ goto out_free;
+ }
+
+ list_add(&r->mw_list, &buf->rb_mws);
+ list_add(&r->mw_all, &buf->rb_all);
+ }
+ return 0;
+
+out_free:
+ kfree(r);
+ return rc;
+}
+
+static int
+rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_frmr *f;
+ struct rpcrdma_mw *r;
+ int i, rc;
+
+ i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+ dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
+
+ while (i--) {
+ r = kzalloc(sizeof(*r), GFP_KERNEL);
+ if (r == NULL)
+ return -ENOMEM;
+ f = &r->r.frmr;
+
+ f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(f->fr_mr)) {
+ rc = PTR_ERR(f->fr_mr);
+ dprintk("RPC: %s: ib_alloc_fast_reg_mr "
+ "failed %i\n", __func__, rc);
+ goto out_free;
+ }
+
+ f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(f->fr_pgl)) {
+ rc = PTR_ERR(f->fr_pgl);
+ dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
+ "failed %i\n", __func__, rc);
+
+ ib_dereg_mr(f->fr_mr);
+ goto out_free;
+ }
+
+ list_add(&r->mw_list, &buf->rb_mws);
+ list_add(&r->mw_all, &buf->rb_all);
+ }
+
+ return 0;
+
+out_free:
+ kfree(r);
return rc;
}
-/*
- * Initialize buffer memory
- */
int
rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
@@ -1005,7 +1084,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
char *p;
size_t len, rlen, wlen;
int i, rc;
- struct rpcrdma_mw *r;
buf->rb_max_requests = cdata->max_requests;
spin_lock_init(&buf->rb_lock);
@@ -1016,28 +1094,12 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
* 2. arrays of struct rpcrdma_req to fill in pointers
* 3. array of struct rpcrdma_rep for replies
* 4. padding, if any
- * 5. mw's, fmr's or frmr's, if any
* Send/recv buffers in req/rep need to be registered
*/
-
len = buf->rb_max_requests *
(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
len += cdata->padding;
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
- sizeof(struct rpcrdma_mw);
- break;
- case RPCRDMA_MTHCAFMR:
- /* TBD we are perhaps overallocating here */
- len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
- sizeof(struct rpcrdma_mw);
- break;
- default:
- break;
- }
- /* allocate 1, 4 and 5 in one shot */
p = kzalloc(len, GFP_KERNEL);
if (p == NULL) {
dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
@@ -1064,51 +1126,17 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
p += cdata->padding;
INIT_LIST_HEAD(&buf->rb_mws);
- r = (struct rpcrdma_mw *)p;
+ INIT_LIST_HEAD(&buf->rb_all);
switch (ia->ri_memreg_strategy) {
case RPCRDMA_FRMR:
- for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
- r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
- ia->ri_max_frmr_depth);
- if (IS_ERR(r->r.frmr.fr_mr)) {
- rc = PTR_ERR(r->r.frmr.fr_mr);
- dprintk("RPC: %s: ib_alloc_fast_reg_mr"
- " failed %i\n", __func__, rc);
- goto out;
- }
- r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
- ia->ri_id->device,
- ia->ri_max_frmr_depth);
- if (IS_ERR(r->r.frmr.fr_pgl)) {
- rc = PTR_ERR(r->r.frmr.fr_pgl);
- dprintk("RPC: %s: "
- "ib_alloc_fast_reg_page_list "
- "failed %i\n", __func__, rc);
-
- ib_dereg_mr(r->r.frmr.fr_mr);
- goto out;
- }
- list_add(&r->mw_list, &buf->rb_mws);
- ++r;
- }
+ rc = rpcrdma_init_frmrs(ia, buf);
+ if (rc)
+ goto out;
break;
case RPCRDMA_MTHCAFMR:
- /* TBD we are perhaps overallocating here */
- for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
- static struct ib_fmr_attr fa =
- { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
- r->r.fmr = ib_alloc_fmr(ia->ri_pd,
- IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
- &fa);
- if (IS_ERR(r->r.fmr)) {
- rc = PTR_ERR(r->r.fmr);
- dprintk("RPC: %s: ib_alloc_fmr"
- " failed %i\n", __func__, rc);
- goto out;
- }
- list_add(&r->mw_list, &buf->rb_mws);
- ++r;
- }
+ rc = rpcrdma_init_fmrs(ia, buf);
+ if (rc)
+ goto out;
break;
default:
break;
@@ -1176,24 +1204,57 @@ out:
return rc;
}
-/*
- * Unregister and destroy buffer memory. Need to deal with
- * partial initialization, so it's callable from failed create.
- * Must be called before destroying endpoint, as registrations
- * reference it.
- */
+static void
+rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mw *r;
+ int rc;
+
+ while (!list_empty(&buf->rb_all)) {
+ r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+ list_del(&r->mw_all);
+ list_del(&r->mw_list);
+
+ rc = ib_dealloc_fmr(r->r.fmr);
+ if (rc)
+ dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
+ __func__, rc);
+
+ kfree(r);
+ }
+}
+
+static void
+rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mw *r;
+ int rc;
+
+ while (!list_empty(&buf->rb_all)) {
+ r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+ list_del(&r->mw_all);
+ list_del(&r->mw_list);
+
+ rc = ib_dereg_mr(r->r.frmr.fr_mr);
+ if (rc)
+ dprintk("RPC: %s: ib_dereg_mr failed %i\n",
+ __func__, rc);
+ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+ kfree(r);
+ }
+}
+
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
- int rc, i;
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
- struct rpcrdma_mw *r;
+ int i;
/* clean up in reverse order from create
* 1. recv mr memory (mr free, then kfree)
* 2. send mr memory (mr free, then kfree)
- * 3. padding (if any) [moved to rpcrdma_ep_destroy]
- * 4. arrays
+ * 3. MWs
*/
dprintk("RPC: %s: entering\n", __func__);
@@ -1212,34 +1273,217 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
}
}
+ switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ rpcrdma_destroy_frmrs(buf);
+ break;
+ case RPCRDMA_MTHCAFMR:
+ rpcrdma_destroy_fmrs(buf);
+ break;
+ default:
+ break;
+ }
+
+ kfree(buf->rb_pool);
+}
+
+/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
+ * an unusable state. Find FRMRs in this state and dereg / reg
+ * each. FRMRs that are VALID and attached to an rpcrdma_req are
+ * also torn down.
+ *
+ * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_frmr_external().
+ */
+static void
+rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_xprt *r_xprt =
+ container_of(ia, struct rpcrdma_xprt, rx_ia);
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct list_head *pos;
+ struct rpcrdma_mw *r;
+ int rc;
+
+ list_for_each(pos, &buf->rb_all) {
+ r = list_entry(pos, struct rpcrdma_mw, mw_all);
+
+ if (r->r.frmr.fr_state == FRMR_IS_INVALID)
+ continue;
+
+ rc = ib_dereg_mr(r->r.frmr.fr_mr);
+ if (rc)
+ dprintk("RPC: %s: ib_dereg_mr failed %i\n",
+ __func__, rc);
+ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+ r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(r->r.frmr.fr_mr)) {
+ rc = PTR_ERR(r->r.frmr.fr_mr);
+ dprintk("RPC: %s: ib_alloc_fast_reg_mr"
+ " failed %i\n", __func__, rc);
+ continue;
+ }
+ r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
+ ia->ri_id->device,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(r->r.frmr.fr_pgl)) {
+ rc = PTR_ERR(r->r.frmr.fr_pgl);
+ dprintk("RPC: %s: "
+ "ib_alloc_fast_reg_page_list "
+ "failed %i\n", __func__, rc);
+
+ ib_dereg_mr(r->r.frmr.fr_mr);
+ continue;
+ }
+ r->r.frmr.fr_state = FRMR_IS_INVALID;
+ }
+}
+
+/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
+ * some req segments uninitialized.
+ */
+static void
+rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
+{
+ if (*mw) {
+ list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
+ *mw = NULL;
+ }
+}
+
+/* Cycle mw's back in reverse order, and "spin" them.
+ * This delays and scrambles reuse as much as possible.
+ */
+static void
+rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_segments;
+ struct rpcrdma_mr_seg *seg1 = seg;
+ int i;
+
+ for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
+ rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
+ rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
+}
+
+static void
+rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+ buf->rb_send_bufs[--buf->rb_send_index] = req;
+ req->rl_niovs = 0;
+ if (req->rl_reply) {
+ buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
+ req->rl_reply->rr_func = NULL;
+ req->rl_reply = NULL;
+ }
+}
+
+/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
+ * Redo only the ib_post_send().
+ */
+static void
+rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_xprt *r_xprt =
+ container_of(ia, struct rpcrdma_xprt, rx_ia);
+ struct ib_send_wr invalidate_wr, *bad_wr;
+ int rc;
+
+ dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
+
+ /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
+ r->r.frmr.fr_state = FRMR_IS_INVALID;
+
+ memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+ invalidate_wr.wr_id = (unsigned long)(void *)r;
+ invalidate_wr.opcode = IB_WR_LOCAL_INV;
+ invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+
+ dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
+ __func__, r, r->r.frmr.fr_mr->rkey);
+
+ read_lock(&ia->ri_qplock);
+ rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+ read_unlock(&ia->ri_qplock);
+ if (rc) {
+ /* Force rpcrdma_buffer_get() to retry */
+ r->r.frmr.fr_state = FRMR_IS_STALE;
+ dprintk("RPC: %s: ib_post_send failed, %i\n",
+ __func__, rc);
+ }
+}
+
+static void
+rpcrdma_retry_flushed_linv(struct list_head *stale,
+ struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ struct list_head *pos;
+ struct rpcrdma_mw *r;
+ unsigned long flags;
+
+ list_for_each(pos, stale) {
+ r = list_entry(pos, struct rpcrdma_mw, mw_list);
+ rpcrdma_retry_local_inv(r, ia);
+ }
+
+ spin_lock_irqsave(&buf->rb_lock, flags);
+ list_splice_tail(stale, &buf->rb_mws);
+ spin_unlock_irqrestore(&buf->rb_lock, flags);
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
+ struct list_head *stale)
+{
+ struct rpcrdma_mw *r;
+ int i;
+
+ i = RPCRDMA_MAX_SEGS - 1;
while (!list_empty(&buf->rb_mws)) {
r = list_entry(buf->rb_mws.next,
- struct rpcrdma_mw, mw_list);
+ struct rpcrdma_mw, mw_list);
list_del(&r->mw_list);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- rc = ib_dereg_mr(r->r.frmr.fr_mr);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dereg_mr"
- " failed %i\n",
- __func__, rc);
- ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
- break;
- case RPCRDMA_MTHCAFMR:
- rc = ib_dealloc_fmr(r->r.fmr);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dealloc_fmr"
- " failed %i\n",
- __func__, rc);
- break;
- default:
- break;
+ if (r->r.frmr.fr_state == FRMR_IS_STALE) {
+ list_add(&r->mw_list, stale);
+ continue;
}
+ req->rl_segments[i].mr_chunk.rl_mw = r;
+ if (unlikely(i-- == 0))
+ return req; /* Success */
}
- kfree(buf->rb_pool);
+ /* Not enough entries on rb_mws for this req */
+ rpcrdma_buffer_put_sendbuf(req, buf);
+ rpcrdma_buffer_put_mrs(req, buf);
+ return NULL;
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mw *r;
+ int i;
+
+ i = RPCRDMA_MAX_SEGS - 1;
+ while (!list_empty(&buf->rb_mws)) {
+ r = list_entry(buf->rb_mws.next,
+ struct rpcrdma_mw, mw_list);
+ list_del(&r->mw_list);
+ req->rl_segments[i].mr_chunk.rl_mw = r;
+ if (unlikely(i-- == 0))
+ return req; /* Success */
+ }
+
+ /* Not enough entries on rb_mws for this req */
+ rpcrdma_buffer_put_sendbuf(req, buf);
+ rpcrdma_buffer_put_mrs(req, buf);
+ return NULL;
}
/*
@@ -1254,10 +1498,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
{
+ struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+ struct list_head stale;
struct rpcrdma_req *req;
unsigned long flags;
- int i;
- struct rpcrdma_mw *r;
spin_lock_irqsave(&buffers->rb_lock, flags);
if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1277,16 +1521,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
}
buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
- if (!list_empty(&buffers->rb_mws)) {
- i = RPCRDMA_MAX_SEGS - 1;
- do {
- r = list_entry(buffers->rb_mws.next,
- struct rpcrdma_mw, mw_list);
- list_del(&r->mw_list);
- req->rl_segments[i].mr_chunk.rl_mw = r;
- } while (--i >= 0);
+
+ INIT_LIST_HEAD(&stale);
+ switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
+ break;
+ case RPCRDMA_MTHCAFMR:
+ req = rpcrdma_buffer_get_fmrs(req, buffers);
+ break;
+ default:
+ break;
}
spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ if (!list_empty(&stale))
+ rpcrdma_retry_flushed_linv(&stale, buffers);
return req;
}
@@ -1299,34 +1548,14 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
{
struct rpcrdma_buffer *buffers = req->rl_buffer;
struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
- int i;
unsigned long flags;
spin_lock_irqsave(&buffers->rb_lock, flags);
- buffers->rb_send_bufs[--buffers->rb_send_index] = req;
- req->rl_niovs = 0;
- if (req->rl_reply) {
- buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
- req->rl_reply->rr_func = NULL;
- req->rl_reply = NULL;
- }
+ rpcrdma_buffer_put_sendbuf(req, buffers);
switch (ia->ri_memreg_strategy) {
case RPCRDMA_FRMR:
case RPCRDMA_MTHCAFMR:
- /*
- * Cycle mw's back in reverse order, and "spin" them.
- * This delays and scrambles reuse as much as possible.
- */
- i = 1;
- do {
- struct rpcrdma_mw **mw;
- mw = &req->rl_segments[i].mr_chunk.rl_mw;
- list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
- *mw = NULL;
- } while (++i < RPCRDMA_MAX_SEGS);
- list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
- &buffers->rb_mws);
- req->rl_segments[0].mr_chunk.rl_mw = NULL;
+ rpcrdma_buffer_put_mrs(req, buffers);
break;
default:
break;
@@ -1388,6 +1617,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
*/
iov->addr = ib_dma_map_single(ia->ri_id->device,
va, len, DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
+ return -ENOMEM;
+
iov->length = len;
if (ia->ri_have_dma_lkey) {
@@ -1483,8 +1715,10 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_mr_seg *seg1 = seg;
- struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
-
+ struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
+ struct rpcrdma_frmr *frmr = &mw->r.frmr;
+ struct ib_mr *mr = frmr->fr_mr;
+ struct ib_send_wr fastreg_wr, *bad_wr;
u8 key;
int len, pageoff;
int i, rc;
@@ -1502,8 +1736,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
rpcrdma_map_one(ia, seg, writing);
pa = seg->mr_dma;
for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
- seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
- page_list[page_no++] = pa;
+ frmr->fr_pgl->page_list[page_no++] = pa;
pa += PAGE_SIZE;
}
len += seg->mr_len;
@@ -1515,65 +1748,51 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
break;
}
dprintk("RPC: %s: Using frmr %p to map %d segments\n",
- __func__, seg1->mr_chunk.rl_mw, i);
-
- if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
- dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
- __func__,
- seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
- /* Invalidate before using. */
- memset(&invalidate_wr, 0, sizeof invalidate_wr);
- invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
- invalidate_wr.next = &frmr_wr;
- invalidate_wr.opcode = IB_WR_LOCAL_INV;
- invalidate_wr.send_flags = IB_SEND_SIGNALED;
- invalidate_wr.ex.invalidate_rkey =
- seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
- DECR_CQCOUNT(&r_xprt->rx_ep);
- post_wr = &invalidate_wr;
- } else
- post_wr = &frmr_wr;
-
- /* Prepare FRMR WR */
- memset(&frmr_wr, 0, sizeof frmr_wr);
- frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
- frmr_wr.opcode = IB_WR_FAST_REG_MR;
- frmr_wr.send_flags = IB_SEND_SIGNALED;
- frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
- frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
- frmr_wr.wr.fast_reg.page_list_len = page_no;
- frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
- if (frmr_wr.wr.fast_reg.length < len) {
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
- return -EIO;
+ __func__, mw, i);
+
+ frmr->fr_state = FRMR_IS_VALID;
+
+ memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+ fastreg_wr.wr_id = (unsigned long)(void *)mw;
+ fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+ fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
+ fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
+ fastreg_wr.wr.fast_reg.page_list_len = page_no;
+ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
+ if (fastreg_wr.wr.fast_reg.length < len) {
+ rc = -EIO;
+ goto out_err;
}
/* Bump the key */
- key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
- ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+ key = (u8)(mr->rkey & 0x000000FF);
+ ib_update_fast_reg_key(mr, ++key);
- frmr_wr.wr.fast_reg.access_flags = (writing ?
+ fastreg_wr.wr.fast_reg.access_flags = (writing ?
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ);
- frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+ fastreg_wr.wr.fast_reg.rkey = mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);
- rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
-
+ rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
if (rc) {
dprintk("RPC: %s: failed ib_post_send for register,"
" status %i\n", __func__, rc);
- while (i--)
- rpcrdma_unmap_one(ia, --seg);
+ ib_update_fast_reg_key(mr, --key);
+ goto out_err;
} else {
- seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+ seg1->mr_rkey = mr->rkey;
seg1->mr_base = seg1->mr_dma + pageoff;
seg1->mr_nsegs = i;
seg1->mr_len = len;
}
*nsegs = i;
+ return 0;
+out_err:
+ frmr->fr_state = FRMR_IS_INVALID;
+ while (i--)
+ rpcrdma_unmap_one(ia, --seg);
return rc;
}
@@ -1585,20 +1804,25 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
struct ib_send_wr invalidate_wr, *bad_wr;
int rc;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
+ seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
memset(&invalidate_wr, 0, sizeof invalidate_wr);
invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
invalidate_wr.opcode = IB_WR_LOCAL_INV;
- invalidate_wr.send_flags = IB_SEND_SIGNALED;
invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);
+ read_lock(&ia->ri_qplock);
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(ia, seg++);
rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
- if (rc)
+ read_unlock(&ia->ri_qplock);
+ if (rc) {
+ /* Force rpcrdma_buffer_get() to retry */
+ seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
dprintk("RPC: %s: failed ib_post_send for invalidate,"
" status %i\n", __func__, rc);
+ }
return rc;
}
@@ -1656,8 +1880,10 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
rc = ib_unmap_fmr(&l);
+ read_lock(&ia->ri_qplock);
while (seg1->mr_nsegs--)
rpcrdma_unmap_one(ia, seg++);
+ read_unlock(&ia->ri_qplock);
if (rc)
dprintk("RPC: %s: failed ib_unmap_fmr,"
" status %i\n", __func__, rc);
@@ -1673,7 +1899,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
switch (ia->ri_memreg_strategy) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
case RPCRDMA_ALLPHYSICAL:
rpcrdma_map_one(ia, seg, writing);
seg->mr_rkey = ia->ri_bind_mem->rkey;
@@ -1681,7 +1906,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
seg->mr_nsegs = 1;
nsegs = 1;
break;
-#endif
/* Registration using frmr registration */
case RPCRDMA_FRMR:
@@ -1711,11 +1935,11 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
switch (ia->ri_memreg_strategy) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
case RPCRDMA_ALLPHYSICAL:
+ read_lock(&ia->ri_qplock);
rpcrdma_unmap_one(ia, seg);
+ read_unlock(&ia->ri_qplock);
break;
-#endif
case RPCRDMA_FRMR:
rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
@@ -1809,3 +2033,44 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
rc);
return rc;
}
+
+/* Physical mapping means one Read/Write list entry per-page.
+ * All list entries must fit within an inline buffer
+ *
+ * NB: The server must return a Write list for NFS READ,
+ * which has the same constraint. Factor in the inline
+ * rsize as well.
+ */
+static size_t
+rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ unsigned int inline_size, pages;
+
+ inline_size = min_t(unsigned int,
+ cdata->inline_wsize, cdata->inline_rsize);
+ inline_size -= RPCRDMA_HDRLEN_MIN;
+ pages = inline_size / sizeof(struct rpcrdma_segment);
+ return pages << PAGE_SHIFT;
+}
+
+static size_t
+rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+ return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
+}
+
+size_t
+rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+ size_t result;
+
+ switch (r_xprt->rx_ia.ri_memreg_strategy) {
+ case RPCRDMA_ALLPHYSICAL:
+ result = rpcrdma_physical_max_payload(r_xprt);
+ break;
+ default:
+ result = rpcrdma_mr_max_payload(r_xprt);
+ }
+ return result;
+}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 89e7cd479705..c419498b8f46 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -59,6 +59,7 @@
* Interface Adapter -- one per transport instance
*/
struct rpcrdma_ia {
+ rwlock_t ri_qplock;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
struct ib_mr *ri_bind_mem;
@@ -98,6 +99,14 @@ struct rpcrdma_ep {
#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+enum rpcrdma_chunktype {
+ rpcrdma_noch = 0,
+ rpcrdma_readch,
+ rpcrdma_areadch,
+ rpcrdma_writech,
+ rpcrdma_replych
+};
+
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
* and complete a reply, asychronously. It needs several pieces of
@@ -137,6 +146,40 @@ struct rpcrdma_rep {
};
/*
+ * struct rpcrdma_mw - external memory region metadata
+ *
+ * An external memory region is any buffer or page that is registered
+ * on the fly (ie, not pre-registered).
+ *
+ * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During
+ * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
+ * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
+ * track of registration metadata while each RPC is pending.
+ * rpcrdma_deregister_external() uses this metadata to unmap and
+ * release these resources when an RPC is complete.
+ */
+enum rpcrdma_frmr_state {
+ FRMR_IS_INVALID, /* ready to be used */
+ FRMR_IS_VALID, /* in use */
+ FRMR_IS_STALE, /* failed completion */
+};
+
+struct rpcrdma_frmr {
+ struct ib_fast_reg_page_list *fr_pgl;
+ struct ib_mr *fr_mr;
+ enum rpcrdma_frmr_state fr_state;
+};
+
+struct rpcrdma_mw {
+ union {
+ struct ib_fmr *fmr;
+ struct rpcrdma_frmr frmr;
+ } r;
+ struct list_head mw_list;
+ struct list_head mw_all;
+};
+
+/*
* struct rpcrdma_req -- structure central to the request/reply sequence.
*
* N of these are associated with a transport instance, and stored in
@@ -163,17 +206,7 @@ struct rpcrdma_rep {
struct rpcrdma_mr_seg { /* chunk descriptors */
union { /* chunk memory handles */
struct ib_mr *rl_mr; /* if registered directly */
- struct rpcrdma_mw { /* if registered from region */
- union {
- struct ib_fmr *fmr;
- struct {
- struct ib_fast_reg_page_list *fr_pgl;
- struct ib_mr *fr_mr;
- enum { FRMR_IS_INVALID, FRMR_IS_VALID } state;
- } frmr;
- } r;
- struct list_head mw_list;
- } *rl_mw;
+ struct rpcrdma_mw *rl_mw; /* if registered from region */
} mr_chunk;
u64 mr_base; /* registration result */
u32 mr_rkey; /* registration result */
@@ -191,6 +224,7 @@ struct rpcrdma_req {
unsigned int rl_niovs; /* 0, 2 or 4 */
unsigned int rl_nchunks; /* non-zero if chunks */
unsigned int rl_connect_cookie; /* retry detection */
+ enum rpcrdma_chunktype rl_rtype, rl_wtype;
struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
@@ -214,6 +248,7 @@ struct rpcrdma_buffer {
atomic_t rb_credits; /* most recent server credits */
int rb_max_requests;/* client max requests */
struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
+ struct list_head rb_all;
int rb_send_index;
struct rpcrdma_req **rb_send_bufs;
int rb_recv_index;
@@ -306,7 +341,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
struct rpcrdma_create_data_internal *);
void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
struct rpcrdma_req *);
@@ -346,7 +381,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
/*
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
*/
+ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
int rpcrdma_marshal_req(struct rpc_rqst *);
+size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
/* Temporary NFS request map cache. Created in svc_rdma.c */
extern struct kmem_cache *svc_rdma_map_cachep;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index be8bbd5d65ec..43cd89eacfab 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -594,6 +594,7 @@ static int xs_local_send_request(struct rpc_task *task)
}
switch (status) {
+ case -ENOBUFS:
case -EAGAIN:
status = xs_nospace(task);
break;
@@ -661,6 +662,7 @@ static int xs_udp_send_request(struct rpc_task *task)
dprintk("RPC: sendmsg returned unrecognized error %d\n",
-status);
case -ENETUNREACH:
+ case -ENOBUFS:
case -EPIPE:
case -ECONNREFUSED:
/* When the server has died, an ICMP port unreachable message
@@ -758,6 +760,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
status = -ENOTCONN;
/* Should we call xs_close() here? */
break;
+ case -ENOBUFS:
case -EAGAIN:
status = xs_nospace(task);
break;
@@ -1946,6 +1949,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
dprintk("RPC: xprt %p connected to %s\n",
xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
xprt_set_connected(xprt);
+ case -ENOBUFS:
break;
case -ENOENT:
dprintk("RPC: xprt %p: socket %s does not exist\n",
@@ -2281,6 +2285,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
case -ECONNREFUSED:
case -ECONNRESET:
case -ENETUNREACH:
+ case -ENOBUFS:
/* retry with existing socket, after a delay */
goto out;
}
@@ -3054,12 +3059,12 @@ static int param_set_uint_minmax(const char *val,
const struct kernel_param *kp,
unsigned int min, unsigned int max)
{
- unsigned long num;
+ unsigned int num;
int ret;
if (!val)
return -EINVAL;
- ret = strict_strtoul(val, 0, &num);
+ ret = kstrtouint(val, 0, &num);
if (ret == -EINVAL || num < min || num > max)
return -EINVAL;
*((unsigned int *)kp->arg) = num;
diff --git a/net/tipc/port.h b/net/tipc/port.h
index 3f93454592b6..3087da39ee47 100644
--- a/net/tipc/port.h
+++ b/net/tipc/port.h
@@ -179,9 +179,12 @@ static inline int tipc_port_importance(struct tipc_port *port)
return msg_importance(&port->phdr);
}
-static inline void tipc_port_set_importance(struct tipc_port *port, int imp)
+static inline int tipc_port_set_importance(struct tipc_port *port, int imp)
{
+ if (imp > TIPC_CRITICAL_IMPORTANCE)
+ return -EINVAL;
msg_set_importance(&port->phdr, (u32)imp);
+ return 0;
}
#endif
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 7d423ee10897..ff8c8118d56e 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1973,7 +1973,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
switch (opt) {
case TIPC_IMPORTANCE:
- tipc_port_set_importance(port, value);
+ res = tipc_port_set_importance(port, value);
break;
case TIPC_SRC_DROPPABLE:
if (sock->type != SOCK_STREAM)