diff options
-rw-r--r-- | drivers/net/vxlan.c | 342 | ||||
-rw-r--r-- | include/net/vxlan.h | 39 | ||||
-rw-r--r-- | include/uapi/linux/openvswitch.h | 11 | ||||
-rw-r--r-- | net/openvswitch/Kconfig | 13 | ||||
-rw-r--r-- | net/openvswitch/Makefile | 4 | ||||
-rw-r--r-- | net/openvswitch/vport-vxlan.c | 204 | ||||
-rw-r--r-- | net/openvswitch/vport.c | 3 | ||||
-rw-r--r-- | net/openvswitch/vport.h | 1 |
8 files changed, 477 insertions, 140 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 570ad7aa7204..b9401b53b699 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -27,6 +27,7 @@ #include <linux/igmp.h> #include <linux/etherdevice.h> #include <linux/if_ether.h> +#include <linux/if_vlan.h> #include <linux/hash.h> #include <linux/ethtool.h> #include <net/arp.h> @@ -41,6 +42,7 @@ #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/vxlan.h> #define VXLAN_VERSION "0.1" @@ -57,6 +59,7 @@ #define VXLAN_VID_MASK (VXLAN_N_VID - 1) /* IP header + UDP + VXLAN + Ethernet header */ #define VXLAN_HEADROOM (20 + 8 + 8 + 14) +#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) #define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ @@ -82,16 +85,6 @@ static int vxlan_net_id; static const u8 all_zeros_mac[ETH_ALEN]; -/* per UDP socket information */ -struct vxlan_sock { - struct hlist_node hlist; - struct rcu_head rcu; - struct work_struct del_work; - atomic_t refcnt; - struct socket *sock; - struct hlist_head vni_list[VNI_HASH_SIZE]; -}; - /* per-network namespace private data for this module */ struct vxlan_net { struct list_head vxlan_list; @@ -188,7 +181,7 @@ static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) } /* Find VXLAN socket based on network namespace and UDP port */ -static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port) +static struct vxlan_sock *vxlan_find_sock(struct net *net, __be16 port) { struct vxlan_sock *vs; @@ -199,16 +192,10 @@ static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port) return NULL; } -/* Look up VNI in a per net namespace table */ -static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port) +static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id) { - struct vxlan_sock *vs; struct vxlan_dev *vxlan; - vs = vxlan_find_port(net, port); - if (!vs) - return NULL; - hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) { if (vxlan->default_dst.remote_vni == id) return vxlan; @@ -217,6 +204,18 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port) return NULL; } +/* Look up VNI in a per net namespace table */ +static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port) +{ + struct vxlan_sock *vs; + + vs = vxlan_find_sock(net, port); + if (!vs) + return NULL; + + return vxlan_vs_find_vni(vs, id); +} + /* Fill in neighbour message in skbuff. */ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, const struct vxlan_fdb *fdb, @@ -802,8 +801,10 @@ static void vxlan_sock_hold(struct vxlan_sock *vs) atomic_inc(&vs->refcnt); } -static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs) +void vxlan_sock_release(struct vxlan_sock *vs) { + struct vxlan_net *vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); + if (!atomic_dec_and_test(&vs->refcnt)) return; @@ -813,6 +814,7 @@ static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs) queue_work(vxlan_wq, &vs->del_work); } +EXPORT_SYMBOL_GPL(vxlan_sock_release); /* Callback to update multicast group membership when first VNI on * multicast asddress is brought up @@ -821,7 +823,6 @@ static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs) static void vxlan_igmp_join(struct work_struct *work) { struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join); - struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; struct ip_mreqn mreq = { @@ -833,7 +834,7 @@ static void vxlan_igmp_join(struct work_struct *work) ip_mc_join_group(sk, &mreq); release_sock(sk); - vxlan_sock_release(vn, vs); + vxlan_sock_release(vs); dev_put(vxlan->dev); } @@ -841,7 +842,6 @@ static void vxlan_igmp_join(struct work_struct *work) static void vxlan_igmp_leave(struct work_struct *work) { struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave); - struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; struct sock *sk = vs->sock->sk; struct ip_mreqn mreq = { @@ -853,30 +853,23 @@ static void vxlan_igmp_leave(struct work_struct *work) ip_mc_leave_group(sk, &mreq); release_sock(sk); - vxlan_sock_release(vn, vs); + vxlan_sock_release(vs); dev_put(vxlan->dev); } /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { - struct iphdr *oip; + struct vxlan_sock *vs; struct vxlanhdr *vxh; - struct vxlan_dev *vxlan; - struct pcpu_tstats *stats; __be16 port; - __u32 vni; - int err; - - /* pop off outer UDP header */ - __skb_pull(skb, sizeof(struct udphdr)); /* Need Vxlan and inner Ethernet header to be present */ - if (!pskb_may_pull(skb, sizeof(struct vxlanhdr))) + if (!pskb_may_pull(skb, VXLAN_HLEN)) goto error; - /* Drop packets with reserved bits set */ - vxh = (struct vxlanhdr *) skb->data; + /* Return packets with reserved bits set */ + vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); if (vxh->vx_flags != htonl(VXLAN_FLAGS) || (vxh->vx_vni & htonl(0xff))) { netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", @@ -884,28 +877,44 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto error; } - __skb_pull(skb, sizeof(struct vxlanhdr)); + if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) + goto drop; - /* Is this VNI defined? */ - vni = ntohl(vxh->vx_vni) >> 8; port = inet_sk(sk)->inet_sport; - vxlan = vxlan_find_vni(sock_net(sk), vni, port); - if (!vxlan) { - netdev_dbg(skb->dev, "unknown vni %d port %u\n", - vni, ntohs(port)); + + vs = vxlan_find_sock(sock_net(sk), port); + if (!vs) goto drop; - } - if (!pskb_may_pull(skb, ETH_HLEN)) { - vxlan->dev->stats.rx_length_errors++; - vxlan->dev->stats.rx_errors++; + vs->rcv(vs, skb, vxh->vx_vni); + return 0; + +drop: + /* Consume bad packet */ + kfree_skb(skb); + return 0; + +error: + /* Return non vxlan pkt */ + return 1; +} + +static void vxlan_rcv(struct vxlan_sock *vs, + struct sk_buff *skb, __be32 vx_vni) +{ + struct iphdr *oip; + struct vxlan_dev *vxlan; + struct pcpu_tstats *stats; + __u32 vni; + int err; + + vni = ntohl(vx_vni) >> 8; + /* Is this VNI defined? */ + vxlan = vxlan_vs_find_vni(vs, vni); + if (!vxlan) goto drop; - } skb_reset_mac_header(skb); - - /* Re-examine inner Ethernet packet */ - oip = ip_hdr(skb); skb->protocol = eth_type_trans(skb, vxlan->dev); /* Ignore packet loops (and multicast echo) */ @@ -913,11 +922,12 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) vxlan->dev->dev_addr) == 0) goto drop; + /* Re-examine inner Ethernet packet */ + oip = ip_hdr(skb); if ((vxlan->flags & VXLAN_F_LEARN) && vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source)) goto drop; - __skb_tunnel_rx(skb, vxlan->dev); skb_reset_network_header(skb); /* If the NIC driver gave us an encapsulated packet with @@ -951,16 +961,10 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) netif_rx(skb); - return 0; -error: - /* Put UDP header back */ - __skb_push(skb, sizeof(struct udphdr)); - - return 1; + return; drop: /* Consume bad packet */ kfree_skb(skb); - return 0; } static int arp_reduce(struct net_device *dev, struct sk_buff *skb) @@ -1079,11 +1083,8 @@ static void vxlan_sock_put(struct sk_buff *skb) } /* On transmit, associate with the tunnel socket */ -static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) +static void vxlan_set_owner(struct sock *sk, struct sk_buff *skb) { - struct vxlan_dev *vxlan = netdev_priv(dev); - struct sock *sk = vxlan->vn_sock->sock->sk; - skb_orphan(skb); sock_hold(sk); skb->sk = sk; @@ -1095,9 +1096,9 @@ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) * better and maybe available from hardware * secondary choice is to use jhash on the Ethernet header */ -static __be16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb) +__be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb) { - unsigned int range = (vxlan->port_max - vxlan->port_min) + 1; + unsigned int range = (port_max - port_min) + 1; u32 hash; hash = skb_get_rxhash(skb); @@ -1105,8 +1106,9 @@ static __be16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb) hash = jhash(skb->data, 2 * ETH_ALEN, (__force u32) skb->protocol); - return htons((((u64) hash * range) >> 32) + vxlan->port_min); + return htons((((u64) hash * range) >> 32) + port_min); } +EXPORT_SYMBOL_GPL(vxlan_src_port); static int handle_offloads(struct sk_buff *skb) { @@ -1122,6 +1124,64 @@ static int handle_offloads(struct sk_buff *skb) return 0; } +int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs, + struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, + __be16 src_port, __be16 dst_port, __be32 vni) +{ + struct vxlanhdr *vxh; + struct udphdr *uh; + int min_headroom; + int err; + + if (!skb->encapsulation) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + } + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + VXLAN_HLEN + sizeof(struct iphdr) + + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + /* Need space for new headers (invalidates iph ptr) */ + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) + return err; + + if (vlan_tx_tag_present(skb)) { + if (WARN_ON(!__vlan_put_tag(skb, + skb->vlan_proto, + vlan_tx_tag_get(skb)))) + return -ENOMEM; + + skb->vlan_tci = 0; + } + + vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); + vxh->vx_flags = htonl(VXLAN_FLAGS); + vxh->vx_vni = vni; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + + uh->len = htons(skb->len); + uh->check = 0; + + vxlan_set_owner(vs->sock->sk, skb); + + err = handle_offloads(skb); + if (err) + return err; + + return iptunnel_xmit(net, rt, skb, src, dst, + IPPROTO_UDP, tos, ttl, df); +} +EXPORT_SYMBOL_GPL(vxlan_xmit_skb); + /* Bypass encapsulation if the destination is local */ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, struct vxlan_dev *dst_vxlan) @@ -1159,8 +1219,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct rtable *rt; const struct iphdr *old_iph; - struct vxlanhdr *vxh; - struct udphdr *uh; struct flowi4 fl4; __be32 dst; __be16 src_port, dst_port; @@ -1182,15 +1240,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto drop; } - if (!skb->encapsulation) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } - - /* Need space for new headers (invalidates iph ptr) */ - if (skb_cow_head(skb, VXLAN_HEADROOM)) - goto drop; - old_iph = ip_hdr(skb); ttl = vxlan->ttl; @@ -1201,7 +1250,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); - src_port = vxlan_src_port(vxlan, skb); + src_port = vxlan_src_port(vxlan->port_min, vxlan->port_max, skb); memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = rdst->remote_ifindex; @@ -1218,9 +1267,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, if (rt->dst.dev == dev) { netdev_dbg(dev, "circular route to %pI4\n", &dst); - ip_rt_put(rt); dev->stats.collisions++; - goto tx_error; + goto rt_tx_error; } /* Bypass encapsulation if the destination is local */ @@ -1235,30 +1283,16 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vxlan_encap_bypass(skb, vxlan, dst_vxlan); return; } - vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); - vxh->vx_flags = htonl(VXLAN_FLAGS); - vxh->vx_vni = htonl(vni << 8); - - __skb_push(skb, sizeof(*uh)); - skb_reset_transport_header(skb); - uh = udp_hdr(skb); - - uh->dest = dst_port; - uh->source = src_port; - - uh->len = htons(skb->len); - uh->check = 0; - - vxlan_set_owner(dev, skb); - - if (handle_offloads(skb)) - goto drop; tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, dst, - IPPROTO_UDP, tos, ttl, df); + err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb, + fl4.saddr, dst, tos, ttl, df, + src_port, dst_port, htonl(vni << 8)); + + if (err < 0) + goto rt_tx_error; iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return; @@ -1267,6 +1301,8 @@ drop: dev->stats.tx_dropped++; goto tx_free; +rt_tx_error: + ip_rt_put(rt); tx_error: dev->stats.tx_errors++; tx_free: @@ -1365,25 +1401,31 @@ static void vxlan_cleanup(unsigned long arg) mod_timer(&vxlan->age_timer, next_timer); } +static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan) +{ + __u32 vni = vxlan->default_dst.remote_vni; + + vxlan->vn_sock = vs; + hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); +} + /* Setup stats when device is created */ static int vxlan_init(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_sock *vs; - __u32 vni = vxlan->default_dst.remote_vni; dev->tstats = alloc_percpu(struct pcpu_tstats); if (!dev->tstats) return -ENOMEM; spin_lock(&vn->sock_lock); - vs = vxlan_find_port(dev_net(dev), vxlan->dst_port); + vs = vxlan_find_sock(dev_net(dev), vxlan->dst_port); if (vs) { /* If we have a socket with same port already, reuse it */ atomic_inc(&vs->refcnt); - vxlan->vn_sock = vs; - hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); + vxlan_vs_add_dev(vs, vxlan); } else { /* otherwise make new socket outside of RTNL */ dev_hold(dev); @@ -1408,13 +1450,12 @@ static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan) static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); struct vxlan_sock *vs = vxlan->vn_sock; vxlan_fdb_delete_default(vxlan); if (vs) - vxlan_sock_release(vn, vs); + vxlan_sock_release(vs); free_percpu(dev->tstats); } @@ -1530,8 +1571,11 @@ static void vxlan_setup(struct net_device *dev) dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; + dev->vlan_features = dev->features; + dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; @@ -1631,8 +1675,10 @@ static void vxlan_del_work(struct work_struct *work) kfree_rcu(vs, rcu); } -static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) +static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, + vxlan_rcv_t *rcv, void *data) { + struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; struct sock *sk; struct sockaddr_in vxlan_addr = { @@ -1644,8 +1690,10 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) unsigned int h; vs = kmalloc(sizeof(*vs), GFP_KERNEL); - if (!vs) + if (!vs) { + pr_debug("memory alocation failure\n"); return ERR_PTR(-ENOMEM); + } for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vs->vni_list[h]); @@ -1673,57 +1721,70 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) kfree(vs); return ERR_PTR(rc); } + atomic_set(&vs->refcnt, 1); + vs->rcv = rcv; + vs->data = data; /* Disable multicast loopback */ inet_sk(sk)->mc_loop = 0; + spin_lock(&vn->sock_lock); + hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); + spin_unlock(&vn->sock_lock); /* Mark socket as an encapsulation socket. */ udp_sk(sk)->encap_type = 1; udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; udp_encap_enable(); - atomic_set(&vs->refcnt, 1); + return vs; +} + +struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, + vxlan_rcv_t *rcv, void *data, + bool no_share) +{ + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_sock *vs; + + vs = vxlan_socket_create(net, port, rcv, data); + if (!IS_ERR(vs)) + return vs; + + if (no_share) /* Return error if sharing is not allowed. */ + return vs; + + spin_lock(&vn->sock_lock); + vs = vxlan_find_sock(net, port); + if (vs) { + if (vs->rcv == rcv) + atomic_inc(&vs->refcnt); + else + vs = ERR_PTR(-EBUSY); + } + spin_unlock(&vn->sock_lock); + + if (!vs) + vs = ERR_PTR(-EINVAL); return vs; } +EXPORT_SYMBOL_GPL(vxlan_sock_add); /* Scheduled at device creation to bind to a socket */ static void vxlan_sock_work(struct work_struct *work) { - struct vxlan_dev *vxlan - = container_of(work, struct vxlan_dev, sock_work); - struct net_device *dev = vxlan->dev; - struct net *net = dev_net(dev); - __u32 vni = vxlan->default_dst.remote_vni; - __be16 port = vxlan->dst_port; + struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, sock_work); + struct net *net = dev_net(vxlan->dev); struct vxlan_net *vn = net_generic(net, vxlan_net_id); - struct vxlan_sock *nvs, *ovs; - - nvs = vxlan_socket_create(net, port); - if (IS_ERR(nvs)) { - netdev_err(vxlan->dev, "Can not create UDP socket, %ld\n", - PTR_ERR(nvs)); - goto out; - } + __be16 port = vxlan->dst_port; + struct vxlan_sock *nvs; + nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false); spin_lock(&vn->sock_lock); - /* Look again to see if can reuse socket */ - ovs = vxlan_find_port(net, port); - if (ovs) { - atomic_inc(&ovs->refcnt); - vxlan->vn_sock = ovs; - hlist_add_head_rcu(&vxlan->hlist, vni_head(ovs, vni)); - spin_unlock(&vn->sock_lock); - - sk_release_kernel(nvs->sock->sk); - kfree(nvs); - } else { - vxlan->vn_sock = nvs; - hlist_add_head_rcu(&nvs->hlist, vs_head(net, port)); - hlist_add_head_rcu(&vxlan->hlist, vni_head(nvs, vni)); - spin_unlock(&vn->sock_lock); - } -out: - dev_put(dev); + if (!IS_ERR(nvs)) + vxlan_vs_add_dev(nvs, vxlan); + spin_unlock(&vn->sock_lock); + + dev_put(vxlan->dev); } static int vxlan_newlink(struct net *net, struct net_device *dev, @@ -1838,7 +1899,8 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) struct vxlan_dev *vxlan = netdev_priv(dev); spin_lock(&vn->sock_lock); - hlist_del_rcu(&vxlan->hlist); + if (!hlist_unhashed(&vxlan->hlist)) + hlist_del_rcu(&vxlan->hlist); spin_unlock(&vn->sock_lock); list_del(&vxlan->next); diff --git a/include/net/vxlan.h b/include/net/vxlan.h new file mode 100644 index 000000000000..ad342e3688a0 --- /dev/null +++ b/include/net/vxlan.h @@ -0,0 +1,39 @@ +#ifndef __NET_VXLAN_H +#define __NET_VXLAN_H 1 + +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/udp.h> + +#define VNI_HASH_BITS 10 +#define VNI_HASH_SIZE (1<<VNI_HASH_BITS) + +struct vxlan_sock; +typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb, __be32 key); + +/* per UDP socket information */ +struct vxlan_sock { + struct hlist_node hlist; + vxlan_rcv_t *rcv; + void *data; + struct work_struct del_work; + struct socket *sock; + struct rcu_head rcu; + struct hlist_head vni_list[VNI_HASH_SIZE]; + atomic_t refcnt; +}; + +struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, + vxlan_rcv_t *rcv, void *data, + bool no_share); + +void vxlan_sock_release(struct vxlan_sock *vs); + +int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs, + struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, + __be16 src_port, __be16 dst_port, __be32 vni); + +__be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb); + +#endif diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index c55efaaa9bb4..52490b0e62b5 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -165,6 +165,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_NETDEV, /* network device */ OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */ OVS_VPORT_TYPE_GRE, /* GRE tunnel. */ + OVS_VPORT_TYPE_VXLAN, /* VXLAN tunnel. */ __OVS_VPORT_TYPE_MAX }; @@ -211,6 +212,16 @@ enum ovs_vport_attr { #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1) +/* OVS_VPORT_ATTR_OPTIONS attributes for tunnels. + */ +enum { + OVS_TUNNEL_ATTR_UNSPEC, + OVS_TUNNEL_ATTR_DST_PORT, /* 16-bit UDP port, used by L4 tunnels. */ + __OVS_TUNNEL_ATTR_MAX +}; + +#define OVS_TUNNEL_ATTR_MAX (__OVS_TUNNEL_ATTR_MAX - 1) + /* Flows. */ #define OVS_FLOW_FAMILY "ovs_flow" diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 27ee56b688a3..bed30e69baa7 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -40,3 +40,16 @@ config OPENVSWITCH_GRE Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config OPENVSWITCH_VXLAN + bool "Open vSwitch VXLAN tunneling support" + depends on INET + depends on OPENVSWITCH + depends on VXLAN && !(OPENVSWITCH=y && VXLAN=m) + default y + ---help--- + If you say Y here, then the Open vSwitch will be able create vxlan vport. + + Say N to exclude this support and reduce the binary size. + + If unsure, say Y. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 01bddb2991e3..82e4ee54a44b 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -13,3 +13,7 @@ openvswitch-y := \ vport-gre.o \ vport-internal_dev.o \ vport-netdev.o + +ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) +openvswitch-y += vport-vxlan.o +endif diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c new file mode 100644 index 000000000000..36848bd54a77 --- /dev/null +++ b/net/openvswitch/vport-vxlan.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2013 Nicira, Inc. + * Copyright (c) 2013 Cisco Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/net.h> +#include <linux/rculist.h> +#include <linux/udp.h> + +#include <net/icmp.h> +#include <net/ip.h> +#include <net/udp.h> +#include <net/ip_tunnels.h> +#include <net/udp.h> +#include <net/rtnetlink.h> +#include <net/route.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/vxlan.h> + +#include "datapath.h" +#include "vport.h" + +/** + * struct vxlan_port - Keeps track of open UDP ports + * @vs: vxlan_sock created for the port. + * @name: vport name. + */ +struct vxlan_port { + struct vxlan_sock *vs; + char name[IFNAMSIZ]; +}; + +static inline struct vxlan_port *vxlan_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +/* Called with rcu_read_lock and BH disabled. */ +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) +{ + struct ovs_key_ipv4_tunnel tun_key; + struct vport *vport = vs->data; + struct iphdr *iph; + __be64 key; + + /* Save outer tunnel values */ + iph = ip_hdr(skb); + key = cpu_to_be64(ntohl(vx_vni) >> 8); + ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); + + ovs_vport_receive(vport, skb, &tun_key); +} + +static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) + return -EMSGSIZE; + return 0; +} + +static void vxlan_tnl_destroy(struct vport *vport) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + + vxlan_sock_release(vxlan_port->vs); + + ovs_vport_deferred_free(vport); +} + +static struct vport *vxlan_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct vxlan_port *vxlan_port; + struct vxlan_sock *vs; + struct vport *vport; + struct nlattr *a; + u16 dst_port; + int err; + + if (!options) { + err = -EINVAL; + goto error; + } + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct vxlan_port), + &ovs_vxlan_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + vxlan_port = vxlan_vport(vport); + strncpy(vxlan_port->name, parms->name, IFNAMSIZ); + + vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true); + if (IS_ERR(vs)) { + ovs_vport_free(vport); + return (void *)vs; + } + vxlan_port->vs = vs; + + return vport; + +error: + return ERR_PTR(err); +} + +static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct vxlan_port *vxlan_port = vxlan_vport(vport); + __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + struct rtable *rt; + struct flowi4 fl; + __be16 src_port; + int port_min; + int port_max; + __be16 df; + int err; + + if (unlikely(!OVS_CB(skb)->tun_key)) { + err = -EINVAL; + goto error; + } + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; + fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + htons(IP_DF) : 0; + + skb->local_df = 1; + + inet_get_local_port_range(&port_min, &port_max); + src_port = vxlan_src_port(port_min, port_max, skb); + + err = vxlan_xmit_skb(net, vxlan_port->vs, rt, skb, + fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, + OVS_CB(skb)->tun_key->ipv4_tos, + OVS_CB(skb)->tun_key->ipv4_ttl, df, + src_port, dst_port, + htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8)); + if (err < 0) + ip_rt_put(rt); +error: + return err; +} + +static const char *vxlan_get_name(const struct vport *vport) +{ + struct vxlan_port *vxlan_port = vxlan_vport(vport); + return vxlan_port->name; +} + +const struct vport_ops ovs_vxlan_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_tnl_create, + .destroy = vxlan_tnl_destroy, + .get_name = vxlan_get_name, + .get_options = vxlan_get_options, + .send = vxlan_tnl_send, +}; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index d4c7fa04ce08..d69e0c06dfde 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -42,6 +42,9 @@ static const struct vport_ops *vport_ops_list[] = { #ifdef CONFIG_OPENVSWITCH_GRE &ovs_gre_vport_ops, #endif +#ifdef CONFIG_OPENVSWITCH_VXLAN + &ovs_vxlan_vport_ops, +#endif }; /* Protected by RCU read lock for reading, ovs_mutex for writing. */ diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 376045c42f8b..1a9fbcec6e1b 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -199,6 +199,7 @@ void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); extern const struct vport_ops ovs_netdev_vport_ops; extern const struct vport_ops ovs_internal_vport_ops; extern const struct vport_ops ovs_gre_vport_ops; +extern const struct vport_ops ovs_vxlan_vport_ops; static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) |