diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/datagram.c | 45 | ||||
-rw-r--r-- | net/core/dev.c | 108 | ||||
-rw-r--r-- | net/core/dev_addr_lists.c | 97 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 2 | ||||
-rw-r--r-- | net/core/devlink.c | 5 | ||||
-rw-r--r-- | net/core/filter.c | 409 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 3 | ||||
-rw-r--r-- | net/core/neighbour.c | 174 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 2 | ||||
-rw-r--r-- | net/core/net_namespace.c | 159 | ||||
-rw-r--r-- | net/core/netpoll.c | 2 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 353 | ||||
-rw-r--r-- | net/core/skbuff.c | 115 | ||||
-rw-r--r-- | net/core/sock.c | 14 | ||||
-rw-r--r-- | net/core/sock_reuseport.c | 1 | ||||
-rw-r--r-- | net/core/stream.c | 2 |
16 files changed, 1084 insertions, 407 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index 57f3a6fcfc1e..4bf62b1afa3b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -728,49 +728,6 @@ fault: return -EFAULT; } -__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) -{ - __sum16 sum; - - sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - if (!skb_shared(skb)) - skb->csum_valid = !sum; - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete_head); - -__sum16 __skb_checksum_complete(struct sk_buff *skb) -{ - __wsum csum; - __sum16 sum; - - csum = skb_checksum(skb, 0, skb->len, 0); - - /* skb->csum holds pseudo checksum */ - sum = csum_fold(csum_add(skb->csum, csum)); - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); - } - - if (!skb_shared(skb)) { - /* Save full packet checksum */ - skb->csum = csum; - skb->ip_summed = CHECKSUM_COMPLETE; - skb->csum_complete_sw = 1; - skb->csum_valid = !sum; - } - - return sum; -} -EXPORT_SYMBOL(__skb_checksum_complete); - /** * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. * @skb: skbuff @@ -810,7 +767,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(NULL); + netdev_rx_csum_fault(NULL, skb); } return 0; fault: diff --git a/net/core/dev.c b/net/core/dev.c index 722d50dbf8a4..754284873355 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -162,6 +162,9 @@ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack); static struct napi_struct *napi_by_id(unsigned int napi_id); /* @@ -1361,7 +1364,7 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); -static int __dev_open(struct net_device *dev) +static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int ret; @@ -1377,7 +1380,7 @@ static int __dev_open(struct net_device *dev) */ netpoll_poll_disable(dev); - ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack); ret = notifier_to_errno(ret); if (ret) return ret; @@ -1406,7 +1409,8 @@ static int __dev_open(struct net_device *dev) /** * dev_open - prepare an interface for use. - * @dev: device to open + * @dev: device to open + * @extack: netlink extended ack * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally @@ -1416,14 +1420,14 @@ static int __dev_open(struct net_device *dev) * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ -int dev_open(struct net_device *dev) +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; if (dev->flags & IFF_UP) return 0; - ret = __dev_open(dev); + ret = __dev_open(dev, extack); if (ret < 0) return ret; @@ -1733,6 +1737,18 @@ static int call_netdevice_notifiers_info(unsigned long val, return raw_notifier_call_chain(&netdev_chain, val, info); } +static int call_netdevice_notifiers_extack(unsigned long val, + struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_info info = { + .dev = dev, + .extack = extack, + }; + + return call_netdevice_notifiers_info(val, &info); +} + /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1744,11 +1760,7 @@ static int call_netdevice_notifiers_info(unsigned long val, int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info = { - .dev = dev, - }; - - return call_netdevice_notifiers_info(val, &info); + return call_netdevice_notifiers_extack(val, dev, NULL); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -3096,10 +3108,17 @@ EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG -void netdev_rx_csum_fault(struct net_device *dev) +void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { if (net_ratelimit()) { pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); + if (dev) + pr_err("dev features: %pNF\n", &dev->features); + pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n", + skb->len, skb->data_len, skb->pkt_type, + skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, + skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum, + skb->csum_complete_sw, skb->csum_valid, skb->csum_level); dump_stack(); } } @@ -4525,9 +4544,14 @@ static int netif_rx_internal(struct sk_buff *skb) int netif_rx(struct sk_buff *skb) { + int ret; + trace_netif_rx_entry(skb); - return netif_rx_internal(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_rx); @@ -4542,6 +4566,7 @@ int netif_rx_ni(struct sk_buff *skb) if (local_softirq_pending()) do_softirq(); preempt_enable(); + trace_netif_rx_ni_exit(err); return err; } @@ -4894,7 +4919,7 @@ skip_classify: * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } type = skb->protocol; @@ -5227,9 +5252,14 @@ static void netif_receive_skb_list_internal(struct list_head *head) */ int netif_receive_skb(struct sk_buff *skb) { + int ret; + trace_netif_receive_skb_entry(skb); - return netif_receive_skb_internal(skb); + ret = netif_receive_skb_internal(skb); + trace_netif_receive_skb_exit(ret); + + return ret; } EXPORT_SYMBOL(netif_receive_skb); @@ -5249,9 +5279,12 @@ void netif_receive_skb_list(struct list_head *head) if (list_empty(head)) return; - list_for_each_entry(skb, head, list) - trace_netif_receive_skb_list_entry(skb); + if (trace_netif_receive_skb_list_entry_enabled()) { + list_for_each_entry(skb, head, list) + trace_netif_receive_skb_list_entry(skb); + } netif_receive_skb_list_internal(head); + trace_netif_receive_skb_list_exit(0); } EXPORT_SYMBOL(netif_receive_skb_list); @@ -5362,11 +5395,13 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - u32 i; + unsigned long bitmask = napi->gro_bitmask; + unsigned int i, base = ~0U; - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - if (test_bit(i, &napi->gro_bitmask)) - __napi_gro_flush_chain(napi, i, flush_old); + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __napi_gro_flush_chain(napi, base, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); @@ -5391,7 +5426,9 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); + if (skb_vlan_tag_present(p)) + diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) @@ -5639,12 +5676,17 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + gro_result_t ret; + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); - return napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + trace_napi_gro_receive_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_receive); @@ -5657,7 +5699,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; @@ -5762,6 +5804,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) gro_result_t napi_gro_frags(struct napi_struct *napi) { + gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); if (!skb) @@ -5769,7 +5812,10 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; } EXPORT_SYMBOL(napi_gro_frags); @@ -5785,10 +5831,11 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev); + netdev_rx_csum_fault(skb->dev, skb); } NAPI_GRO_CB(skb)->csum = wsum; @@ -7467,7 +7514,8 @@ unsigned int dev_get_flags(const struct net_device *dev) } EXPORT_SYMBOL(dev_get_flags); -int __dev_change_flags(struct net_device *dev, unsigned int flags) +int __dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { unsigned int old_flags = dev->flags; int ret; @@ -7504,7 +7552,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) if (old_flags & IFF_UP) __dev_close(dev); else - ret = __dev_open(dev); + ret = __dev_open(dev, extack); } if ((flags ^ dev->gflags) & IFF_PROMISC) { @@ -7564,16 +7612,18 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, * dev_change_flags - change device settings * @dev: device * @flags: device state flags + * @extack: netlink extended ack * * Change settings on device based state flags. The flags are * in the userspace exported format. */ -int dev_change_flags(struct net_device *dev, unsigned int flags) +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; - ret = __dev_change_flags(dev, flags); + ret = __dev_change_flags(dev, flags, extack); if (ret < 0) return ret; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index d884d8f5f0e5..81a8cd4ea3bd 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -278,6 +278,103 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, EXPORT_SYMBOL(__hw_addr_sync_dev); /** + * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking + * into account references + * @list: address list to synchronize + * @dev: device to sync + * @sync: function to call if address or reference on it should be added + * @unsync: function to call if address or some reference on it should removed + * + * This function is intended to be called from the ndo_set_rx_mode + * function of devices that require explicit address or references on it + * add/remove notifications. The unsync function may be NULL in which case + * the addresses or references on it requiring removal will simply be + * removed without any notification to the device. That is responsibility of + * the driver to identify and distribute address or references on it between + * internal address tables. + **/ +int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*sync)(struct net_device *, + const unsigned char *, int), + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + int err, ref_cnt; + + /* first go through and flush out any unsynced/stale entries */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address is not used */ + if ((ha->sync_cnt << 1) <= ha->refcount) + continue; + + /* if fails defer unsyncing address */ + ref_cnt = ha->refcount - ha->sync_cnt; + if (unsync && unsync(dev, ha->addr, ref_cnt)) + continue; + + ha->refcount = (ref_cnt << 1) + 1; + ha->sync_cnt = ref_cnt; + __hw_addr_del_entry(list, ha, false, false); + } + + /* go through and sync updated/new entries to the list */ + list_for_each_entry_safe(ha, tmp, &list->list, list) { + /* sync if address added or reused */ + if ((ha->sync_cnt << 1) >= ha->refcount) + continue; + + ref_cnt = ha->refcount - ha->sync_cnt; + err = sync(dev, ha->addr, ref_cnt); + if (err) + return err; + + ha->refcount = ref_cnt << 1; + ha->sync_cnt = ref_cnt; + } + + return 0; +} +EXPORT_SYMBOL(__hw_addr_ref_sync_dev); + +/** + * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on + * it from device + * @list: address list to remove synchronized addresses (references on it) from + * @dev: device to sync + * @unsync: function to call if address and references on it should be removed + * + * Remove all addresses that were added to the device by + * __hw_addr_ref_sync_dev(). This function is intended to be called from the + * ndo_stop or ndo_open functions on devices that require explicit address (or + * references on it) add/remove notifications. If the unsync function pointer + * is NULL then this function can be used to just reset the sync_cnt for the + * addresses in the list. + **/ +void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, + struct net_device *dev, + int (*unsync)(struct net_device *, + const unsigned char *, int)) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + if (!ha->sync_cnt) + continue; + + /* if fails defer unsyncing address */ + if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) + continue; + + ha->refcount -= ha->sync_cnt - 1; + ha->sync_cnt = 0; + __hw_addr_del_entry(list, ha, false, false); + } +} +EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); + +/** * __hw_addr_unsync_dev - Remove synchronized addresses from device * @list: address list to remove synchronized addresses from * @dev: device to sync diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 90e8aa36881e..da273ec3cc57 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -234,7 +234,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); + return dev_change_flags(dev, ifr->ifr_flags, NULL); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 3a4b29a13d31..abb0da9d7b4b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2692,6 +2692,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, + .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, + .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) diff --git a/net/core/filter.c b/net/core/filter.c index 8d2c629501e2..8659b40172d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -296,22 +296,18 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, break; case SKF_AD_VLAN_TAG: - case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); - if (skb_field == SKF_AD_VLAN_TAG) { - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, - ~VLAN_TAG_PRESENT); - } else { - /* dst_reg >>= 12 */ - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); - /* dst_reg &= 1 */ + break; + case SKF_AD_VLAN_TAG_PRESENT: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); - } break; } @@ -467,7 +463,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); - *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + if (offset) + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { @@ -2428,6 +2425,174 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { .arg4_type = ARG_ANYTHING, }; +static void sk_msg_shift_left(struct sk_msg *msg, int i) +{ + int prev; + + do { + prev = i; + sk_msg_iter_var_next(i); + msg->sg.data[prev] = msg->sg.data[i]; + } while (i != msg->sg.end); + + sk_msg_iter_prev(msg, end); +} + +static void sk_msg_shift_right(struct sk_msg *msg, int i) +{ + struct scatterlist tmp, sge; + + sk_msg_iter_next(msg, end); + sge = sk_msg_elem_cpy(msg, i); + sk_msg_iter_var_next(i); + tmp = sk_msg_elem_cpy(msg, i); + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sk_msg_iter_var_next(i); + sge = tmp; + tmp = sk_msg_elem_cpy(msg, i); + } +} + +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + u32 i = 0, l, space, offset = 0; + u64 last = start + len; + int pop; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + /* Bounds checks: start and pop must be inside message */ + if (start >= offset + l || last >= msg->sg.size) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + pop = len; + /* --------------| offset + * -| start |-------- len -------| + * + * |----- a ----|-------- pop -------|----- b ----| + * |______________________________________________| length + * + * + * a: region at front of scatter element to save + * b: region at back of scatter element to save when length > A + pop + * pop: region to pop from element, same as input 'pop' here will be + * decremented below per iteration. + * + * Two top-level cases to handle when start != offset, first B is non + * zero and second B is zero corresponding to when a pop includes more + * than one element. + * + * Then if B is non-zero AND there is no space allocate space and + * compact A, B regions into page. If there is space shift ring to + * the rigth free'ing the next element in ring to place B, leaving + * A untouched except to reduce length. + */ + if (start != offset) { + struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); + int a = start; + int b = sge->length - pop - a; + + sk_msg_iter_var_next(i); + + if (pop < sge->length - a) { + if (space) { + sge->length = a; + sk_msg_shift_right(msg, i); + nsge = sk_msg_elem(msg, i); + get_page(sg_page(sge)); + sg_set_page(nsge, + sg_page(sge), + b, sge->offset + pop + a); + } else { + struct page *page, *orig; + u8 *to, *from; + + page = alloc_pages(__GFP_NOWARN | + __GFP_COMP | GFP_ATOMIC, + get_order(a + b)); + if (unlikely(!page)) + return -ENOMEM; + + sge->length = a; + orig = sg_page(sge); + from = sg_virt(sge); + to = page_address(page); + memcpy(to, from, a); + memcpy(to + a, from + a + pop, b); + sg_set_page(sge, page, a + b, 0); + put_page(orig); + } + pop = 0; + } else if (pop >= sge->length - a) { + sge->length = a; + pop -= (sge->length - a); + } + } + + /* From above the current layout _must_ be as follows, + * + * -| offset + * -| start + * + * |---- pop ---|---------------- b ------------| + * |____________________________________________| length + * + * Offset and start of the current msg elem are equal because in the + * previous case we handled offset != start and either consumed the + * entire element and advanced to the next element OR pop == 0. + * + * Two cases to handle here are first pop is less than the length + * leaving some remainder b above. Simply adjust the element's layout + * in this case. Or pop >= length of the element so that b = 0. In this + * case advance to next element decrementing pop. + */ + while (pop) { + struct scatterlist *sge = sk_msg_elem(msg, i); + + if (pop < sge->length) { + sge->length -= pop; + sge->offset += pop; + pop = 0; + } else { + pop -= sge->length; + sk_msg_shift_left(msg, i); + } + sk_msg_iter_var_next(i); + } + + sk_mem_uncharge(msg->sk, len - pop); + msg->sg.size -= (len - pop); + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_pop_data_proto = { + .func = bpf_msg_pop_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3908,6 +4073,26 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +static const struct bpf_func_proto bpf_sockopt_event_output_proto = { + .func = bpf_sockopt_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -4825,37 +5010,31 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, - struct sk_buff *skb, u8 family, u8 proto) + int dif, int sdif, u8 family, u8 proto) { bool refcounted = false; struct sock *sk = NULL; - int dif = 0; - - if (skb->dev) - dif = skb->dev->ifindex; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; - int sdif = inet_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, - dif, sdif, &udp_table, skb); + dif, sdif, &udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; - int sdif = inet6_sdif(skb); if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); @@ -4864,7 +5043,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, src6, tuple->ipv6.sport, dst6, tuple->ipv6.dport, dif, sdif, - &udp_table, skb); + &udp_table, NULL); #endif } @@ -4881,31 +5060,33 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, * callers to satisfy BPF_CALL declarations. */ static unsigned long -bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, - u8 proto, u64 netns_id, u64 flags) +__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, + u64 flags) { - struct net *caller_net; struct sock *sk = NULL; u8 family = AF_UNSPEC; struct net *net; + int sdif; family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; if (unlikely(family == AF_UNSPEC || flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; - if (skb->dev) - caller_net = dev_net(skb->dev); + if (family == AF_INET) + sdif = inet_sdif(skb); else - caller_net = sock_net(skb->sk); + sdif = inet6_sdif(skb); + if ((s32)netns_id < 0) { net = caller_net; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; - sk = sk_lookup(net, tuple, skb, family, proto); + sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); } @@ -4915,6 +5096,25 @@ out: return (unsigned long) sk; } +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + int ifindex; + + if (skb->dev) { + caller_net = dev_net(skb->dev); + ifindex = skb->dev->ifindex; + } else { + caller_net = sock_net(skb->sk); + ifindex = 0; + } + + return __bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, + proto, netns_id, flags); +} + BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { @@ -4964,6 +5164,87 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_SOCKET, }; + +BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { + .func = bpf_xdp_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(ctx->rxq->dev); + int ifindex = ctx->rxq->dev->ifindex; + + return __bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { + .func = bpf_xdp_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { + .func = bpf_sock_addr_sk_lookup_tcp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return __bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, + IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { + .func = bpf_sock_addr_sk_lookup_udp, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -4986,6 +5267,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_msg_push_data || + func == bpf_msg_pop_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5070,6 +5352,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sock_addr_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sock_addr_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif /* CONFIG_INET */ default: return bpf_base_func_proto(func_id); } @@ -5214,6 +5504,14 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_udp: + return &bpf_xdp_sk_lookup_udp_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_xdp_sk_lookup_tcp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -5240,6 +5538,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_perf_event_output: + return &bpf_sockopt_event_output_proto; default: return bpf_base_func_proto(func_id); } @@ -5264,6 +5564,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; + case BPF_FUNC_msg_pop_data: + return &bpf_msg_pop_data_proto; default: return bpf_base_func_proto(func_id); } @@ -5440,6 +5742,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(__u64)) return false; break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (size != sizeof(__u64)) + return false; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5467,6 +5773,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5505,6 +5812,10 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; + case bpf_ctx_range(struct __sk_buff, tstamp): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; default: return false; } @@ -5532,6 +5843,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -5741,6 +6053,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + case bpf_ctx_range(struct __sk_buff, tstamp): break; default: return false; @@ -5960,6 +6273,7 @@ static bool sk_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -6046,6 +6360,7 @@ static bool flow_dissector_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, tstamp): return false; } @@ -6140,19 +6455,19 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, vlan_present): - case offsetof(struct __sk_buff, vlan_tci): - BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_VLAN_PRESENT_OFFSET()); + if (PKT_VLAN_PRESENT_BIT) + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); + if (PKT_VLAN_PRESENT_BIT < 7) + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + break; + case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); - if (si->off == offsetof(struct __sk_buff, vlan_tci)) { - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, - ~VLAN_TAG_PRESENT); - } else { - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); - } break; case offsetof(struct __sk_buff, cb[0]) ... @@ -6355,6 +6670,22 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; + + case offsetof(struct __sk_buff, tstamp): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); + else + *insn++ = BPF_LDX_MEM(BPF_DW, + si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + tstamp, 8, + target_size)); } return insn - insn_buf; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 588f475019d4..2e8d91e54179 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -952,8 +952,7 @@ proto_again: if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); - key_vlan->vlan_priority = - (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); + key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); } else { key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 41954e42a2de..c3b58712e98b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -118,6 +118,34 @@ unsigned long neigh_rand_reach_time(unsigned long base) } EXPORT_SYMBOL(neigh_rand_reach_time); +static void neigh_mark_dead(struct neighbour *n) +{ + n->dead = 1; + if (!list_empty(&n->gc_list)) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } +} + +static void neigh_change_state(struct neighbour *n, u8 new) +{ + bool on_gc_list = !list_empty(&n->gc_list); + bool new_is_perm = new & NUD_PERMANENT; + + n->nud_state = new; + + /* remove from the gc list if new state is permanent; + * add to the gc list if new state is not permanent + */ + if (new_is_perm && on_gc_list) { + list_del_init(&n->gc_list); + atomic_dec(&n->tbl->gc_entries); + } else if (!new_is_perm && !on_gc_list) { + /* add entries to the tail; cleaning removes from the front */ + list_add_tail(&n->gc_list, &n->tbl->gc_list); + atomic_inc(&n->tbl->gc_entries); + } +} static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, struct neighbour __rcu **np, struct neigh_table *tbl) @@ -132,7 +160,7 @@ static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, neigh = rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock)); rcu_assign_pointer(*np, neigh); - n->dead = 1; + neigh_mark_dead(n); retval = true; } write_unlock(&n->lock); @@ -166,32 +194,31 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) static int neigh_forced_gc(struct neigh_table *tbl) { + int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2; + unsigned long tref = jiffies - 5 * HZ; + u8 flags = NTF_EXT_LEARNED; + struct neighbour *n, *tmp; + u8 state = NUD_PERMANENT; int shrunk = 0; - int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; - struct neighbour __rcu **np; - np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { - /* Neighbour record may be discarded if: - * - nobody refers to it. - * - it is not permanent - */ - if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, - tbl)) { - shrunk = 1; - continue; - } - np = &n->next; + list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { + if (refcount_read(&n->refcnt) == 1) { + bool remove = false; + + write_lock(&n->lock); + if (!(n->nud_state & state) && !(n->flags & flags) && + time_after(tref, n->updated)) + remove = true; + write_unlock(&n->lock); + + if (remove && neigh_remove_one(n, tbl)) + shrunk++; + if (shrunk >= max_clean) + break; } } @@ -260,8 +287,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); - n->dead = 1; - + neigh_mark_dead(n); if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. We must destroy neighbour entry, @@ -321,13 +347,18 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) } EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, + struct net_device *dev, + bool permanent) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; - entries = atomic_inc_return(&tbl->entries) - 1; + if (permanent) + goto do_alloc; + + entries = atomic_inc_return(&tbl->gc_entries) - 1; if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { @@ -340,6 +371,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device } } +do_alloc: n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; @@ -358,11 +390,19 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; + + if (!permanent) + list_add_tail(&n->gc_list, &n->tbl->gc_list); + else + INIT_LIST_HEAD(&n->gc_list); + + atomic_inc(&tbl->entries); out: return n; out_entries: - atomic_dec(&tbl->entries); + if (!permanent) + atomic_dec(&tbl->gc_entries); goto out; } @@ -505,13 +545,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev, bool want_ref) +static struct neighbour *___neigh_create(struct neigh_table *tbl, + const void *pkey, + struct net_device *dev, + bool permanent, bool want_ref) { + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, permanent); u32 hash_val; unsigned int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; if (!n) { @@ -591,6 +633,12 @@ out_neigh_release: neigh_release(n); goto out; } + +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) +{ + return ___neigh_create(tbl, pkey, dev, false, want_ref); +} EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) @@ -854,7 +902,7 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; - n->dead = 1; + neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; @@ -1137,8 +1185,9 @@ static void neigh_update_hhs(struct neighbour *neigh) Caller MUST hold reference count on the entry. */ -int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - u32 flags, u32 nlmsg_pid) +static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, + u8 new, u32 flags, u32 nlmsg_pid, + struct netlink_ext_ack *extack) { u8 old; int err; @@ -1155,8 +1204,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; - if (neigh->dead) + if (neigh->dead) { + NL_SET_ERR_MSG(extack, "Neighbor entry is now dead"); goto out; + } neigh_update_ext_learned(neigh, flags, ¬ify); @@ -1164,7 +1215,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); - neigh->nud_state = new; + neigh_change_state(neigh, new); err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1193,8 +1244,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, use it, otherwise discard the request. */ err = -EINVAL; - if (!(old & NUD_VALID)) + if (!(old & NUD_VALID)) { + NL_SET_ERR_MSG(extack, "No link layer address given"); goto out; + } lladdr = neigh->ha; } @@ -1241,7 +1294,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); - neigh->nud_state = new; + neigh_change_state(neigh, new); notify = 1; } @@ -1307,6 +1360,12 @@ out: return err; } + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags, u32 nlmsg_pid) +{ + return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL); +} EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is @@ -1571,6 +1630,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); + INIT_LIST_HEAD(&tbl->gc_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); @@ -1678,8 +1738,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); - if (dst_attr == NULL) + if (!dst_attr) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1694,8 +1756,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(dst_attr) < (int)tbl->key_len) + if (nla_len(dst_attr) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); @@ -1711,10 +1775,9 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = neigh_update(neigh, NULL, NUD_FAILED, - NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, + NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh, tbl); @@ -1744,8 +1807,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; err = -EINVAL; - if (tb[NDA_DST] == NULL) + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Network address not specified"); goto out; + } ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { @@ -1755,16 +1820,21 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) { + NL_SET_ERR_MSG(extack, "Invalid link address"); goto out; + } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address"); goto out; + } + dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; @@ -1780,8 +1850,10 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - if (dev == NULL) + if (!dev) { + NL_SET_ERR_MSG(extack, "Device not specified"); goto out; + } neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { @@ -1790,7 +1862,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - neigh = __neigh_lookup_errno(tbl, dst, dev); + neigh = ___neigh_create(tbl, dst, dev, + ndm->ndm_state & NUD_PERMANENT, + true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; @@ -1817,8 +1891,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_event_send(neigh, NULL); err = 0; } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags, - NETLINK_CB(skb).portid); + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); neigh_release(neigh); out: @@ -2631,7 +2705,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); - n->dead = 1; + neigh_mark_dead(n); } else np = &n->next; write_unlock(&n->lock); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bd67c4d0fcfd..ff9fd2bb4ce4 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -337,7 +337,7 @@ NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { - return dev_change_flags(dev, (unsigned int)new_flags); + return dev_change_flags(dev, (unsigned int)new_flags, NULL); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index fefe72774aeb..05b23b285058 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -669,6 +669,7 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, [NETNSA_FD] = { .type = NLA_U32 }, + [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, }; static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -735,23 +736,38 @@ static int rtnl_net_get_size(void) { return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ ; } -static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, - int cmd, struct net *net, int nsid) +struct net_fill_args { + u32 portid; + u32 seq; + int flags; + int cmd; + int nsid; + bool add_ref; + int ref_nsid; +}; + +static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) { struct nlmsghdr *nlh; struct rtgenmsg *rth; - nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags); + nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), + args->flags); if (!nlh) return -EMSGSIZE; rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; - if (nla_put_s32(skb, NETNSA_NSID, nsid)) + if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) + goto nla_put_failure; + + if (args->add_ref && + nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -767,10 +783,15 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + struct net_fill_args fillargs = { + .portid = NETLINK_CB(skb).portid, + .seq = nlh->nlmsg_seq, + .cmd = RTM_NEWNSID, + }; + struct net *peer, *target = net; struct nlattr *nla; struct sk_buff *msg; - struct net *peer; - int err, id; + int err; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); @@ -782,6 +803,11 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; + } else if (tb[NETNSA_NSID]) { + peer = get_net_ns_by_id(net, nla_get_u32(tb[NETNSA_NSID])); + if (!peer) + peer = ERR_PTR(-ENOENT); + nla = tb[NETNSA_NSID]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; @@ -793,15 +819,29 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } + if (tb[NETNSA_TARGET_NSID]) { + int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); + + target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); + if (IS_ERR(target)) { + NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); + NL_SET_ERR_MSG(extack, + "Target netns reference is invalid"); + err = PTR_ERR(target); + goto out; + } + fillargs.add_ref = true; + fillargs.ref_nsid = peernet2id(net, peer); + } + msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } - id = peernet2id(net, peer); - err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - RTM_NEWNSID, net, id); + fillargs.nsid = peernet2id(target, peer); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; @@ -811,14 +851,17 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, err_out: nlmsg_free(msg); out: + if (fillargs.add_ref) + put_net(target); put_net(peer); return err; } struct rtnl_net_dump_cb { - struct net *net; + struct net *tgt_net; + struct net *ref_net; struct sk_buff *skb; - struct netlink_callback *cb; + struct net_fill_args fillargs; int idx; int s_idx; }; @@ -831,9 +874,10 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data) if (net_cb->idx < net_cb->s_idx) goto cont; - ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid, - net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWNSID, net_cb->net, id); + net_cb->fillargs.nsid = id; + if (net_cb->fillargs.add_ref) + net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); + ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; @@ -842,33 +886,96 @@ cont: return 0; } +static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, + struct rtnl_net_dump_cb *net_cb, + struct netlink_callback *cb) +{ + struct netlink_ext_ack *extack = cb->extack; + struct nlattr *tb[NETNSA_MAX + 1]; + int err, i; + + err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, + rtnl_net_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= NETNSA_MAX; i++) { + if (!tb[i]) + continue; + + if (i == NETNSA_TARGET_NSID) { + struct net *net; + + net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); + if (IS_ERR(net)) { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Invalid target network namespace id"); + return PTR_ERR(net); + } + net_cb->fillargs.add_ref = true; + net_cb->ref_net = net_cb->tgt_net; + net_cb->tgt_net = net; + } else { + NL_SET_BAD_ATTR(extack, tb[i]); + NL_SET_ERR_MSG(extack, + "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); struct rtnl_net_dump_cb net_cb = { - .net = net, + .tgt_net = sock_net(skb->sk), .skb = skb, - .cb = cb, + .fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .cmd = RTM_NEWNSID, + }, .idx = 0, .s_idx = cb->args[0], }; + int err = 0; - if (cb->strict_check && - nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) { - NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request"); - return -EINVAL; + if (cb->strict_check) { + err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); + if (err < 0) + goto end; } - spin_lock_bh(&net->nsid_lock); - idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); - spin_unlock_bh(&net->nsid_lock); + spin_lock_bh(&net_cb.tgt_net->nsid_lock); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net) && + !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) { + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); + err = -EAGAIN; + goto end; + } + idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); + if (net_cb.fillargs.add_ref && + !net_eq(net_cb.ref_net, net_cb.tgt_net)) + spin_unlock_bh(&net_cb.ref_net->nsid_lock); + spin_unlock_bh(&net_cb.tgt_net->nsid_lock); cb->args[0] = net_cb.idx; - return skb->len; +end: + if (net_cb.fillargs.add_ref) + put_net(net_cb.tgt_net); + return err < 0 ? err : skb->len; } static void rtnl_net_notifyid(struct net *net, int cmd, int id) { + struct net_fill_args fillargs = { + .cmd = cmd, + .nsid = id, + }; struct sk_buff *msg; int err = -ENOMEM; @@ -876,7 +983,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id) if (!msg) goto out; - err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id); + err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2b9fdbc43205..36a2b63ffd6d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -663,7 +663,7 @@ int netpoll_setup(struct netpoll *np) np_info(np, "device %s not up yet, forcing it\n", np->dev_name); - err = dev_open(ndev); + err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7819f7804eeb..c9c0407a7ee0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,7 +59,7 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> -#define RTNL_MAX_TYPE 49 +#define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 36 struct rtnl_link { @@ -2489,7 +2489,8 @@ static int do_setlink(const struct sk_buff *skb, } if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); if (err < 0) goto errout; } @@ -2870,7 +2871,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { - err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + NULL); if (err < 0) return err; } @@ -2885,9 +2887,11 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } EXPORT_SYMBOL(rtnl_configure_link); -struct net_device *rtnl_create_link(struct net *net, - const char *ifname, unsigned char name_assign_type, - const struct rtnl_link_ops *ops, struct nlattr *tb[]) +struct net_device *rtnl_create_link(struct net *net, const char *ifname, + unsigned char name_assign_type, + const struct rtnl_link_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; @@ -2903,11 +2907,15 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); - if (num_tx_queues < 1 || num_tx_queues > 4096) + if (num_tx_queues < 1 || num_tx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); + } - if (num_rx_queues < 1 || num_rx_queues > 4096) + if (num_rx_queues < 1 || num_rx_queues > 4096) { + NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); + } dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); @@ -2965,20 +2973,24 @@ static int rtnl_group_changelink(const struct sk_buff *skb, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attr, struct netlink_ext_ack *extack) { + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; + unsigned char name_assign_type = NET_NAME_USER; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + const struct rtnl_link_ops *m_ops = NULL; + struct net_device *master_dev = NULL; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - const struct rtnl_link_ops *m_ops = NULL; + struct nlattr *tb[IFLA_MAX + 1]; + struct net *dest_net, *link_net; + struct nlattr **slave_data; + char kind[MODULE_NAME_LEN]; struct net_device *dev; - struct net_device *master_dev = NULL; struct ifinfomsg *ifm; - char kind[MODULE_NAME_LEN]; char ifname[IFNAMSIZ]; - struct nlattr *tb[IFLA_MAX+1]; - struct nlattr *linkinfo[IFLA_INFO_MAX+1]; - unsigned char name_assign_type = NET_NAME_USER; + struct nlattr **data; int err; #ifdef CONFIG_MODULES @@ -3034,193 +3046,200 @@ replay: ops = NULL; } - if (1) { - struct nlattr *attr[RTNL_MAX_TYPE + 1]; - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; - struct nlattr **data = NULL; - struct nlattr **slave_data = NULL; - struct net *dest_net, *link_net = NULL; - - if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; + data = NULL; + if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; - if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested(attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy, NULL); - if (err < 0) - return err; - data = attr; - } - if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; - } + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + err = nla_parse_nested(attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); + if (err < 0) + return err; + data = attr; + } + if (ops->validate) { + err = ops->validate(tb, data, extack); + if (err < 0) + return err; } + } - if (m_ops) { - if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) - return -EINVAL; + slave_data = NULL; + if (m_ops) { + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; - if (m_ops->slave_maxtype && - linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested(slave_attr, - m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, - NULL); - if (err < 0) - return err; - slave_data = slave_attr; - } + if (m_ops->slave_maxtype && + linkinfo[IFLA_INFO_SLAVE_DATA]) { + err = nla_parse_nested(slave_attr, m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + slave_data = slave_attr; } + } - if (dev) { - int status = 0; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + if (dev) { + int status = 0; - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; - err = ops->changelink(dev, tb, data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || + !ops->changelink) + return -EOPNOTSUPP; - if (linkinfo[IFLA_INFO_SLAVE_DATA]) { - if (!m_ops || !m_ops->slave_changelink) - return -EOPNOTSUPP; + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; + } - err = m_ops->slave_changelink(master_dev, dev, - tb, slave_data, - extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; - return do_setlink(skb, dev, ifm, extack, tb, ifname, - status); + err = m_ops->slave_changelink(master_dev, dev, tb, + slave_data, extack); + if (err < 0) + return err; + status |= DO_SETLINK_NOTIFY; } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, + return do_setlink(skb, dev, ifm, extack, tb, ifname, status); + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); - return -ENODEV; - } + return -ENODEV; + } - if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) - return -EOPNOTSUPP; + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; - if (!ops) { + if (!ops) { #ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; - } -#endif - return -EOPNOTSUPP; + if (kind[0]) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + if (ops) + goto replay; } +#endif + NL_SET_ERR_MSG(extack, "Unknown device type"); + return -EOPNOTSUPP; + } - if (!ops->setup) - return -EOPNOTSUPP; - - if (!ifname[0]) { - snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - name_assign_type = NET_NAME_ENUM; - } + if (!ops->setup) + return -EOPNOTSUPP; - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); + if (!ifname[0]) { + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); + link_net = get_net_ns_by_id(dest_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + err = -EINVAL; goto out; } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; + } else { + link_net = NULL; + } - dev->ifindex = ifm->ifi_index; + dev = rtnl_create_link(link_net ? : dest_net, ifname, + name_assign_type, ops, tb, extack); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out; + } - if (ops->newlink) { - err = ops->newlink(link_net ? : net, dev, tb, data, - extack); - /* Drivers should call free_netdev() in ->destructor - * and unregister it on failure after registration - * so that device could be finally freed in rtnl_unlock. - */ - if (err < 0) { - /* If device is not registered at all, free it now */ - if (dev->reg_state == NETREG_UNINITIALIZED) - free_netdev(dev); - goto out; - } - } else { - err = register_netdevice(dev); - if (err < 0) { + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) { + err = ops->newlink(link_net ? : net, dev, tb, data, extack); + /* Drivers should call free_netdev() in ->destructor + * and unregister it on failure after registration + * so that device could be finally freed in rtnl_unlock. + */ + if (err < 0) { + /* If device is not registered at all, free it now */ + if (dev->reg_state == NETREG_UNINITIALIZED) free_netdev(dev); - goto out; - } + goto out; + } + } else { + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + goto out; } - err = rtnl_configure_link(dev, ifm); + } + err = rtnl_configure_link(dev, ifm); + if (err < 0) + goto out_unregister; + if (link_net) { + err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); - if (err < 0) - goto out_unregister; - } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), - extack); - if (err) - goto out_unregister; - } + } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto out_unregister; + } out: - if (link_net) - put_net(link_net); - put_net(dest_net); - return err; + if (link_net) + put_net(link_net); + put_net(dest_net); + return err; out_unregister: - if (ops->newlink) { - LIST_HEAD(list_kill); + if (ops->newlink) { + LIST_HEAD(list_kill); - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); } + goto out; +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr **attr; + int ret; + + attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + ret = __rtnl_newlink(skb, nlh, attr, extack); + kfree(attr); + return ret; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a8217e221e19..40552547c69a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1089,7 +1089,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg) } EXPORT_SYMBOL_GPL(sock_zerocopy_put); -void sock_zerocopy_put_abort(struct ubuf_info *uarg) +void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { struct sock *sk = skb_from_uarg(uarg)->sk; @@ -1097,7 +1097,8 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) atomic_dec(&sk->sk_zckey); uarg->len--; - sock_zerocopy_put(uarg); + if (have_uref) + sock_zerocopy_put(uarg); } } EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); @@ -1105,6 +1106,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) +{ + return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) @@ -1131,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); @@ -1151,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, if (skb_copy_ubufs(nskb, GFP_ATOMIC)) return -EIO; } - skb_zcopy_set(nskb, skb_uarg(orig)); + skb_zcopy_set(nskb, skb_uarg(orig), NULL); } return 0; } @@ -1925,8 +1932,6 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) struct sk_buff *insp = NULL; do { - BUG_ON(!list); - if (list->len <= eat) { /* Eaten as whole. */ eat -= list->len; @@ -2366,19 +2371,6 @@ error: } EXPORT_SYMBOL_GPL(skb_send_sock_locked); -/* Send skb data on a socket. */ -int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) -{ - int ret = 0; - - lock_sock(sk); - ret = skb_send_sock_locked(sk, skb, offset, len); - release_sock(sk); - - return ret; -} -EXPORT_SYMBOL_GPL(skb_send_sock); - /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer @@ -2645,6 +2637,65 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +{ + __sum16 sum; + + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + if (!skb_shared(skb)) + skb->csum_valid = !sum; + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete_head); + +/* This function assumes skb->csum already holds pseudo header's checksum, + * which has been changed from the hardware checksum, for example, by + * __skb_checksum_validate_complete(). And, the original skb->csum must + * have been validated unsuccessfully for CHECKSUM_COMPLETE case. + * + * It returns non-zero if the recomputed checksum is still invalid, otherwise + * zero. The new checksum is stored back into skb->csum unless the skb is + * shared. + */ +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + __wsum csum; + __sum16 sum; + + csum = skb_checksum(skb, 0, skb->len, 0); + + sum = csum_fold(csum_add(skb->csum, csum)); + /* This check is inverted, because we already knew the hardware + * checksum is invalid before calling this function. So, if the + * re-computed checksum is valid instead, then we have a mismatch + * between the original skb->csum and skb_checksum(). This means either + * the original hardware checksum is incorrect or we screw up skb->csum + * when moving skb->data around. + */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + if (!skb_shared(skb)) { + /* Save full packet checksum */ + skb->csum = csum; + skb->ip_summed = CHECKSUM_COMPLETE; + skb->csum_complete_sw = 1; + skb->csum_valid = !sum; + } + + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) { net_warn_ratelimited( @@ -2962,28 +3013,6 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head } EXPORT_SYMBOL(skb_append); -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * @list: list to use - * - * Place a packet before a given packet in a list. The list locks are - * taken and this function is atomic with respect to other list locked - * calls. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_insert(newsk, old->prev, old, list); - spin_unlock_irqrestore(&list->lock, flags); -} -EXPORT_SYMBOL(skb_insert); - static inline void skb_split_inside_header(struct sk_buff *skb, struct sk_buff* skb1, const u32 len, const int pos) @@ -4856,7 +4885,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) #ifdef CONFIG_NET_SWITCHDEV skb->offload_fwd_mark = 0; - skb->offload_mr_fwd_mark = 0; + skb->offload_l3_fwd_mark = 0; #endif if (!xnet) @@ -5128,7 +5157,7 @@ int skb_vlan_pop(struct sk_buff *skb) int err; if (likely(skb_vlan_tag_present(skb))) { - skb->vlan_tci = 0; + __vlan_hwaccel_clear_tag(skb); } else { if (unlikely(!eth_type_vlan(skb->protocol))) return 0; diff --git a/net/core/sock.c b/net/core/sock.c index 080a880a1761..f00902c532cc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, lock_sock(sk); sk->sk_bound_dev_if = index; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); sk_dst_reset(sk); release_sock(sk); @@ -698,6 +700,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); @@ -950,10 +953,12 @@ set_rcvbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; - else + } else if (val != sk->sk_mark) { sk->sk_mark = val; + sk_dst_reset(sk); + } break; case SO_RXQ_OVFL: @@ -1014,7 +1019,10 @@ set_rcvbuf: case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!((sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP) || + (sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP))) ret = -ENOTSUPP; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index ba5cba56f574..d8fe3e549373 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } +EXPORT_SYMBOL(reuseport_add_sock); void reuseport_detach_sock(struct sock *sk) { diff --git a/net/core/stream.c b/net/core/stream.c index 7d329fb1f553..e94bb02a5629 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -32,7 +32,7 @@ void sk_stream_write_space(struct sock *sk) struct socket *sock = sk->sk_socket; struct socket_wq *wq; - if (sk_stream_is_writeable(sk) && sock) { + if (__sk_stream_is_writeable(sk, 1) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); |