summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-11 09:49:36 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-11 09:49:36 -0800
commitf54ca91fe6f25c2028f953ce82f19ca2ea0f07bb (patch)
tree6a16c8a40cb30141aa3c26fa8408a5085d6750e1 /net/ipv4
parentc55a04176cba5a2de085461438b65e1c46541ef3 (diff)
parentd336509cb9d03970911878bb77f0497f64fda061 (diff)
Merge tag 'net-5.16-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Pull networking fixes from Jakub Kicinski: "Including fixes from bpf, can and netfilter. Current release - regressions: - bpf: do not reject when the stack read size is different from the tracked scalar size - net: fix premature exit from NAPI state polling in napi_disable() - riscv, bpf: fix RV32 broken build, and silence RV64 warning Current release - new code bugs: - net: fix possible NULL deref in sock_reserve_memory - amt: fix error return code in amt_init(); fix stopping the workqueue - ax88796c: use the correct ioctl callback Previous releases - always broken: - bpf: stop caching subprog index in the bpf_pseudo_func insn - security: fixups for the security hooks in sctp - nfc: add necessary privilege flags in netlink layer, limit operations to admin only - vsock: prevent unnecessary refcnt inc for non-blocking connect - net/smc: fix sk_refcnt underflow on link down and fallback - nfnetlink_queue: fix OOB when mac header was cleared - can: j1939: ignore invalid messages per standard - bpf, sockmap: - fix race in ingress receive verdict with redirect to self - fix incorrect sk_skb data_end access when src_reg = dst_reg - strparser, and tls are reusing qdisc_skb_cb and colliding - ethtool: fix ethtool msg len calculation for pause stats - vlan: fix a UAF in vlan_dev_real_dev() when ref-holder tries to access an unregistering real_dev - udp6: make encap_rcv() bump the v6 not v4 stats - drv: prestera: add explicit padding to fix m68k build - drv: felix: fix broken VLAN-tagged PTP under VLAN-aware bridge - drv: mvpp2: fix wrong SerDes reconfiguration order Misc & small latecomers: - ipvs: auto-load ipvs on genl access - mctp: sanity check the struct sockaddr_mctp padding fields - libfs: support RENAME_EXCHANGE in simple_rename() - avoid double accounting for pure zerocopy skbs" * tag 'net-5.16-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (123 commits) selftests/net: udpgso_bench_rx: fix port argument net: wwan: iosm: fix compilation warning cxgb4: fix eeprom len when diagnostics not implemented net: fix premature exit from NAPI state polling in napi_disable() net/smc: fix sk_refcnt underflow on linkdown and fallback net/mlx5: Lag, fix a potential Oops with mlx5_lag_create_definer() gve: fix unmatched u64_stats_update_end() net: ethernet: lantiq_etop: Fix compilation error selftests: forwarding: Fix packet matching in mirroring selftests vsock: prevent unnecessary refcnt inc for nonblocking connect net: marvell: mvpp2: Fix wrong SerDes reconfiguration order net: ethernet: ti: cpsw_ale: Fix access to un-initialized memory net: stmmac: allow a tc-taprio base-time of zero selftests: net: test_vxlan_under_vrf: fix HV connectivity test net: hns3: allow configure ETS bandwidth of all TCs net: hns3: remove check VF uc mac exist when set by PF net: hns3: fix some mac statistics is always 0 in device version V2 net: hns3: fix kernel crash when unload VF while it is being reset net: hns3: sync rx ring head in echo common pull net: hns3: fix pfc packet number incorrect after querying pfc parameters ...
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp.c22
-rw-r--r--net/ipv4/tcp_bpf.c48
-rw-r--r--net/ipv4/tcp_output.c27
3 files changed, 82 insertions, 15 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3c6498dab6bd..b7796b4cf0a0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -862,6 +862,7 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
if (likely(skb)) {
bool mem_scheduled;
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
if (force_schedule) {
mem_scheduled = true;
sk_forced_mem_schedule(sk, skb->truesize);
@@ -1318,6 +1319,15 @@ new_segment:
copy = min_t(int, copy, pfrag->size - pfrag->offset);
+ /* skb changing from pure zc to mixed, must charge zc */
+ if (unlikely(skb_zcopy_pure(skb))) {
+ if (!sk_wmem_schedule(sk, skb->data_len))
+ goto wait_for_space;
+
+ sk_mem_charge(sk, skb->data_len);
+ skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
+ }
+
if (!sk_wmem_schedule(sk, copy))
goto wait_for_space;
@@ -1338,8 +1348,16 @@ new_segment:
}
pfrag->offset += copy;
} else {
- if (!sk_wmem_schedule(sk, copy))
- goto wait_for_space;
+ /* First append to a fragless skb builds initial
+ * pure zerocopy skb
+ */
+ if (!skb->len)
+ skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
+
+ if (!skb_zcopy_pure(skb)) {
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_space;
+ }
err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
if (err == -EMSGSIZE || err == -EEXIST) {
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 5f4d6f45d87f..f70aa0932bd6 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -172,6 +172,41 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
return ret;
}
+static int tcp_bpf_recvmsg_parser(struct sock *sk,
+ struct msghdr *msg,
+ size_t len,
+ int nonblock,
+ int flags,
+ int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+ lock_sock(sk);
+msg_bytes_ready:
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ long timeo;
+ int data;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = tcp_msg_wait_data(sk, psock, timeo);
+ if (data && !sk_psock_queue_empty(psock))
+ goto msg_bytes_ready;
+ copied = -EAGAIN;
+ }
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return copied;
+}
+
static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
@@ -464,6 +499,8 @@ enum {
enum {
TCP_BPF_BASE,
TCP_BPF_TX,
+ TCP_BPF_RX,
+ TCP_BPF_TXRX,
TCP_BPF_NUM_CFGS,
};
@@ -475,7 +512,6 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
struct proto *base)
{
prot[TCP_BPF_BASE] = *base;
- prot[TCP_BPF_BASE].unhash = sock_map_unhash;
prot[TCP_BPF_BASE].close = sock_map_close;
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable;
@@ -483,6 +519,12 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage;
+
+ prot[TCP_BPF_RX] = prot[TCP_BPF_BASE];
+ prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser;
+
+ prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX];
+ prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser;
}
static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops)
@@ -520,6 +562,10 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
+ if (psock->progs.stream_verdict || psock->progs.skb_verdict) {
+ config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX;
+ }
+
if (restore) {
if (inet_csk_has_ulp(sk)) {
/* TLS does not have an unhash proto in SW cases,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6fbbf1558033..2e6e5a70168e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -408,13 +408,13 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
return tp->snd_una != tp->snd_up;
}
-#define OPTION_SACK_ADVERTISE (1 << 0)
-#define OPTION_TS (1 << 1)
-#define OPTION_MD5 (1 << 2)
-#define OPTION_WSCALE (1 << 3)
-#define OPTION_FAST_OPEN_COOKIE (1 << 8)
-#define OPTION_SMC (1 << 9)
-#define OPTION_MPTCP (1 << 10)
+#define OPTION_SACK_ADVERTISE BIT(0)
+#define OPTION_TS BIT(1)
+#define OPTION_MD5 BIT(2)
+#define OPTION_WSCALE BIT(3)
+#define OPTION_FAST_OPEN_COOKIE BIT(8)
+#define OPTION_SMC BIT(9)
+#define OPTION_MPTCP BIT(10)
static void smc_options_write(__be32 *ptr, u16 *options)
{
@@ -1559,7 +1559,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
return -ENOMEM;
}
- if (skb_unclone(skb, gfp))
+ if (skb_unclone_keeptruesize(skb, gfp))
return -ENOMEM;
/* Get a new skb... force flag on. */
@@ -1667,7 +1667,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
u32 delta_truesize;
- if (skb_unclone(skb, GFP_ATOMIC))
+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
return -ENOMEM;
delta_truesize = __pskb_trim_head(skb, len);
@@ -1677,7 +1677,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
if (delta_truesize) {
skb->truesize -= delta_truesize;
sk_wmem_queued_add(sk, -delta_truesize);
- sk_mem_uncharge(sk, delta_truesize);
+ if (!skb_zcopy_pure(skb))
+ sk_mem_uncharge(sk, delta_truesize);
}
/* Any change of skb->len requires recalculation of tso factor. */
@@ -2295,7 +2296,9 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
if (len <= skb->len)
break;
- if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
+ if (unlikely(TCP_SKB_CB(skb)->eor) ||
+ tcp_has_tx_tstamp(skb) ||
+ !skb_pure_zcopy_same(skb, next))
return false;
len -= skb->len;
@@ -3166,7 +3169,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
- if (skb_unclone(skb, GFP_ATOMIC))
+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
return -ENOMEM;
diff = tcp_skb_pcount(skb);