summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-21 08:28:08 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-21 08:28:08 -0800
commitfcc79e1714e8c2b8e216dc3149812edd37884eef (patch)
tree17a51d29db810b81412be040aaf380936b3261b4 /fs
parent6e95ef0258ff4ee23ae3b06bf6b00b33dbbd5ef7 (diff)
parentdd7207838d38780b51e4690ee508ab2d5057e099 (diff)
Merge tag 'net-next-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "The most significant set of changes is the per netns RTNL. The new behavior is disabled by default, regression risk should be contained. Notably the new config knob PTP_1588_CLOCK_VMCLOCK will inherit its default value from PTP_1588_CLOCK_KVM, as the first is intended to be a more reliable replacement for the latter. Core: - Started a very large, in-progress, effort to make the RTNL lock scope per network-namespace, thus reducing the lock contention significantly in the containerized use-case, comprising: - RCU-ified some relevant slices of the FIB control path - introduce basic per netns locking helpers - namespacified the IPv4 address hash table - remove rtnl_register{,_module}() in favour of rtnl_register_many() - refactor rtnl_{new,del,set}link() moving as much validation as possible out of RTNL lock - convert all phonet doit() and dumpit() handlers to RCU - convert IPv4 addresses manipulation to per-netns RTNL - convert virtual interface creation to per-netns RTNL the per-netns lock infrastructure is guarded by the CONFIG_DEBUG_NET_SMALL_RTNL knob, disabled by default ad interim. - Introduce NAPI suspension, to efficiently switching between busy polling (NAPI processing suspended) and normal processing. - Migrate the IPv4 routing input, output and control path from direct ToS usage to DSCP macros. This is a work in progress to make ECN handling consistent and reliable. - Add drop reasons support to the IPv4 rotue input path, allowing better introspection in case of packets drop. - Make FIB seqnum lockless, dropping RTNL protection for read access. - Make inet{,v6} addresses hashing less predicable. - Allow providing timestamp OPT_ID via cmsg, to correlate TX packets and timestamps Things we sprinkled into general kernel code: - Add small file operations for debugfs, to reduce the struct ops size. - Refactoring and optimization for the implementation of page_frag API, This is a preparatory work to consolidate the page_frag implementation. Netfilter: - Optimize set element transactions to reduce memory consumption - Extended netlink error reporting for attribute parser failure. - Make legacy xtables configs user selectable, giving users the option to configure iptables without enabling any other config. - Address a lot of false-positive RCU issues, pointed by recent CI improvements. BPF: - Put xsk sockets on a struct diet and add various cleanups. Overall, this helps to bump performance by 12% for some workloads. - Extend BPF selftests to increase coverage of XDP features in combination with BPF cpumap. - Optimize and homogenize bpf_csum_diff helper for all archs and also add a batch of new BPF selftests for it. - Extend netkit with an option to delegate skb->{mark,priority} scrubbing to its BPF program. - Make the bpf_get_netns_cookie() helper available also to tc(x) BPF programs. Protocols: - Introduces 4-tuple hash for connected udp sockets, speeding-up significantly connected sockets lookup. - Add a fastpath for some TCP timers that usually expires after close, the socket lock contention. - Add inbound and outbound xfrm state caches to speed up state lookups. - Avoid sending MPTCP advertisements on stale subflows, reducing risks on loosing them. - Make neighbours table flushing more scalable, maintaining per device neigh lists. Driver API: - Introduce a unified interface to configure transmission H/W shaping, and expose it to user-space via generic-netlink. - Add support for per-NAPI config via netlink. This makes napi configuration persistent across queues removal and re-creation. Requires driver updates, currently supported drivers are: nVidia/Mellanox mlx4 and mlx5, Broadcom brcm and Intel ice. - Add ethtool support for writing SFP / PHY firmware blocks. - Track RSS context allocation from ethtool core. - Implement support for mirroring to DSA CPU port, via TC mirror offload. - Consolidate FDB updates notification, to avoid duplicates on device-specific entries. - Expose DPLL clock quality level to the user-space. - Support master-slave PHY config via device tree. Tests and tooling: - forwarding: introduce deferred commands, to simplify the cleanup phase Drivers: - Updated several drivers - Amazon vNic, Google vNic, Microsoft vNic, Intel e1000e and Broadcom Tigon3 - to use netdev-genl to link the IRQs and queues to NAPI IDs, allowing busy polling and better introspection. - Ethernet high-speed NICs: - nVidia/Mellanox: - mlx5: - a large refactor to implement support for cross E-Switch scheduling - refactor H/W conter management to let it scale better - H/W GRO cleanups - Intel (100G, ice):: - add support for ethtool reset - implement support for per TX queue H/W shaping - AMD/Solarflare: - implement per device queue stats support - Broadcom (bnxt): - improve wildcard l4proto on IPv4/IPv6 ntuple rules - Marvell Octeon: - Add representor support for each Resource Virtualization Unit (RVU) device. - Hisilicon: - add support for the BMC Gigabit Ethernet - IBM (EMAC): - driver cleanup and modernization - Cisco (VIC): - raise the queues number limit to 256 - Ethernet virtual: - Google vNIC: - implement page pool support - macsec: - inherit lower device's features and TSO limits when offloading - virtio_net: - enable premapped mode by default - support for XDP socket(AF_XDP) zerocopy TX - wireguard: - set the TSO max size to be GSO_MAX_SIZE, to aggregate larger packets. - Ethernet NICs embedded and virtual: - Broadcom ASP: - enable software timestamping - Freescale: - add enetc4 PF driver - MediaTek: Airoha SoC: - implement BQL support - RealTek r8169: - enable TSO by default on r8168/r8125 - implement extended ethtool stats - Renesas AVB: - enable TX checksum offload - Synopsys (stmmac): - support header splitting for vlan tagged packets - move common code for DWMAC4 and DWXGMAC into a separate FPE module. - add dwmac driver support for T-HEAD TH1520 SoC - Synopsys (xpcs): - driver refactor and cleanup - TI: - icssg_prueth: add VLAN offload support - Xilinx emaclite: - add clock support - Ethernet switches: - Microchip: - implement support for the lan969x Ethernet switch family - add LAN9646 switch support to KSZ DSA driver - Ethernet PHYs: - Marvel: 88q2x: enable auto negotiation - Microchip: add support for LAN865X Rev B1 and LAN867X Rev C1/C2 - PTP: - Add support for the Amazon virtual clock device - Add PtP driver for s390 clocks - WiFi: - mac80211 - EHT 1024 aggregation size for transmissions - new operation to indicate that a new interface is to be added - support radio separation of multi-band devices - move wireless extension spy implementation to libiw - Broadcom: - brcmfmac: optional LPO clock support - Microchip: - add support for Atmel WILC3000 - Qualcomm (ath12k): - firmware coredump collection support - add debugfs support for a multitude of statistics - Qualcomm (ath5k): - Arcadyan ARV45XX AR2417 & Gigaset SX76[23] AR241[34]A support - Realtek: - rtw88: 8821au and 8812au USB adapters support - rtw89: add thermal protection - rtw89: fine tune BT-coexsitence to improve user experience - rtw89: firmware secure boot for WiFi 6 chip - Bluetooth - add Qualcomm WCN785x support for ids Foxconn 0xe0fc/0xe0f3 and 0x13d3:0x3623 - add Realtek RTL8852BE support for id Foxconn 0xe123 - add MediaTek MT7920 support for wireless module ids - btintel_pcie: add handshake between driver and firmware - btintel_pcie: add recovery mechanism - btnxpuart: add GPIO support to power save feature" * tag 'net-next-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1475 commits) mm: page_frag: fix a compile error when kernel is not compiled Documentation: tipc: fix formatting issue in tipc.rst selftests: nic_performance: Add selftest for performance of NIC driver selftests: nic_link_layer: Add selftest case for speed and duplex states selftests: nic_link_layer: Add link layer selftest for NIC driver bnxt_en: Add FW trace coredump segments to the coredump bnxt_en: Add a new ethtool -W dump flag bnxt_en: Add 2 parameters to bnxt_fill_coredump_seg_hdr() bnxt_en: Add functions to copy host context memory bnxt_en: Do not free FW log context memory bnxt_en: Manage the FW trace context memory bnxt_en: Allocate backing store memory for FW trace logs bnxt_en: Add a 'force' parameter to bnxt_free_ctx_mem() bnxt_en: Refactor bnxt_free_ctx_mem() bnxt_en: Add mem_valid bit to struct bnxt_ctx_mem_type bnxt_en: Update firmware interface spec to 1.10.3.85 selftests/bpf: Add some tests with sockmap SK_PASS bpf: fix recursive lock when verdict program return SK_PASS wireguard: device: support big tcp GSO wireguard: selftests: load nf_conntrack if not present ...
Diffstat (limited to 'fs')
-rw-r--r--fs/debugfs/file.c100
-rw-r--r--fs/debugfs/inode.c63
-rw-r--r--fs/debugfs/internal.h6
-rw-r--r--fs/eventpoll.c36
4 files changed, 136 insertions, 69 deletions
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 67299e8b734e..47dc96dfe386 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -100,8 +100,16 @@ int debugfs_file_get(struct dentry *dentry)
if (!fsd)
return -ENOMEM;
- fsd->real_fops = (void *)((unsigned long)d_fsd &
- ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+ if ((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT) {
+ fsd->real_fops = NULL;
+ fsd->short_fops = (void *)((unsigned long)d_fsd &
+ ~(DEBUGFS_FSDATA_IS_REAL_FOPS_BIT |
+ DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT));
+ } else {
+ fsd->real_fops = (void *)((unsigned long)d_fsd &
+ ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+ fsd->short_fops = NULL;
+ }
refcount_set(&fsd->active_users, 1);
init_completion(&fsd->active_users_drained);
INIT_LIST_HEAD(&fsd->cancellations);
@@ -241,9 +249,10 @@ static int debugfs_locked_down(struct inode *inode,
{
if ((inode->i_mode & 07777 & ~0444) == 0 &&
!(filp->f_mode & FMODE_WRITE) &&
- !real_fops->unlocked_ioctl &&
- !real_fops->compat_ioctl &&
- !real_fops->mmap)
+ (!real_fops ||
+ (!real_fops->unlocked_ioctl &&
+ !real_fops->compat_ioctl &&
+ !real_fops->mmap)))
return 0;
if (security_locked_down(LOCKDOWN_DEBUGFS))
@@ -316,19 +325,38 @@ static ret_type full_proxy_ ## name(proto) \
return r; \
}
-FULL_PROXY_FUNC(llseek, loff_t, filp,
- PROTO(struct file *filp, loff_t offset, int whence),
- ARGS(filp, offset, whence));
+#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args) \
+static ret_type full_proxy_ ## name(proto) \
+{ \
+ struct dentry *dentry = F_DENTRY(filp); \
+ struct debugfs_fsdata *fsd; \
+ ret_type r; \
+ \
+ r = debugfs_file_get(dentry); \
+ if (unlikely(r)) \
+ return r; \
+ fsd = dentry->d_fsdata; \
+ if (fsd->real_fops) \
+ r = fsd->real_fops->name(args); \
+ else \
+ r = fsd->short_fops->name(args); \
+ debugfs_file_put(dentry); \
+ return r; \
+}
+
+FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp,
+ PROTO(struct file *filp, loff_t offset, int whence),
+ ARGS(filp, offset, whence));
-FULL_PROXY_FUNC(read, ssize_t, filp,
- PROTO(struct file *filp, char __user *buf, size_t size,
- loff_t *ppos),
- ARGS(filp, buf, size, ppos));
+FULL_PROXY_FUNC_BOTH(read, ssize_t, filp,
+ PROTO(struct file *filp, char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos));
-FULL_PROXY_FUNC(write, ssize_t, filp,
- PROTO(struct file *filp, const char __user *buf, size_t size,
- loff_t *ppos),
- ARGS(filp, buf, size, ppos));
+FULL_PROXY_FUNC_BOTH(write, ssize_t, filp,
+ PROTO(struct file *filp, const char __user *buf,
+ size_t size, loff_t *ppos),
+ ARGS(filp, buf, size, ppos));
FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
@@ -363,7 +391,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
* not to leak any resources. Releasers must not assume that
* ->i_private is still being meaningful here.
*/
- if (real_fops->release)
+ if (real_fops && real_fops->release)
r = real_fops->release(inode, filp);
replace_fops(filp, d_inode(dentry)->i_fop);
@@ -373,39 +401,48 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
}
static void __full_proxy_fops_init(struct file_operations *proxy_fops,
- const struct file_operations *real_fops)
+ struct debugfs_fsdata *fsd)
{
proxy_fops->release = full_proxy_release;
- if (real_fops->llseek)
+
+ if ((fsd->real_fops && fsd->real_fops->llseek) ||
+ (fsd->short_fops && fsd->short_fops->llseek))
proxy_fops->llseek = full_proxy_llseek;
- if (real_fops->read)
+
+ if ((fsd->real_fops && fsd->real_fops->read) ||
+ (fsd->short_fops && fsd->short_fops->read))
proxy_fops->read = full_proxy_read;
- if (real_fops->write)
+
+ if ((fsd->real_fops && fsd->real_fops->write) ||
+ (fsd->short_fops && fsd->short_fops->write))
proxy_fops->write = full_proxy_write;
- if (real_fops->poll)
+
+ if (fsd->real_fops && fsd->real_fops->poll)
proxy_fops->poll = full_proxy_poll;
- if (real_fops->unlocked_ioctl)
+
+ if (fsd->real_fops && fsd->real_fops->unlocked_ioctl)
proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
}
static int full_proxy_open(struct inode *inode, struct file *filp)
{
struct dentry *dentry = F_DENTRY(filp);
- const struct file_operations *real_fops = NULL;
+ const struct file_operations *real_fops;
struct file_operations *proxy_fops = NULL;
+ struct debugfs_fsdata *fsd;
int r;
r = debugfs_file_get(dentry);
if (r)
return r == -EIO ? -ENOENT : r;
- real_fops = debugfs_real_fops(filp);
-
+ fsd = dentry->d_fsdata;
+ real_fops = fsd->real_fops;
r = debugfs_locked_down(inode, filp, real_fops);
if (r)
goto out;
- if (!fops_get(real_fops)) {
+ if (real_fops && !fops_get(real_fops)) {
#ifdef CONFIG_MODULES
if (real_fops->owner &&
real_fops->owner->state == MODULE_STATE_GOING) {
@@ -426,11 +463,14 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
r = -ENOMEM;
goto free_proxy;
}
- __full_proxy_fops_init(proxy_fops, real_fops);
+ __full_proxy_fops_init(proxy_fops, fsd);
replace_fops(filp, proxy_fops);
- if (real_fops->open) {
- r = real_fops->open(inode, filp);
+ if (!real_fops || real_fops->open) {
+ if (real_fops)
+ r = real_fops->open(inode, filp);
+ else
+ r = simple_open(inode, filp);
if (r) {
replace_fops(filp, d_inode(dentry)->i_fop);
goto free_proxy;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 66d9b3b4c588..38a9c7eb97e6 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -412,7 +412,7 @@ static struct dentry *end_creating(struct dentry *dentry)
static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *proxy_fops,
- const struct file_operations *real_fops)
+ const void *real_fops)
{
struct dentry *dentry;
struct inode *inode;
@@ -450,49 +450,38 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
return end_creating(dentry);
}
-/**
- * debugfs_create_file - create a file in the debugfs filesystem
- * @name: a pointer to a string containing the name of the file to create.
- * @mode: the permission that the file should have.
- * @parent: a pointer to the parent dentry for this file. This should be a
- * directory dentry if set. If this parameter is NULL, then the
- * file will be created in the root of the debugfs filesystem.
- * @data: a pointer to something that the caller will want to get to later
- * on. The inode.i_private pointer will point to this value on
- * the open() call.
- * @fops: a pointer to a struct file_operations that should be used for
- * this file.
- *
- * This is the basic "create a file" function for debugfs. It allows for a
- * wide range of flexibility in creating a file, or a directory (if you want
- * to create a directory, the debugfs_create_dir() function is
- * recommended to be used instead.)
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
- *
- * NOTE: it's expected that most callers should _ignore_ the errors returned
- * by this function. Other debugfs functions handle the fact that the "dentry"
- * passed to them could be an error and they don't crash in that case.
- * Drivers should generally work fine even if debugfs fails to init anyway.
- */
-struct dentry *debugfs_create_file(const char *name, umode_t mode,
- struct dentry *parent, void *data,
- const struct file_operations *fops)
+struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops)
{
+ if (WARN_ON((unsigned long)fops &
+ (DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT |
+ DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+ return ERR_PTR(-EINVAL);
return __debugfs_create_file(name, mode, parent, data,
fops ? &debugfs_full_proxy_file_operations :
&debugfs_noop_file_operations,
fops);
}
-EXPORT_SYMBOL_GPL(debugfs_create_file);
+EXPORT_SYMBOL_GPL(debugfs_create_file_full);
+
+struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct debugfs_short_fops *fops)
+{
+ if (WARN_ON((unsigned long)fops &
+ (DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT |
+ DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+ return ERR_PTR(-EINVAL);
+
+ return __debugfs_create_file(name, mode, parent, data,
+ fops ? &debugfs_full_proxy_file_operations :
+ &debugfs_noop_file_operations,
+ (const void *)((unsigned long)fops |
+ DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT));
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file_short);
/**
* debugfs_create_file_unsafe - create a file in the debugfs filesystem
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index dae80c2a469e..a3edfa4f0d8e 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -18,6 +18,7 @@ extern const struct file_operations debugfs_full_proxy_file_operations;
struct debugfs_fsdata {
const struct file_operations *real_fops;
+ const struct debugfs_short_fops *short_fops;
union {
/* automount_fn is used when real_fops is NULL */
debugfs_automount_t automount;
@@ -39,6 +40,11 @@ struct debugfs_fsdata {
* pointer gets its lowest bit set.
*/
#define DEBUGFS_FSDATA_IS_REAL_FOPS_BIT BIT(0)
+/*
+ * A dentry's ->d_fsdata, when pointing to real fops, is with
+ * short fops instead of full fops.
+ */
+#define DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT BIT(1)
/* Access BITS */
#define DEBUGFS_ALLOW_API BIT(0)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 62433cb3d2c2..f9898e60dd8b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -420,7 +420,9 @@ static bool busy_loop_ep_timeout(unsigned long start_time,
static bool ep_busy_loop_on(struct eventpoll *ep)
{
- return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on();
+ return !!READ_ONCE(ep->busy_poll_usecs) ||
+ READ_ONCE(ep->prefer_busy_poll) ||
+ net_busy_loop_on();
}
static bool ep_busy_loop_end(void *p, unsigned long start_time)
@@ -455,6 +457,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
* it back in when we have moved a socket with a valid NAPI
* ID onto the ready list.
*/
+ if (prefer_busy_poll)
+ napi_resume_irqs(napi_id);
ep->napi_id = 0;
return false;
}
@@ -538,6 +542,22 @@ static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
}
}
+static void ep_suspend_napi_irqs(struct eventpoll *ep)
+{
+ unsigned int napi_id = READ_ONCE(ep->napi_id);
+
+ if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll))
+ napi_suspend_irqs(napi_id);
+}
+
+static void ep_resume_napi_irqs(struct eventpoll *ep)
+{
+ unsigned int napi_id = READ_ONCE(ep->napi_id);
+
+ if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll))
+ napi_resume_irqs(napi_id);
+}
+
#else
static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
@@ -555,6 +575,14 @@ static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
return -EOPNOTSUPP;
}
+static void ep_suspend_napi_irqs(struct eventpoll *ep)
+{
+}
+
+static void ep_resume_napi_irqs(struct eventpoll *ep)
+{
+}
+
#endif /* CONFIG_NET_RX_BUSY_POLL */
/*
@@ -786,6 +814,7 @@ static bool ep_refcount_dec_and_test(struct eventpoll *ep)
static void ep_free(struct eventpoll *ep)
{
+ ep_resume_napi_irqs(ep);
mutex_destroy(&ep->mtx);
free_uid(ep->user);
wakeup_source_unregister(ep->ws);
@@ -2008,8 +2037,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* trying again in search of more luck.
*/
res = ep_send_events(ep, events, maxevents);
- if (res)
+ if (res) {
+ if (res > 0)
+ ep_suspend_napi_irqs(ep);
return res;
+ }
}
if (timed_out)