From f03e94f23b04c2b71c0044c1534921b3975ef10c Mon Sep 17 00:00:00 2001 From: Chaotian Jing Date: Tue, 13 Aug 2024 13:34:10 +0800 Subject: scsi: core: Fix the return value of scsi_logical_block_count() scsi_logical_block_count() should return the block count of a given SCSI command. The original implementation ended up shifting twice, leading to an incorrect count being returned. Fix the conversion between bytes and logical blocks. Cc: stable@vger.kernel.org Fixes: 6a20e21ae1e2 ("scsi: core: Add helper to return number of logical blocks in a request") Signed-off-by: Chaotian Jing Link: https://lore.kernel.org/r/20240813053534.7720-1-chaotian.jing@mediatek.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- include/scsi/scsi_cmnd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 45c40d200154..8ecfb94049db 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -234,7 +234,7 @@ static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd) static inline unsigned int scsi_logical_block_count(struct scsi_cmnd *scmd) { - unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT; + unsigned int shift = ilog2(scmd->device->sector_size); return blk_rq_bytes(scsi_cmd_to_rq(scmd)) >> shift; } -- cgit v1.3.1 From cd06b713a6880997ca5aecac8e33d5f9c541749e Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Fri, 16 Aug 2024 11:55:10 +0530 Subject: scsi: ufs: core: Add a quirk for handling broken LSDBS field in controller capabilities register 'Legacy Queue & Single Doorbell Support (LSDBS)' field in the controller capabilities register is supposed to report whether the legacy single doorbell mode is supported in the controller or not. But some controllers report '1' in this field which corresponds to 'LSDB not supported', but they indeed support LSDB. So let's add a quirk to handle those controllers. If the quirk is enabled by the controller driver, then LSDBS register field will be ignored and legacy single doorbell mode is assumed to be enabled always. Tested-by: Amit Pundir Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20240816-ufs-bug-fix-v3-1-e6fe0e18e2a3@linaro.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 6 +++++- include/ufs/ufshcd.h | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 0b3d0c8e0dda..a6f818cdef0e 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -2426,7 +2426,11 @@ static inline int ufshcd_hba_capabilities(struct ufs_hba *hba) * 0h: legacy single doorbell support is available * 1h: indicate that legacy single doorbell support has been removed */ - hba->lsdb_sup = !FIELD_GET(MASK_LSDB_SUPPORT, hba->capabilities); + if (!(hba->quirks & UFSHCD_QUIRK_BROKEN_LSDBS_CAP)) + hba->lsdb_sup = !FIELD_GET(MASK_LSDB_SUPPORT, hba->capabilities); + else + hba->lsdb_sup = true; + if (!hba->mcq_sup) return 0; diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index cac0cdb9a916..0fd2aebac728 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -676,6 +676,14 @@ enum ufshcd_quirks { * the standard best practice for managing keys). */ UFSHCD_QUIRK_KEYS_IN_PRDT = 1 << 24, + + /* + * This quirk indicates that the controller reports the value 1 (not + * supported) in the Legacy Single DoorBell Support (LSDBS) bit of the + * Controller Capabilities register although it supports the legacy + * single doorbell mode. + */ + UFSHCD_QUIRK_BROKEN_LSDBS_CAP = 1 << 25, }; enum ufshcd_caps { -- cgit v1.3.1 From cd8e468efb4fb2742e06328a75b282c35c1abf8d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 14 Aug 2024 21:01:57 +0200 Subject: ACPI: video: Add Dell UART backlight controller detection Dell All In One (AIO) models released after 2017 use a backlight controller board connected to an UART. In DSDT this uart port will be defined as: Name (_HID, "DELL0501") Name (_CID, EisaId ("PNP0501") Commit 484bae9e4d6a ("platform/x86: Add new Dell UART backlight driver") has added support for this, but I neglected to tie this into acpi_video_get_backlight_type(). Now the first AIO has turned up which has not only the DSDT bits for this, but also an actual controller attached to the UART, yet it is not using this controller for backlight control. Add support to acpi_video_get_backlight_type() for a new dell_uart backlight type. So that the existing infra to override the backlight control method on the commandline or with DMI quirks can be used. Fixes: 484bae9e4d6a ("platform/x86: Add new Dell UART backlight driver") Cc: All applicable Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20240814190159.15650-2-hdegoede@redhat.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/video_detect.c | 7 +++++++ include/acpi/video.h | 1 + 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index c11cbe5b6eaa..e509dcbf3090 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -54,6 +54,8 @@ static void acpi_video_parse_cmdline(void) acpi_backlight_cmdline = acpi_backlight_nvidia_wmi_ec; if (!strcmp("apple_gmux", acpi_video_backlight_string)) acpi_backlight_cmdline = acpi_backlight_apple_gmux; + if (!strcmp("dell_uart", acpi_video_backlight_string)) + acpi_backlight_cmdline = acpi_backlight_dell_uart; if (!strcmp("none", acpi_video_backlight_string)) acpi_backlight_cmdline = acpi_backlight_none; } @@ -918,6 +920,7 @@ enum acpi_backlight_type __acpi_video_get_backlight_type(bool native, bool *auto static DEFINE_MUTEX(init_mutex); static bool nvidia_wmi_ec_present; static bool apple_gmux_present; + static bool dell_uart_present; static bool native_available; static bool init_done; static long video_caps; @@ -932,6 +935,7 @@ enum acpi_backlight_type __acpi_video_get_backlight_type(bool native, bool *auto &video_caps, NULL); nvidia_wmi_ec_present = nvidia_wmi_ec_supported(); apple_gmux_present = apple_gmux_detect(NULL, NULL); + dell_uart_present = acpi_dev_present("DELL0501", NULL, -1); init_done = true; } if (native) @@ -962,6 +966,9 @@ enum acpi_backlight_type __acpi_video_get_backlight_type(bool native, bool *auto if (apple_gmux_present) return acpi_backlight_apple_gmux; + if (dell_uart_present) + return acpi_backlight_dell_uart; + /* Use ACPI video if available, except when native should be preferred. */ if ((video_caps & ACPI_VIDEO_BACKLIGHT) && !(native_available && prefer_native_over_acpi_video())) diff --git a/include/acpi/video.h b/include/acpi/video.h index 3d538d4178ab..044c463138df 100644 --- a/include/acpi/video.h +++ b/include/acpi/video.h @@ -50,6 +50,7 @@ enum acpi_backlight_type { acpi_backlight_native, acpi_backlight_nvidia_wmi_ec, acpi_backlight_apple_gmux, + acpi_backlight_dell_uart, }; #if IS_ENABLED(CONFIG_ACPI_VIDEO) -- cgit v1.3.1 From ad614a706b1ac83b95b333f44b8f5e70bcb37dc5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 29 Jul 2024 11:26:34 +0200 Subject: drm/xe/oa/uapi: Make bit masks unsigned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building with gcc-5: In function ‘decode_oa_format.isra.26’, inlined from ‘xe_oa_set_prop_oa_format’ at drivers/gpu/drm/xe/xe_oa.c:1664:6: ././include/linux/compiler_types.h:510:38: error: call to ‘__compiletime_assert_1336’ declared with attribute error: FIELD_GET: mask is not constant [...] ./include/linux/bitfield.h:155:3: note: in expansion of macro ‘__BF_FIELD_CHECK’ __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \ ^ drivers/gpu/drm/xe/xe_oa.c:1573:18: note: in expansion of macro ‘FIELD_GET’ u32 bc_report = FIELD_GET(DRM_XE_OA_FORMAT_MASK_BC_REPORT, fmt); ^ Fixes: b6fd51c62119 ("drm/xe/oa/uapi: Define and parse OA stream properties") Signed-off-by: Geert Uytterhoeven Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20240729092634.2227611-1-geert+renesas@glider.be Signed-off-by: Lucas De Marchi (cherry picked from commit f2881dfdaaa9ec873dbd383ef5512fc31e576cbb) Signed-off-by: Rodrigo Vivi --- include/uapi/drm/xe_drm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 19619d4952a8..db232a25189e 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -1590,10 +1590,10 @@ enum drm_xe_oa_property_id { * b. Counter select c. Counter size and d. BC report. Also refer to the * oa_formats array in drivers/gpu/drm/xe/xe_oa.c. */ -#define DRM_XE_OA_FORMAT_MASK_FMT_TYPE (0xff << 0) -#define DRM_XE_OA_FORMAT_MASK_COUNTER_SEL (0xff << 8) -#define DRM_XE_OA_FORMAT_MASK_COUNTER_SIZE (0xff << 16) -#define DRM_XE_OA_FORMAT_MASK_BC_REPORT (0xff << 24) +#define DRM_XE_OA_FORMAT_MASK_FMT_TYPE (0xffu << 0) +#define DRM_XE_OA_FORMAT_MASK_COUNTER_SEL (0xffu << 8) +#define DRM_XE_OA_FORMAT_MASK_COUNTER_SIZE (0xffu << 16) +#define DRM_XE_OA_FORMAT_MASK_BC_REPORT (0xffu << 24) /** * @DRM_XE_OA_PROPERTY_OA_PERIOD_EXPONENT: Requests periodic OA unit -- cgit v1.3.1 From 81475beb1b5996505a39cd1d9316ce1e668932a2 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 15 Aug 2024 16:32:28 +0000 Subject: block: Drop NULL check in bdev_write_zeroes_sectors() Function bdev_get_queue() must not return NULL, so drop the check in bdev_write_zeroes_sectors(). Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Nitesh Shetty Link: https://lore.kernel.org/r/20240815163228.216051-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e85ec73a07d5..b7664d593486 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1296,12 +1296,7 @@ bdev_max_secure_erase_sectors(struct block_device *bdev) static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { - struct request_queue *q = bdev_get_queue(bdev); - - if (q) - return q->limits.max_write_zeroes_sectors; - - return 0; + return bdev_get_queue(bdev)->limits.max_write_zeroes_sectors; } static inline bool bdev_nonrot(struct block_device *bdev) -- cgit v1.3.1 From dc0112e6d8b42b39f9d283bab489a757e9d284f0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 12 Aug 2024 11:47:59 -0400 Subject: rpcrdma: Trace connection registration and unregistration These new trace points record xarray indices and the time of endpoint registration and unregistration, to co-ordinate with device removal events. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 36 ++++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/ib_client.c | 2 ++ 2 files changed, 38 insertions(+) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index ba2d6a0e41cc..a96a985c49b3 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -2277,6 +2277,42 @@ DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_remove_one); DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_wait_on); DEFINE_CLIENT_DEVICE_EVENT(rpcrdma_client_remove_one_done); +DECLARE_EVENT_CLASS(rpcrdma_client_register_class, + TP_PROTO( + const struct ib_device *device, + const struct rpcrdma_notification *rn + ), + + TP_ARGS(device, rn), + + TP_STRUCT__entry( + __string(name, device->name) + __field(void *, callback) + __field(u32, index) + ), + + TP_fast_assign( + __assign_str(name); + __entry->callback = rn->rn_done; + __entry->index = rn->rn_index; + ), + + TP_printk("device=%s index=%u done callback=%pS\n", + __get_str(name), __entry->index, __entry->callback + ) +); + +#define DEFINE_CLIENT_REGISTER_EVENT(name) \ + DEFINE_EVENT(rpcrdma_client_register_class, name, \ + TP_PROTO( \ + const struct ib_device *device, \ + const struct rpcrdma_notification *rn \ + ), \ + TP_ARGS(device, rn)) + +DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_register); +DEFINE_CLIENT_REGISTER_EVENT(rpcrdma_client_unregister); + #endif /* _TRACE_RPCRDMA_H */ #include diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c index 7913d7bad23d..8507cd4d8921 100644 --- a/net/sunrpc/xprtrdma/ib_client.c +++ b/net/sunrpc/xprtrdma/ib_client.c @@ -66,6 +66,7 @@ int rpcrdma_rn_register(struct ib_device *device, return -ENOMEM; kref_get(&rd->rd_kref); rn->rn_done = done; + trace_rpcrdma_client_register(device, rn); return 0; } @@ -91,6 +92,7 @@ void rpcrdma_rn_unregister(struct ib_device *device, if (!rd) return; + trace_rpcrdma_client_unregister(device, rn); xa_erase(&rd->rd_xa, rn->rn_index); kref_put(&rd->rd_kref, rpcrdma_rn_release); } -- cgit v1.3.1 From 5fd0628918977a0afdc2e6bc562d8751b5d3b8c5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 26 Aug 2024 12:45:22 +0200 Subject: netfilter: nf_tables: restore IP sanity checks for netdev/egress Subtract network offset to skb->len before performing IPv4 header sanity checks, then adjust transport offset from offset from mac header. Jorge Ortiz says: When small UDP packets (< 4 bytes payload) are sent from eth0, `meta l4proto udp` condition is not met because `NFT_PKTINFO_L4PROTO` is not set. This happens because there is a comparison that checks if the transport header offset exceeds the total length. This comparison does not take into account the fact that the skb network offset might be non-zero in egress mode (e.g., 14 bytes for Ethernet header). Fixes: 0ae8e4cca787 ("netfilter: nf_tables: set transport offset from mac header for netdev/egress") Reported-by: Jorge Ortiz Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv4.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index 60a7d0ce3080..fcf967286e37 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -19,7 +19,7 @@ static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt) static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { struct iphdr *iph, _iph; - u32 len, thoff; + u32 len, thoff, skb_len; iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*iph), &_iph); @@ -30,8 +30,10 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) return -1; len = iph_totlen(pkt->skb, iph); - thoff = skb_network_offset(pkt->skb) + (iph->ihl * 4); - if (pkt->skb->len < len) + thoff = iph->ihl * 4; + skb_len = pkt->skb->len - skb_network_offset(pkt->skb); + + if (skb_len < len) return -1; else if (len < thoff) return -1; @@ -40,7 +42,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; - pkt->thoff = thoff; + pkt->thoff = skb_network_offset(pkt->skb) + thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; -- cgit v1.3.1 From 7e8ae8486e4471513e2111aba6ac29f2357bed2a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 26 Aug 2024 10:32:34 -0400 Subject: fs/nfsd: fix update of inode attrs in CB_GETATTR Currently, we copy the mtime and ctime to the in-core inode and then mark the inode dirty. This is fine for certain types of filesystems, but not all. Some require a real setattr to properly change these values (e.g. ceph or reexported NFS). Fix this code to call notify_change() instead, which is the proper way to effect a setattr. There is one problem though: In this case, the client is holding a write delegation and has sent us attributes to update our cache. We don't want to break the delegation for this since that would defeat the purpose. Add a new ATTR_DELEG flag that makes notify_change bypass the try_break_deleg call. Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation") Reviewed-by: Christian Brauner Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/attr.c | 14 +++++++++++--- fs/nfsd/nfs4state.c | 18 +++++++++++++----- fs/nfsd/nfs4xdr.c | 2 +- fs/nfsd/state.h | 2 +- include/linux/fs.h | 1 + 5 files changed, 27 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/fs/attr.c b/fs/attr.c index 960a310581eb..0dbf43b6555c 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -489,9 +489,17 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, error = security_inode_setattr(idmap, dentry, attr); if (error) return error; - error = try_break_deleg(inode, delegated_inode); - if (error) - return error; + + /* + * If ATTR_DELEG is set, then these attributes are being set on + * behalf of the holder of a write delegation. We want to avoid + * breaking the delegation in this case. + */ + if (!(ia_valid & ATTR_DELEG)) { + error = try_break_deleg(inode, delegated_inode); + if (error) + return error; + } if (inode->i_op->setattr) error = inode->i_op->setattr(idmap, dentry, attr); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 02d43f95146e..07f2496850c4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8815,7 +8815,7 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, /** * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict * @rqstp: RPC transaction context - * @inode: file to be checked for a conflict + * @dentry: dentry of inode to be checked for a conflict * @modified: return true if file was modified * @size: new size of file if modified is true * @@ -8830,7 +8830,7 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * code is returned. */ __be32 -nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, bool *modified, u64 *size) { __be32 status; @@ -8839,6 +8839,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, struct file_lease *fl; struct iattr attrs; struct nfs4_cb_fattr *ncf; + struct inode *inode = d_inode(dentry); *modified = false; ctx = locks_inode_context(inode); @@ -8890,15 +8891,22 @@ break_lease: ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) ncf->ncf_file_modified = true; if (ncf->ncf_file_modified) { + int err; + /* * Per section 10.4.3 of RFC 8881, the server would * not update the file's metadata with the client's * modified size */ attrs.ia_mtime = attrs.ia_ctime = current_time(inode); - attrs.ia_valid = ATTR_MTIME | ATTR_CTIME; - setattr_copy(&nop_mnt_idmap, inode, &attrs); - mark_inode_dirty(inode); + attrs.ia_valid = ATTR_MTIME | ATTR_CTIME | ATTR_DELEG; + inode_lock(inode); + err = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL); + inode_unlock(inode); + if (err) { + nfs4_put_stid(&dp->dl_stid); + return nfserrno(err); + } ncf->ncf_cur_fsize = ncf->ncf_cb_fsize; *size = ncf->ncf_cur_fsize; *modified = true; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 43ccf6119cf1..97f583777972 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3565,7 +3565,7 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, } args.size = 0; if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { - status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry), + status = nfsd4_deleg_getattr_conflict(rqstp, dentry, &file_modified, &size); if (status) goto out; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index ffc217099d19..ec4559ecd193 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -781,5 +781,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) } extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, - struct inode *inode, bool *file_modified, u64 *size); + struct dentry *dentry, bool *file_modified, u64 *size); #endif /* NFSD4_STATE_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 0283cf366c2a..bafc1d134b94 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -208,6 +208,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) #define ATTR_TOUCH (1 << 17) +#define ATTR_DELEG (1 << 18) /* Delegated attrs. Don't break write delegations */ /* * Whiteout is represented by a char device. The following constants define the -- cgit v1.3.1 From 70c261d500951cf3ea0fcf32651aab9a65a91471 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 26 Aug 2024 15:03:23 +0200 Subject: netfilter: nf_tables_ipv6: consider network offset in netdev/egress validation From netdev/egress, skb->len can include the ethernet header, therefore, subtract network offset from skb->len when validating IPv6 packet length. Fixes: 42df6e1d221d ("netfilter: Introduce egress hook") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv6.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index 467d59b9e533..a0633eeaec97 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -31,8 +31,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; + u32 pkt_len, skb_len; int protohdr; - u32 pkt_len; ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*ip6h), &_ip6h); @@ -43,7 +43,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) return -1; pkt_len = ntohs(ip6h->payload_len); - if (pkt_len + sizeof(*ip6h) > pkt->skb->len) + skb_len = pkt->skb->len - skb_network_offset(pkt->skb); + if (pkt_len + sizeof(*ip6h) > skb_len) return -1; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); -- cgit v1.3.1 From 2aeeef906d5a526dc60cf4af92eda69836c39b1f Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 23 Aug 2024 06:10:56 +0300 Subject: bonding: change ipsec_lock from spin lock to mutex In the cited commit, bond->ipsec_lock is added to protect ipsec_list, hence xdo_dev_state_add and xdo_dev_state_delete are called inside this lock. As ipsec_lock is a spin lock and such xfrmdev ops may sleep, "scheduling while atomic" will be triggered when changing bond's active slave. [ 101.055189] BUG: scheduling while atomic: bash/902/0x00000200 [ 101.055726] Modules linked in: [ 101.058211] CPU: 3 PID: 902 Comm: bash Not tainted 6.9.0-rc4+ #1 [ 101.058760] Hardware name: [ 101.059434] Call Trace: [ 101.059436] [ 101.060873] dump_stack_lvl+0x51/0x60 [ 101.061275] __schedule_bug+0x4e/0x60 [ 101.061682] __schedule+0x612/0x7c0 [ 101.062078] ? __mod_timer+0x25c/0x370 [ 101.062486] schedule+0x25/0xd0 [ 101.062845] schedule_timeout+0x77/0xf0 [ 101.063265] ? asm_common_interrupt+0x22/0x40 [ 101.063724] ? __bpf_trace_itimer_state+0x10/0x10 [ 101.064215] __wait_for_common+0x87/0x190 [ 101.064648] ? usleep_range_state+0x90/0x90 [ 101.065091] cmd_exec+0x437/0xb20 [mlx5_core] [ 101.065569] mlx5_cmd_do+0x1e/0x40 [mlx5_core] [ 101.066051] mlx5_cmd_exec+0x18/0x30 [mlx5_core] [ 101.066552] mlx5_crypto_create_dek_key+0xea/0x120 [mlx5_core] [ 101.067163] ? bonding_sysfs_store_option+0x4d/0x80 [bonding] [ 101.067738] ? kmalloc_trace+0x4d/0x350 [ 101.068156] mlx5_ipsec_create_sa_ctx+0x33/0x100 [mlx5_core] [ 101.068747] mlx5e_xfrm_add_state+0x47b/0xaa0 [mlx5_core] [ 101.069312] bond_change_active_slave+0x392/0x900 [bonding] [ 101.069868] bond_option_active_slave_set+0x1c2/0x240 [bonding] [ 101.070454] __bond_opt_set+0xa6/0x430 [bonding] [ 101.070935] __bond_opt_set_notify+0x2f/0x90 [bonding] [ 101.071453] bond_opt_tryset_rtnl+0x72/0xb0 [bonding] [ 101.071965] bonding_sysfs_store_option+0x4d/0x80 [bonding] [ 101.072567] kernfs_fop_write_iter+0x10c/0x1a0 [ 101.073033] vfs_write+0x2d8/0x400 [ 101.073416] ? alloc_fd+0x48/0x180 [ 101.073798] ksys_write+0x5f/0xe0 [ 101.074175] do_syscall_64+0x52/0x110 [ 101.074576] entry_SYSCALL_64_after_hwframe+0x4b/0x53 As bond_ipsec_add_sa_all and bond_ipsec_del_sa_all are only called from bond_change_active_slave, which requires holding the RTNL lock. And bond_ipsec_add_sa and bond_ipsec_del_sa are xfrm state xdo_dev_state_add and xdo_dev_state_delete APIs, which are in user context. So ipsec_lock doesn't have to be spin lock, change it to mutex, and thus the above issue can be resolved. Fixes: 9a5605505d9c ("bonding: Add struct bond_ipesc to manage SA") Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Reviewed-by: Hangbin Liu Acked-by: Jay Vosburgh Link: https://patch.msgid.link/20240823031056.110999-4-jianbol@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 79 ++++++++++++++++++++++------------------- include/net/bonding.h | 2 +- 2 files changed, 44 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index f98491748420..bb9c3d6ef435 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -428,6 +428,7 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs, { struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; + netdevice_tracker tracker; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; @@ -439,24 +440,26 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs, rcu_read_lock(); bond = netdev_priv(bond_dev); slave = rcu_dereference(bond->curr_active_slave); - if (!slave) { - rcu_read_unlock(); - return -ENODEV; + real_dev = slave ? slave->dev : NULL; + netdev_hold(real_dev, &tracker, GFP_ATOMIC); + rcu_read_unlock(); + if (!real_dev) { + err = -ENODEV; + goto out; } - real_dev = slave->dev; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(real_dev)) { NL_SET_ERR_MSG_MOD(extack, "Slave does not support ipsec offload"); - rcu_read_unlock(); - return -EINVAL; + err = -EINVAL; + goto out; } - ipsec = kmalloc(sizeof(*ipsec), GFP_ATOMIC); + ipsec = kmalloc(sizeof(*ipsec), GFP_KERNEL); if (!ipsec) { - rcu_read_unlock(); - return -ENOMEM; + err = -ENOMEM; + goto out; } xs->xso.real_dev = real_dev; @@ -464,13 +467,14 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs, if (!err) { ipsec->xs = xs; INIT_LIST_HEAD(&ipsec->list); - spin_lock_bh(&bond->ipsec_lock); + mutex_lock(&bond->ipsec_lock); list_add(&ipsec->list, &bond->ipsec_list); - spin_unlock_bh(&bond->ipsec_lock); + mutex_unlock(&bond->ipsec_lock); } else { kfree(ipsec); } - rcu_read_unlock(); +out: + netdev_put(real_dev, &tracker); return err; } @@ -481,35 +485,35 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) struct bond_ipsec *ipsec; struct slave *slave; - rcu_read_lock(); - slave = rcu_dereference(bond->curr_active_slave); - if (!slave) - goto out; + slave = rtnl_dereference(bond->curr_active_slave); + real_dev = slave ? slave->dev : NULL; + if (!real_dev) + return; - real_dev = slave->dev; + mutex_lock(&bond->ipsec_lock); if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(real_dev)) { - spin_lock_bh(&bond->ipsec_lock); if (!list_empty(&bond->ipsec_list)) slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_add\n", __func__); - spin_unlock_bh(&bond->ipsec_lock); goto out; } - spin_lock_bh(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { + /* If new state is added before ipsec_lock acquired */ + if (ipsec->xs->xso.real_dev == real_dev) + continue; + ipsec->xs->xso.real_dev = real_dev; if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); ipsec->xs->xso.real_dev = NULL; } } - spin_unlock_bh(&bond->ipsec_lock); out: - rcu_read_unlock(); + mutex_unlock(&bond->ipsec_lock); } /** @@ -520,6 +524,7 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) { struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; + netdevice_tracker tracker; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; @@ -530,6 +535,9 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) rcu_read_lock(); bond = netdev_priv(bond_dev); slave = rcu_dereference(bond->curr_active_slave); + real_dev = slave ? slave->dev : NULL; + netdev_hold(real_dev, &tracker, GFP_ATOMIC); + rcu_read_unlock(); if (!slave) goto out; @@ -537,7 +545,6 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) if (!xs->xso.real_dev) goto out; - real_dev = slave->dev; WARN_ON(xs->xso.real_dev != real_dev); if (!real_dev->xfrmdev_ops || @@ -549,7 +556,8 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) real_dev->xfrmdev_ops->xdo_dev_state_delete(xs); out: - spin_lock_bh(&bond->ipsec_lock); + netdev_put(real_dev, &tracker); + mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (ipsec->xs == xs) { list_del(&ipsec->list); @@ -557,8 +565,7 @@ out: break; } } - spin_unlock_bh(&bond->ipsec_lock); - rcu_read_unlock(); + mutex_unlock(&bond->ipsec_lock); } static void bond_ipsec_del_sa_all(struct bonding *bond) @@ -568,15 +575,12 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) struct bond_ipsec *ipsec; struct slave *slave; - rcu_read_lock(); - slave = rcu_dereference(bond->curr_active_slave); - if (!slave) { - rcu_read_unlock(); + slave = rtnl_dereference(bond->curr_active_slave); + real_dev = slave ? slave->dev : NULL; + if (!real_dev) return; - } - real_dev = slave->dev; - spin_lock_bh(&bond->ipsec_lock); + mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (!ipsec->xs->xso.real_dev) continue; @@ -593,8 +597,7 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); } } - spin_unlock_bh(&bond->ipsec_lock); - rcu_read_unlock(); + mutex_unlock(&bond->ipsec_lock); } static void bond_ipsec_free_sa(struct xfrm_state *xs) @@ -5921,7 +5924,7 @@ void bond_setup(struct net_device *bond_dev) /* set up xfrm device ops (only supported in active-backup right now) */ bond_dev->xfrmdev_ops = &bond_xfrmdev_ops; INIT_LIST_HEAD(&bond->ipsec_list); - spin_lock_init(&bond->ipsec_lock); + mutex_init(&bond->ipsec_lock); #endif /* CONFIG_XFRM_OFFLOAD */ /* don't acquire bond device's netif_tx_lock when transmitting */ @@ -5970,6 +5973,10 @@ static void bond_uninit(struct net_device *bond_dev) __bond_release_one(bond_dev, slave->dev, true, true); netdev_info(bond_dev, "Released all slaves\n"); +#ifdef CONFIG_XFRM_OFFLOAD + mutex_destroy(&bond->ipsec_lock); +#endif /* CONFIG_XFRM_OFFLOAD */ + bond_set_slave_arr(bond, NULL, NULL); list_del_rcu(&bond->bond_list); diff --git a/include/net/bonding.h b/include/net/bonding.h index b61fb1aa3a56..8bb5f016969f 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -260,7 +260,7 @@ struct bonding { #ifdef CONFIG_XFRM_OFFLOAD struct list_head ipsec_list; /* protecting ipsec_list */ - spinlock_t ipsec_lock; + struct mutex ipsec_lock; #endif /* CONFIG_XFRM_OFFLOAD */ struct bpf_prog *xdp_prog; }; -- cgit v1.3.1 From 0870b0d8b393dde53106678a1e2cec9dfa52f9b7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Aug 2024 11:49:16 +0000 Subject: net: busy-poll: use ktime_get_ns() instead of local_clock() Typically, busy-polling durations are below 100 usec. When/if the busy-poller thread migrates to another cpu, local_clock() can be off by +/-2msec or more for small values of HZ, depending on the platform. Use ktimer_get_ns() to ensure deterministic behavior, which is the whole point of busy-polling. Fixes: 060212928670 ("net: add low latency socket poll") Fixes: 9a3c71aa8024 ("net: convert low latency sockets to sched_clock()") Fixes: 37089834528b ("sched, net: Fixup busy_loop_us_clock()") Signed-off-by: Eric Dumazet Cc: Mina Almasry Cc: Willem de Bruijn Reviewed-by: Joe Damato Link: https://patch.msgid.link/20240827114916.223377-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 9b09acac538e..522f1da8b747 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -68,7 +68,7 @@ static inline bool sk_can_busy_loop(struct sock *sk) static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL - return (unsigned long)(local_clock() >> 10); + return (unsigned long)(ktime_get_ns() >> 10); #else return 0; #endif -- cgit v1.3.1